feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -13,10 +13,17 @@ class BaseNameFormatter(ABC):
    """

    def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
-        self.connectors = connectors or ['wa', 'ya', 'ka', 'ba']
+        self.connectors = connectors or ["wa", "ya", "ka", "ba"]
        self.additional_surnames = additional_surnames or [
-            'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
-            'andre', 'michel', 'robert'
+            "jean",
+            "paul",
+            "marie",
+            "joseph",
+            "pierre",
+            "claude",
+            "andre",
+            "michel",
+            "robert",
        ]

    @classmethod
@@ -26,7 +33,9 @@ class BaseNameFormatter(ABC):
            return []
        return native_str.strip().split()

-    def create_ner_tags(self, text: str, native_parts: List[str], surname: str) -> List[Tuple[int, int, str]]:
+    def create_ner_tags(
+        self, text: str, native_parts: List[str], surname: str
+    ) -> List[Tuple[int, int, str]]:
        """Create NER entity tags for transformed text"""
        entities = []
        current_pos = 0
@@ -38,15 +47,15 @@ class BaseNameFormatter(ABC):

            # Determine tag based on word content
            if word in native_parts or any(connector in word for connector in self.connectors):
-                tag = 'NATIVE'
+                tag = "NATIVE"
            elif word == surname or word in self.additional_surnames:
-                tag = 'SURNAME'
+                tag = "SURNAME"
            else:
                # Check if it's a compound native word or new surname
                if any(part in word for part in native_parts):
-                    tag = 'NATIVE'
+                    tag = "NATIVE"
                else:
-                    tag = 'SURNAME'
+                    tag = "SURNAME"

            entities.append((start_pos, end_pos, tag))
            current_pos = end_pos + 1  # +1 for space
@@ -54,15 +63,17 @@ class BaseNameFormatter(ABC):
        return entities

    @classmethod
-    def compute_derived_attributes(cls, name: str) -> Dict:
+    def compute_numeric_features(cls, name: str) -> Dict:
        """Compute all derived attributes for the transformed name"""
        words_count = len(name.split()) if name else 0
        length = len(name) if name else 0

        return {
-            'words': words_count,
-            'length': length,
-            'identified_category': NameCategory.SIMPLE if words_count == 3 else NameCategory.COMPOSE,
+            "words": words_count,
+            "length": length,
+            "identified_category": (
+                NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
+            ),
        }

    @abstractmethod
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter

 class ConnectorFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
        connector = random.choice(self.connectors)

        # Connect native parts with a random connector
@@ -17,20 +17,22 @@ class ConnectorFormatter(BaseNameFormatter):
            connected_native = f" {connector} ".join(native_parts)
            full_name = f"{connected_native} {surname}".strip()
        else:
-            connected_native = f"{row['probable_native']} {connector} {row['probable_native']}".strip()
+            connected_native = (
+                f"{row['probable_native']} {connector} {row['probable_native']}".strip()
+            )
            full_name = f"{connected_native} {surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': connected_native,
-            'identify_name': connected_native,
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": connected_native,
+            "identified_name": connected_native,
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'connector_added'
+        return "connector_added"
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter

 class ExtendedSurnameFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        original_surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Add random additional surname
        additional_surname = random.choice(self.additional_surnames)
@@ -17,16 +17,16 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
        full_name = f"{row['probable_native']} {combined_surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': combined_surname,
-            'identity_surname': combined_surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, combined_surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": combined_surname,
+            "identified_surname": combined_surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'extended_surname'
+        return "extended_surname"
@@ -7,22 +7,22 @@ from processing.ner.formats import BaseNameFormatter

 class NativeOnlyFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
+        native_parts = self.parse_native_components(row["probable_native"])

        # Only native components
-        full_name = row['probable_native']
+        full_name = row["probable_native"]

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': '',
-            'identify_surname': '',
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, '')),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": "",
+            "identified_surname": "",
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'native_only'
+        return "native_only"
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter

 class OriginalFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Keep original order: native components + surname
        full_name = f"{row['probable_native']} {surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'original'
+        return "original"
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter

 class PositionFlippedFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Flip order: surname + native components
        full_name = f"{surname} {row['probable_native']}".strip()

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'position_flipped'
+        return "position_flipped"
@@ -7,24 +7,24 @@ from processing.ner.formats import BaseNameFormatter

 class ReducedNativeFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Keep only first native component + surname
-        reduced_native = native_parts[0] if len(native_parts) > 1 else row['probable_native']
+        reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
        full_name = f"{reduced_native} {surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': reduced_native,
-            'identify_name': reduced_native,
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, [reduced_native], surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": reduced_native,
+            "identified_name": reduced_native,
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'reduced_native'
+        return "reduced_native"
@@ -10,189 +10,143 @@ from spacy.util import filter_spans

 from core.config import PipelineConfig
 from core.utils import get_data_file_path
+from core.utils.data_loader import DataLoader


 class NERDataBuilder:
    def __init__(self, config: PipelineConfig):
        self.config = config
+        self.data_loader = DataLoader(config)

-    @classmethod
-    def parse_entities(cls, entities_str):
-        """Parse entity string (tuple format or JSON) into spaCy-style tuples."""
-        if not entities_str or entities_str in ["[]", "", "nan"]:
-            return []
+    @staticmethod
+    def _parse_entities(series: pd.Series) -> pd.Series:
+        """Vectorized parse of entity strings."""

-        entities_str = str(entities_str).strip()
-
-        # Handle different formats
-        try:
-            # Try to parse as Python literal (tuples or lists)
-            if entities_str.startswith("[(") and entities_str.endswith(")]"):
-                # Standard tuple format: [(0, 6, 'NATIVE'), ...]
-                return ast.literal_eval(entities_str)
-            elif entities_str.startswith("[[") and entities_str.endswith("]]"):
-                # Nested list format: [[0, 6, 'NATIVE'], ...]
-                nested_list = ast.literal_eval(entities_str)
-                return [(start, end, label) for start, end, label in nested_list]
-            elif entities_str.startswith("[{") and entities_str.endswith("}]"):
-                # JSON format: [{"start": 0, "end": 6, "label": "NATIVE"}, ...]
-                json_entities = json.loads(entities_str)
-                return [(e["start"], e["end"], e["label"]) for e in json_entities]
-            else:
-                # Try general ast.literal_eval for other formats
-                parsed = ast.literal_eval(entities_str)
-                if isinstance(parsed, list):
-                    # Convert any list format to tuples
-                    result = []
-                    for item in parsed:
-                        if isinstance(item, (list, tuple)) and len(item) == 3:
-                            result.append((item[0], item[1], item[2]))
-                    return result
-
-        except (ValueError, SyntaxError, json.JSONDecodeError) as e:
-            logging.warning(f"Failed to parse entities: {entities_str} ({e})")
-            return []
-
-        logging.warning(f"Unknown entity format: {entities_str}")
-        return []
-
-    @classmethod
-    def validate_entities(cls, entities, text):
-        """Validate and sort entity tuples, removing overlaps and invalid spans."""
-        if not entities or not text:
-            return []
-
-        text = str(text).strip()
-        if not text:
-            return []
-
-        # Filter out invalid entities
-        valid_entities = []
-        for entity in entities:
-            if not isinstance(entity, (list, tuple)) or len(entity) != 3:
-                logging.warning(f"Invalid entity format: {entity}")
-                continue
-
-            start, end, label = entity
-
-            # Ensure start/end are integers
+        def _parse(entities_str):
+            if not entities_str or entities_str in ["[]", "", "nan"]:
+                return []
+            entities_str = str(entities_str).strip()
            try:
-                start = int(start)
-                end = int(end)
-            except (ValueError, TypeError):
-                logging.warning(f"Invalid start/end positions: {entity}")
-                continue
+                if entities_str.startswith("[(") and entities_str.endswith(")]"):
+                    return ast.literal_eval(entities_str)
+                elif entities_str.startswith("[[") and entities_str.endswith("]]"):
+                    return [tuple(e) for e in ast.literal_eval(entities_str)]
+                elif entities_str.startswith("[{") and entities_str.endswith("}]"):
+                    return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
+                else:
+                    parsed = ast.literal_eval(entities_str)
+                    return [
+                        tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
+                    ]
+            except (ValueError, SyntaxError, json.JSONDecodeError):
+                return []

-            # Ensure label is string
-            if not isinstance(label, str):
-                logging.warning(f"Invalid label type: {entity}")
-                continue
+        return series.map(_parse)

-            # Check bounds
-            if not (0 <= start < end <= len(text)):
-                logging.warning(f"Entity span out of bounds: {entity} for text '{text}' (length {len(text)})")
-                continue
+    @staticmethod
+    def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
+        """Vectorized entity validation."""

-            # Check that span contains actual text
-            span_text = text[start:end].strip()
-            if not span_text:
-                logging.warning(f"Empty span: {entity} in text '{text}'")
-                continue
-
-            valid_entities.append((start, end, label))
-
-        if not valid_entities:
-            return []
-
-        # Sort by start position
-        valid_entities.sort(key=lambda x: (x[0], x[1]))
-
-        # Remove overlapping entities (keep the first one)
-        filtered = []
-        for start, end, label in valid_entities:
-            # Check for overlap with already added entities
-            has_overlap = False
-            for e_start, e_end, _ in filtered:
-                if not (end <= e_start or start >= e_end):
-                    has_overlap = True
-                    logging.warning(
-                        f"Removing overlapping entity ({start}, {end}, '{label}') "
-                        f"conflicts with ({e_start}, {e_end}) in '{text}'"
-                    )
-                    break
-
-            if not has_overlap:
-                filtered.append((start, end, label))
-
-        return filtered
-
-    @classmethod
-    def create_doc(cls, text, entities, nlp):
-        """Create a spaCy Doc object with entities added."""
-        doc = nlp(text)
-        ents = []
-
-        for start, end, label in entities:
-            span = doc.char_span(start, end, label=label, alignment_mode="contract") \
-                   or doc.char_span(start, end, label=label, alignment_mode="strict")
-            if span:
-                ents.append(span)
-            else:
-                logging.warning(f"Could not create span ({start}, {end}, '{label}') in '{text}'")
-
-        doc.ents = filter_spans(ents) if ents else []
-        return doc
-
-    def build(self, data: pd.DataFrame = None) -> int:
-        """Build the dataset for NER training."""
-        logging.info("Building dataset for NER training")
-        try:
-            df = pd.read_csv(get_data_file_path("names_featured.csv", self.config)) \
-                if data is None \
-                else data
-
-            ner_df = df[df["ner_tagged"] == 1].copy()
-            if ner_df.empty:
-                logging.error("No NER tagged data found in the CSV")
-                return 1
-
-            logging.info(f"Found {len(ner_df)} NER tagged entries")
-            nlp = spacy.blank("fr")
-            doc_bin, training_data = DocBin(), []
-            processed_count, skipped_count = 0, 0
-
-            for _, row in ner_df.iterrows():
-                text = str(row.get("name", "")).strip()
-                if not text:
+        def _validate(text, entities):
+            if not entities or not text:
+                return []
+            text = str(text).strip()
+            valid = []
+            for ent in entities:
+                if not isinstance(ent, (list, tuple)) or len(ent) != 3:
                    continue
-
-                entities = self.parse_entities(row.get("ner_entities", "[]"))
-                entities = self.validate_entities(entities, text)
-
-                training_data.append((text, {"entities": entities}))
+                start, end, label = ent
                try:
-                    doc_bin.add(self.create_doc(text, entities, nlp))
-                    processed_count += 1
-                except Exception as e:
-                    logging.error(f"Error processing '{text}': {e}")
-                    skipped_count += 1
+                    start, end = int(start), int(end)
+                except (ValueError, TypeError):
+                    continue
+                if not isinstance(label, str):
+                    continue
+                if not (0 <= start < end <= len(text)):
+                    continue
+                if not text[start:end].strip():
+                    continue
+                valid.append((start, end, label))
+            if not valid:
+                return []
+            valid.sort(key=lambda x: (x[0], x[1]))
+            # remove overlaps
+            filtered, last_end = [], -1
+            for s, e, l in valid:
+                if s >= last_end:
+                    filtered.append((s, e, l))
+                    last_end = e
+            return filtered

-            if not training_data:
-                logging.error("No valid training examples generated")
-                return 1
+        return pd.Series(map(_validate, texts, entities_series), index=texts.index)

-            json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
-            spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
+    @staticmethod
+    def _create_docs(nlp, texts, entities):
+        """Batch create spaCy Docs."""
+        docs = []
+        for text, ents in zip(texts, entities):
+            doc = nlp(text)
+            spans = []
+            for start, end, label in ents:
+                span = doc.char_span(
+                    start, end, label=label, alignment_mode="contract"
+                ) or doc.char_span(start, end, label=label, alignment_mode="strict")
+                if span:
+                    spans.append(span)
+            doc.ents = filter_spans(spans)
+            docs.append(doc)
+        return docs

-            with open(json_path, "w", encoding="utf-8") as f:
-                json.dump(training_data, f, ensure_ascii=False, indent=None)
-            doc_bin.to_disk(spacy_path)
+    def build(self) -> int:
+        input_filepath = get_data_file_path(
+            self.config.data.output_files["engineered"], self.config
+        )
+        df = self.data_loader.load_csv_complete(input_filepath)
+        df = df[["name", "ner_tagged", "ner_entities"]]

-            logging.info(f"Processed: {processed_count}, Skipped: {skipped_count}")
-            logging.info(f"Saved NER data in json format to {json_path}")
-            logging.info(f"Saved NER data in spaCy format to {spacy_path}")
-            return 0
-
-        except Exception as e:
-            logging.error(f"Failed to build NER dataset: {e}", exc_info=True)
+        # Filter early
+        ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
+        if ner_df.empty:
+            logging.error("No NER tagged data found")
            return 1
+
+        total_rows = len(df)
+        del df  # No need to keep in memory
+
+        logging.info(f"Found {len(ner_df)} NER tagged entries")
+        nlp = spacy.blank("fr")
+
+        # Vectorized parsing + validation
+        parsed_entities = self._parse_entities(ner_df["ner_entities"])
+        validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
+
+        # Drop rows with no valid entities
+        mask = validated_entities.map(bool)
+        ner_df = ner_df.loc[mask]
+        validated_entities = validated_entities.loc[mask]
+
+        if ner_df.empty:
+            logging.error("No valid training examples after validation")
+            return 1
+
+        # Prepare training data
+        training_data = list(
+            zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
+        )
+
+        # Create spaCy DocBin in batch
+        docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
+        doc_bin = DocBin(docs=docs)
+
+        # Save
+        json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
+        spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
+        doc_bin.to_disk(spacy_path)
+
+        logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
+        logging.info(f"Saved NER JSON to {json_path}")
+        logging.info(f"Saved NER spacy to {spacy_path}")
+        return 0
@@ -1,9 +1,14 @@
 import random
 from typing import List
+import logging

 import numpy as np
 import pandas as pd
+from tqdm import tqdm

+from core.config import PipelineConfig
+from core.utils import get_data_file_path
+from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
 from processing.ner.formats.connectors_format import ConnectorFormatter
 from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
 from processing.ner.formats.native_only_format import NativeOnlyFormatter
@@ -18,50 +23,64 @@ class NEREngineering:
    and encourage sequence characteristic learning.
    """

-    def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
-        self.connectors = connectors or ['wa', 'ya', 'ka', 'ba', 'la']
-        self.additional_surnames = additional_surnames or [
-            'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
-            'andre', 'michel', 'robert'
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.data_loader = DataLoader(config)
+        self.connectors = ["wa", "ya", "ka", "ba", "la"]
+        self.additional_surnames = [
+            "jean",
+            "paul",
+            "marie",
+            "joseph",
+            "pierre",
+            "claude",
+            "andre",
+            "michel",
+            "robert",
        ]

+        random.seed(self.config.data.random_seed)
+        np.random.seed(self.config.data.random_seed)
+
        # Initialize format classes
        self.formatters = {
-            'original': OriginalFormatter(self.connectors, self.additional_surnames),
-            'native_only': NativeOnlyFormatter(self.connectors, self.additional_surnames),
-            'position_flipped': PositionFlippedFormatter(self.connectors, self.additional_surnames),
-            'reduced_native': ReducedNativeFormatter(self.connectors, self.additional_surnames),
-            'connector_added': ConnectorFormatter(self.connectors, self.additional_surnames),
-            'extended_surname': ExtendedSurnameFormatter(self.connectors, self.additional_surnames)
+            "original": OriginalFormatter(self.connectors, self.additional_surnames),
+            "native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
+            "position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
+            "reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
+            "connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
+            "extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
        }

-    @classmethod
-    def load_ner_data(cls, filepath: str) -> pd.DataFrame:
+    def load_data(self) -> pd.DataFrame:
        """Load and filter NER-tagged data from CSV file"""
-        df = pd.read_csv(filepath)
+
+        filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
+        df = self.data_loader.load_csv_complete(filepath)

        # Filter only NER-tagged rows
-        ner_data = df[df['ner_tagged'] == 1].copy()
-        print(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
+        ner_data = df[df["ner_tagged"] == 1].copy()
+        logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")

        return ner_data

-    def engineer_dataset(self, df: pd.DataFrame, random_seed: int = 42) -> pd.DataFrame:
-        """
-        Apply feature engineering transformations according to the specified rules:
-        - First 25%: original format
-        - Second 25%: remove surname
-        - Third 25%: flip positions
-        - Fourth 10%: reduce native components
-        - Fifth 10%: add connectors
-        - Last 5%: extend surnames
-        """
-        random.seed(random_seed)
-        np.random.seed(random_seed)
+    def compute(self) -> None:
+        logging.info("Applying feature engineering transformations...")
+        input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
+        output_filepath = get_data_file_path(
+            self.config.data.output_files["engineered"], self.config
+        )

-        # Shuffle the dataset
-        df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
-        total_rows = len(df_shuffled)
+        df = self.data_loader.load_csv_complete(input_filepath)
+        ner_df = df[df["ner_tagged"] == 1].copy()
+        logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
+
+        del df  # No need to keep in memory
+
+        ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
+            drop=True
+        )
+        total_rows = len(ner_df)

        # Calculate split points
        split_25_1 = int(total_rows * 0.25)
@@ -71,37 +90,31 @@ class NEREngineering:
        split_10_2 = int(total_rows * 0.95)

        # Define transformation groups
-        transformation_groups = [
-            (0, split_25_1, 'original'),
-            (split_25_1, split_25_2, 'native_only'),
-            (split_25_2, split_25_3, 'position_flipped'),
-            (split_25_3, split_10_1, 'reduced_native'),
-            (split_10_1, split_10_2, 'connector_added'),
-            (split_10_2, total_rows, 'extended_surname')
+        groups = [
+            (0, split_25_1, "original"),  # First 25%: original format
+            (split_25_1, split_25_2, "native_only"),  # Second 25%: remove surname
+            (split_25_2, split_25_3, "position_flipped"),  # Third 25%: flip positions
+            (split_25_3, split_10_1, "reduced_native"),  # Fourth 10%: reduce native components
+            (split_10_1, split_10_2, "connector_added"),  # Fifth 10%: add connectors
+            (split_10_2, total_rows, "extended_surname"),  # Last 5%: extend surnames
        ]

-        print("Dataset splits:")
-        for start, end, trans_type in transformation_groups:
-            print(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
+        for start, end, trans_type in groups:
+            logging.info(f"Group {trans_type}: {start} to {end} ({end - start} rows)")

        # Process each group
-        engineered_rows = []
-        for start, end, formatter_key in transformation_groups:
+        rows = []
+        for start, end, formatter_key in groups:
            formatter = self.formatters[formatter_key]

-            for idx in range(start, end):
-                row = df_shuffled.iloc[idx]
+            for idx in tqdm(range(start, end), desc=f"Processing {formatter_key}"):
+                row = ner_df.iloc[idx]
                transformed = formatter.transform(row)

                # Keep original columns and add transformed ones
                new_row = row.to_dict()
                new_row.update(transformed)
-                engineered_rows.append(new_row)
+                rows.append(new_row)

-        return pd.DataFrame(engineered_rows)
-
-    @classmethod
-    def save_engineered_dataset(cls, df: pd.DataFrame, output_path: str):
-        """Save the engineered dataset to CSV file"""
-        df.to_csv(output_path, index=False)
-        print(f"Engineered dataset saved to {output_path}")
+        self.data_loader.save_csv(pd.DataFrame(rows), output_filepath)
+        logging.info(f"Engineered dataset saved to {output_filepath}")
@@ -48,7 +48,7 @@ class NERNameModel:

        logging.info(f"Loading training data from {data_path}")

-        with open(data_path, 'r', encoding='utf-8') as f:
+        with open(data_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        # Validate and clean training data
@@ -58,7 +58,9 @@ class NERNameModel:
        for i, item in enumerate(raw_data):
            try:
                if not isinstance(item, (list, tuple)) or len(item) != 2:
-                    logging.warning(f"Skipping invalid training example format at index {i}: {item}")
+                    logging.warning(
+                        f"Skipping invalid training example format at index {i}: {item}"
+                    )
                    skipped_count += 1
                    continue

@@ -83,20 +85,27 @@ class NERNameModel:
                    # String format from tagger: "[(0, 6, 'NATIVE'), ...]"
                    try:
                        import ast
+
                        entities = ast.literal_eval(entities_raw)
                        if not isinstance(entities, list):
-                            logging.warning(f"Parsed entities is not a list at index {i}: {entities}")
+                            logging.warning(
+                                f"Parsed entities is not a list at index {i}: {entities}"
+                            )
                            skipped_count += 1
                            continue
                    except (ValueError, SyntaxError) as e:
-                        logging.warning(f"Failed to parse entity string at index {i}: {entities_raw} ({e})")
+                        logging.warning(
+                            f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
+                        )
                        skipped_count += 1
                        continue
                elif isinstance(entities_raw, list):
                    # Already in list format
                    entities = entities_raw
                else:
-                    logging.warning(f"Skipping invalid entities format at index {i}: {entities_raw}")
+                    logging.warning(
+                        f"Skipping invalid entities format at index {i}: {entities_raw}"
+                    )
                    skipped_count += 1
                    continue

@@ -110,16 +119,20 @@ class NERNameModel:
                    start, end, label = entity

                    # Validate entity components
-                    if (not isinstance(start, int) or not isinstance(end, int) or
-                            not isinstance(label, str) or start >= end or
-                            start < 0 or end > len(text)):
+                    if (
+                        not isinstance(start, int)
+                        or not isinstance(end, int)
+                        or not isinstance(label, str)
+                        or start >= end
+                        or start < 0
+                        or end > len(text)
+                    ):
                        logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
                        continue

                    # Check for overlaps with already validated entities
                    has_overlap = any(
-                        start < v_end and end > v_start
-                        for v_start, v_end, _ in valid_entities
+                        start < v_end and end > v_start for v_start, v_end, _ in valid_entities
                    )

                    if has_overlap:
@@ -128,8 +141,10 @@ class NERNameModel:

                    # Validate that the span doesn't contain spaces (matching tagger validation)
                    span_text = text[start:end]
-                    if not span_text or span_text != span_text.strip() or ' ' in span_text:
-                        logging.warning(f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'")
+                    if not span_text or span_text != span_text.strip() or " " in span_text:
+                        logging.warning(
+                            f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
+                        )
                        continue

                    valid_entities.append((start, end, label))
@@ -148,7 +163,9 @@ class NERNameModel:
                skipped_count += 1
                continue

-        logging.info(f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones")
+        logging.info(
+            f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
+        )

        if not valid_data:
            raise ValueError("No valid training examples found in the data")
@@ -156,15 +173,17 @@ class NERNameModel:
        return valid_data

    def train(
-            self,
-            data: List[Tuple[str, Dict]],
-            epochs: int = 5,
-            batch_size: int = 16,
-            dropout_rate: float = 0.2,
+        self,
+        data: List[Tuple[str, Dict]],
+        epochs: int = 5,
+        batch_size: int = 16,
+        dropout_rate: float = 0.2,
    ) -> None:
        """Train the NER model"""
        logging.info(f"Starting NER training with {len(data)} examples")
-        logging.info(f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}")
+        logging.info(
+            f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
+        )

        if self.nlp is None:
            raise ValueError("Model not initialized. Call create_blank_model() first.")
@@ -184,16 +203,15 @@ class NERNameModel:
                doc = self.nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
-                logging.info(f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}")
+                logging.info(
+                    f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
+                )

            # Train in batches
            batches = minibatch(examples, size=batch_size)
            for batch in batches:
                self.nlp.update(
-                    batch,
-                    losses=losses,
-                    drop=dropout_rate,
-                    sgd=self.nlp.create_optimizer()
+                    batch, losses=losses, drop=dropout_rate, sgd=self.nlp.create_optimizer()
                )
                logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")

@@ -208,7 +226,7 @@ class NERNameModel:
            "training_examples": len(data),
            "loss_history": losses_history,
            "batch_size": batch_size,
-            "dropout_rate": dropout_rate
+            "dropout_rate": dropout_rate,
        }

        logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
@@ -225,7 +243,10 @@ class NERNameModel:
        predicted_entities = 0
        actual_entities = 0

-        entity_stats = {"NATIVE": {"tp": 0, "fp": 0, "fn": 0}, "SURNAME": {"tp": 0, "fp": 0, "fn": 0}}
+        entity_stats = {
+            "NATIVE": {"tp": 0, "fp": 0, "fn": 0},
+            "SURNAME": {"tp": 0, "fp": 0, "fn": 0},
+        }

        for text, annotations in test_data:
            # Get actual entities
@@ -259,7 +280,9 @@ class NERNameModel:
        # Calculate overall metrics
        precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
        recall = correct_entities / actual_entities if actual_entities > 0 else 0
-        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+        f1_score = (
+            2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+        )

        # Calculate per-label metrics
        label_metrics = {}
@@ -268,14 +291,16 @@ class NERNameModel:
            label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            label_f1 = (
-                    2 * (label_precision * label_recall) / (label_precision + label_recall)) \
-                if (label_precision + label_recall) > 0 else 0
+                (2 * (label_precision * label_recall) / (label_precision + label_recall))
+                if (label_precision + label_recall) > 0
+                else 0
+            )

            label_metrics[label] = {
                "precision": label_precision,
                "recall": label_recall,
                "f1_score": label_f1,
-                "support": tp + fn
+                "support": tp + fn,
            }

        evaluation_results = {
@@ -286,9 +311,9 @@ class NERNameModel:
                "total_examples": total_examples,
                "correct_entities": correct_entities,
                "predicted_entities": predicted_entities,
-                "actual_entities": actual_entities
+                "actual_entities": actual_entities,
            },
-            "by_label": label_metrics
+            "by_label": label_metrics,
        }

        logging.info(f"NER Evaluation completed. Overall F1: {f1_score:.4f}")
@@ -309,7 +334,7 @@ class NERNameModel:

        # Save training statistics
        stats_path = model_dir / "training_stats.json"
-        with open(stats_path, 'w', encoding='utf-8') as f:
+        with open(stats_path, "w", encoding="utf-8") as f:
            json.dump(self.training_stats, f, indent=2)

        logging.info(f"NER Model saved to {model_dir}")
@@ -328,7 +353,7 @@ class NERNameModel:
        # Load training statistics if available
        stats_path = Path(model_path) / "training_stats.json"
        if stats_path.exists():
-            with open(stats_path, 'r', encoding='utf-8') as f:
+            with open(stats_path, "r", encoding="utf-8") as f:
                self.training_stats = json.load(f)

        logging.info("NER Model loaded successfully")
@@ -342,15 +367,14 @@ class NERNameModel:
        entities = []

        for ent in doc.ents:
-            entities.append({
-                "text": ent.text,
-                "label": ent.label_,
-                "start": ent.start_char,
-                "end": ent.end_char,
-                "confidence": getattr(ent, 'score', None)  # If confidence scores are available
-            })
+            entities.append(
+                {
+                    "text": ent.text,
+                    "label": ent.label_,
+                    "start": ent.start_char,
+                    "end": ent.end_char,
+                    "confidence": getattr(ent, "score", None),  # If confidence scores are available
+                }
+            )

-        return {
-            "text": text,
-            "entities": entities
-        }
+        return {"text": text, "entities": entities}
@@ -3,7 +3,9 @@ import logging


 class NERNameTagger:
-    def tag_name(self, name: str, probable_native: str, probable_surname: str) -> Union[Dict[str, Any], None]:
+    def tag_name(
+        self, name: str, probable_native: str, probable_surname: str
+    ) -> Union[Dict[str, Any], None]:
        """Create a single NER training example using probable_native and probable_surname"""
        if not name or not probable_native or not probable_surname:
            return None
@@ -56,9 +58,10 @@ class NERNameTagger:
                    continue

                # Check if this is a word boundary match and doesn't overlap
-                if (self._is_word_boundary_match(name, pos, end_pos) and
-                    not has_overlap(pos, end_pos)):
-                    entities.append((pos, end_pos, 'NATIVE'))
+                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
+                    pos, end_pos
+                ):
+                    entities.append((pos, end_pos, "NATIVE"))
                    used_spans.append((pos, end_pos))
                    break  # Only take the first non-overlapping occurrence

@@ -84,16 +87,19 @@ class NERNameTagger:
                    start_pos = pos + 1
                    continue

-                if (self._is_word_boundary_match(name, pos, end_pos) and
-                    not has_overlap(pos, end_pos)):
-                    entities.append((pos, end_pos, 'SURNAME'))
+                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
+                    pos, end_pos
+                ):
+                    entities.append((pos, end_pos, "SURNAME"))
                    used_spans.append((pos, end_pos))
                    break

                start_pos = pos + 1

        if not entities:
-            logging.warning(f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'")
+            logging.warning(
+                f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
+            )
            return None

        # Sort entities by position and validate
@@ -104,7 +110,9 @@ class NERNameTagger:
        for start, end, label in entities:
            # Check bounds
            if not (0 <= start < end <= len(name)):
-                logging.warning(f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'")
+                logging.warning(
+                    f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
+                )
                continue

            # Check for overlaps with already validated entities
@@ -114,8 +122,10 @@ class NERNameTagger:

            # CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
            span_text = name[start:end]
-            if not span_text or span_text != span_text.strip() or ' ' in span_text:
-                logging.warning(f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'")
+            if not span_text or span_text != span_text.strip() or " " in span_text:
+                logging.warning(
+                    f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
+                )
                continue

            validated_entities.append((start, end, label))
@@ -129,7 +139,7 @@ class NERNameTagger:

        return {
            "entities": entities_str,
-            "spans": validated_entities  # Keep the original tuples for internal use
+            "spans": validated_entities,  # Keep the original tuples for internal use
        }

    @classmethod
@@ -154,6 +164,7 @@ class NERNameTagger:
        """Validate that entity annotations are correct for a given name"""
        try:
            import ast
+
            entities = ast.literal_eval(entities_str)

            # Check for overlaps and valid bounds
@@ -182,10 +193,11 @@ class NERNameTagger:
    @classmethod
    def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
        """Extract the actual text for each entity type"""
-        result = {'NATIVE': [], 'SURNAME': []}
+        result = {"NATIVE": [], "SURNAME": []}

        try:
            import ast
+
            entities = ast.literal_eval(entities_str)

            for start, end, label in entities: