feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -9,6 +9,7 @@ import pandas as pd
 from pydantic import BaseModel

 from core.config.pipeline_config import PipelineConfig
+from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
 from processing.batch.batch_config import BatchConfig


@@ -37,10 +38,11 @@ class PipelineStep(ABC):
    """Abstract base class for pipeline steps"""

    def __init__(
-            self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
+        self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
    ):
        self.name = name
        self.pipeline_config = pipeline_config
+        self.data_loader = DataLoader(pipeline_config)

        # Use provided batch_config or create default from pipeline config
        if batch_config is None:
@@ -53,6 +55,11 @@ class PipelineStep(ABC):
        self.batch_config = batch_config
        self.state = PipelineState()

+    @property
+    def requires_batch_mutation(self) -> bool:
+        """Indicates if this step modifies the batch data"""
+        return False
+
    @abstractmethod
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process a single batch of data"""
@@ -108,12 +115,12 @@ class PipelineStep(ABC):
    def save_batch(self, batch: pd.DataFrame, batch_id: int):
        """Save processed batch to checkpoint"""
        checkpoint_path = self.get_checkpoint_path(batch_id)
-        batch.to_csv(checkpoint_path, index=False)
+        self.data_loader.save_csv(batch, checkpoint_path)
        logging.info(f"Saved batch {batch_id} to {checkpoint_path}")

    def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
        """Load processed batch from checkpoint"""
        checkpoint_path = self.get_checkpoint_path(batch_id)
        if os.path.exists(checkpoint_path):
-            return pd.read_csv(checkpoint_path)
+            return self.data_loader.load_csv_complete(checkpoint_path)
        return None
@@ -2,11 +2,10 @@ import numpy as np
 import pandas as pd

 from core.config.pipeline_config import PipelineConfig
-from processing.steps.feature_extraction_step import Gender
-from core.utils.data_loader import DataLoader
-
+from core.utils.region_mapper import RegionMapper
 from processing.batch.batch_config import BatchConfig
 from processing.steps import PipelineStep
+from processing.steps.feature_extraction_step import Gender


 class DataSplittingStep(PipelineStep):
@@ -20,7 +19,6 @@ class DataSplittingStep(PipelineStep):
            use_multiprocessing=False,
        )
        super().__init__("data_splitting", pipeline_config, batch_config)
-        self.data_loader = DataLoader(pipeline_config)
        self.eval_indices = None

    def determine_eval_indices(self, total_size: int) -> set:
@@ -33,9 +31,9 @@ class DataSplittingStep(PipelineStep):

    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process batch for data splitting - no modification needed"""
-        return batch.copy()
+        return batch

-    def save_splits(self, df: pd.DataFrame) -> None:
+    def split(self, df: pd.DataFrame) -> None:
        """Save the split datasets based on configuration"""
        output_files = self.pipeline_config.data.output_files
        data_dir = self.pipeline_config.paths.data_dir
@@ -52,9 +50,14 @@ class DataSplittingStep(PipelineStep):
        else:
            self.data_loader.save_csv(df, data_dir / output_files["featured"])

+        if self.pipeline_config.data.split_by_province:
+            for province in RegionMapper.get_provinces():
+                df_region = df[df.province == province]
+                self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
+
        if self.pipeline_config.data.split_by_gender:
-            df_males = df[df["sex"] == Gender.MALE.value]
-            df_females = df[df["sex"] == Gender.FEMALE.value]
+            df_males = df[df.sex == Gender.MALE.value]
+            df_females = df[df.sex == Gender.FEMALE.value]

            self.data_loader.save_csv(df_males, data_dir / output_files["males"])
            self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -1,5 +1,7 @@
+import gc
 import logging
 from enum import Enum
+from typing import Dict, Any

 import pandas as pd

@@ -27,10 +29,15 @@ class FeatureExtractionStep(PipelineStep):
        self.region_mapper = RegionMapper()
        self.name_tagger = NERNameTagger()

+    @classmethod
+    def requires_batch_mutation(cls) -> bool:
+        """This step creates new columns, so mutation is required"""
+        return True
+
    @classmethod
    def validate_gender(cls, gender: str) -> Gender:
        """Validate and normalize gender value"""
-        gender_lower = gender.lower().strip()
+        gender_lower = str(gender).lower().strip()
        if gender_lower in ["m", "male", "homme", "masculin"]:
            return Gender.MALE
        elif gender_lower in ["f", "female", "femme", "féminin"]:
@@ -41,68 +48,144 @@ class FeatureExtractionStep(PipelineStep):
    @classmethod
    def get_name_category(cls, word_count: int) -> NameCategory:
        """Determine name category based on word count"""
-        if word_count == 3:
-            return NameCategory.SIMPLE
-        else:
-            return NameCategory.COMPOSE
+        return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE

    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Extract features from names in batch"""
        logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")

-        batch = batch.copy()
+        result = batch.copy()
+        numeric_features = self._compute_numeric_features(result["name"])
+        result = result.assign(**numeric_features)

-        # Basic features
-        batch["words"] = batch["name"].str.count(" ") + 1
-        batch["length"] = batch["name"].str.len()
+        # Initialize features columns with optimal dtypes
+        features_columns = self._initialize_features_columns(len(result))
+        result = result.assign(**features_columns)

-        # Handle year column
-        if "year" in batch.columns:
-            batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
+        self._assign_probable_names(result)
+        self._process_simple_names(result)
+        result["identified_category"] = self._assign_identified_category(result["words"])

-        # Initialize new columns
-        batch["probable_native"] = None
-        batch["probable_surname"] = None
-        batch["identified_name"] = None
-        batch["identified_surname"] = None
-        batch["ner_entities"] = None
-        batch["ner_tagged"] = 0
-        batch["annotated"] = 0
+        if "year" in result.columns:
+            result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")

-        # Vectorized category assignment
-        batch["identified_category"] = batch["words"].apply(
-            lambda x: self.get_name_category(x).value
+        if "region" in result.columns:
+            result["province"] = self.region_mapper.map(result["region"])
+            result["province"] = result["province"].astype("category")
+
+        if "sex" in result.columns:
+            result["sex"] = self._normalize_gender(result["sex"])
+
+        # Apply final dtype optimizations
+        result = self._optimize_dtypes(result)
+
+        # Cleanup
+        del numeric_features, features_columns
+        if batch_id % 10 == 0:  # Periodic cleanup
+            gc.collect()
+
+        return result
+
+    @classmethod
+    def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
+        """Calculate basic features in vectorized manner"""
+        return {
+            "words": (series.str.count(" ") + 1).astype("Int8"),
+            "length": series.str.len().astype("Int16"),
+        }
+
+    @classmethod
+    def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
+        """Initialize new columns with optimal dtypes"""
+        return {
+            "probable_native": pd.Series([None] * size, dtype="string"),
+            "probable_surname": pd.Series([None] * size, dtype="string"),
+            "identified_name": pd.Series([None] * size, dtype="string"),
+            "identified_surname": pd.Series([None] * size, dtype="string"),
+            "ner_entities": pd.Series([None] * size, dtype="string"),
+            "ner_tagged": pd.Series([0] * size, dtype="Int8"),
+            "annotated": pd.Series([0] * size, dtype="Int8"),
+        }
+
+    @classmethod
+    def _assign_probable_names(cls, df: pd.DataFrame) -> None:
+        """Assign probable native and surname names efficiently"""
+
+        name_splits = df["name"].str.split()
+        mask = name_splits.str.len() >= 2
+
+        df.loc[mask, "probable_native"] = name_splits[mask].apply(
+            lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
+        )
+        df.loc[mask, "probable_surname"] = name_splits[mask].apply(
+            lambda x: x[-1] if isinstance(x, list) else None
        )

-        # Assign probable_native and probable_surname for all names
-        name_splits = batch["name"].str.split()
-        batch["probable_native"] = name_splits.apply(
-            lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
-        )
-        batch["probable_surname"] = name_splits.apply(
-            lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
-        )
+    def _assign_identified_category(self, series: pd.Series) -> pd.Series:
+        """Assign identified category based on word count"""
+        return series.map(lambda x: self.get_name_category(x).value).astype("category")

-        # Auto-assign for 3-word names
-        three_word_mask = batch["words"] == 3
-        batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"]
-        batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"]
-        batch.loc[three_word_mask, "annotated"] = 1
+    def _process_simple_names(self, df: pd.DataFrame) -> None:
+        """Process 3-word names efficiently with vectorized operations"""
+        mask = df["words"] == 3

-        # Tag names with NER entities
-        three_word_rows = batch[three_word_mask]
+        if not mask.any():
+            return
+
+        df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
+        df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
+        df.loc[mask, "annotated"] = 1
+
+        # NER tagging for 3-word names
+        three_word_rows = df[mask]
        for idx, row in three_word_rows.iterrows():
-            entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname'])
+            try:
+                entity = self.name_tagger.tag_name(
+                    row["name"], row["identified_name"], row["identified_surname"]
+                )

-            if entity:
-                batch.at[idx, "ner_entities"] = entity["entities"]
-                batch.at[idx, "ner_tagged"] = 1
+                if entity:
+                    df.at[idx, "ner_entities"] = str(entity["entities"])
+                    df.at[idx, "ner_tagged"] = 1
+            except Exception as e:
+                logging.warning(f"NER tagging failed for row {idx}: {e}")

-        # Map regions to provinces
-        batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
+    def _normalize_gender(self, series: pd.Series) -> pd.Series:
+        gender_mapping = {
+            "m": "m",
+            "male": "m",
+            "homme": "m",
+            "masculin": "m",
+            "f": "f",
+            "female": "f",
+            "femme": "f",
+            "féminin": "f",
+        }

-        # Normalize gender
-        if "sex" in batch.columns:
-            batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
+        # Apply mapping with error handling
+        normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
+        return normalized.astype("category")

-        return batch
+    @classmethod
+    def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
+        categories = ["province", "identified_category", "sex"]
+
+        for col in categories:
+            if col in df.columns and df[col].dtype != "category":
+                df[col] = df[col].astype("category")
+
+        # Ensure string columns are proper string dtype
+        string_cols = [
+            "name",
+            "probable_native",
+            "probable_surname",
+            "identified_name",
+            "identified_surname",
+            "ner_entities",
+        ]
+
+        for col in string_cols:
+            if col in df.columns and df[col].dtype == "object":
+                df[col] = df[col].astype("string")
+
+        return df
@@ -24,8 +24,7 @@ class LLMAnnotationStep(PipelineStep):
        batch_config = BatchConfig(
            batch_size=pipeline_config.processing.batch_size,
            max_workers=min(
-                self.llm_config.max_concurrent_requests,
-                pipeline_config.processing.max_workers
+                self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
            ),
            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
            use_multiprocessing=pipeline_config.processing.use_multiprocessing,
@@ -98,7 +97,7 @@ class LLMAnnotationStep(PipelineStep):

                # Exponential backoff with jitter
                if attempt < self.llm_config.retry_attempts - 1:
-                    wait_time = (2 ** attempt) + (time.time() % 1)
+                    wait_time = (2**attempt) + (time.time() % 1)
                    time.sleep(min(wait_time, 10))

        self.failed_requests += 1
@@ -156,6 +155,8 @@ class LLMAnnotationStep(PipelineStep):
                        batch.loc[idx, "annotated"] = 0

        # Ensure proper data types
-        batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        batch["annotated"] = (
+            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        )

        return batch
@@ -6,8 +6,8 @@ from typing import Dict
 import pandas as pd

 from core.config.pipeline_config import PipelineConfig
-from processing.steps import PipelineStep, NameAnnotation
 from processing.ner.ner_name_model import NERNameModel
+from processing.steps import PipelineStep, NameAnnotation


 class NERAnnotationStep(PipelineStep):
@@ -63,7 +63,7 @@ class NERAnnotationStep(PipelineStep):

                # Get NER predictions
                prediction = self.ner_trainer.predict(name.lower())
-                entities = prediction.get('entities', [])
+                entities = prediction.get("entities", [])

                elapsed_time = time.time() - start_time

@@ -72,15 +72,15 @@ class NERAnnotationStep(PipelineStep):
                surname_parts = []

                for entity in entities:
-                    if entity['label'] == 'NATIVE':
-                        native_parts.append(entity['text'])
-                    elif entity['label'] == 'SURNAME':
-                        surname_parts.append(entity['text'])
+                    if entity["label"] == "NATIVE":
+                        native_parts.append(entity["text"])
+                    elif entity["label"] == "SURNAME":
+                        surname_parts.append(entity["text"])

                # Create annotation result in same format as LLM step
                annotation = NameAnnotation(
                    identified_name=" ".join(native_parts) if native_parts else None,
-                    identified_surname=" ".join(surname_parts) if surname_parts else None
+                    identified_surname=" ".join(surname_parts) if surname_parts else None,
                )

                result = {
@@ -159,6 +159,8 @@ class NERAnnotationStep(PipelineStep):
                        batch.loc[idx, "annotated"] = 0

        # Ensure proper data types
-        batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        batch["annotated"] = (
+            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        )

        return batch