feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -1,17 +1,41 @@
+import gc
 import logging
 from pathlib import Path
-from typing import Optional, Union, Iterator
+from typing import Optional, Union, Iterator, Dict

 import pandas as pd

 from core.config.pipeline_config import PipelineConfig

+OPTIMIZED_DTYPES = {
+    # Numeric columns with appropriate bit-width
+    "year": "Int16",  # Years fit in 16-bit integer
+    "words": "Int8",  # Word counts typically < 128
+    "length": "Int16",  # Name lengths fit in 16-bit
+    "annotated": "Int8",  # Binary flag (0/1)
+    "ner_tagged": "Int8",  # Binary flag (0/1)
+    # Categorical columns (memory efficient for repeated values)
+    "sex": "category",
+    "province": "category",
+    "region": "category",
+    "identified_category": "category",
+    "transformation_type": "category",
+    # String columns with proper string dtype
+    "name": "string",
+    "probable_native": "string",
+    "probable_surname": "string",
+    "identified_name": "string",
+    "identified_surname": "string",
+    "ner_entities": "string",
+}
+

 class DataLoader:
    """Reusable data loading utilities"""

-    def __init__(self, config: PipelineConfig):
+    def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
        self.config = config
+        self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}

    def load_csv_chunked(
        self, filepath: Union[str, Path], chunk_size: Optional[int] = None
@@ -19,19 +43,23 @@ class DataLoader:
        """Load CSV file in chunks for memory efficiency"""
        chunk_size = chunk_size or self.config.processing.chunk_size
        encodings = self.config.processing.encoding_options
-
        filepath = Path(filepath)

        for encoding in encodings:
            try:
-                logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
+                logging.info(f"Reading {filepath} with encoding: {encoding}")

+                # Read with optimal dtypes
                chunk_iter = pd.read_csv(
-                    filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
+                    filepath,
+                    encoding=encoding,
+                    chunksize=chunk_size,
+                    on_bad_lines="skip",
+                    dtype=self.dtypes,
                )

                for i, chunk in enumerate(chunk_iter):
-                    logging.debug(f"Processing chunk {i+1}")
+                    logging.debug(f"Processing optimized chunk {i + 1}")
                    yield chunk

                logging.info(f"Successfully read {filepath} with encoding: {encoding}")
@@ -44,12 +72,20 @@ class DataLoader:
        raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")

    def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
-        """Load complete CSV file into memory with size limiting and balancing"""
-        chunks = list(self.load_csv_chunked(filepath))
+        """Load complete CSV with memory optimization"""
+        chunks = []
+        for chunk in self.load_csv_chunked(filepath):
+            chunks.append(chunk)
+
        if not chunks:
            return pd.DataFrame()

-        df = pd.concat(chunks, ignore_index=True)
+        logging.info(f"Concatenating {len(chunks)} optimized chunks")
+        df = pd.concat(chunks, ignore_index=True, copy=False)
+
+        # Cleanup chunks from memory
+        del chunks
+        gc.collect()

        # Apply dataset size limiting if configured
        if self.config.data.max_dataset_size is not None:
@@ -87,27 +123,35 @@ class DataLoader:
        balanced_samples = []

        for i, sex in enumerate(sex_values):
-            sex_df = df[df["sex"] == sex]
+            # Use boolean indexing instead of creating temporary DataFrames
+            sex_mask = df["sex"] == sex
+            sex_indices = df[sex_mask].index

            # Distribute remaining samples to first categories
            current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
-            current_samples = min(current_samples, len(sex_df))
+            current_samples = min(current_samples, len(sex_indices))

            if current_samples > 0:
-                sample = sex_df.sample(n=current_samples, random_state=self.config.data.random_seed + i)
-                balanced_samples.append(sample)
+                # Sample indices instead of DataFrame
+                sampled_indices = pd.Series(sex_indices).sample(
+                    n=current_samples, random_state=self.config.data.random_seed + i
+                )
+                balanced_samples.extend(sampled_indices.tolist())
                logging.info(f"Sampled {current_samples} records for sex '{sex}'")

        if not balanced_samples:
            logging.warning("No balanced samples could be created, using random sampling")
            return df.sample(n=max_size, random_state=self.config.data.random_seed)

-        result = pd.concat(balanced_samples, ignore_index=True)
+        # Create result using iloc with indices (no copying until final step)
+        result = df.iloc[balanced_samples].copy()

        # Shuffle the final result
-        result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(drop=True)
+        result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
+            drop=True
+        )

-        logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total records")
+        logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
        return result

    @classmethod
@@ -1,3 +1 @@

-
-
@@ -9,14 +9,9 @@ class RegionMapper:
    def __init__(self, mapping: Optional[Dict] = None):
        self.mapping = mapping or REGION_MAPPING

-    def map_region_to_province(self, region: str) -> str:
-        """Map a region to its province"""
-        region_lower = str(region).lower().strip()
-        return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
-
-    def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
+    def map(self, series: pd.Series) -> pd.Series:
        """Vectorized region to province mapping"""
-        return regions.str.lower().map(
+        return series.str.lower().map(
            lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
        )

@@ -34,6 +29,7 @@ class RegionMapper:
            "sud-kivu",
            "kasai-occidental",
            "kasai-oriental",
+            "autres",
        ]