feat: enhance logging and memory management across modules

This commit is contained in:
2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
+60 -16
View File
@@ -1,17 +1,41 @@
import gc
import logging
from pathlib import Path
from typing import Optional, Union, Iterator
from typing import Optional, Union, Iterator, Dict
import pandas as pd
from core.config.pipeline_config import PipelineConfig
OPTIMIZED_DTYPES = {
# Numeric columns with appropriate bit-width
"year": "Int16", # Years fit in 16-bit integer
"words": "Int8", # Word counts typically < 128
"length": "Int16", # Name lengths fit in 16-bit
"annotated": "Int8", # Binary flag (0/1)
"ner_tagged": "Int8", # Binary flag (0/1)
# Categorical columns (memory efficient for repeated values)
"sex": "category",
"province": "category",
"region": "category",
"identified_category": "category",
"transformation_type": "category",
# String columns with proper string dtype
"name": "string",
"probable_native": "string",
"probable_surname": "string",
"identified_name": "string",
"identified_surname": "string",
"ner_entities": "string",
}
class DataLoader:
"""Reusable data loading utilities"""
def __init__(self, config: PipelineConfig):
def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
self.config = config
self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
def load_csv_chunked(
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
@@ -19,19 +43,23 @@ class DataLoader:
"""Load CSV file in chunks for memory efficiency"""
chunk_size = chunk_size or self.config.processing.chunk_size
encodings = self.config.processing.encoding_options
filepath = Path(filepath)
for encoding in encodings:
try:
logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
logging.info(f"Reading {filepath} with encoding: {encoding}")
# Read with optimal dtypes
chunk_iter = pd.read_csv(
filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
filepath,
encoding=encoding,
chunksize=chunk_size,
on_bad_lines="skip",
dtype=self.dtypes,
)
for i, chunk in enumerate(chunk_iter):
logging.debug(f"Processing chunk {i+1}")
logging.debug(f"Processing optimized chunk {i + 1}")
yield chunk
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
@@ -44,12 +72,20 @@ class DataLoader:
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
"""Load complete CSV file into memory with size limiting and balancing"""
chunks = list(self.load_csv_chunked(filepath))
"""Load complete CSV with memory optimization"""
chunks = []
for chunk in self.load_csv_chunked(filepath):
chunks.append(chunk)
if not chunks:
return pd.DataFrame()
df = pd.concat(chunks, ignore_index=True)
logging.info(f"Concatenating {len(chunks)} optimized chunks")
df = pd.concat(chunks, ignore_index=True, copy=False)
# Cleanup chunks from memory
del chunks
gc.collect()
# Apply dataset size limiting if configured
if self.config.data.max_dataset_size is not None:
@@ -87,27 +123,35 @@ class DataLoader:
balanced_samples = []
for i, sex in enumerate(sex_values):
sex_df = df[df["sex"] == sex]
# Use boolean indexing instead of creating temporary DataFrames
sex_mask = df["sex"] == sex
sex_indices = df[sex_mask].index
# Distribute remaining samples to first categories
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
current_samples = min(current_samples, len(sex_df))
current_samples = min(current_samples, len(sex_indices))
if current_samples > 0:
sample = sex_df.sample(n=current_samples, random_state=self.config.data.random_seed + i)
balanced_samples.append(sample)
# Sample indices instead of DataFrame
sampled_indices = pd.Series(sex_indices).sample(
n=current_samples, random_state=self.config.data.random_seed + i
)
balanced_samples.extend(sampled_indices.tolist())
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
if not balanced_samples:
logging.warning("No balanced samples could be created, using random sampling")
return df.sample(n=max_size, random_state=self.config.data.random_seed)
result = pd.concat(balanced_samples, ignore_index=True)
# Create result using iloc with indices (no copying until final step)
result = df.iloc[balanced_samples].copy()
# Shuffle the final result
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(drop=True)
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
drop=True
)
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total records")
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
return result
@classmethod
-2
View File
@@ -1,3 +1 @@
+3 -7
View File
@@ -9,14 +9,9 @@ class RegionMapper:
def __init__(self, mapping: Optional[Dict] = None):
self.mapping = mapping or REGION_MAPPING
def map_region_to_province(self, region: str) -> str:
"""Map a region to its province"""
region_lower = str(region).lower().strip()
return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
def map(self, series: pd.Series) -> pd.Series:
"""Vectorized region to province mapping"""
return regions.str.lower().map(
return series.str.lower().map(
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
)
@@ -34,6 +29,7 @@ class RegionMapper:
"sud-kivu",
"kasai-occidental",
"kasai-oriental",
"autres",
]