feat: enhance logging and memory management across modules

This commit is contained in:
2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
+131 -48
View File
@@ -1,5 +1,7 @@
import gc
import logging
from enum import Enum
from typing import Dict, Any
import pandas as pd
@@ -27,10 +29,15 @@ class FeatureExtractionStep(PipelineStep):
self.region_mapper = RegionMapper()
self.name_tagger = NERNameTagger()
@classmethod
def requires_batch_mutation(cls) -> bool:
"""This step creates new columns, so mutation is required"""
return True
@classmethod
def validate_gender(cls, gender: str) -> Gender:
"""Validate and normalize gender value"""
gender_lower = gender.lower().strip()
gender_lower = str(gender).lower().strip()
if gender_lower in ["m", "male", "homme", "masculin"]:
return Gender.MALE
elif gender_lower in ["f", "female", "femme", "féminin"]:
@@ -41,68 +48,144 @@ class FeatureExtractionStep(PipelineStep):
@classmethod
def get_name_category(cls, word_count: int) -> NameCategory:
"""Determine name category based on word count"""
if word_count == 3:
return NameCategory.SIMPLE
else:
return NameCategory.COMPOSE
return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Extract features from names in batch"""
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
batch = batch.copy()
result = batch.copy()
numeric_features = self._compute_numeric_features(result["name"])
result = result.assign(**numeric_features)
# Basic features
batch["words"] = batch["name"].str.count(" ") + 1
batch["length"] = batch["name"].str.len()
# Initialize features columns with optimal dtypes
features_columns = self._initialize_features_columns(len(result))
result = result.assign(**features_columns)
# Handle year column
if "year" in batch.columns:
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
self._assign_probable_names(result)
self._process_simple_names(result)
result["identified_category"] = self._assign_identified_category(result["words"])
# Initialize new columns
batch["probable_native"] = None
batch["probable_surname"] = None
batch["identified_name"] = None
batch["identified_surname"] = None
batch["ner_entities"] = None
batch["ner_tagged"] = 0
batch["annotated"] = 0
if "year" in result.columns:
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
# Vectorized category assignment
batch["identified_category"] = batch["words"].apply(
lambda x: self.get_name_category(x).value
if "region" in result.columns:
result["province"] = self.region_mapper.map(result["region"])
result["province"] = result["province"].astype("category")
if "sex" in result.columns:
result["sex"] = self._normalize_gender(result["sex"])
# Apply final dtype optimizations
result = self._optimize_dtypes(result)
# Cleanup
del numeric_features, features_columns
if batch_id % 10 == 0: # Periodic cleanup
gc.collect()
return result
@classmethod
def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
"""Calculate basic features in vectorized manner"""
return {
"words": (series.str.count(" ") + 1).astype("Int8"),
"length": series.str.len().astype("Int16"),
}
@classmethod
def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
"""Initialize new columns with optimal dtypes"""
return {
"probable_native": pd.Series([None] * size, dtype="string"),
"probable_surname": pd.Series([None] * size, dtype="string"),
"identified_name": pd.Series([None] * size, dtype="string"),
"identified_surname": pd.Series([None] * size, dtype="string"),
"ner_entities": pd.Series([None] * size, dtype="string"),
"ner_tagged": pd.Series([0] * size, dtype="Int8"),
"annotated": pd.Series([0] * size, dtype="Int8"),
}
@classmethod
def _assign_probable_names(cls, df: pd.DataFrame) -> None:
"""Assign probable native and surname names efficiently"""
name_splits = df["name"].str.split()
mask = name_splits.str.len() >= 2
df.loc[mask, "probable_native"] = name_splits[mask].apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
)
df.loc[mask, "probable_surname"] = name_splits[mask].apply(
lambda x: x[-1] if isinstance(x, list) else None
)
# Assign probable_native and probable_surname for all names
name_splits = batch["name"].str.split()
batch["probable_native"] = name_splits.apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
)
batch["probable_surname"] = name_splits.apply(
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
)
def _assign_identified_category(self, series: pd.Series) -> pd.Series:
"""Assign identified category based on word count"""
return series.map(lambda x: self.get_name_category(x).value).astype("category")
# Auto-assign for 3-word names
three_word_mask = batch["words"] == 3
batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"]
batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"]
batch.loc[three_word_mask, "annotated"] = 1
def _process_simple_names(self, df: pd.DataFrame) -> None:
"""Process 3-word names efficiently with vectorized operations"""
mask = df["words"] == 3
# Tag names with NER entities
three_word_rows = batch[three_word_mask]
if not mask.any():
return
df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
df.loc[mask, "annotated"] = 1
# NER tagging for 3-word names
three_word_rows = df[mask]
for idx, row in three_word_rows.iterrows():
entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname'])
try:
entity = self.name_tagger.tag_name(
row["name"], row["identified_name"], row["identified_surname"]
)
if entity:
batch.at[idx, "ner_entities"] = entity["entities"]
batch.at[idx, "ner_tagged"] = 1
if entity:
df.at[idx, "ner_entities"] = str(entity["entities"])
df.at[idx, "ner_tagged"] = 1
except Exception as e:
logging.warning(f"NER tagging failed for row {idx}: {e}")
# Map regions to provinces
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
def _normalize_gender(self, series: pd.Series) -> pd.Series:
gender_mapping = {
"m": "m",
"male": "m",
"homme": "m",
"masculin": "m",
"f": "f",
"female": "f",
"femme": "f",
"féminin": "f",
}
# Normalize gender
if "sex" in batch.columns:
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
# Apply mapping with error handling
normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
return normalized.astype("category")
return batch
@classmethod
def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
categories = ["province", "identified_category", "sex"]
for col in categories:
if col in df.columns and df[col].dtype != "category":
df[col] = df[col].astype("category")
# Ensure string columns are proper string dtype
string_cols = [
"name",
"probable_native",
"probable_surname",
"identified_name",
"identified_surname",
"ner_entities",
]
for col in string_cols:
if col in df.columns and df[col].dtype == "object":
df[col] = df[col].astype("string")
return df