192 lines
6.7 KiB
Python
192 lines
6.7 KiB
Python
import gc
|
|
import logging
|
|
from enum import Enum
|
|
from typing import Dict, Any
|
|
|
|
import pandas as pd
|
|
|
|
from core.config.pipeline_config import PipelineConfig
|
|
from core.utils.region_mapper import RegionMapper
|
|
from processing.ner.ner_name_tagger import NERNameTagger
|
|
from processing.steps import PipelineStep
|
|
|
|
|
|
class Gender(Enum):
|
|
MALE = "m"
|
|
FEMALE = "f"
|
|
|
|
|
|
class NameCategory(Enum):
|
|
SIMPLE = "simple"
|
|
COMPOSE = "compose"
|
|
|
|
|
|
class FeatureExtractionStep(PipelineStep):
|
|
"""Configuration-driven feature extraction step"""
|
|
|
|
def __init__(self, pipeline_config: PipelineConfig):
|
|
super().__init__("feature_extraction", pipeline_config)
|
|
self.region_mapper = RegionMapper()
|
|
self.name_tagger = NERNameTagger()
|
|
|
|
@classmethod
|
|
def requires_batch_mutation(cls) -> bool:
|
|
"""This step creates new columns, so mutation is required"""
|
|
return True
|
|
|
|
@classmethod
|
|
def validate_gender(cls, gender: str) -> Gender:
|
|
"""Validate and normalize gender value"""
|
|
gender_lower = str(gender).lower().strip()
|
|
if gender_lower in ["m", "male", "homme", "masculin"]:
|
|
return Gender.MALE
|
|
elif gender_lower in ["f", "female", "femme", "féminin"]:
|
|
return Gender.FEMALE
|
|
else:
|
|
raise ValueError(f"Unknown gender: {gender}")
|
|
|
|
@classmethod
|
|
def get_name_category(cls, word_count: int) -> NameCategory:
|
|
"""Determine name category based on word count"""
|
|
return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
|
|
|
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
|
"""Extract features from names in batch"""
|
|
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
|
|
|
|
result = batch.copy()
|
|
numeric_features = self._compute_numeric_features(result["name"])
|
|
result = result.assign(**numeric_features)
|
|
|
|
# Initialize features columns with optimal dtypes
|
|
features_columns = self._initialize_features_columns(len(result))
|
|
result = result.assign(**features_columns)
|
|
|
|
self._assign_probable_names(result)
|
|
self._process_simple_names(result)
|
|
result["identified_category"] = self._assign_identified_category(result["words"])
|
|
|
|
if "year" in result.columns:
|
|
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
|
|
|
|
if "region" in result.columns:
|
|
result["province"] = self.region_mapper.map(result["region"])
|
|
result["province"] = result["province"].astype("category")
|
|
|
|
if "sex" in result.columns:
|
|
result["sex"] = self._normalize_gender(result["sex"])
|
|
|
|
# Apply final dtype optimizations
|
|
result = self._optimize_dtypes(result)
|
|
|
|
# Cleanup
|
|
del numeric_features, features_columns
|
|
if batch_id % 10 == 0: # Periodic cleanup
|
|
gc.collect()
|
|
|
|
return result
|
|
|
|
@classmethod
|
|
def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
|
|
"""Calculate basic features in vectorized manner"""
|
|
return {
|
|
"words": (series.str.count(" ") + 1).astype("Int8"),
|
|
"length": series.str.len().astype("Int16"),
|
|
}
|
|
|
|
@classmethod
|
|
def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
|
|
"""Initialize new columns with optimal dtypes"""
|
|
return {
|
|
"probable_native": pd.Series([None] * size, dtype="string"),
|
|
"probable_surname": pd.Series([None] * size, dtype="string"),
|
|
"identified_name": pd.Series([None] * size, dtype="string"),
|
|
"identified_surname": pd.Series([None] * size, dtype="string"),
|
|
"ner_entities": pd.Series([None] * size, dtype="string"),
|
|
"ner_tagged": pd.Series([0] * size, dtype="Int8"),
|
|
"annotated": pd.Series([0] * size, dtype="Int8"),
|
|
}
|
|
|
|
@classmethod
|
|
def _assign_probable_names(cls, df: pd.DataFrame) -> None:
|
|
"""Assign probable native and surname names efficiently"""
|
|
|
|
name_splits = df["name"].str.split()
|
|
mask = name_splits.str.len() >= 2
|
|
|
|
df.loc[mask, "probable_native"] = name_splits[mask].apply(
|
|
lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
|
|
)
|
|
df.loc[mask, "probable_surname"] = name_splits[mask].apply(
|
|
lambda x: x[-1] if isinstance(x, list) else None
|
|
)
|
|
|
|
def _assign_identified_category(self, series: pd.Series) -> pd.Series:
|
|
"""Assign identified category based on word count"""
|
|
return series.map(lambda x: self.get_name_category(x).value).astype("category")
|
|
|
|
def _process_simple_names(self, df: pd.DataFrame) -> None:
|
|
"""Process 3-word names efficiently with vectorized operations"""
|
|
mask = pd.Series(df["words"] == 3)
|
|
|
|
if not mask.any():
|
|
return
|
|
|
|
df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
|
|
df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
|
|
df.loc[mask, "annotated"] = 1
|
|
|
|
# NER tagging for 3-word names
|
|
three_word_rows = df[mask]
|
|
for idx, row in three_word_rows.iterrows():
|
|
try:
|
|
entity = self.name_tagger.tag_name(
|
|
row["name"], row["identified_name"], row["identified_surname"]
|
|
)
|
|
|
|
if entity:
|
|
df.at[idx, "ner_entities"] = str(entity["entities"])
|
|
df.at[idx, "ner_tagged"] = 1
|
|
except Exception as e:
|
|
logging.warning(f"NER tagging failed for row {idx}: {e}")
|
|
|
|
def _normalize_gender(self, series: pd.Series) -> pd.Series:
|
|
gender_mapping = {
|
|
"m": "m",
|
|
"male": "m",
|
|
"homme": "m",
|
|
"masculin": "m",
|
|
"f": "f",
|
|
"female": "f",
|
|
"femme": "f",
|
|
"féminin": "f",
|
|
}
|
|
|
|
# Apply mapping with error handling
|
|
normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
|
|
return normalized.astype("category")
|
|
|
|
@classmethod
|
|
def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
|
|
categories = ["province", "identified_category", "sex"]
|
|
|
|
for col in categories:
|
|
if col in df.columns and df[col].dtype != "category":
|
|
df[col] = df[col].astype("category")
|
|
|
|
# Ensure string columns are proper string dtype
|
|
string_cols = [
|
|
"name",
|
|
"probable_native",
|
|
"probable_surname",
|
|
"identified_name",
|
|
"identified_surname",
|
|
"ner_entities",
|
|
]
|
|
|
|
for col in string_cols:
|
|
if col in df.columns and df[col].dtype == "object":
|
|
df[col] = df[col].astype("string")
|
|
|
|
return df
|