import logging from enum import Enum import pandas as pd from core.config.pipeline_config import PipelineConfig from core.utils.region_mapper import RegionMapper from processing.ner.ner_name_tagger import NERNameTagger from processing.steps import PipelineStep class Gender(Enum): MALE = "m" FEMALE = "f" class NameCategory(Enum): SIMPLE = "simple" COMPOSE = "compose" class FeatureExtractionStep(PipelineStep): """Configuration-driven feature extraction step""" def __init__(self, pipeline_config: PipelineConfig): super().__init__("feature_extraction", pipeline_config) self.region_mapper = RegionMapper() self.name_tagger = NERNameTagger() @classmethod def validate_gender(cls, gender: str) -> Gender: """Validate and normalize gender value""" gender_lower = gender.lower().strip() if gender_lower in ["m", "male", "homme", "masculin"]: return Gender.MALE elif gender_lower in ["f", "female", "femme", "féminin"]: return Gender.FEMALE else: raise ValueError(f"Unknown gender: {gender}") @classmethod def get_name_category(cls, word_count: int) -> NameCategory: """Determine name category based on word count""" if word_count == 3: return NameCategory.SIMPLE else: return NameCategory.COMPOSE def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame: """Extract features from names in batch""" logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows") batch = batch.copy() # Basic features batch["words"] = batch["name"].str.count(" ") + 1 batch["length"] = batch["name"].str.len() # Handle year column if "year" in batch.columns: batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64") # Initialize new columns batch["probable_native"] = None batch["probable_surname"] = None batch["identified_name"] = None batch["identified_surname"] = None batch["ner_entities"] = None batch["ner_tagged"] = 0 batch["annotated"] = 0 # Vectorized category assignment batch["identified_category"] = batch["words"].apply( lambda x: self.get_name_category(x).value ) # Assign probable_native and probable_surname for all names name_splits = batch["name"].str.split() batch["probable_native"] = name_splits.apply( lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None ) batch["probable_surname"] = name_splits.apply( lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None ) # Auto-assign for 3-word names three_word_mask = batch["words"] == 3 batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"] batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"] batch.loc[three_word_mask, "annotated"] = 1 # Tag names with NER entities three_word_rows = batch[three_word_mask] for idx, row in three_word_rows.iterrows(): entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname']) if entity: batch.at[idx, "ner_entities"] = entity["entities"] batch.at[idx, "ner_tagged"] = 1 # Map regions to provinces batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"]) # Normalize gender if "sex" in batch.columns: batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value) return batch