drc-ners-nlp/processing/steps/feature_extraction_step.py

import gc
import logging
from enum import Enum
from typing import Dict, Any

import pandas as pd

from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.ner.ner_name_tagger import NERNameTagger
from processing.steps import PipelineStep


class Gender(Enum):
    MALE = "m"
    FEMALE = "f"


class NameCategory(Enum):
    SIMPLE = "simple"
    COMPOSE = "compose"


class FeatureExtractionStep(PipelineStep):
    """Configuration-driven feature extraction step"""

    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("feature_extraction", pipeline_config)
        self.region_mapper = RegionMapper()
        self.name_tagger = NERNameTagger()

    @classmethod
    def requires_batch_mutation(cls) -> bool:
        """This step creates new columns, so mutation is required"""
        return True

    @classmethod
    def validate_gender(cls, gender: str) -> Gender:
        """Validate and normalize gender value"""
        gender_lower = str(gender).lower().strip()
        if gender_lower in ["m", "male", "homme", "masculin"]:
            return Gender.MALE
        elif gender_lower in ["f", "female", "femme", "féminin"]:
            return Gender.FEMALE
        else:
            raise ValueError(f"Unknown gender: {gender}")

    @classmethod
    def get_name_category(cls, word_count: int) -> NameCategory:
        """Determine name category based on word count"""
        return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE

    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Extract features from names in batch"""
        logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")

        result = batch.copy()
        numeric_features = self._compute_numeric_features(result["name"])
        result = result.assign(**numeric_features)

        # Initialize features columns with optimal dtypes
        features_columns = self._initialize_features_columns(len(result))
        result = result.assign(**features_columns)

        self._assign_probable_names(result)
        self._process_simple_names(result)
        result["identified_category"] = self._assign_identified_category(result["words"])

        if "year" in result.columns:
            result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")

        if "region" in result.columns:
            result["province"] = self.region_mapper.map(result["region"])
            result["province"] = result["province"].astype("category")

        if "sex" in result.columns:
            result["sex"] = self._normalize_gender(result["sex"])

        # Apply final dtype optimizations
        result = self._optimize_dtypes(result)

        # Cleanup
        del numeric_features, features_columns
        if batch_id % 10 == 0:  # Periodic cleanup
            gc.collect()

        return result

    @classmethod
    def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
        """Calculate basic features in vectorized manner"""
        return {
            "words": (series.str.count(" ") + 1).astype("Int8"),
            "length": series.str.len().astype("Int16"),
        }

    @classmethod
    def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
        """Initialize new columns with optimal dtypes"""
        return {
            "probable_native": pd.Series([None] * size, dtype="string"),
            "probable_surname": pd.Series([None] * size, dtype="string"),
            "identified_name": pd.Series([None] * size, dtype="string"),
            "identified_surname": pd.Series([None] * size, dtype="string"),
            "ner_entities": pd.Series([None] * size, dtype="string"),
            "ner_tagged": pd.Series([0] * size, dtype="Int8"),
            "annotated": pd.Series([0] * size, dtype="Int8"),
        }

    @classmethod
    def _assign_probable_names(cls, df: pd.DataFrame) -> None:
        """Assign probable native and surname names efficiently"""

        name_splits = df["name"].str.split()
        mask = name_splits.str.len() >= 2

        df.loc[mask, "probable_native"] = name_splits[mask].apply(
            lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
        )
        df.loc[mask, "probable_surname"] = name_splits[mask].apply(
            lambda x: x[-1] if isinstance(x, list) else None
        )

    def _assign_identified_category(self, series: pd.Series) -> pd.Series:
        """Assign identified category based on word count"""
        return series.map(lambda x: self.get_name_category(x).value).astype("category")

    def _process_simple_names(self, df: pd.DataFrame) -> None:
        """Process 3-word names efficiently with vectorized operations"""
        mask = df["words"] == 3

        if not mask.any():
            return

        df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
        df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
        df.loc[mask, "annotated"] = 1

        # NER tagging for 3-word names
        three_word_rows = df[mask]
        for idx, row in three_word_rows.iterrows():
            try:
                entity = self.name_tagger.tag_name(
                    row["name"], row["identified_name"], row["identified_surname"]
                )

                if entity:
                    df.at[idx, "ner_entities"] = str(entity["entities"])
                    df.at[idx, "ner_tagged"] = 1
            except Exception as e:
                logging.warning(f"NER tagging failed for row {idx}: {e}")

    def _normalize_gender(self, series: pd.Series) -> pd.Series:
        gender_mapping = {
            "m": "m",
            "male": "m",
            "homme": "m",
            "masculin": "m",
            "f": "f",
            "female": "f",
            "femme": "f",
            "féminin": "f",
        }

        # Apply mapping with error handling
        normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
        return normalized.astype("category")

    @classmethod
    def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
        categories = ["province", "identified_category", "sex"]

        for col in categories:
            if col in df.columns and df[col].dtype != "category":
                df[col] = df[col].astype("category")

        # Ensure string columns are proper string dtype
        string_cols = [
            "name",
            "probable_native",
            "probable_surname",
            "identified_name",
            "identified_surname",
            "ner_entities",
        ]

        for col in string_cols:
            if col in df.columns and df[col].dtype == "object":
                df[col] = df[col].astype("string")

        return df