Files
drc-ners-nlp/research/experiment/feature_extractor.py
T
2025-09-21 16:23:44 +02:00

91 lines
3.1 KiB
Python

from enum import Enum
from typing import List, Dict, Any, Union
import pandas as pd
class FeatureType(Enum):
"""Types of features that can be extracted from names"""
FULL_NAME = "full_name"
NATIVE_NAME = "native_name"
SURNAME = "surname"
FIRST_WORD = "first_word"
LAST_WORD = "last_word"
NAME_LENGTH = "name_length"
WORD_COUNT = "word_count"
PROVINCE = "province"
CHAR_NGRAMS = "char_ngrams"
WORD_NGRAMS = "word_ngrams"
NAME_ENDINGS = "name_endings"
NAME_BEGINNINGS = "name_beginnings"
class FeatureExtractor:
"""Extract different types of features from name data"""
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
self.feature_types = feature_types
self.feature_params = feature_params or {}
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Extract all configured features"""
features_df = pd.DataFrame(index=df.index)
for feature_type in self.feature_types:
feature_data = self._extract_single_feature(df, feature_type)
if isinstance(feature_data, pd.DataFrame):
features_df = pd.concat([features_df, feature_data], axis=1)
else:
features_df[feature_type.value] = feature_data
return features_df
def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME:
return df["name"].fillna("")
elif feature_type == FeatureType.NATIVE_NAME:
return df["identified_name"].fillna(df["probable_native"]).fillna("")
elif feature_type == FeatureType.SURNAME:
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
elif feature_type == FeatureType.FIRST_WORD:
return df["name"].str.split().str[0].fillna("")
elif feature_type == FeatureType.LAST_WORD:
return df["name"].str.split().str[-1].fillna("")
elif feature_type == FeatureType.NAME_LENGTH:
return df["name"].str.len().fillna(0)
elif feature_type == FeatureType.WORD_COUNT:
return df["words"].fillna(1)
elif feature_type == FeatureType.PROVINCE:
return df["province"].fillna("unknown")
elif feature_type == FeatureType.NAME_ENDINGS:
n = self.feature_params.get("ending_length", 3)
return df["name"].str[-n:].fillna("")
elif feature_type == FeatureType.NAME_BEGINNINGS:
n = self.feature_params.get("beginning_length", 3)
return df["name"].str[:n].fillna("")
elif feature_type == FeatureType.CHAR_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
elif feature_type == FeatureType.WORD_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
else:
raise ValueError(f"Unknown feature type: {feature_type}")