91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
from enum import Enum
|
|
from typing import List, Dict, Any, Union
|
|
|
|
import pandas as pd
|
|
|
|
|
|
class FeatureType(Enum):
|
|
"""Types of features that can be extracted from names"""
|
|
|
|
FULL_NAME = "full_name"
|
|
NATIVE_NAME = "native_name"
|
|
SURNAME = "surname"
|
|
FIRST_WORD = "first_word"
|
|
LAST_WORD = "last_word"
|
|
NAME_LENGTH = "name_length"
|
|
WORD_COUNT = "word_count"
|
|
PROVINCE = "province"
|
|
CHAR_NGRAMS = "char_ngrams"
|
|
WORD_NGRAMS = "word_ngrams"
|
|
NAME_ENDINGS = "name_endings"
|
|
NAME_BEGINNINGS = "name_beginnings"
|
|
|
|
|
|
class FeatureExtractor:
|
|
"""Extract different types of features from name data"""
|
|
|
|
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
|
|
self.feature_types = feature_types
|
|
self.feature_params = feature_params or {}
|
|
|
|
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Extract all configured features"""
|
|
features_df = pd.DataFrame(index=df.index)
|
|
|
|
for feature_type in self.feature_types:
|
|
feature_data = self._extract_single_feature(df, feature_type)
|
|
|
|
if isinstance(feature_data, pd.DataFrame):
|
|
features_df = pd.concat([features_df, feature_data], axis=1)
|
|
else:
|
|
features_df[feature_type.value] = feature_data
|
|
|
|
return features_df
|
|
|
|
def _extract_single_feature(
|
|
self, df: pd.DataFrame, feature_type: FeatureType
|
|
) -> Union[pd.Series, pd.DataFrame]:
|
|
"""Extract a single type of feature"""
|
|
if feature_type == FeatureType.FULL_NAME:
|
|
return df["name"].fillna("")
|
|
|
|
elif feature_type == FeatureType.NATIVE_NAME:
|
|
return df["identified_name"].fillna(df["probable_native"]).fillna("")
|
|
|
|
elif feature_type == FeatureType.SURNAME:
|
|
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
|
|
|
|
elif feature_type == FeatureType.FIRST_WORD:
|
|
return df["name"].str.split().str[0].fillna("")
|
|
|
|
elif feature_type == FeatureType.LAST_WORD:
|
|
return df["name"].str.split().str[-1].fillna("")
|
|
|
|
elif feature_type == FeatureType.NAME_LENGTH:
|
|
return df["name"].str.len().fillna(0)
|
|
|
|
elif feature_type == FeatureType.WORD_COUNT:
|
|
return df["words"].fillna(1)
|
|
|
|
elif feature_type == FeatureType.PROVINCE:
|
|
return df["province"].fillna("unknown")
|
|
|
|
elif feature_type == FeatureType.NAME_ENDINGS:
|
|
n = self.feature_params.get("ending_length", 3)
|
|
return df["name"].str[-n:].fillna("")
|
|
|
|
elif feature_type == FeatureType.NAME_BEGINNINGS:
|
|
n = self.feature_params.get("beginning_length", 3)
|
|
return df["name"].str[:n].fillna("")
|
|
|
|
elif feature_type == FeatureType.CHAR_NGRAMS:
|
|
# This will be handled by the model's vectorizer
|
|
return df["name"].fillna("")
|
|
|
|
elif feature_type == FeatureType.WORD_NGRAMS:
|
|
# This will be handled by the model's vectorizer
|
|
return df["name"].fillna("")
|
|
|
|
else:
|
|
raise ValueError(f"Unknown feature type: {feature_type}")
|