refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
from enum import Enum
|
||||
from typing import List, Dict, Any, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class FeatureType(Enum):
|
||||
"""Types of features that can be extracted from names"""
|
||||
|
||||
FULL_NAME = "full_name"
|
||||
NATIVE_NAME = "native_name"
|
||||
SURNAME = "surname"
|
||||
FIRST_WORD = "first_word"
|
||||
LAST_WORD = "last_word"
|
||||
NAME_LENGTH = "name_length"
|
||||
WORD_COUNT = "word_count"
|
||||
PROVINCE = "province"
|
||||
CHAR_NGRAMS = "char_ngrams"
|
||||
WORD_NGRAMS = "word_ngrams"
|
||||
NAME_ENDINGS = "name_endings"
|
||||
NAME_BEGINNINGS = "name_beginnings"
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
"""Extract different types of features from name data"""
|
||||
|
||||
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
|
||||
self.feature_types = feature_types
|
||||
self.feature_params = feature_params or {}
|
||||
|
||||
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Extract all configured features"""
|
||||
features_df = pd.DataFrame(index=df.index)
|
||||
|
||||
for feature_type in self.feature_types:
|
||||
feature_data = self._extract_single_feature(df, feature_type)
|
||||
|
||||
if isinstance(feature_data, pd.DataFrame):
|
||||
features_df = pd.concat([features_df, feature_data], axis=1)
|
||||
else:
|
||||
features_df[feature_type.value] = feature_data
|
||||
|
||||
return features_df
|
||||
|
||||
def _extract_single_feature(
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
) -> Union[pd.Series, pd.DataFrame]:
|
||||
"""Extract a single type of feature"""
|
||||
if feature_type == FeatureType.FULL_NAME:
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NATIVE_NAME:
|
||||
return df["identified_name"].fillna(df["probable_native"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.SURNAME:
|
||||
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.FIRST_WORD:
|
||||
return df["name"].str.split().str[0].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.LAST_WORD:
|
||||
return df["name"].str.split().str[-1].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_LENGTH:
|
||||
return df["name"].str.len().fillna(0)
|
||||
|
||||
elif feature_type == FeatureType.WORD_COUNT:
|
||||
return df["words"].fillna(1)
|
||||
|
||||
elif feature_type == FeatureType.PROVINCE:
|
||||
return df["province"].fillna("unknown")
|
||||
|
||||
elif feature_type == FeatureType.NAME_ENDINGS:
|
||||
n = self.feature_params.get("ending_length", 3)
|
||||
return df["name"].str[-n:].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_BEGINNINGS:
|
||||
n = self.feature_params.get("beginning_length", 3)
|
||||
return df["name"].str[:n].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.CHAR_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.WORD_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown feature type: {feature_type}")
|
||||
Reference in New Issue
Block a user