refactor: update configuration loading and ensure directory existence across modules
This commit is contained in:
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
|
||||
class LightGBMModel(TraditionalModel):
|
||||
"""LightGBM with engineered features"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Store vectorizers and encoders to ensure consistent feature space
|
||||
self.vectorizers = {}
|
||||
self.label_encoders = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
@@ -33,19 +39,58 @@ class LightGBMModel(TraditionalModel):
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character n-grams for text features
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = vectorizer.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
# Character-level features for names
|
||||
feature_key = f"vectorizer_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.vectorizers:
|
||||
# First time - create and fit vectorizer
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = self.vectorizers[feature_key].fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = self.vectorizers[feature_key].transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
|
||||
features.append(char_features)
|
||||
else:
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
# Categorical features
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.label_encoders:
|
||||
# First time - create and fit encoder
|
||||
self.label_encoders[feature_key] = LabelEncoder()
|
||||
encoded = self.label_encoders[feature_key].fit_transform(
|
||||
column.fillna("unknown").astype(str)
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing encoder
|
||||
# Handle unseen labels by mapping them to a default value
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
|
||||
# Get the classes the encoder was trained on
|
||||
known_classes = set(self.label_encoders[feature_key].classes_)
|
||||
|
||||
# Map unseen values to "unknown" if it exists, otherwise to the first class
|
||||
if "unknown" in known_classes:
|
||||
default_class = "unknown"
|
||||
else:
|
||||
default_class = self.label_encoders[feature_key].classes_[0]
|
||||
|
||||
# Replace unseen values with default
|
||||
column_mapped = column_clean.apply(
|
||||
lambda x: x if x in known_classes else default_class
|
||||
)
|
||||
|
||||
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
|
||||
Reference in New Issue
Block a user