refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,56 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class BiGRUModel(NeuralNetworkModel):
|
||||
"""Bidirectional GRU model for name classification"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Bidirectional(
|
||||
GRU(
|
||||
params.get("gru_units", 32),
|
||||
return_sequences=True,
|
||||
dropout=params.get("dropout", 0.2),
|
||||
)
|
||||
),
|
||||
Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,75 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import (
|
||||
Embedding,
|
||||
Conv1D,
|
||||
MaxPooling1D,
|
||||
GlobalMaxPooling1D,
|
||||
Dense,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.models import Sequential
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class CNNModel(NeuralNetworkModel):
|
||||
"""1D Convolutional Neural Network for character patterns"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any:
|
||||
"""Build CNN model with known vocabulary size"""
|
||||
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
),
|
||||
MaxPooling1D(pool_size=2),
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
),
|
||||
GlobalMaxPooling1D(),
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Prepare sequences for CNN using extracted features"""
|
||||
# X here contains the features already extracted by FeatureExtractor
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
# Get text data from extracted features - use character level for CNN
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
# Fallback - should not happen if FeatureExtractor is properly configured
|
||||
text_data = [""] * len(X)
|
||||
|
||||
# Initialize character-level tokenizer
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,97 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class EnsembleModel(TraditionalModel):
|
||||
"""Ensemble model combining multiple base models"""
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
"""Return the architecture type"""
|
||||
return "ensemble"
|
||||
|
||||
def __init__(self, config: ExperimentConfig):
|
||||
super().__init__(config)
|
||||
self.base_models = []
|
||||
self.model_weights = None
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
base_model_types = params.get(
|
||||
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
|
||||
)
|
||||
|
||||
# Create base models with simplified configs
|
||||
estimators = []
|
||||
for model_type in base_model_types:
|
||||
if model_type == "logistic_regression":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append((f"logistic_regression", model))
|
||||
|
||||
elif model_type == "random_forest":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
RandomForestClassifier(
|
||||
n_estimators=50, random_state=self.config.random_seed
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append((f"rf", model))
|
||||
|
||||
elif model_type == "naive_bayes":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
|
||||
),
|
||||
("classifier", MultinomialNB()),
|
||||
]
|
||||
)
|
||||
estimators.append((f"nb", model))
|
||||
|
||||
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||
return VotingClassifier(estimators=estimators, voting=voting_type)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,51 @@
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LightGBMModel(TraditionalModel):
|
||||
"""LightGBM with engineered features"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
return lgb.LGBMClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", -1),
|
||||
learning_rate=params.get("learning_rate", 0.1),
|
||||
num_leaves=params.get("num_leaves", 31),
|
||||
subsample=params.get("subsample", 0.8),
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=-1,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character n-grams for text features
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = vectorizer.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
features.append(char_features)
|
||||
else:
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,44 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LogisticRegressionModel(TraditionalModel):
|
||||
"""Logistic Regression with character n-grams"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 5)),
|
||||
max_features=params.get("max_features", 10000),
|
||||
)
|
||||
|
||||
classifier = LogisticRegression(
|
||||
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
# Collect text-based features from the extracted features DataFrame
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
# Combine text features
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
# Concatenate multiple text features with separator
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,52 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class LSTMModel(NeuralNetworkModel):
|
||||
"""LSTM model for sequence learning"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32))),
|
||||
Dense(64, activation="relu"),
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,39 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class NaiveBayesModel(TraditionalModel):
|
||||
"""Multinomial Naive Bayes with character n-grams"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (1, 4)),
|
||||
max_features=params.get("max_features", 8000),
|
||||
)
|
||||
|
||||
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class RandomForestModel(TraditionalModel):
|
||||
"""Random Forest with engineered features"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
|
||||
params = self.config.model_params
|
||||
|
||||
return RandomForestClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", None),
|
||||
random_state=self.config.random_seed,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
# Handle different feature types
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
else:
|
||||
# Categorical features (encode them)
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,45 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class SVMModel(TraditionalModel):
|
||||
"""Support Vector Machine with character n-grams and RBF kernel"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
vectorizer = TfidfVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 4)),
|
||||
max_features=params.get("max_features", 5000),
|
||||
)
|
||||
|
||||
classifier = SVC(
|
||||
kernel=params.get("kernel", "rbf"),
|
||||
C=params.get("C", 1.0),
|
||||
gamma=params.get("gamma", "scale"),
|
||||
probability=True, # Enable probability prediction
|
||||
random_state=self.config.random_seed,
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,82 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
Embedding,
|
||||
Dense,
|
||||
GlobalAveragePooling1D,
|
||||
MultiHeadAttention,
|
||||
Dropout,
|
||||
LayerNormalization,
|
||||
)
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class TransformerModel(NeuralNetworkModel):
|
||||
"""Transformer-based model"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
|
||||
# Build Transformer model
|
||||
inputs = Input(shape=(max_len,))
|
||||
x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=max_len, delta=1)
|
||||
pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))(
|
||||
positions
|
||||
)
|
||||
x = x + pos_embedding
|
||||
|
||||
x = self._transformer_encoder(x, params)
|
||||
x = GlobalAveragePooling1D()(x)
|
||||
x = Dense(32, activation="relu")(x)
|
||||
outputs = Dense(2, activation="softmax")(x)
|
||||
|
||||
model = Model(inputs, outputs)
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
@classmethod
|
||||
def _transformer_encoder(cls, x, cfg_params):
|
||||
"""Transformer encoder block"""
|
||||
|
||||
attn = MultiHeadAttention(
|
||||
num_heads=cfg_params.get("transformer_num_heads", 2),
|
||||
key_dim=cfg_params.get("transformer_head_size", 64),
|
||||
)(x, x)
|
||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
|
||||
|
||||
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
|
||||
ff = Dense(x.shape[-1])(ff)
|
||||
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,52 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class XGBoostModel(TraditionalModel):
|
||||
"""XGBoost with engineered features and character embeddings"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
return xgb.XGBClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", 6),
|
||||
learning_rate=params.get("learning_rate", 0.1),
|
||||
subsample=params.get("subsample", 0.8),
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
eval_metric="logloss",
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character-level features for names
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||
)
|
||||
char_features = vectorizer.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
features.append(char_features)
|
||||
else:
|
||||
# Categorical features
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
Reference in New Issue
Block a user