feat: document models
This commit is contained in:
@@ -17,17 +17,38 @@ class BiGRUModel(NeuralNetworkModel):
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
# Mask padding tokens so recurrent layers ignore them; fix input length
|
||||
# for better shape inference and to support masking through the stack.
|
||||
Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
input_length=max_len,
|
||||
mask_zero=True,
|
||||
),
|
||||
# First recurrent block returns full sequences to allow stacking.
|
||||
# Moderate dropout + optional recurrent_dropout to reduce overfitting
|
||||
# on short names while retaining temporal signal.
|
||||
Bidirectional(
|
||||
GRU(
|
||||
params.get("gru_units", 32),
|
||||
return_sequences=True,
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
|
||||
# Second GRU summarizes to the last hidden state (no return_sequences),
|
||||
# capturing bidirectional context efficiently for classification.
|
||||
Bidirectional(
|
||||
GRU(
|
||||
params.get("gru_units", 32),
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Small dense head; ReLU + dropout for capacity and regularization.
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
# Two-way softmax for binary gender classification.
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
@@ -38,19 +59,13 @@ class BiGRUModel(NeuralNetworkModel):
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
|
||||
@@ -9,6 +9,7 @@ from tensorflow.keras.layers import (
|
||||
GlobalMaxPooling1D,
|
||||
Dense,
|
||||
Dropout,
|
||||
SpatialDropout1D,
|
||||
)
|
||||
from tensorflow.keras.models import Sequential
|
||||
|
||||
@@ -24,21 +25,33 @@ class CNNModel(NeuralNetworkModel):
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
# Learn char/subword embeddings; spatial dropout regularizes across channels
|
||||
# to make the model robust to noisy characters and transliteration.
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
|
||||
# Small kernels capture short n-gram like patterns; padding='same' keeps
|
||||
# sequence length stable for simpler pooling behavior.
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
padding="same",
|
||||
),
|
||||
# Downsample to gain some position invariance and reduce computation.
|
||||
MaxPooling1D(pool_size=2),
|
||||
# Second conv layer to compose higher-level motifs (e.g., suffix+vowel).
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
padding="same",
|
||||
),
|
||||
# Global max pooling picks strongest motif evidence anywhere in the name.
|
||||
GlobalMaxPooling1D(),
|
||||
# Compact dense head with dropout to control overfitting.
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
# Two-way softmax for binary classification.
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
@@ -55,21 +68,14 @@ class CNNModel(NeuralNetworkModel):
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
# Get text data from extracted features - use character level for CNN
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
# Fallback - should not happen if FeatureExtractor is properly configured
|
||||
text_data = [""] * len(X)
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
# Initialize character-level tokenizer
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
|
||||
@@ -31,7 +31,8 @@ class EnsembleModel(TraditionalModel):
|
||||
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
|
||||
)
|
||||
|
||||
# Create base models with simplified configs
|
||||
# Create base models with simplified configs; diverse vectorizers/classifiers
|
||||
# encourage complementary errors that voting can average out.
|
||||
estimators = []
|
||||
for model_type in base_model_types:
|
||||
if model_type == "logistic_regression":
|
||||
@@ -78,8 +79,10 @@ class EnsembleModel(TraditionalModel):
|
||||
)
|
||||
estimators.append((f"nb", model))
|
||||
|
||||
# Soft voting averages probabilities (preferred when members are calibrated);
|
||||
# hard voting uses majority class. Parallelize member predictions.
|
||||
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||
return VotingClassifier(estimators=estimators, voting=voting_type)
|
||||
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
@@ -20,6 +20,8 @@ class LightGBMModel(TraditionalModel):
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
# Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
|
||||
# and parallelism improve training speed for this task.
|
||||
return lgb.LGBMClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", -1),
|
||||
@@ -28,6 +30,8 @@ class LightGBMModel(TraditionalModel):
|
||||
subsample=params.get("subsample", 0.8),
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
objective=params.get("objective", "binary"),
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
|
||||
@@ -13,14 +13,23 @@ class LogisticRegressionModel(TraditionalModel):
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# Character n-grams are strong signals for names; (2,5) balances
|
||||
# capturing prefixes/suffixes with tractable feature size.
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 5)),
|
||||
max_features=params.get("max_features", 10000),
|
||||
)
|
||||
|
||||
# liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
|
||||
# OvR across classes (no effect for binary). class_weight can mitigate imbalance.
|
||||
classifier = LogisticRegression(
|
||||
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed, verbose=2
|
||||
max_iter=params.get("max_iter", 1000),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
solver=params.get("solver", "liblinear"),
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
class_weight=params.get("class_weight", None),
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
@@ -17,10 +17,35 @@ class LSTMModel(NeuralNetworkModel):
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32))),
|
||||
# Mask padding tokens; required for LSTM to ignore padded timesteps.
|
||||
Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
input_length=max_len,
|
||||
mask_zero=True,
|
||||
),
|
||||
# Stacked bidirectional LSTMs: first returns sequences to feed the next.
|
||||
# Dropout/recurrent_dropout mitigate overfitting on short sequences.
|
||||
Bidirectional(
|
||||
LSTM(
|
||||
params.get("lstm_units", 32),
|
||||
return_sequences=True,
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Second LSTM condenses sequence to a fixed vector for classification.
|
||||
Bidirectional(
|
||||
LSTM(
|
||||
params.get("lstm_units", 32),
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Compact dense head with dropout; sufficient capacity for name signals.
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
# Two-way softmax for binary classification.
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
@@ -31,14 +56,7 @@ class LSTMModel(NeuralNetworkModel):
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
@@ -46,7 +64,7 @@ class LSTMModel(NeuralNetworkModel):
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
|
||||
@@ -13,12 +13,15 @@ class NaiveBayesModel(TraditionalModel):
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
|
||||
# includes unigrams for coverage and higher n for suffix/prefix cues.
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (1, 4)),
|
||||
max_features=params.get("max_features", 8000),
|
||||
)
|
||||
|
||||
# Laplace smoothing (alpha) counters zero counts for rare n-grams.
|
||||
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
@@ -10,15 +12,23 @@ from research.traditional_model import TraditionalModel
|
||||
class RandomForestModel(TraditionalModel):
|
||||
"""Random Forest with engineered features"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Persist encoders so categorical mappings stay consistent.
|
||||
self.label_encoders: Dict[str, LabelEncoder] = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
|
||||
params = self.config.model_params
|
||||
|
||||
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
|
||||
# across trees for speed. Keep depth moderate for generalisation.
|
||||
return RandomForestClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", None),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
@@ -33,9 +43,24 @@ class RandomForestModel(TraditionalModel):
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
else:
|
||||
# Categorical features (encode them)
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
# Categorical features (encode them persistently)
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.label_encoders:
|
||||
self.label_encoders[feature_key] = LabelEncoder()
|
||||
encoded = self.label_encoders[feature_key].fit_transform(
|
||||
column.fillna("unknown").astype(str)
|
||||
)
|
||||
else:
|
||||
encoder = self.label_encoders[feature_key]
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
known_classes = set(encoder.classes_)
|
||||
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
|
||||
column_mapped = column_clean.apply(
|
||||
lambda value: value if value in known_classes else default_class
|
||||
)
|
||||
encoded = encoder.transform(column_mapped)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
|
||||
@@ -13,17 +13,23 @@ class SVMModel(TraditionalModel):
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
|
||||
# for distinguishing name morphology under RBF kernels.
|
||||
vectorizer = TfidfVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 4)),
|
||||
max_features=params.get("max_features", 5000),
|
||||
)
|
||||
|
||||
# RBF kernel captures non-linear interactions between n-grams; probability=True
|
||||
# adds calibration at some cost. Larger cache helps speed kernel computations.
|
||||
classifier = SVC(
|
||||
kernel=params.get("kernel", "rbf"),
|
||||
C=params.get("C", 1.0),
|
||||
gamma=params.get("gamma", "scale"),
|
||||
probability=True, # Enable probability prediction
|
||||
class_weight=params.get("class_weight", None),
|
||||
cache_size=params.get("cache_size", 1000),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
@@ -27,7 +27,12 @@ class TransformerModel(NeuralNetworkModel):
|
||||
|
||||
# Build Transformer model
|
||||
inputs = Input(shape=(max_len,))
|
||||
x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
|
||||
x = Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
input_length=max_len,
|
||||
mask_zero=True,
|
||||
)(inputs)
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=max_len, delta=1)
|
||||
@@ -39,6 +44,7 @@ class TransformerModel(NeuralNetworkModel):
|
||||
x = self._transformer_encoder(x, params)
|
||||
x = GlobalAveragePooling1D()(x)
|
||||
x = Dense(32, activation="relu")(x)
|
||||
x = Dropout(params.get("dropout", 0.1))(x)
|
||||
outputs = Dense(2, activation="softmax")(x)
|
||||
|
||||
model = Model(inputs, outputs)
|
||||
@@ -54,6 +60,7 @@ class TransformerModel(NeuralNetworkModel):
|
||||
attn = MultiHeadAttention(
|
||||
num_heads=cfg_params.get("transformer_num_heads", 2),
|
||||
key_dim=cfg_params.get("transformer_head_size", 64),
|
||||
dropout=cfg_params.get("attn_dropout", 0.1),
|
||||
)(x, x)
|
||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
|
||||
|
||||
@@ -62,13 +69,7 @@ class TransformerModel(NeuralNetworkModel):
|
||||
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
@@ -76,7 +77,7 @@ class TransformerModel(NeuralNetworkModel):
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
|
||||
@@ -20,6 +20,8 @@ class XGBoostModel(TraditionalModel):
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
# Histogram-based trees and parallelism provide fast training; default
|
||||
# logloss metric suits binary classification of gender.
|
||||
return xgb.XGBClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", 6),
|
||||
@@ -28,6 +30,8 @@ class XGBoostModel(TraditionalModel):
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
eval_metric="logloss",
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
tree_method=params.get("tree_method", "hist"),
|
||||
verbosity=2,
|
||||
)
|
||||
|
||||
|
||||
@@ -82,6 +82,25 @@ class NeuralNetworkModel(BaseModel):
|
||||
self.is_fitted = True
|
||||
return self
|
||||
|
||||
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
|
||||
"""Combine configured textual features into one string per record."""
|
||||
|
||||
column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
|
||||
if not column_names:
|
||||
raise ValueError("No configured text features found in the provided DataFrame.")
|
||||
|
||||
text_frame = X[column_names].fillna("").astype(str)
|
||||
|
||||
if len(column_names) == 1:
|
||||
return text_frame.iloc[:, 0].tolist()
|
||||
|
||||
combined_rows = []
|
||||
for row in text_frame.itertuples(index=False):
|
||||
tokens = [value for value in row if value]
|
||||
combined_rows.append(" ".join(tokens))
|
||||
|
||||
return combined_rows
|
||||
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> dict[str, np.floating[Any]]:
|
||||
@@ -145,6 +164,9 @@ class NeuralNetworkModel(BaseModel):
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
if train_sizes is None:
|
||||
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
|
||||
|
||||
learning_curve_data = {
|
||||
"train_sizes": [],
|
||||
"train_scores": [],
|
||||
|
||||
Reference in New Issue
Block a user