feat: document models

This commit is contained in:
2025-09-20 23:35:54 +02:00
parent dd2a9f2711
commit e41b15a863
13 changed files with 256 additions and 47 deletions
+25 -10
View File
@@ -17,17 +17,38 @@ class BiGRUModel(NeuralNetworkModel):
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
# Mask padding tokens so recurrent layers ignore them; fix input length
# for better shape inference and to support masking through the stack.
Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
input_length=max_len,
mask_zero=True,
),
# First recurrent block returns full sequences to allow stacking.
# Moderate dropout + optional recurrent_dropout to reduce overfitting
# on short names while retaining temporal signal.
Bidirectional(
GRU(
params.get("gru_units", 32),
return_sequences=True,
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
# Second GRU summarizes to the last hidden state (no return_sequences),
# capturing bidirectional context efficiently for classification.
Bidirectional(
GRU(
params.get("gru_units", 32),
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Small dense head; ReLU + dropout for capacity and regularization.
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary gender classification.
Dense(2, activation="softmax"),
]
)
@@ -38,19 +59,13 @@ class BiGRUModel(NeuralNetworkModel):
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
text_data = self._collect_text_corpus(X)
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+15 -9
View File
@@ -9,6 +9,7 @@ from tensorflow.keras.layers import (
GlobalMaxPooling1D,
Dense,
Dropout,
SpatialDropout1D,
)
from tensorflow.keras.models import Sequential
@@ -24,21 +25,33 @@ class CNNModel(NeuralNetworkModel):
params = kwargs
model = Sequential(
[
# Learn char/subword embeddings; spatial dropout regularizes across channels
# to make the model robust to noisy characters and transliteration.
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
# Small kernels capture short n-gram like patterns; padding='same' keeps
# sequence length stable for simpler pooling behavior.
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
padding="same",
),
# Downsample to gain some position invariance and reduce computation.
MaxPooling1D(pool_size=2),
# Second conv layer to compose higher-level motifs (e.g., suffix+vowel).
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
padding="same",
),
# Global max pooling picks strongest motif evidence anywhere in the name.
GlobalMaxPooling1D(),
# Compact dense head with dropout to control overfitting.
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary classification.
Dense(2, activation="softmax"),
]
)
@@ -55,21 +68,14 @@ class CNNModel(NeuralNetworkModel):
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Get text data from extracted features - use character level for CNN
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
# Fallback - should not happen if FeatureExtractor is properly configured
text_data = [""] * len(X)
text_data = self._collect_text_corpus(X)
# Initialize character-level tokenizer
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
return pad_sequences(sequences, maxlen=max_len, padding="post")
+5 -2
View File
@@ -31,7 +31,8 @@ class EnsembleModel(TraditionalModel):
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
)
# Create base models with simplified configs
# Create base models with simplified configs; diverse vectorizers/classifiers
# encourage complementary errors that voting can average out.
estimators = []
for model_type in base_model_types:
if model_type == "logistic_regression":
@@ -78,8 +79,10 @@ class EnsembleModel(TraditionalModel):
)
estimators.append((f"nb", model))
# Soft voting averages probabilities (preferred when members are calibrated);
# hard voting uses majority class. Parallelize member predictions.
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(estimators=estimators, voting=voting_type)
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
+4
View File
@@ -20,6 +20,8 @@ class LightGBMModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
# and parallelism improve training speed for this task.
return lgb.LGBMClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", -1),
@@ -28,6 +30,8 @@ class LightGBMModel(TraditionalModel):
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
objective=params.get("objective", "binary"),
n_jobs=params.get("n_jobs", -1),
verbose=2,
)
+10 -1
View File
@@ -13,14 +13,23 @@ class LogisticRegressionModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Character n-grams are strong signals for names; (2,5) balances
# capturing prefixes/suffixes with tractable feature size.
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 5)),
max_features=params.get("max_features", 10000),
)
# liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
# OvR across classes (no effect for binary). class_weight can mitigate imbalance.
classifier = LogisticRegression(
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed, verbose=2
max_iter=params.get("max_iter", 1000),
random_state=self.config.random_seed,
verbose=2,
solver=params.get("solver", "liblinear"),
n_jobs=params.get("n_jobs", -1),
class_weight=params.get("class_weight", None),
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+31 -13
View File
@@ -2,7 +2,7 @@ from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
@@ -17,10 +17,35 @@ class LSTMModel(NeuralNetworkModel):
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
Bidirectional(LSTM(params.get("lstm_units", 32))),
# Mask padding tokens; required for LSTM to ignore padded timesteps.
Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
input_length=max_len,
mask_zero=True,
),
# Stacked bidirectional LSTMs: first returns sequences to feed the next.
# Dropout/recurrent_dropout mitigate overfitting on short sequences.
Bidirectional(
LSTM(
params.get("lstm_units", 32),
return_sequences=True,
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Second LSTM condenses sequence to a fixed vector for classification.
Bidirectional(
LSTM(
params.get("lstm_units", 32),
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Compact dense head with dropout; sufficient capacity for name signals.
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary classification.
Dense(2, activation="softmax"),
]
)
@@ -31,14 +56,7 @@ class LSTMModel(NeuralNetworkModel):
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
text_data = self._collect_text_corpus(X)
# Initialize tokenizer if needed
if self.tokenizer is None:
@@ -46,7 +64,7 @@ class LSTMModel(NeuralNetworkModel):
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+3
View File
@@ -13,12 +13,15 @@ class NaiveBayesModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
# includes unigrams for coverage and higher n for suffix/prefix cues.
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (1, 4)),
max_features=params.get("max_features", 8000),
)
# Laplace smoothing (alpha) counters zero counts for rare n-grams.
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+28 -3
View File
@@ -1,3 +1,5 @@
from typing import Dict
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
@@ -10,15 +12,23 @@ from research.traditional_model import TraditionalModel
class RandomForestModel(TraditionalModel):
"""Random Forest with engineered features"""
def __init__(self, config):
super().__init__(config)
# Persist encoders so categorical mappings stay consistent.
self.label_encoders: Dict[str, LabelEncoder] = {}
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
# across trees for speed. Keep depth moderate for generalisation.
return RandomForestClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", None),
random_state=self.config.random_seed,
verbose=2,
n_jobs=params.get("n_jobs", -1),
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
@@ -33,9 +43,24 @@ class RandomForestModel(TraditionalModel):
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
else:
# Categorical features (encode them)
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
# Categorical features (encode them persistently)
feature_key = f"encoder_{feature_type.value}"
if feature_key not in self.label_encoders:
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
encoder = self.label_encoders[feature_key]
column_clean = column.fillna("unknown").astype(str)
known_classes = set(encoder.classes_)
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
column_mapped = column_clean.apply(
lambda value: value if value in known_classes else default_class
)
encoded = encoder.transform(column_mapped)
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+6
View File
@@ -13,17 +13,23 @@ class SVMModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
# for distinguishing name morphology under RBF kernels.
vectorizer = TfidfVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 4)),
max_features=params.get("max_features", 5000),
)
# RBF kernel captures non-linear interactions between n-grams; probability=True
# adds calibration at some cost. Larger cache helps speed kernel computations.
classifier = SVC(
kernel=params.get("kernel", "rbf"),
C=params.get("C", 1.0),
gamma=params.get("gamma", "scale"),
probability=True, # Enable probability prediction
class_weight=params.get("class_weight", None),
cache_size=params.get("cache_size", 1000),
random_state=self.config.random_seed,
verbose=2,
)
+10 -9
View File
@@ -27,7 +27,12 @@ class TransformerModel(NeuralNetworkModel):
# Build Transformer model
inputs = Input(shape=(max_len,))
x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
x = Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
input_length=max_len,
mask_zero=True,
)(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=max_len, delta=1)
@@ -39,6 +44,7 @@ class TransformerModel(NeuralNetworkModel):
x = self._transformer_encoder(x, params)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
x = Dropout(params.get("dropout", 0.1))(x)
outputs = Dense(2, activation="softmax")(x)
model = Model(inputs, outputs)
@@ -54,6 +60,7 @@ class TransformerModel(NeuralNetworkModel):
attn = MultiHeadAttention(
num_heads=cfg_params.get("transformer_num_heads", 2),
key_dim=cfg_params.get("transformer_head_size", 64),
dropout=cfg_params.get("attn_dropout", 0.1),
)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
@@ -62,13 +69,7 @@ class TransformerModel(NeuralNetworkModel):
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
text_data = self._collect_text_corpus(X)
# Initialize tokenizer if needed
if self.tokenizer is None:
@@ -76,7 +77,7 @@ class TransformerModel(NeuralNetworkModel):
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+4
View File
@@ -20,6 +20,8 @@ class XGBoostModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Histogram-based trees and parallelism provide fast training; default
# logloss metric suits binary classification of gender.
return xgb.XGBClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", 6),
@@ -28,6 +30,8 @@ class XGBoostModel(TraditionalModel):
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
eval_metric="logloss",
n_jobs=params.get("n_jobs", -1),
tree_method=params.get("tree_method", "hist"),
verbosity=2,
)
+22
View File
@@ -82,6 +82,25 @@ class NeuralNetworkModel(BaseModel):
self.is_fitted = True
return self
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
"""Combine configured textual features into one string per record."""
column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
if not column_names:
raise ValueError("No configured text features found in the provided DataFrame.")
text_frame = X[column_names].fillna("").astype(str)
if len(column_names) == 1:
return text_frame.iloc[:, 0].tolist()
combined_rows = []
for row in text_frame.itertuples(index=False):
tokens = [value for value in row if value]
combined_rows.append(" ".join(tokens))
return combined_rows
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
@@ -145,6 +164,9 @@ class NeuralNetworkModel(BaseModel):
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
if train_sizes is None:
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
learning_curve_data = {
"train_sizes": [],
"train_scores": [],