feat: document models

2025-09-20 23:35:54 +02:00
parent dd2a9f2711
commit e41b15a863
13 changed files with 256 additions and 47 deletions
@@ -17,17 +17,38 @@ class BiGRUModel(NeuralNetworkModel):
        params = kwargs
        model = Sequential(
            [
-                Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
+                # Mask padding tokens so recurrent layers ignore them; fix input length
+                # for better shape inference and to support masking through the stack.
+                Embedding(
+                    input_dim=vocab_size,
+                    output_dim=params.get("embedding_dim", 64),
+                    input_length=max_len,
+                    mask_zero=True,
+                ),
+                # First recurrent block returns full sequences to allow stacking.
+                # Moderate dropout + optional recurrent_dropout to reduce overfitting
+                # on short names while retaining temporal signal.
                Bidirectional(
                    GRU(
                        params.get("gru_units", 32),
                        return_sequences=True,
                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
                    )
                ),
-                Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
+                # Second GRU summarizes to the last hidden state (no return_sequences),
+                # capturing bidirectional context efficiently for classification.
+                Bidirectional(
+                    GRU(
+                        params.get("gru_units", 32),
+                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
+                    )
+                ),
+                # Small dense head; ReLU + dropout for capacity and regularization.
                Dense(64, activation="relu"),
                Dropout(params.get("dropout", 0.5)),
+                # Two-way softmax for binary gender classification.
                Dense(2, activation="softmax"),
            ]
        )
@@ -38,19 +59,13 @@ class BiGRUModel(NeuralNetworkModel):
        return model

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
-        text_data = []
-        for feature_type in self.config.features:
-            if feature_type.value in X.columns:
-                text_data.extend(X[feature_type.value].astype(str).tolist())
-
-        if not text_data:
-            raise ValueError("No text data found in the provided DataFrame.")
+        text_data = self._collect_text_corpus(X)

        if self.tokenizer is None:
            self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
            self.tokenizer.fit_on_texts(text_data)

-        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
+        sequences = self.tokenizer.texts_to_sequences(text_data)
        max_len = self.config.model_params.get("max_len", 6)

        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -9,6 +9,7 @@ from tensorflow.keras.layers import (
    GlobalMaxPooling1D,
    Dense,
    Dropout,
+    SpatialDropout1D,
 )
 from tensorflow.keras.models import Sequential

@@ -24,21 +25,33 @@ class CNNModel(NeuralNetworkModel):
        params = kwargs
        model = Sequential(
            [
+                # Learn char/subword embeddings; spatial dropout regularizes across channels
+                # to make the model robust to noisy characters and transliteration.
                Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
+                SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
+                # Small kernels capture short n-gram like patterns; padding='same' keeps
+                # sequence length stable for simpler pooling behavior.
                Conv1D(
                    filters=params.get("filters", 64),
                    kernel_size=params.get("kernel_size", 3),
                    activation="relu",
+                    padding="same",
                ),
+                # Downsample to gain some position invariance and reduce computation.
                MaxPooling1D(pool_size=2),
+                # Second conv layer to compose higher-level motifs (e.g., suffix+vowel).
                Conv1D(
                    filters=params.get("filters", 64),
                    kernel_size=params.get("kernel_size", 3),
                    activation="relu",
+                    padding="same",
                ),
+                # Global max pooling picks strongest motif evidence anywhere in the name.
                GlobalMaxPooling1D(),
+                # Compact dense head with dropout to control overfitting.
                Dense(64, activation="relu"),
                Dropout(params.get("dropout", 0.5)),
+                # Two-way softmax for binary classification.
                Dense(2, activation="softmax"),
            ]
        )
@@ -55,21 +68,14 @@ class CNNModel(NeuralNetworkModel):
        from tensorflow.keras.preprocessing.sequence import pad_sequences

        # Get text data from extracted features - use character level for CNN
-        text_data = []
-        for feature_type in self.config.features:
-            if feature_type.value in X.columns:
-                text_data.extend(X[feature_type.value].astype(str).tolist())
-
-        if not text_data:
-            # Fallback - should not happen if FeatureExtractor is properly configured
-            text_data = [""] * len(X)
+        text_data = self._collect_text_corpus(X)

        # Initialize character-level tokenizer
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
            self.tokenizer.fit_on_texts(text_data)

-        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
+        sequences = self.tokenizer.texts_to_sequences(text_data)
        max_len = self.config.model_params.get("max_len", 20)  # Longer for character level

        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -31,7 +31,8 @@ class EnsembleModel(TraditionalModel):
            "base_models", ["logistic_regression", "random_forest", "naive_bayes"]
        )

-        # Create base models with simplified configs
+        # Create base models with simplified configs; diverse vectorizers/classifiers
+        # encourage complementary errors that voting can average out.
        estimators = []
        for model_type in base_model_types:
            if model_type == "logistic_regression":
@@ -78,8 +79,10 @@ class EnsembleModel(TraditionalModel):
                )
                estimators.append((f"nb", model))

+        # Soft voting averages probabilities (preferred when members are calibrated);
+        # hard voting uses majority class. Parallelize member predictions.
        voting_type = params.get("voting", "soft")  # 'hard' or 'soft'
-        return VotingClassifier(estimators=estimators, voting=voting_type)
+        return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_features = []
@@ -20,6 +20,8 @@ class LightGBMModel(TraditionalModel):
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params

+        # Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
+        # and parallelism improve training speed for this task.
        return lgb.LGBMClassifier(
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth", -1),
@@ -28,6 +30,8 @@ class LightGBMModel(TraditionalModel):
            subsample=params.get("subsample", 0.8),
            colsample_bytree=params.get("colsample_bytree", 0.8),
            random_state=self.config.random_seed,
+            objective=params.get("objective", "binary"),
+            n_jobs=params.get("n_jobs", -1),
            verbose=2,
        )

@@ -13,14 +13,23 @@ class LogisticRegressionModel(TraditionalModel):

    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
+        # Character n-grams are strong signals for names; (2,5) balances
+        # capturing prefixes/suffixes with tractable feature size.
        vectorizer = CountVectorizer(
            analyzer="char",
            ngram_range=params.get("ngram_range", (2, 5)),
            max_features=params.get("max_features", 10000),
        )

+        # liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
+        # OvR across classes (no effect for binary). class_weight can mitigate imbalance.
        classifier = LogisticRegression(
-            max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed, verbose=2
+            max_iter=params.get("max_iter", 1000),
+            random_state=self.config.random_seed,
+            verbose=2,
+            solver=params.get("solver", "liblinear"),
+            n_jobs=params.get("n_jobs", -1),
+            class_weight=params.get("class_weight", None),
        )

        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
@@ -2,7 +2,7 @@ from typing import Any

 import numpy as np
 import pandas as pd
-from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
+from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
@@ -17,10 +17,35 @@ class LSTMModel(NeuralNetworkModel):
        params = kwargs
        model = Sequential(
            [
-                Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
-                Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
-                Bidirectional(LSTM(params.get("lstm_units", 32))),
+                # Mask padding tokens; required for LSTM to ignore padded timesteps.
+                Embedding(
+                    input_dim=vocab_size,
+                    output_dim=params.get("embedding_dim", 64),
+                    input_length=max_len,
+                    mask_zero=True,
+                ),
+                # Stacked bidirectional LSTMs: first returns sequences to feed the next.
+                # Dropout/recurrent_dropout mitigate overfitting on short sequences.
+                Bidirectional(
+                    LSTM(
+                        params.get("lstm_units", 32),
+                        return_sequences=True,
+                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
+                    )
+                ),
+                # Second LSTM condenses sequence to a fixed vector for classification.
+                Bidirectional(
+                    LSTM(
+                        params.get("lstm_units", 32),
+                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
+                    )
+                ),
+                # Compact dense head with dropout; sufficient capacity for name signals.
                Dense(64, activation="relu"),
+                Dropout(params.get("dropout", 0.5)),
+                # Two-way softmax for binary classification.
                Dense(2, activation="softmax"),
            ]
        )
@@ -31,14 +56,7 @@ class LSTMModel(NeuralNetworkModel):
        return model

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
-        text_data = []
-
-        for feature_type in self.config.features:
-            if feature_type.value in X.columns:
-                text_data.extend(X[feature_type.value].astype(str).tolist())
-
-        if not text_data:
-            raise ValueError("No text data found in the provided DataFrame.")
+        text_data = self._collect_text_corpus(X)

        # Initialize tokenizer if needed
        if self.tokenizer is None:
@@ -46,7 +64,7 @@ class LSTMModel(NeuralNetworkModel):
            self.tokenizer.fit_on_texts(text_data)

        # Convert to sequences
-        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
+        sequences = self.tokenizer.texts_to_sequences(text_data)
        max_len = self.config.model_params.get("max_len", 6)

        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -13,12 +13,15 @@ class NaiveBayesModel(TraditionalModel):

    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
+        # Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
+        # includes unigrams for coverage and higher n for suffix/prefix cues.
        vectorizer = CountVectorizer(
            analyzer="char",
            ngram_range=params.get("ngram_range", (1, 4)),
            max_features=params.get("max_features", 8000),
        )

+        # Laplace smoothing (alpha) counters zero counts for rare n-grams.
        classifier = MultinomialNB(alpha=params.get("alpha", 1.0))

        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
@@ -1,3 +1,5 @@
+from typing import Dict
+
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
@@ -10,15 +12,23 @@ from research.traditional_model import TraditionalModel
 class RandomForestModel(TraditionalModel):
    """Random Forest with engineered features"""

+    def __init__(self, config):
+        super().__init__(config)
+        # Persist encoders so categorical mappings stay consistent.
+        self.label_encoders: Dict[str, LabelEncoder] = {}
+
    def build_model(self) -> BaseEstimator:

        params = self.config.model_params

+        # Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
+        # across trees for speed. Keep depth moderate for generalisation.
        return RandomForestClassifier(
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth", None),
            random_state=self.config.random_seed,
            verbose=2,
+            n_jobs=params.get("n_jobs", -1),
        )

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
@@ -33,9 +43,24 @@ class RandomForestModel(TraditionalModel):
                    # Numerical features
                    features.append(column.fillna(0).values.reshape(-1, 1))
                else:
-                    # Categorical features (encode them)
-                    le = LabelEncoder()
-                    encoded = le.fit_transform(column.fillna("unknown").astype(str))
+                    # Categorical features (encode them persistently)
+                    feature_key = f"encoder_{feature_type.value}"
+
+                    if feature_key not in self.label_encoders:
+                        self.label_encoders[feature_key] = LabelEncoder()
+                        encoded = self.label_encoders[feature_key].fit_transform(
+                            column.fillna("unknown").astype(str)
+                        )
+                    else:
+                        encoder = self.label_encoders[feature_key]
+                        column_clean = column.fillna("unknown").astype(str)
+                        known_classes = set(encoder.classes_)
+                        default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
+                        column_mapped = column_clean.apply(
+                            lambda value: value if value in known_classes else default_class
+                        )
+                        encoded = encoder.transform(column_mapped)
+
                    features.append(encoded.reshape(-1, 1))

        return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -13,17 +13,23 @@ class SVMModel(TraditionalModel):

    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
+        # TF-IDF downweights very common patterns; char n-grams (2,4) are effective
+        # for distinguishing name morphology under RBF kernels.
        vectorizer = TfidfVectorizer(
            analyzer="char",
            ngram_range=params.get("ngram_range", (2, 4)),
            max_features=params.get("max_features", 5000),
        )

+        # RBF kernel captures non-linear interactions between n-grams; probability=True
+        # adds calibration at some cost. Larger cache helps speed kernel computations.
        classifier = SVC(
            kernel=params.get("kernel", "rbf"),
            C=params.get("C", 1.0),
            gamma=params.get("gamma", "scale"),
            probability=True,  # Enable probability prediction
+            class_weight=params.get("class_weight", None),
+            cache_size=params.get("cache_size", 1000),
            random_state=self.config.random_seed,
            verbose=2,
        )
@@ -27,7 +27,12 @@ class TransformerModel(NeuralNetworkModel):

        # Build Transformer model
        inputs = Input(shape=(max_len,))
-        x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
+        x = Embedding(
+            input_dim=vocab_size,
+            output_dim=params.get("embedding_dim", 64),
+            input_length=max_len,
+            mask_zero=True,
+        )(inputs)

        # Add positional encoding
        positions = tf.range(start=0, limit=max_len, delta=1)
@@ -39,6 +44,7 @@ class TransformerModel(NeuralNetworkModel):
        x = self._transformer_encoder(x, params)
        x = GlobalAveragePooling1D()(x)
        x = Dense(32, activation="relu")(x)
+        x = Dropout(params.get("dropout", 0.1))(x)
        outputs = Dense(2, activation="softmax")(x)

        model = Model(inputs, outputs)
@@ -54,6 +60,7 @@ class TransformerModel(NeuralNetworkModel):
        attn = MultiHeadAttention(
            num_heads=cfg_params.get("transformer_num_heads", 2),
            key_dim=cfg_params.get("transformer_head_size", 64),
+            dropout=cfg_params.get("attn_dropout", 0.1),
        )(x, x)
        x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))

@@ -62,13 +69,7 @@ class TransformerModel(NeuralNetworkModel):
        return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
-        text_data = []
-        for feature_type in self.config.features:
-            if feature_type.value in X.columns:
-                text_data.extend(X[feature_type.value].astype(str).tolist())
-
-        if not text_data:
-            raise ValueError("No text data found in the provided DataFrame.")
+        text_data = self._collect_text_corpus(X)

        # Initialize tokenizer if needed
        if self.tokenizer is None:
@@ -76,7 +77,7 @@ class TransformerModel(NeuralNetworkModel):
            self.tokenizer.fit_on_texts(text_data)

        # Convert to sequences
-        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
+        sequences = self.tokenizer.texts_to_sequences(text_data)
        max_len = self.config.model_params.get("max_len", 6)

        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -20,6 +20,8 @@ class XGBoostModel(TraditionalModel):
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params

+        # Histogram-based trees and parallelism provide fast training; default
+        # logloss metric suits binary classification of gender.
        return xgb.XGBClassifier(
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth", 6),
@@ -28,6 +30,8 @@ class XGBoostModel(TraditionalModel):
            colsample_bytree=params.get("colsample_bytree", 0.8),
            random_state=self.config.random_seed,
            eval_metric="logloss",
+            n_jobs=params.get("n_jobs", -1),
+            tree_method=params.get("tree_method", "hist"),
            verbosity=2,
        )

@@ -82,6 +82,25 @@ class NeuralNetworkModel(BaseModel):
        self.is_fitted = True
        return self

+    def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
+        """Combine configured textual features into one string per record."""
+
+        column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
+        if not column_names:
+            raise ValueError("No configured text features found in the provided DataFrame.")
+
+        text_frame = X[column_names].fillna("").astype(str)
+
+        if len(column_names) == 1:
+            return text_frame.iloc[:, 0].tolist()
+
+        combined_rows = []
+        for row in text_frame.itertuples(index=False):
+            tokens = [value for value in row if value]
+            combined_rows.append(" ".join(tokens))
+
+        return combined_rows
+
    def cross_validate(
            self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
    ) -> dict[str, np.floating[Any]]:
@@ -145,6 +164,9 @@ class NeuralNetworkModel(BaseModel):
        """Generate learning curve data for the model"""
        logging.info(f"Generating learning curve for {self.__class__.__name__}")

+        if train_sizes is None:
+            train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
+
        learning_curve_data = {
            "train_sizes": [],
            "train_scores": [],