feat: document models

This commit is contained in:
2025-09-20 23:35:54 +02:00
parent dd2a9f2711
commit e41b15a863
13 changed files with 256 additions and 47 deletions
+31 -13
View File
@@ -2,7 +2,7 @@ from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
@@ -17,10 +17,35 @@ class LSTMModel(NeuralNetworkModel):
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
Bidirectional(LSTM(params.get("lstm_units", 32))),
# Mask padding tokens; required for LSTM to ignore padded timesteps.
Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
input_length=max_len,
mask_zero=True,
),
# Stacked bidirectional LSTMs: first returns sequences to feed the next.
# Dropout/recurrent_dropout mitigate overfitting on short sequences.
Bidirectional(
LSTM(
params.get("lstm_units", 32),
return_sequences=True,
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Second LSTM condenses sequence to a fixed vector for classification.
Bidirectional(
LSTM(
params.get("lstm_units", 32),
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Compact dense head with dropout; sufficient capacity for name signals.
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary classification.
Dense(2, activation="softmax"),
]
)
@@ -31,14 +56,7 @@ class LSTMModel(NeuralNetworkModel):
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
text_data = self._collect_text_corpus(X)
# Initialize tokenizer if needed
if self.tokenizer is None:
@@ -46,7 +64,7 @@ class LSTMModel(NeuralNetworkModel):
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")