feat: document models
This commit is contained in:
@@ -2,7 +2,7 @@ from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
@@ -17,10 +17,35 @@ class LSTMModel(NeuralNetworkModel):
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32))),
|
||||
# Mask padding tokens; required for LSTM to ignore padded timesteps.
|
||||
Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
input_length=max_len,
|
||||
mask_zero=True,
|
||||
),
|
||||
# Stacked bidirectional LSTMs: first returns sequences to feed the next.
|
||||
# Dropout/recurrent_dropout mitigate overfitting on short sequences.
|
||||
Bidirectional(
|
||||
LSTM(
|
||||
params.get("lstm_units", 32),
|
||||
return_sequences=True,
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Second LSTM condenses sequence to a fixed vector for classification.
|
||||
Bidirectional(
|
||||
LSTM(
|
||||
params.get("lstm_units", 32),
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Compact dense head with dropout; sufficient capacity for name signals.
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
# Two-way softmax for binary classification.
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
@@ -31,14 +56,7 @@ class LSTMModel(NeuralNetworkModel):
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
@@ -46,7 +64,7 @@ class LSTMModel(NeuralNetworkModel):
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
|
||||
Reference in New Issue
Block a user