From 63e23d660068d1a4791cdd78e70d7de66880e312 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Sun, 21 Sep 2025 13:10:07 +0200 Subject: [PATCH] fix: normalize hyper params --- config/pipeline.development.yaml | 2 +- research/models/bigru_model.py | 3 +-- research/models/cnn_model.py | 2 +- research/models/lstm_model.py | 3 +-- research/models/naive_bayes_model.py | 2 +- research/models/transformer_model.py | 10 +++++----- research/neural_network_model.py | 11 +++++------ research/traditional_model.py | 12 +++++++++++- 8 files changed, 26 insertions(+), 19 deletions(-) diff --git a/config/pipeline.development.yaml b/config/pipeline.development.yaml index 40c43af..7ccdf3b 100644 --- a/config/pipeline.development.yaml +++ b/config/pipeline.development.yaml @@ -29,7 +29,7 @@ llm: # Data handling configuration data: split_evaluation: false - max_dataset_size: null + max_dataset_size: 100_000 balance_by_sex: true # Enhanced logging for development diff --git a/research/models/bigru_model.py b/research/models/bigru_model.py index 7cbc21f..9954f1c 100644 --- a/research/models/bigru_model.py +++ b/research/models/bigru_model.py @@ -13,7 +13,7 @@ from research.neural_network_model import NeuralNetworkModel class BiGRUModel(NeuralNetworkModel): """Bidirectional GRU model for name classification""" - def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any: + def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: params = kwargs model = Sequential( [ @@ -22,7 +22,6 @@ class BiGRUModel(NeuralNetworkModel): Embedding( input_dim=vocab_size, output_dim=params.get("embedding_dim", 64), - input_length=max_len, mask_zero=True, ), # First recurrent block returns full sequences to allow stacking. diff --git a/research/models/cnn_model.py b/research/models/cnn_model.py index b955a75..8763d6f 100644 --- a/research/models/cnn_model.py +++ b/research/models/cnn_model.py @@ -21,7 +21,7 @@ from research.neural_network_model import NeuralNetworkModel class CNNModel(NeuralNetworkModel): """1D Convolutional Neural Network for character patterns""" - def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any: + def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: """Build CNN model with known vocabulary size""" params = kwargs diff --git a/research/models/lstm_model.py b/research/models/lstm_model.py index 28e378c..7d2cb3c 100644 --- a/research/models/lstm_model.py +++ b/research/models/lstm_model.py @@ -13,7 +13,7 @@ from research.neural_network_model import NeuralNetworkModel class LSTMModel(NeuralNetworkModel): """LSTM model for sequence learning""" - def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any: + def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: params = kwargs model = Sequential( [ @@ -21,7 +21,6 @@ class LSTMModel(NeuralNetworkModel): Embedding( input_dim=vocab_size, output_dim=params.get("embedding_dim", 64), - input_length=max_len, mask_zero=True, ), # Stacked bidirectional LSTMs: first returns sequences to feed the next. diff --git a/research/models/naive_bayes_model.py b/research/models/naive_bayes_model.py index becad50..d377d7f 100644 --- a/research/models/naive_bayes_model.py +++ b/research/models/naive_bayes_model.py @@ -17,7 +17,7 @@ class NaiveBayesModel(TraditionalModel): # includes unigrams for coverage and higher n for suffix/prefix cues. vectorizer = CountVectorizer( analyzer="char", - ngram_range=params.get("ngram_range", (1, 4)), + ngram_range=params.get("ngram_range", (2, 5)), max_features=params.get("max_features", 8000), ) diff --git a/research/models/transformer_model.py b/research/models/transformer_model.py index aef751d..6524c23 100644 --- a/research/models/transformer_model.py +++ b/research/models/transformer_model.py @@ -22,21 +22,21 @@ from research.neural_network_model import NeuralNetworkModel class TransformerModel(NeuralNetworkModel): """Transformer-based model""" - def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any: + def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: params = kwargs # Build Transformer model - inputs = Input(shape=(max_len,)) + inputs = Input(shape=(params.get("max_len", 8),)) x = Embedding( input_dim=vocab_size, output_dim=params.get("embedding_dim", 64), - input_length=max_len, + input_length=params.get("max_len", 8), mask_zero=True, )(inputs) # Add positional encoding - positions = tf.range(start=0, limit=max_len, delta=1) - pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))( + positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1) + pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))( positions ) x = x + pos_embedding diff --git a/research/neural_network_model.py b/research/neural_network_model.py index b48b47a..66c7c03 100644 --- a/research/neural_network_model.py +++ b/research/neural_network_model.py @@ -52,22 +52,21 @@ class NeuralNetworkModel(BaseModel): logging.info(f"Vocabulary size: {vocab_size}") # Get additional model parameters - max_len = self.config.model_params.get("max_len", 6) - - self.model = self.build_model_with_vocab( - vocab_size=vocab_size, max_len=max_len, **self.config.model_params - ) + self.model = self.build_model_with_vocab(vocab_size=vocab_size, **self.config.model_params) # Train the neural network logging.info( f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features" ) + logging.info(X_prepared[0]) + logging.info(f"Model parameters: {self.config.model_params}") + history = self.model.fit( X_prepared, y_encoded, epochs=self.config.model_params.get("epochs", 10), batch_size=self.config.model_params.get("batch_size", 64), - validation_split=0.1, + validation_split=self.config.model_params.get("validation_split", 0.1), verbose=2, ) diff --git a/research/traditional_model.py b/research/traditional_model.py index bd10ec8..c8884b5 100644 --- a/research/traditional_model.py +++ b/research/traditional_model.py @@ -59,9 +59,19 @@ class TraditionalModel(BaseModel): f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features" ) - self.model.fit(X_prepared, y_encoded) + logging.info(X_prepared[0]) + logging.info(f"Model parameters: {self.config.model_params}") + + history = self.model.fit(X_prepared, y_encoded) self.is_fitted = True + self.training_history = { + "accuracy": history.history["accuracy"], + "loss": history.history["loss"], + "val_accuracy": history.history.get("val_accuracy", []), + "val_loss": history.history.get("val_loss", []), + } + return self def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]: