fix: normalize hyper params

2025-09-21 13:10:07 +02:00
parent 83d21c640b
commit 63e23d6600
8 changed files with 26 additions and 19 deletions
@@ -29,7 +29,7 @@ llm:
 # Data handling configuration
 data:
  split_evaluation: false
-  max_dataset_size: null
+  max_dataset_size: 100_000
  balance_by_sex: true
 # Enhanced logging for development
@@ -13,7 +13,7 @@ from research.neural_network_model import NeuralNetworkModel
 class BiGRUModel(NeuralNetworkModel):
    """Bidirectional GRU model for name classification"""
-    def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
        params = kwargs
        model = Sequential(
            [
@@ -22,7 +22,6 @@ class BiGRUModel(NeuralNetworkModel):
                Embedding(
                    input_dim=vocab_size,
                    output_dim=params.get("embedding_dim", 64),
                    input_length=max_len,
                    mask_zero=True,
                ),
                # First recurrent block returns full sequences to allow stacking.
@@ -21,7 +21,7 @@ from research.neural_network_model import NeuralNetworkModel
 class CNNModel(NeuralNetworkModel):
    """1D Convolutional Neural Network for character patterns"""
-    def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any:
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
        """Build CNN model with known vocabulary size"""
        params = kwargs
@@ -13,7 +13,7 @@ from research.neural_network_model import NeuralNetworkModel
 class LSTMModel(NeuralNetworkModel):
    """LSTM model for sequence learning"""
-    def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
        params = kwargs
        model = Sequential(
            [
@@ -21,7 +21,6 @@ class LSTMModel(NeuralNetworkModel):
                Embedding(
                    input_dim=vocab_size,
                    output_dim=params.get("embedding_dim", 64),
                    input_length=max_len,
                    mask_zero=True,
                ),
                # Stacked bidirectional LSTMs: first returns sequences to feed the next.
@@ -17,7 +17,7 @@ class NaiveBayesModel(TraditionalModel):
        # includes unigrams for coverage and higher n for suffix/prefix cues.
        vectorizer = CountVectorizer(
            analyzer="char",
-            ngram_range=params.get("ngram_range", (1, 4)),
+            ngram_range=params.get("ngram_range", (2, 5)),
            max_features=params.get("max_features", 8000),
        )
@@ -22,21 +22,21 @@ from research.neural_network_model import NeuralNetworkModel
 class TransformerModel(NeuralNetworkModel):
    """Transformer-based model"""
-    def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
        params = kwargs
        # Build Transformer model
-        inputs = Input(shape=(max_len,))
+        inputs = Input(shape=(params.get("max_len", 8),))
        x = Embedding(
            input_dim=vocab_size,
            output_dim=params.get("embedding_dim", 64),
-            input_length=max_len,
+            input_length=params.get("max_len", 8),
            mask_zero=True,
        )(inputs)
        # Add positional encoding
-        positions = tf.range(start=0, limit=max_len, delta=1)
+        positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
-        pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))(
+        pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
            positions
        )
        x = x + pos_embedding
@@ -52,22 +52,21 @@ class NeuralNetworkModel(BaseModel):
        logging.info(f"Vocabulary size: {vocab_size}")
        # Get additional model parameters
-        max_len = self.config.model_params.get("max_len", 6)
+        self.model = self.build_model_with_vocab(vocab_size=vocab_size, **self.config.model_params)
        self.model = self.build_model_with_vocab(
            vocab_size=vocab_size, max_len=max_len, **self.config.model_params
        )
        # Train the neural network
        logging.info(
            f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
        )
        logging.info(X_prepared[0])
        logging.info(f"Model parameters: {self.config.model_params}")
        history = self.model.fit(
            X_prepared,
            y_encoded,
            epochs=self.config.model_params.get("epochs", 10),
            batch_size=self.config.model_params.get("batch_size", 64),
-            validation_split=0.1,
+            validation_split=self.config.model_params.get("validation_split", 0.1),
            verbose=2,
        )
@@ -59,9 +59,19 @@ class TraditionalModel(BaseModel):
                f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
            )
-        self.model.fit(X_prepared, y_encoded)
+        logging.info(X_prepared[0])
        logging.info(f"Model parameters: {self.config.model_params}")
        history = self.model.fit(X_prepared, y_encoded)
        self.is_fitted = True
        self.training_history = {
            "accuracy": history.history["accuracy"],
            "loss": history.history["loss"],
            "val_accuracy": history.history.get("val_accuracy", []),
            "val_loss": history.history.get("val_loss", []),
        }
        return self
    def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]: