feat: balanced dataset loading

2025-06-30 01:32:10 +02:00
parent eb139ee09a
commit 0888d94596
9 changed files with 306 additions and 614 deletions
@@ -1,15 +1,12 @@
-import argparse
-import logging
 import os
 from dataclasses import dataclass
-from typing import Tuple, Optional
+from typing import Tuple

 import numpy as np
 import pandas as pd
 import tensorflow as tf
 from sklearn.metrics import (
-    accuracy_score, precision_recall_fscore_support,
-    classification_report, confusion_matrix
+    accuracy_score
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
@@ -23,56 +20,11 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer

 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
-
-logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+from ners.gender.models import BaseConfig, load_config, evaluate_proba, logging


@dataclass
-class Config:
-    """
-    Configuration data class used to store settings and parameters for a machine learning or deep
-    learning model.
-
-    This class allows the user to specify various parameters such as dataset path, size of input,
-    model architecture details like embedding dimensions, transformer configurations, training settings
-    like batch size and epochs, and validation and testing settings. The attributes provide flexibility
-    to customize model configuration and training processes.
-
-    :ivar dataset_path: The file path to the dataset.
-    :type dataset_path: str
-    :ivar size: Optional size parameter, can be used to specify sample size or custom
-        configuration based on the user's requirement.
-    :type size: Optional[int]
-    :ivar max_len: Maximum sequence length for input data, used often in text or sequence
-        processing.
-    :type max_len: int
-    :ivar embedding_dim: The dimensionality of embeddings used in the model.
-    :type embedding_dim: int
-    :ivar transformer_head_size: The size of each transformer attention head.
-    :type transformer_head_size: int
-    :ivar transformer_num_heads: The number of attention heads in the transformer model.
-    :type transformer_num_heads: int
-    :ivar transformer_ff_dim: The dimensionality of the feed-forward network in the transformer.
-    :type transformer_ff_dim: int
-    :ivar dropout: Dropout rate used for regularization during training.
-    :type dropout: float
-    :ivar batch_size: Batch size used for training and validation.
-    :type batch_size: int
-    :ivar epochs: Number of epochs for model training.
-    :type epochs: int
-    :ivar test_size: Proportion of the dataset to be used for testing.
-    :type test_size: float
-    :ivar random_state: Random seed value for reproducibility.
-    :type random_state: int
-    :ivar threshold: Threshold value for model predictions or classification.
-    :type threshold: float
-    :ivar cv: Cross-validation configuration, if applicable.
-    :type cv: Optional[int]
-    :ivar save: Boolean flag indicating whether to save the model after training.
-    :type save: bool
-    """
-    dataset_path: str
-    size: Optional[int]
+class Config(BaseConfig):
    max_len: int = 6
    embedding_dim: int = 64
    transformer_head_size: int = 64
@@ -80,38 +32,21 @@ class Config:
    transformer_ff_dim: int = 128
    dropout: float = 0.1
    batch_size: int = 64
-    epochs: int = 10
-    test_size: float = 0.2
-    random_state: int = 42
-    threshold: float = 0.5
-    cv: Optional[int] = None
-    save: bool = False


 def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
    """
-    Load and preprocess data for model training or evaluation. This function handles the
-    loading of a dataset in CSV format, applies preprocessing to clean and normalize
-    the input data, tokenizes text features, and encodes categorical labels.
-
-    The preprocessed data is prepared as padded sequences and encoded labels, which
-    can be directly used as inputs for machine learning models. Tokenizer and LabelEncoder
-    are returned to ensure consistency between training and inference stages.
-
-    :param cfg: Configuration object containing dataset path, size of the
-                dataset to load, and maximum length for padding sequences.
-    :type cfg: Config
-    :return: A tuple containing padded input sequences for the model, encoded labels,
-             the tokenizer used for text sequences, and the encoder used for labels.
-    :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
+    Load and preprocess the dataset for training a Transformer model.
+    This function reads a CSV dataset, tokenizes the names, pads the sequences,
+    and encodes the labels. It returns the padded sequences, encoded labels,
+    tokenizer, and label encoder.
    """
    logging.info("Loading and preprocessing data")
-    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
-    df["name"] = df["name"].str.lower().str.strip()
-    df["sex"] = df["sex"].str.lower().str.strip()
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))

    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(df["name"])
+
    sequences = tokenizer.texts_to_sequences(df["name"])
    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")

@@ -122,18 +57,8 @@ def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, La

 def transformer_encoder(x, cfg: Config):
    """
-    Transforms input tensor using a single Transformer encoder block with attention and feedforward
-    layers. The encoder applies multi-head attention to the input tensor, adds the output to
-    the original tensor for residual connection, and normalizes it. Subsequently, the processed
-    tensor passes through a feedforward network with added dropout and normalization.
-
-    :param x: Input tensor to be transformed.
-    :type x: TensorFlow tensor
-    :param cfg: Configuration object containing Transformer hyperparameters such as the number of
-        attention heads, head size, feedforward dimension, and dropout rate.
-    :type cfg: Config
-    :return: Transformed tensor resulting from applying the Transformer encoder block.
-    :rtype: TensorFlow tensor
+    Transformer encoder block that applies multi-head attention and feed-forward
+    neural network layers with residual connections and layer normalization.
    """
    attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
    x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
@@ -145,18 +70,10 @@ def transformer_encoder(x, cfg: Config):

 def build_model(cfg: Config, vocab_size: int) -> Model:
    """
-    Builds a Transformer-based model using Keras/TensorFlow components. The model
-    is designed for classification tasks, utilizing embedding layers with positional
-    encoding, a Transformer encoder block, and fully connected layers for
-    output generation.
-
-    :param cfg: Configuration object containing model-specific hyperparameters
-        such as maximum sequence length, embedding dimensions, etc.
-    :type cfg: Config
-    :param vocab_size: The size of the vocabulary for the embedding layer.
-    :type vocab_size: int
-    :return: A compiled Keras model, ready for training and evaluation.
-    :rtype: Model
+    Builds a Transformer-based model aimed at sequence processing tasks.
+    The model includes an embedding layer integrating positional encodings
+    and a Transformer encoder, followed by a global pooling layer,
+    a dense hidden layer, and a softmax output layer.
    """
    logging.info("Building Transformer model")
    inputs = Input(shape=(cfg.max_len,))
@@ -177,54 +94,11 @@ def build_model(cfg: Config, vocab_size: int) -> Model:
    return model


-def evaluate_proba(y_true, y_proba, threshold, class_names):
-    """
-    Evaluates the performance of a binary classification model by calculating accuracy,
-    precision, recall, F1 score, confusion matrix, and generates a classification
-    report. This function takes the true labels, predicted probabilities, a decision
-    threshold, and class names to assist in the evaluation.
-
-    :param y_true: Ground truth (correct) target values.
-    :type y_true: array-like of shape (n_samples,)
-    :param y_proba: Predicted probabilities for each class. Expected to be an array
-        where the second column corresponds to the probability of the positive class.
-    :type y_proba: array-like of shape (n_samples, 2)
-    :param threshold: Decision threshold for classifying a sample as positive
-        or negative based on predicted probabilities.
-    :type threshold: float
-    :param class_names: List of class names for labeling the classification report.
-    :type class_names: list of str
-    :return: None. Outputs performance metrics and confusion matrix to the logging
-        system and the console.
-    """
-    y_pred = (y_proba[:, 1] >= threshold).astype(int)
-    acc = accuracy_score(y_true, y_pred)
-    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
-    cm = confusion_matrix(y_true, y_pred)
-
-    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
-    print("Confusion Matrix:\n", cm)
-    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
-
-
 def cross_validate(cfg: Config, X, y, vocab_size: int):
    """
-    Evaluate the performance of a model using K-fold cross-validation. This function takes
-    configuration settings, input data, target labels, and vocabulary size to perform the
-    specified number of cross-validation folds with a stratified approach. For each fold,
-    it builds a new model, trains it, predicts the validation set, and calculates accuracy.
-
-    :param cfg: The configuration object containing hyperparameters and settings for
-                cross-validation, random state, and training.
-    :type cfg: Config
-    :param X: The input data samples provided as a dataset.
-    :type X: numpy.ndarray
-    :param y: The target labels corresponding to the input data samples.
-    :type y: numpy.ndarray
-    :param vocab_size: The size of the vocabulary, used to configure the language model.
-    :type vocab_size: int
-    :return: A list containing accuracy scores from each fold in the cross-validation process.
-    :rtype: list
+    Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
+    splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
+    data. The overall mean and standard deviation of accuracies across all folds are logged.
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
@@ -247,14 +121,11 @@ def cross_validate(cfg: Config, X, y, vocab_size: int):

 def save_artifacts(model, tokenizer, encoder):
    """
-    Saves the machine learning model and its associated artifacts such as tokenizer and
-    label encoder to predefined file paths. This function ensures that the model and
-    artifacts can be reloaded later for inference or further use.
-
-    :param model: The machine learning model to be saved.
-    :param tokenizer: The tokenizer used for preparing data for the model.
-    :param encoder: The label encoder used for encoding target labels.
-    :return: None
+    Saves the model and associated artifacts to the designated directory. The model
+    is serialized and saved in a `.keras` file, while the tokenizer and label
+    encoder are serialized into `.pkl` files. If the directory does not exist, it
+    is created automatically. This function also logs the completion of the
+    operation.
    """
    os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
    model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
@@ -266,21 +137,7 @@ def save_artifacts(model, tokenizer, encoder):


 def main():
-    parser = argparse.ArgumentParser(description="Train Transformer model for name-based gender classification")
-    parser.add_argument("--dataset", type=str, default="names.csv")
-    parser.add_argument("--size", type=int)
-    parser.add_argument("--threshold", type=float, default=0.5)
-    parser.add_argument("--cv", type=int)
-    parser.add_argument("--save", action="store_true")
-    args = parser.parse_args()
-
-    cfg = Config(
-        dataset_path=args.dataset,
-        size=args.size,
-        threshold=args.threshold,
-        cv=args.cv,
-        save=args.save
-    )
+    cfg = Config(**vars(load_config("Transformer model")))

    X, y, tokenizer, encoder = load_and_prepare(cfg)
    vocab_size = len(tokenizer.word_index) + 1