feat: balanced dataset loading

2025-06-30 01:32:10 +02:00
parent eb139ee09a
commit 0888d94596
9 changed files with 306 additions and 614 deletions
@@ -0,0 +1,82 @@
+import argparse
+import logging
+from dataclasses import dataclass
+from typing import Optional
+
+from sklearn.metrics import (
+    accuracy_score, precision_recall_fscore_support,
+    classification_report, confusion_matrix
+)
+
+logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+
+
+def evaluate_proba(y_true, y_proba, threshold, class_names):
+    y_pred = (y_proba[:, 1] >= threshold).astype(int)
+    acc = accuracy_score(y_true, y_pred)
+    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
+    cm = confusion_matrix(y_true, y_pred)
+
+    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
+    print("Confusion Matrix:\n", cm)
+    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
+
+
+@dataclass
+class BaseConfig:
+    """
+    Represents the base configuration for a dataset and its associated parameters.
+
+    This class serves as a foundational configuration handler to encapsulate
+    dataset-related parameters and options. It allows customization of dataset
+    behavior, including threshold values, size, cross-validation settings, and
+    whether to save derived configurations. It can also manage configurations
+    for balanced datasets if necessary.
+    """
+    dataset_path: str = "names_featured.csv"
+    size: Optional[int] = None
+    threshold: float = 0.5
+    cv: Optional[int] = None
+    save: bool = False
+    balanced: bool = False
+
+    epochs: int = 10
+    test_size: float = 0.2
+    random_state: int = 42
+
+
+def load_config(description: str) -> BaseConfig:
+    """
+    Parses command-line arguments and loads the configuration for the logistic regression model.
+
+    This function sets up an argument parser for various command-line options including
+    the dataset path, dataset size, dataset balancing, classification threshold,
+    cross-validation folds, and saving the model and its associated artifacts. Once parsed,
+    it transfers the configurations to a ``BaseConfig`` instance and returns it.
+    """
+    parser = argparse.ArgumentParser(description)
+
+    parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
+    parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
+    parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
+    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
+    parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
+    parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
+
+    parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
+    parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
+    parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
+
+    args = parser.parse_args()
+
+    return BaseConfig(
+        dataset_path=args.dataset,
+        size=args.size,
+        threshold=args.threshold,
+        cv=args.cv,
+        save=args.save,
+        balanced=args.balanced,
+        epochs=args.epochs,
+        test_size=args.test_size,
+        random_state=args.random_state
+    )
@@ -1,8 +1,6 @@
-import argparse
-import logging
 import os
 from dataclasses import dataclass
-from typing import Tuple, Optional
+from typing import Tuple

 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
@@ -16,54 +14,20 @@ from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.preprocessing import LabelEncoder

 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
-
-logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+from ners.gender.models import BaseConfig, load_config, logging


@dataclass
-class Config:
-    dataset_path: str
-    size: Optional[int]
-    test_size: float = 0.2
+class Config(BaseConfig):
    ngram_range: Tuple[int, int] = (2, 5)
    max_iter: int = 1000
-    random_state: int = 42
-    threshold: float = 0.5
-    cv: Optional[int] = None
-    save: bool = False
-
-
-def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]:
-    """
-    Load and clean dataset as specified by the provided configuration. This function reads
-    a CSV dataset from the path specified in the configuration, processes it to remove
-    missing values from key columns ('name' and 'sex'), and cleans string data in these
-    columns by converting them to lowercase and stripping whitespace. The cleaned data
-    is then returned as two separate pandas Series objects.
-
-    :param cfg: Configuration object specifying the dataset path and size
-    :type cfg: Config
-    :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects
-    :rtype: Tuple[pd.Series, pd.Series]
-    """
-    logging.info(f"Loading dataset from {cfg.dataset_path}")
-    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size))
-    df = df.dropna(subset=["name", "sex"])
-    df["name"] = df["name"].str.lower().str.strip()
-    df["sex"] = df["sex"].str.lower().str.strip()
-    return df["name"], df["sex"]


 def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
    """
-    Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical
-    labels to integers, which is particularly useful for machine learning models that require numerical
-    input data.
-
-    :param y: A pandas Series of categorical labels to be encoded.
-    :type y: pd.Series
-    :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object.
-    :rtype: Tuple[pd.Series, LabelEncoder]
+    Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
+    fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
+    for model training. The transformed labels and the fitted encoder are returned.
    """
    logging.info("Encoding labels")
    encoder = LabelEncoder()
@@ -73,21 +37,11 @@ def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:

 def build_model(cfg: Config) -> Pipeline:
    """
-    Builds a machine learning pipeline for text classification.
-
-    This function constructs and returns a scikit-learn pipeline that consists of
-    a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer
-    leverages character-level n-grams based on the provided configuration, and the
-    logistic regression model is trained with a maximum number of iterations defined
-    in the configuration. This pipeline is used for processing text data and training
-    classification models.
-
-    :param cfg: Configuration object containing the n-gram range and the maximum
-                number of iterations for the logistic regression model.
-    :type cfg: Config
-    :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression`
-             based on the provided configuration.
-    :rtype: Pipeline
+    Build a logistic regression model pipeline with a character-level CountVectorizer.
+    The pipeline consists of a CountVectorizer that transforms the input text into
+    character n-grams, followed by a Logistic Regression classifier. The n-gram range
+    and maximum iterations for the logistic regression can be configured through the
+    provided configuration object.
    """
    return make_pipeline(
        CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
@@ -95,7 +49,7 @@ def build_model(cfg: Config) -> Pipeline:
    )


-def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
+def evaluate_proba(y_true, y_proba, threshold: float, class_names):
    """
    Evaluates the performance of a classification model using a specified threshold
    for predicted probabilities. Computes metrics such as accuracy, precision,
@@ -104,19 +58,6 @@ def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):

    Logs the evaluation metrics at the specified threshold and prints the confusion
    matrix and classification report.
-
-    :param y_true: Ground truth (correct) labels.
-    :type y_true: array-like
-    :param y_proba: Predicted probabilities for each class, where each row
-        corresponds to an instance and contains probabilities for each target class.
-    :type y_proba: numpy.ndarray
-    :param threshold: The threshold on predicted probabilities to determine
-        class membership for each instance.
-    :type threshold: float
-    :param class_names: List of class names for the target variable used in the
-        classification report.
-    :type class_names: list of str
-    :return: None
    """
    logging.info(f"Evaluating at threshold = {threshold}")
    y_pred = (y_proba[:, 1] >= threshold).astype(int)
@@ -135,16 +76,6 @@ def cross_validate(cfg: Config, X, y) -> None:
    Performs k-fold cross-validation on the provided dataset using the configuration and
    logs the results including individual fold scores, mean accuracy, and the standard
    deviation of the scores.
-
-    :param cfg: Configuration object containing cross-validation settings such as the
-        number of folds to use in the cross-validation (`cv`).
-    :type cfg: Config
-    :param X: Input feature matrix for the dataset to be used for cross-validation.
-    :type X: Any
-    :param y: Target labels corresponding to the input feature matrix `X`.
-    :type y: Any
-    :return: This function does not return any value. Results are logged.
-    :rtype: None
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    pipeline = build_model(cfg)
@@ -153,21 +84,9 @@ def cross_validate(cfg: Config, X, y) -> None:
    logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")


-def save_artifacts(model, encoder, cfg: Config):
+def save_artifacts(model, encoder):
    """
-    Saves machine learning model and label encoder artifacts to specified directories
-    within the gender models' directory. This function ensures that the model and encoder
-    are serialized and stored as pickle files. It uses the specified configuration settings
-    to locate the appropriate directory for storing the files.
-
-    :param model: The machine learning model object to be saved.
-    :type model: Any
-    :param encoder: The label encoder object used for data preprocessing.
-    :type encoder: Any
-    :param cfg: Configuration object containing application-specific settings regarding
-        paths and directories.
-    :type cfg: Config
-    :return: None
+    Saves the trained model and label encoder artifacts to the specified directory.
    """
    save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
    save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
@@ -176,23 +95,10 @@ def save_artifacts(model, encoder, cfg: Config):


 def main():
-    parser = argparse.ArgumentParser(description="Train a gender classifier on names")
-    parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset")
-    parser.add_argument("--size", type=int, help="Number of rows to load")
-    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision")
-    parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
-    parser.add_argument("--save", action="store_true", help="Save the model and encoder")
-    args = parser.parse_args()
+    cfg = Config(**vars(load_config("logistic regression model")))

-    cfg = Config(
-        dataset_path=args.dataset,
-        size=args.size,
-        threshold=args.threshold,
-        cv=args.cv,
-        save=args.save
-    )
-
-    X_raw, y_raw = load_and_clean_data(cfg)
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
+    X_raw, y_raw = df["name"], df["sex"]
    y_encoded, encoder = encode_labels(y_raw)

    if cfg.cv:
@@ -207,10 +113,10 @@ def main():
    model.fit(X_train, y_train)

    y_proba = model.predict_proba(X_test)
-    evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
+    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)

    if cfg.save:
-        save_artifacts(model, encoder, cfg)
+        save_artifacts(model, encoder)


 if __name__ == "__main__":
@@ -1,13 +1,11 @@
-import argparse
-import logging
 import os
 from dataclasses import dataclass
-from typing import Tuple, Optional
+from typing import Tuple

 import numpy as np
 import pandas as pd
 from sklearn.metrics import (
-    accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix
+    accuracy_score
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
@@ -18,82 +16,25 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer

 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
-
-logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+from ners.gender.models import load_config, BaseConfig, evaluate_proba, logging


@dataclass
-class Config:
-    """
-    Configuration for the machine learning model and its training process.
-
-    This class encapsulates the configuration options necessary for initializing,
-    training, and evaluating a machine learning model. It allows flexibility
-    in specifying dataset details, model parameters, training settings, and
-    options for evaluation. Attributes include paths, numerical parameters,
-    and flags that guide the model's behavior.
-
-    :ivar dataset_path: Path to the dataset file.
-    :type dataset_path: str
-    :ivar size: Optional size of the dataset to use. If None, use the full dataset.
-    :type size: Optional[int]
-    :ivar max_len: Maximum length of sequences used in the model.
-    :type max_len: int
-    :ivar embedding_dim: Dimensionality of the embedding layer.
-    :type embedding_dim: int
-    :ivar lstm_units: Number of LSTM units in the model.
-    :type lstm_units: int
-    :ivar batch_size: Batch size to use during training.
-    :type batch_size: int
-    :ivar epochs: Number of epochs for model training.
-    :type epochs: int
-    :ivar test_size: Fraction of data to use for testing.
-    :type test_size: float
-    :ivar random_state: Seed for random number generation to ensure reproducibility.
-    :type random_state: int
-    :ivar threshold: Decision threshold for binary classification tasks.
-    :type threshold: float
-    :ivar cv: Number of cross-validation folds. If None, no cross-validation is used.
-    :type cv: Optional[int]
-    :ivar save: Flag indicating whether to save the trained model.
-    :type save: bool
-    """
-    dataset_path: str
-    size: Optional[int] = None
+class Config(BaseConfig):
    max_len: int = 6
    embedding_dim: int = 64
    lstm_units: int = 32
    batch_size: int = 64
-    epochs: int = 10
-    test_size: float = 0.2
-    random_state: int = 42
-    threshold: float = 0.5
-    cv: Optional[int] = None
-    save: bool = False


 def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
    """
-    Load and preprocess the dataset based on the provided configuration.
-
-    This function performs a series of operations including loading the dataset
-    from the specified path, cleaning and preprocessing data (e.g., converting
-    to lowercase, stripping whitespace, handling missing values), tokenizing names
-    using a tokenizer, and encoding the labels using a label encoder. The final processed
-    data and tools (tokenizer and label encoder) are returned for further use.
-
-    :param cfg: Config object containing dataset parameters such as dataset path, size, and
-        maximum sequence length.
-    :type cfg: Config
-    :return: A tuple containing processed padded sequences (numpy ndarray), corresponding
-        encoded labels (numpy ndarray), tokenizer object used for preprocessing names,
-        and label encoder object used for encoding labels.
-    :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
+    Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
+    This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
+    model training. The resulting outputs are ready for input into a machine learning pipeline.
    """
    logging.info("Loading and preprocessing data")
-    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
-    df["name"] = df["name"].str.lower().str.strip()
-    df["sex"] = df["sex"].str.lower().str.strip()
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))

    tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
    tokenizer.fit_on_texts(df["name"])
@@ -107,6 +48,12 @@ def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, La


 def build_model(cfg: Config, vocab_size: int) -> Sequential:
+    """
+    Builds and compiles a Sequential LSTM-based model. The model consists of an
+    embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
+    activation, and an output layer with a softmax activation function. The model
+    is compiled using sparse categorical crossentropy loss and the Adam optimizer.
+    """
    logging.info("Building LSTM model")
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
@@ -119,60 +66,12 @@ def build_model(cfg: Config, vocab_size: int) -> Sequential:
    return model


-def evaluate_proba(y_true, y_proba, threshold, class_names):
-    """
-    Evaluate the performance of a binary classification model by calculating key metrics and printing
-    a detailed classification report.
-
-    This function thresholds the predicted probabilities to produce binary predictions and calculates
-    metrics such as accuracy, precision, recall, and F1 score. It also generates a confusion matrix
-    and a classification report for the model's performance. Additionally, metrics are logged and
-    informational outputs are printed.
-
-    :param y_true: Ground truth binary labels. Must be a 1-dimensional array or list of integers.
-    :param y_proba: Predicted probabilities for each class from the model. It is a 2-dimensional array
-        where the second dimension represents class probabilities for each sample.
-    :param threshold: Threshold value for converting probabilities into binary predictions. Should be
-        a float between 0 and 1.
-    :param class_names: List of class names corresponding to the binary labels. Used for labeling the
-        classification report.
-    :return: None
-    """
-    y_pred = (y_proba[:, 1] >= threshold).astype(int)
-    acc = accuracy_score(y_true, y_pred)
-    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
-    cm = confusion_matrix(y_true, y_pred)
-
-    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
-    print("Confusion Matrix:\n", cm)
-    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
-
-
 def cross_validate(cfg: Config, X, y, vocab_size: int):
    """
-    Performs k-fold cross-validation on a dataset using a specified model configuration.
-
-    This function takes a dataset and corresponding labels, splits the dataset into
-    k folds (based on the `cv` attribute of the provided configuration object), and
-    performs cross-validation using the specified deep learning model. The model is
-    built and trained on the training subset for each fold, and the validation subset
-    is used to compute accuracy scores. Finally, it logs the individual fold accuracies
-    and the overall mean accuracy with its standard deviation.
-
-    :param cfg: Configuration object containing the parameters for cross-validation,
-                model training, and other settings. `cv` specifies the number of folds,
-                and other attributes such as `epochs`, `batch_size`, and `random_state`
-                dictate the training and reproducibility behavior.
-    :type cfg: Config
-    :param X: Feature data for the dataset. Assumes the input is compatible with the
-              model configuration.
-    :param y: True labels corresponding to the dataset. The order should correspond
-              to the feature set `X`.
-    :param vocab_size: Total vocabulary size used for building the model. Determines
-                       the structure of the model input.
-    :type vocab_size: int
-    :return: A list containing the accuracy scores for each fold.
-    :rtype: List[float]
+    Performs cross-validation on the given dataset using the specified model configuration.
+    The function uses StratifiedKFold cross-validator to split the dataset into training and
+    validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
+    on the validation data, and logs the fold-wise and overall results.
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
@@ -195,23 +94,11 @@ def cross_validate(cfg: Config, X, y, vocab_size: int):

 def save_artifacts(model, tokenizer, encoder):
    """
-    Save the model, tokenizer, and label encoder artifacts to predefined file paths
-    within the GENDER_MODELS_DIR directory. The function ensures that the model is
-    saved in H5 format, while the tokenizer and encoder are serialized using the
-    Pickle module. It logs a message indicating the completion of the saving process.
+    Saves the given model, tokenizer, and encoder artifacts to a predefined directory.

-    :param model: The machine learning model object to be saved.
-    :type model: Any
-
-    :param tokenizer: The tokenizer object used in preprocessing, to be serialized
-        for future use.
-    :type tokenizer: Any
-
-    :param encoder: The label encoder object used for encoding labels during
-        training, to be serialized for future use.
-    :type encoder: Any
-
-    :return: None
+    The function ensures that the specified directory for saving artifacts exists,
+    then serializes the model, tokenizer, and encoder using appropriate formats. It
+    also logs the success of the operation to notify the user of the action taken.
    """
    os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
    model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
@@ -223,21 +110,7 @@ def save_artifacts(model, tokenizer, encoder):


 def main():
-    parser = argparse.ArgumentParser(description="Train BiLSTM model for name-based gender classification")
-    parser.add_argument("--dataset", type=str, default="names.csv")
-    parser.add_argument("--size", type=int)
-    parser.add_argument("--threshold", type=float, default=0.5)
-    parser.add_argument("--cv", type=int)
-    parser.add_argument("--save", action="store_true")
-    args = parser.parse_args()
-
-    cfg = Config(
-        dataset_path=args.dataset,
-        size=args.size,
-        threshold=args.threshold,
-        cv=args.cv,
-        save=args.save
-    )
+    cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))

    X, y, tokenizer, encoder = load_and_prepare(cfg)
    vocab_size = len(tokenizer.word_index) + 1
@@ -1,15 +1,12 @@
-import argparse
-import logging
 import os
 from dataclasses import dataclass
-from typing import Tuple, Optional
+from typing import Tuple

 import numpy as np
 import pandas as pd
 import tensorflow as tf
 from sklearn.metrics import (
-    accuracy_score, precision_recall_fscore_support,
-    classification_report, confusion_matrix
+    accuracy_score
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
@@ -23,56 +20,11 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer

 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
-
-logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+from ners.gender.models import BaseConfig, load_config, evaluate_proba, logging


@dataclass
-class Config:
-    """
-    Configuration data class used to store settings and parameters for a machine learning or deep
-    learning model.
-
-    This class allows the user to specify various parameters such as dataset path, size of input,
-    model architecture details like embedding dimensions, transformer configurations, training settings
-    like batch size and epochs, and validation and testing settings. The attributes provide flexibility
-    to customize model configuration and training processes.
-
-    :ivar dataset_path: The file path to the dataset.
-    :type dataset_path: str
-    :ivar size: Optional size parameter, can be used to specify sample size or custom
-        configuration based on the user's requirement.
-    :type size: Optional[int]
-    :ivar max_len: Maximum sequence length for input data, used often in text or sequence
-        processing.
-    :type max_len: int
-    :ivar embedding_dim: The dimensionality of embeddings used in the model.
-    :type embedding_dim: int
-    :ivar transformer_head_size: The size of each transformer attention head.
-    :type transformer_head_size: int
-    :ivar transformer_num_heads: The number of attention heads in the transformer model.
-    :type transformer_num_heads: int
-    :ivar transformer_ff_dim: The dimensionality of the feed-forward network in the transformer.
-    :type transformer_ff_dim: int
-    :ivar dropout: Dropout rate used for regularization during training.
-    :type dropout: float
-    :ivar batch_size: Batch size used for training and validation.
-    :type batch_size: int
-    :ivar epochs: Number of epochs for model training.
-    :type epochs: int
-    :ivar test_size: Proportion of the dataset to be used for testing.
-    :type test_size: float
-    :ivar random_state: Random seed value for reproducibility.
-    :type random_state: int
-    :ivar threshold: Threshold value for model predictions or classification.
-    :type threshold: float
-    :ivar cv: Cross-validation configuration, if applicable.
-    :type cv: Optional[int]
-    :ivar save: Boolean flag indicating whether to save the model after training.
-    :type save: bool
-    """
-    dataset_path: str
-    size: Optional[int]
+class Config(BaseConfig):
    max_len: int = 6
    embedding_dim: int = 64
    transformer_head_size: int = 64
@@ -80,38 +32,21 @@ class Config:
    transformer_ff_dim: int = 128
    dropout: float = 0.1
    batch_size: int = 64
-    epochs: int = 10
-    test_size: float = 0.2
-    random_state: int = 42
-    threshold: float = 0.5
-    cv: Optional[int] = None
-    save: bool = False


 def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
    """
-    Load and preprocess data for model training or evaluation. This function handles the
-    loading of a dataset in CSV format, applies preprocessing to clean and normalize
-    the input data, tokenizes text features, and encodes categorical labels.
-
-    The preprocessed data is prepared as padded sequences and encoded labels, which
-    can be directly used as inputs for machine learning models. Tokenizer and LabelEncoder
-    are returned to ensure consistency between training and inference stages.
-
-    :param cfg: Configuration object containing dataset path, size of the
-                dataset to load, and maximum length for padding sequences.
-    :type cfg: Config
-    :return: A tuple containing padded input sequences for the model, encoded labels,
-             the tokenizer used for text sequences, and the encoder used for labels.
-    :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
+    Load and preprocess the dataset for training a Transformer model.
+    This function reads a CSV dataset, tokenizes the names, pads the sequences,
+    and encodes the labels. It returns the padded sequences, encoded labels,
+    tokenizer, and label encoder.
    """
    logging.info("Loading and preprocessing data")
-    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
-    df["name"] = df["name"].str.lower().str.strip()
-    df["sex"] = df["sex"].str.lower().str.strip()
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))

    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(df["name"])
+
    sequences = tokenizer.texts_to_sequences(df["name"])
    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")

@@ -122,18 +57,8 @@ def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, La

 def transformer_encoder(x, cfg: Config):
    """
-    Transforms input tensor using a single Transformer encoder block with attention and feedforward
-    layers. The encoder applies multi-head attention to the input tensor, adds the output to
-    the original tensor for residual connection, and normalizes it. Subsequently, the processed
-    tensor passes through a feedforward network with added dropout and normalization.
-
-    :param x: Input tensor to be transformed.
-    :type x: TensorFlow tensor
-    :param cfg: Configuration object containing Transformer hyperparameters such as the number of
-        attention heads, head size, feedforward dimension, and dropout rate.
-    :type cfg: Config
-    :return: Transformed tensor resulting from applying the Transformer encoder block.
-    :rtype: TensorFlow tensor
+    Transformer encoder block that applies multi-head attention and feed-forward
+    neural network layers with residual connections and layer normalization.
    """
    attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
    x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
@@ -145,18 +70,10 @@ def transformer_encoder(x, cfg: Config):

 def build_model(cfg: Config, vocab_size: int) -> Model:
    """
-    Builds a Transformer-based model using Keras/TensorFlow components. The model
-    is designed for classification tasks, utilizing embedding layers with positional
-    encoding, a Transformer encoder block, and fully connected layers for
-    output generation.
-
-    :param cfg: Configuration object containing model-specific hyperparameters
-        such as maximum sequence length, embedding dimensions, etc.
-    :type cfg: Config
-    :param vocab_size: The size of the vocabulary for the embedding layer.
-    :type vocab_size: int
-    :return: A compiled Keras model, ready for training and evaluation.
-    :rtype: Model
+    Builds a Transformer-based model aimed at sequence processing tasks.
+    The model includes an embedding layer integrating positional encodings
+    and a Transformer encoder, followed by a global pooling layer,
+    a dense hidden layer, and a softmax output layer.
    """
    logging.info("Building Transformer model")
    inputs = Input(shape=(cfg.max_len,))
@@ -177,54 +94,11 @@ def build_model(cfg: Config, vocab_size: int) -> Model:
    return model


-def evaluate_proba(y_true, y_proba, threshold, class_names):
-    """
-    Evaluates the performance of a binary classification model by calculating accuracy,
-    precision, recall, F1 score, confusion matrix, and generates a classification
-    report. This function takes the true labels, predicted probabilities, a decision
-    threshold, and class names to assist in the evaluation.
-
-    :param y_true: Ground truth (correct) target values.
-    :type y_true: array-like of shape (n_samples,)
-    :param y_proba: Predicted probabilities for each class. Expected to be an array
-        where the second column corresponds to the probability of the positive class.
-    :type y_proba: array-like of shape (n_samples, 2)
-    :param threshold: Decision threshold for classifying a sample as positive
-        or negative based on predicted probabilities.
-    :type threshold: float
-    :param class_names: List of class names for labeling the classification report.
-    :type class_names: list of str
-    :return: None. Outputs performance metrics and confusion matrix to the logging
-        system and the console.
-    """
-    y_pred = (y_proba[:, 1] >= threshold).astype(int)
-    acc = accuracy_score(y_true, y_pred)
-    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
-    cm = confusion_matrix(y_true, y_pred)
-
-    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
-    print("Confusion Matrix:\n", cm)
-    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
-
-
 def cross_validate(cfg: Config, X, y, vocab_size: int):
    """
-    Evaluate the performance of a model using K-fold cross-validation. This function takes
-    configuration settings, input data, target labels, and vocabulary size to perform the
-    specified number of cross-validation folds with a stratified approach. For each fold,
-    it builds a new model, trains it, predicts the validation set, and calculates accuracy.
-
-    :param cfg: The configuration object containing hyperparameters and settings for
-                cross-validation, random state, and training.
-    :type cfg: Config
-    :param X: The input data samples provided as a dataset.
-    :type X: numpy.ndarray
-    :param y: The target labels corresponding to the input data samples.
-    :type y: numpy.ndarray
-    :param vocab_size: The size of the vocabulary, used to configure the language model.
-    :type vocab_size: int
-    :return: A list containing accuracy scores from each fold in the cross-validation process.
-    :rtype: list
+    Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
+    splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
+    data. The overall mean and standard deviation of accuracies across all folds are logged.
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
@@ -247,14 +121,11 @@ def cross_validate(cfg: Config, X, y, vocab_size: int):

 def save_artifacts(model, tokenizer, encoder):
    """
-    Saves the machine learning model and its associated artifacts such as tokenizer and
-    label encoder to predefined file paths. This function ensures that the model and
-    artifacts can be reloaded later for inference or further use.
-
-    :param model: The machine learning model to be saved.
-    :param tokenizer: The tokenizer used for preparing data for the model.
-    :param encoder: The label encoder used for encoding target labels.
-    :return: None
+    Saves the model and associated artifacts to the designated directory. The model
+    is serialized and saved in a `.keras` file, while the tokenizer and label
+    encoder are serialized into `.pkl` files. If the directory does not exist, it
+    is created automatically. This function also logs the completion of the
+    operation.
    """
    os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
    model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
@@ -266,21 +137,7 @@ def save_artifacts(model, tokenizer, encoder):


 def main():
-    parser = argparse.ArgumentParser(description="Train Transformer model for name-based gender classification")
-    parser.add_argument("--dataset", type=str, default="names.csv")
-    parser.add_argument("--size", type=int)
-    parser.add_argument("--threshold", type=float, default=0.5)
-    parser.add_argument("--cv", type=int)
-    parser.add_argument("--save", action="store_true")
-    args = parser.parse_args()
-
-    cfg = Config(
-        dataset_path=args.dataset,
-        size=args.size,
-        threshold=args.threshold,
-        cv=args.cv,
-        save=args.save
-    )
+    cfg = Config(**vars(load_config("Transformer model")))

    X, y, tokenizer, encoder = load_and_prepare(cfg)
    vocab_size = len(tokenizer.word_index) + 1