From 25f1df46d89fd5d4e1a83e9e6020a59c93694b38 Mon Sep 17 00:00:00 2001
From: bernard-ng <ngandubernard@gmail.com>
Date: Sat, 21 Jun 2025 10:35:48 +0200
Subject: [PATCH] feat: improve inference for logreg model

---
 .gitignore                        |   2 +-
 ners/gender/models/__init__.py    |   0
 ners/gender/models/bilstm.py      | 274 +++++++++++++++++++++++++
 ners/gender/models/regression.py  | 222 +++++++++++++++++++++
 ners/gender/models/transformer.py | 320 ++++++++++++++++++++++++++++++
 5 files changed, 817 insertions(+), 1 deletion(-)
 create mode 100644 ners/gender/models/__init__.py
 create mode 100644 ners/gender/models/bilstm.py
 create mode 100644 ners/gender/models/regression.py
 create mode 100644 ners/gender/models/transformer.py

diff --git a/.gitignore b/.gitignore
index 82cdb45..1068dc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 __pycache__/
 .ipynb_checkpoints/
 *.pyc
-models/
+/models/
 .env.local
 var/
 /dataset/
diff --git a/ners/gender/models/__init__.py b/ners/gender/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ners/gender/models/bilstm.py b/ners/gender/models/bilstm.py
new file mode 100644
index 0000000..4ee268e
--- /dev/null
+++ b/ners/gender/models/bilstm.py
@@ -0,0 +1,274 @@
+import argparse
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+from typing import Tuple, Optional
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import (
+    accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix
+)
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.keras.callbacks import ProgbarLogger
+from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+from misc import GENDER_MODELS_DIR, load_csv_dataset
+
+logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+
+
+@dataclass
+class Config:
+    """
+    Configuration for the machine learning model and its training process.
+
+    This class encapsulates the configuration options necessary for initializing,
+    training, and evaluating a machine learning model. It allows flexibility
+    in specifying dataset details, model parameters, training settings, and
+    options for evaluation. Attributes include paths, numerical parameters,
+    and flags that guide the model's behavior.
+
+    :ivar dataset_path: Path to the dataset file.
+    :type dataset_path: str
+    :ivar size: Optional size of the dataset to use. If None, use the full dataset.
+    :type size: Optional[int]
+    :ivar max_len: Maximum length of sequences used in the model.
+    :type max_len: int
+    :ivar embedding_dim: Dimensionality of the embedding layer.
+    :type embedding_dim: int
+    :ivar lstm_units: Number of LSTM units in the model.
+    :type lstm_units: int
+    :ivar batch_size: Batch size to use during training.
+    :type batch_size: int
+    :ivar epochs: Number of epochs for model training.
+    :type epochs: int
+    :ivar test_size: Fraction of data to use for testing.
+    :type test_size: float
+    :ivar random_state: Seed for random number generation to ensure reproducibility.
+    :type random_state: int
+    :ivar threshold: Decision threshold for binary classification tasks.
+    :type threshold: float
+    :ivar cv: Number of cross-validation folds. If None, no cross-validation is used.
+    :type cv: Optional[int]
+    :ivar save: Flag indicating whether to save the trained model.
+    :type save: bool
+    """
+    dataset_path: str
+    size: Optional[int] = None
+    max_len: int = 6
+    embedding_dim: int = 64
+    lstm_units: int = 32
+    batch_size: int = 64
+    epochs: int = 10
+    test_size: float = 0.2
+    random_state: int = 42
+    threshold: float = 0.5
+    cv: Optional[int] = None
+    save: bool = False
+
+
+def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
+    """
+    Load and preprocess the dataset based on the provided configuration.
+
+    This function performs a series of operations including loading the dataset
+    from the specified path, cleaning and preprocessing data (e.g., converting
+    to lowercase, stripping whitespace, handling missing values), tokenizing names
+    using a tokenizer, and encoding the labels using a label encoder. The final processed
+    data and tools (tokenizer and label encoder) are returned for further use.
+
+    :param cfg: Config object containing dataset parameters such as dataset path, size, and
+        maximum sequence length.
+    :type cfg: Config
+    :return: A tuple containing processed padded sequences (numpy ndarray), corresponding
+        encoded labels (numpy ndarray), tokenizer object used for preprocessing names,
+        and label encoder object used for encoding labels.
+    :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
+    """
+    logging.info("Loading and preprocessing data")
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
+    df["name"] = df["name"].str.lower().str.strip()
+    df["sex"] = df["sex"].str.lower().str.strip()
+
+    tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
+    tokenizer.fit_on_texts(df["name"])
+    sequences = tokenizer.texts_to_sequences(df["name"])
+    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
+
+    label_encoder = LabelEncoder()
+    labels = label_encoder.fit_transform(df["sex"])
+
+    return padded, labels, tokenizer, label_encoder
+
+
+def build_model(cfg: Config, vocab_size: int) -> Sequential:
+    logging.info("Building LSTM model")
+    model = Sequential([
+        Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
+        Bidirectional(LSTM(cfg.lstm_units)),
+        Dense(32, activation="relu"),
+        Dense(2, activation="softmax")
+    ])
+    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+    return model
+
+
+def evaluate_proba(y_true, y_proba, threshold, class_names):
+    """
+    Evaluate the performance of a binary classification model by calculating key metrics and printing
+    a detailed classification report.
+
+    This function thresholds the predicted probabilities to produce binary predictions and calculates
+    metrics such as accuracy, precision, recall, and F1 score. It also generates a confusion matrix
+    and a classification report for the model's performance. Additionally, metrics are logged and
+    informational outputs are printed.
+
+    :param y_true: Ground truth binary labels. Must be a 1-dimensional array or list of integers.
+    :param y_proba: Predicted probabilities for each class from the model. It is a 2-dimensional array
+        where the second dimension represents class probabilities for each sample.
+    :param threshold: Threshold value for converting probabilities into binary predictions. Should be
+        a float between 0 and 1.
+    :param class_names: List of class names corresponding to the binary labels. Used for labeling the
+        classification report.
+    :return: None
+    """
+    y_pred = 1 if y_proba[:, 1] >= threshold else 0
+    acc = accuracy_score(y_true, y_pred)
+    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
+    cm = confusion_matrix(y_true, y_pred)
+
+    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
+    print("Confusion Matrix:\n", cm)
+    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
+
+
+def cross_validate(cfg: Config, X, y, vocab_size: int):
+    """
+    Performs k-fold cross-validation on a dataset using a specified model configuration.
+
+    This function takes a dataset and corresponding labels, splits the dataset into
+    k folds (based on the `cv` attribute of the provided configuration object), and
+    performs cross-validation using the specified deep learning model. The model is
+    built and trained on the training subset for each fold, and the validation subset
+    is used to compute accuracy scores. Finally, it logs the individual fold accuracies
+    and the overall mean accuracy with its standard deviation.
+
+    :param cfg: Configuration object containing the parameters for cross-validation,
+                model training, and other settings. `cv` specifies the number of folds,
+                and other attributes such as `epochs`, `batch_size`, and `random_state`
+                dictate the training and reproducibility behavior.
+    :type cfg: Config
+    :param X: Feature data for the dataset. Assumes the input is compatible with the
+              model configuration.
+    :param y: True labels corresponding to the dataset. The order should correspond
+              to the feature set `X`.
+    :param vocab_size: Total vocabulary size used for building the model. Determines
+                       the structure of the model input.
+    :type vocab_size: int
+    :return: A list containing the accuracy scores for each fold.
+    :rtype: List[float]
+    """
+    logging.info(f"Running {cfg.cv}-fold cross-validation")
+    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
+    accuracies = []
+
+    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
+        logging.info(f"Fold {fold + 1}")
+        model = build_model(cfg, vocab_size)
+        model.fit(X[train_idx], y[train_idx],
+                  epochs=cfg.epochs,
+                  batch_size=cfg.batch_size,
+                  verbose=0)
+        y_pred = model.predict(X[val_idx])
+        acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
+        accuracies.append(acc)
+        logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
+
+    logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
+
+
+def save_artifacts(model, tokenizer, encoder):
+    """
+    Save the model, tokenizer, and label encoder artifacts to predefined file paths
+    within the GENDER_MODELS_DIR directory. The function ensures that the model is
+    saved in H5 format, while the tokenizer and encoder are serialized using the
+    Pickle module. It logs a message indicating the completion of the saving process.
+
+    :param model: The machine learning model object to be saved.
+    :type model: Any
+
+    :param tokenizer: The tokenizer object used in preprocessing, to be serialized
+        for future use.
+    :type tokenizer: Any
+
+    :param encoder: The label encoder object used for encoding labels during
+        training, to be serialized for future use.
+    :type encoder: Any
+
+    :return: None
+    """
+    model_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5")
+    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl")
+    encoder_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl")
+
+    model.save(model_path)
+    with open(tokenizer_path, "wb") as f:
+        pickle.dump(tokenizer, f)
+    with open(encoder_path, "wb") as f:
+        pickle.dump(encoder, f)
+    logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train BiLSTM model for name-based gender classification")
+    parser.add_argument("--dataset", type=str, default="names.csv")
+    parser.add_argument("--size", type=int)
+    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--cv", type=int)
+    parser.add_argument("--save", action="store_true")
+    args = parser.parse_args()
+
+    cfg = Config(
+        dataset_path=args.dataset,
+        size=args.size,
+        threshold=args.threshold,
+        cv=args.cv,
+        save=args.save
+    )
+
+    X, y, tokenizer, encoder = load_and_prepare(cfg)
+    vocab_size = len(tokenizer.word_index) + 1
+
+    if cfg.cv:
+        cross_validate(cfg, X, y, vocab_size)
+        return
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
+    )
+
+    model = build_model(cfg, vocab_size)
+    model.summary()
+
+    logging.info("Training model")
+    model.fit(X_train, y_train,
+              validation_split=0.1,
+              epochs=cfg.epochs,
+              batch_size=cfg.batch_size,
+              callbacks=[ProgbarLogger()])
+
+    y_proba = model.predict(X_test)
+    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
+
+    if cfg.save:
+        save_artifacts(model, tokenizer, encoder)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ners/gender/models/regression.py b/ners/gender/models/regression.py
new file mode 100644
index 0000000..9f13631
--- /dev/null
+++ b/ners/gender/models/regression.py
@@ -0,0 +1,222 @@
+import argparse
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+from typing import Tuple, Optional
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    accuracy_score, classification_report, confusion_matrix,
+    precision_recall_fscore_support
+)
+from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.preprocessing import LabelEncoder
+
+from misc import GENDER_MODELS_DIR, load_csv_dataset
+
+logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+
+@dataclass
+class Config:
+    dataset_path: str
+    size: Optional[int]
+    test_size: float = 0.2
+    ngram_range: Tuple[int, int] = (2, 5)
+    max_iter: int = 1000
+    random_state: int = 42
+    threshold: float = 0.5
+    cv: Optional[int] = None
+    save: bool = False
+
+
+def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]:
+    """
+    Load and clean dataset as specified by the provided configuration. This function reads
+    a CSV dataset from the path specified in the configuration, processes it to remove
+    missing values from key columns ('name' and 'sex'), and cleans string data in these
+    columns by converting them to lowercase and stripping whitespace. The cleaned data
+    is then returned as two separate pandas Series objects.
+
+    :param cfg: Configuration object specifying the dataset path and size
+    :type cfg: Config
+    :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects
+    :rtype: Tuple[pd.Series, pd.Series]
+    """
+    logging.info(f"Loading dataset from {cfg.dataset_path}")
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size))
+    df = df.dropna(subset=["name", "sex"])
+    df["name"] = df["name"].str.lower().str.strip()
+    df["sex"] = df["sex"].str.lower().str.strip()
+    return df["name"], df["sex"]
+
+
+def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
+    """
+    Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical
+    labels to integers, which is particularly useful for machine learning models that require numerical
+    input data.
+
+    :param y: A pandas Series of categorical labels to be encoded.
+    :type y: pd.Series
+    :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object.
+    :rtype: Tuple[pd.Series, LabelEncoder]
+    """
+    logging.info("Encoding labels")
+    encoder = LabelEncoder()
+    y_encoded = encoder.fit_transform(y)
+    return y_encoded, encoder
+
+
+def build_model(cfg: Config) -> Pipeline:
+    """
+    Builds a machine learning pipeline for text classification.
+
+    This function constructs and returns a scikit-learn pipeline that consists of
+    a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer
+    leverages character-level n-grams based on the provided configuration, and the
+    logistic regression model is trained with a maximum number of iterations defined
+    in the configuration. This pipeline is used for processing text data and training
+    classification models.
+
+    :param cfg: Configuration object containing the n-gram range and the maximum
+                number of iterations for the logistic regression model.
+    :type cfg: Config
+    :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression`
+             based on the provided configuration.
+    :rtype: Pipeline
+    """
+    return make_pipeline(
+        CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
+        LogisticRegression(max_iter=cfg.max_iter)
+    )
+
+
+def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
+    """
+    Evaluates the performance of a classification model using a specified threshold
+    for predicted probabilities. Computes metrics such as accuracy, precision,
+    recall, F1-score, and the confusion matrix. Also generates a classification
+    report with detailed metrics for each class.
+
+    Logs the evaluation metrics at the specified threshold and prints the confusion
+    matrix and classification report.
+
+    :param y_true: Ground truth (correct) labels.
+    :type y_true: array-like
+    :param y_proba: Predicted probabilities for each class, where each row
+        corresponds to an instance and contains probabilities for each target class.
+    :type y_proba: numpy.ndarray
+    :param threshold: The threshold on predicted probabilities to determine
+        class membership for each instance.
+    :type threshold: float
+    :param class_names: List of class names for the target variable used in the
+        classification report.
+    :type class_names: list of str
+    :return: None
+    """
+    logging.info(f"Evaluating at threshold = {threshold}")
+    y_pred = (y_proba[:, 1] >= threshold).astype(int)
+    acc = accuracy_score(y_true, y_pred)
+    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
+    cm = confusion_matrix(y_true, y_pred)
+
+    logging.info(f"Accuracy: {acc:.4f}")
+    logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
+    print("Confusion Matrix:\n", cm)
+    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
+
+
+def cross_validate(cfg: Config, X, y) -> None:
+    """
+    Performs k-fold cross-validation on the provided dataset using the configuration and
+    logs the results including individual fold scores, mean accuracy, and the standard
+    deviation of the scores.
+
+    :param cfg: Configuration object containing cross-validation settings such as the
+        number of folds to use in the cross-validation (`cv`).
+    :type cfg: Config
+    :param X: Input feature matrix for the dataset to be used for cross-validation.
+    :type X: Any
+    :param y: Target labels corresponding to the input feature matrix `X`.
+    :type y: Any
+    :return: This function does not return any value. Results are logged.
+    :rtype: None
+    """
+    logging.info(f"Running {cfg.cv}-fold cross-validation")
+    pipeline = build_model(cfg)
+    scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
+    logging.info(f"Cross-validation scores: {scores}")
+    logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
+
+
+def save_artifacts(model, encoder, cfg: Config):
+    """
+    Saves machine learning model and label encoder artifacts to specified directories
+    within the gender models' directory. This function ensures that the model and encoder
+    are serialized and stored as pickle files. It uses the specified configuration settings
+    to locate the appropriate directory for storing the files.
+
+    :param model: The machine learning model object to be saved.
+    :type model: Any
+    :param encoder: The label encoder object used for data preprocessing.
+    :type encoder: Any
+    :param cfg: Configuration object containing application-specific settings regarding
+        paths and directories.
+    :type cfg: Config
+    :return: None
+    """
+    model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
+    encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
+
+    with open(model_path, "wb") as f:
+        pickle.dump(model, f)
+    with open(encoder_path, "wb") as f:
+        pickle.dump(encoder, f)
+    logging.info(f"Saved model to: {model_path}")
+    logging.info(f"Saved label encoder to: {encoder_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train a gender classifier on names")
+    parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset")
+    parser.add_argument("--size", type=int, help="Number of rows to load")
+    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision")
+    parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
+    parser.add_argument("--save", action="store_true", help="Save the model and encoder")
+    args = parser.parse_args()
+
+    cfg = Config(
+        dataset_path=args.dataset,
+        size=args.size,
+        threshold=args.threshold,
+        cv=args.cv,
+        save=args.save
+    )
+
+    X_raw, y_raw = load_and_clean_data(cfg)
+    y_encoded, encoder = encode_labels(y_raw)
+
+    if cfg.cv:
+        cross_validate(cfg, X_raw, y_encoded)
+        return
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
+    )
+
+    model = build_model(cfg)
+    model.fit(X_train, y_train)
+
+    y_proba = model.predict_proba(X_test)
+    evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
+
+    if cfg.save:
+        save_artifacts(model, encoder, cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ners/gender/models/transformer.py b/ners/gender/models/transformer.py
new file mode 100644
index 0000000..d6e94bf
--- /dev/null
+++ b/ners/gender/models/transformer.py
@@ -0,0 +1,320 @@
+import argparse
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+from typing import Tuple, Optional
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.metrics import (
+    accuracy_score, precision_recall_fscore_support,
+    classification_report, confusion_matrix
+)
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.keras.callbacks import ProgbarLogger
+from tensorflow.keras.layers import (
+    Input, Embedding, Dense, GlobalAveragePooling1D,
+    MultiHeadAttention, Dropout, LayerNormalization
+)
+from tensorflow.keras.models import Model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+from misc import GENDER_MODELS_DIR, load_csv_dataset
+
+logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+
+
+@dataclass
+class Config:
+    """
+    Configuration data class used to store settings and parameters for a machine learning or deep
+    learning model.
+
+    This class allows the user to specify various parameters such as dataset path, size of input,
+    model architecture details like embedding dimensions, transformer configurations, training settings
+    like batch size and epochs, and validation and testing settings. The attributes provide flexibility
+    to customize model configuration and training processes.
+
+    :ivar dataset_path: The file path to the dataset.
+    :type dataset_path: str
+    :ivar size: Optional size parameter, can be used to specify sample size or custom
+        configuration based on the user's requirement.
+    :type size: Optional[int]
+    :ivar max_len: Maximum sequence length for input data, used often in text or sequence
+        processing.
+    :type max_len: int
+    :ivar embedding_dim: The dimensionality of embeddings used in the model.
+    :type embedding_dim: int
+    :ivar transformer_head_size: The size of each transformer attention head.
+    :type transformer_head_size: int
+    :ivar transformer_num_heads: The number of attention heads in the transformer model.
+    :type transformer_num_heads: int
+    :ivar transformer_ff_dim: The dimensionality of the feed-forward network in the transformer.
+    :type transformer_ff_dim: int
+    :ivar dropout: Dropout rate used for regularization during training.
+    :type dropout: float
+    :ivar batch_size: Batch size used for training and validation.
+    :type batch_size: int
+    :ivar epochs: Number of epochs for model training.
+    :type epochs: int
+    :ivar test_size: Proportion of the dataset to be used for testing.
+    :type test_size: float
+    :ivar random_state: Random seed value for reproducibility.
+    :type random_state: int
+    :ivar threshold: Threshold value for model predictions or classification.
+    :type threshold: float
+    :ivar cv: Cross-validation configuration, if applicable.
+    :type cv: Optional[int]
+    :ivar save: Boolean flag indicating whether to save the model after training.
+    :type save: bool
+    """
+    dataset_path: str
+    size: Optional[int]
+    max_len: int = 6
+    embedding_dim: int = 64
+    transformer_head_size: int = 64
+    transformer_num_heads: int = 2
+    transformer_ff_dim: int = 128
+    dropout: float = 0.1
+    batch_size: int = 64
+    epochs: int = 10
+    test_size: float = 0.2
+    random_state: int = 42
+    threshold: float = 0.5
+    cv: Optional[int] = None
+    save: bool = False
+
+
+def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
+    """
+    Load and preprocess data for model training or evaluation. This function handles the
+    loading of a dataset in CSV format, applies preprocessing to clean and normalize
+    the input data, tokenizes text features, and encodes categorical labels.
+
+    The preprocessed data is prepared as padded sequences and encoded labels, which
+    can be directly used as inputs for machine learning models. Tokenizer and LabelEncoder
+    are returned to ensure consistency between training and inference stages.
+
+    :param cfg: Configuration object containing dataset path, size of the
+                dataset to load, and maximum length for padding sequences.
+    :type cfg: Config
+    :return: A tuple containing padded input sequences for the model, encoded labels,
+             the tokenizer used for text sequences, and the encoder used for labels.
+    :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
+    """
+    logging.info("Loading and preprocessing data")
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
+    df["name"] = df["name"].str.lower().str.strip()
+    df["sex"] = df["sex"].str.lower().str.strip()
+
+    tokenizer = Tokenizer(oov_token="<OOV>")
+    tokenizer.fit_on_texts(df["name"])
+    sequences = tokenizer.texts_to_sequences(df["name"])
+    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
+
+    encoder = LabelEncoder()
+    labels = encoder.fit_transform(df["sex"])
+    return padded, labels, tokenizer, encoder
+
+
+def transformer_encoder(x, cfg: Config):
+    """
+    Transforms input tensor using a single Transformer encoder block with attention and feedforward
+    layers. The encoder applies multi-head attention to the input tensor, adds the output to
+    the original tensor for residual connection, and normalizes it. Subsequently, the processed
+    tensor passes through a feedforward network with added dropout and normalization.
+
+    :param x: Input tensor to be transformed.
+    :type x: TensorFlow tensor
+    :param cfg: Configuration object containing Transformer hyperparameters such as the number of
+        attention heads, head size, feedforward dimension, and dropout rate.
+    :type cfg: Config
+    :return: Transformed tensor resulting from applying the Transformer encoder block.
+    :rtype: TensorFlow tensor
+    """
+    attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
+    x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
+
+    ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
+    ff = Dense(x.shape[-1])(ff)
+    return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
+
+
+def build_model(cfg: Config, vocab_size: int) -> Model:
+    """
+    Builds a Transformer-based model using Keras/TensorFlow components. The model
+    is designed for classification tasks, utilizing embedding layers with positional
+    encoding, a Transformer encoder block, and fully connected layers for
+    output generation.
+
+    :param cfg: Configuration object containing model-specific hyperparameters
+        such as maximum sequence length, embedding dimensions, etc.
+    :type cfg: Config
+    :param vocab_size: The size of the vocabulary for the embedding layer.
+    :type vocab_size: int
+    :return: A compiled Keras model, ready for training and evaluation.
+    :rtype: Model
+    """
+    logging.info("Building Transformer model")
+    inputs = Input(shape=(cfg.max_len,))
+    x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
+
+    # Add positional encoding
+    positions = tf.range(start=0, limit=cfg.max_len, delta=1)
+    pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
+    x = x + pos_embedding
+
+    x = transformer_encoder(x, cfg)
+    x = GlobalAveragePooling1D()(x)
+    x = Dense(32, activation="relu")(x)
+    outputs = Dense(2, activation="softmax")(x)
+
+    model = Model(inputs, outputs)
+    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    return model
+
+
+def evaluate_proba(y_true, y_proba, threshold, class_names):
+    """
+    Evaluates the performance of a binary classification model by calculating accuracy,
+    precision, recall, F1 score, confusion matrix, and generates a classification
+    report. This function takes the true labels, predicted probabilities, a decision
+    threshold, and class names to assist in the evaluation.
+
+    :param y_true: Ground truth (correct) target values.
+    :type y_true: array-like of shape (n_samples,)
+    :param y_proba: Predicted probabilities for each class. Expected to be an array
+        where the second column corresponds to the probability of the positive class.
+    :type y_proba: array-like of shape (n_samples, 2)
+    :param threshold: Decision threshold for classifying a sample as positive
+        or negative based on predicted probabilities.
+    :type threshold: float
+    :param class_names: List of class names for labeling the classification report.
+    :type class_names: list of str
+    :return: None. Outputs performance metrics and confusion matrix to the logging
+        system and the console.
+    """
+    y_pred = 1 if y_proba[:, 1] >= threshold else 0
+    acc = accuracy_score(y_true, y_pred)
+    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
+    cm = confusion_matrix(y_true, y_pred)
+
+    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
+    print("Confusion Matrix:\n", cm)
+    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
+
+
+def cross_validate(cfg: Config, X, y, vocab_size: int):
+    """
+    Evaluate the performance of a model using K-fold cross-validation. This function takes
+    configuration settings, input data, target labels, and vocabulary size to perform the
+    specified number of cross-validation folds with a stratified approach. For each fold,
+    it builds a new model, trains it, predicts the validation set, and calculates accuracy.
+
+    :param cfg: The configuration object containing hyperparameters and settings for
+                cross-validation, random state, and training.
+    :type cfg: Config
+    :param X: The input data samples provided as a dataset.
+    :type X: numpy.ndarray
+    :param y: The target labels corresponding to the input data samples.
+    :type y: numpy.ndarray
+    :param vocab_size: The size of the vocabulary, used to configure the language model.
+    :type vocab_size: int
+    :return: A list containing accuracy scores from each fold in the cross-validation process.
+    :rtype: list
+    """
+    logging.info(f"Running {cfg.cv}-fold cross-validation")
+    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
+    accuracies = []
+
+    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
+        logging.info(f"Fold {fold + 1}")
+        model = build_model(cfg, vocab_size)
+        model.fit(X[train_idx], y[train_idx],
+                  epochs=cfg.epochs,
+                  batch_size=cfg.batch_size,
+                  verbose=0)
+        y_pred = model.predict(X[val_idx])
+        acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
+        accuracies.append(acc)
+        logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
+
+    logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
+
+
+def save_artifacts(model, tokenizer, encoder):
+    """
+    Saves the machine learning model and its associated artifacts such as tokenizer and
+    label encoder to predefined file paths. This function ensures that the model and
+    artifacts can be reloaded later for inference or further use.
+
+    :param model: The machine learning model to be saved.
+    :param tokenizer: The tokenizer used for preparing data for the model.
+    :param encoder: The label encoder used for encoding target labels.
+    :return: None
+    """
+    model_path = os.path.join(GENDER_MODELS_DIR, "transformer.h5")
+    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
+    encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
+
+    model.save(model_path)
+    with open(tokenizer_path, "wb") as f:
+        pickle.dump(tokenizer, f)
+    with open(encoder_path, "wb") as f:
+        pickle.dump(encoder, f)
+    logging.info("Model and artifacts saved.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train Transformer model for name-based gender classification")
+    parser.add_argument("--dataset", type=str, default="names.csv")
+    parser.add_argument("--size", type=int)
+    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--cv", type=int)
+    parser.add_argument("--save", action="store_true")
+    args = parser.parse_args()
+
+    cfg = Config(
+        dataset_path=args.dataset,
+        size=args.size,
+        threshold=args.threshold,
+        cv=args.cv,
+        save=args.save
+    )
+
+    X, y, tokenizer, encoder = load_and_prepare(cfg)
+    vocab_size = len(tokenizer.word_index) + 1
+
+    if cfg.cv:
+        cross_validate(cfg, X, y, vocab_size)
+        return
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
+    )
+
+    model = build_model(cfg, vocab_size)
+    model.summary()
+
+    logging.info("Training Transformer model")
+    model.fit(
+        X_train, y_train,
+        validation_split=0.1,
+        epochs=cfg.epochs,
+        batch_size=cfg.batch_size,
+        callbacks=[ProgbarLogger()]
+    )
+
+    y_proba = model.predict(X_test)
+    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
+
+    if cfg.save:
+        save_artifacts(model, tokenizer, encoder)
+
+
+if __name__ == "__main__":
+    main()