feat: improve inference for logreg model

2025-06-21 10:35:48 +02:00
parent a46a5f7924
commit 25f1df46d8
5 changed files with 817 additions and 1 deletions
@@ -4,7 +4,7 @@
 __pycache__/
 .ipynb_checkpoints/
 *.pyc
-models/
+/models/
 .env.local
 var/
 /dataset/
@@ -0,0 +1,274 @@
 import argparse
 import logging
 import os
 import pickle
 from dataclasses import dataclass
 from typing import Tuple, Optional
 import numpy as np
 import pandas as pd
 from sklearn.metrics import (
    accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
 from tensorflow.keras.callbacks import ProgbarLogger
 from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from misc import GENDER_MODELS_DIR, load_csv_dataset
 logging.basicConfig(level=logging.INFO, format=">> %(message)s")
@dataclass
 class Config:
    """
    Configuration for the machine learning model and its training process.
    This class encapsulates the configuration options necessary for initializing,
    training, and evaluating a machine learning model. It allows flexibility
    in specifying dataset details, model parameters, training settings, and
    options for evaluation. Attributes include paths, numerical parameters,
    and flags that guide the model's behavior.
    :ivar dataset_path: Path to the dataset file.
    :type dataset_path: str
    :ivar size: Optional size of the dataset to use. If None, use the full dataset.
    :type size: Optional[int]
    :ivar max_len: Maximum length of sequences used in the model.
    :type max_len: int
    :ivar embedding_dim: Dimensionality of the embedding layer.
    :type embedding_dim: int
    :ivar lstm_units: Number of LSTM units in the model.
    :type lstm_units: int
    :ivar batch_size: Batch size to use during training.
    :type batch_size: int
    :ivar epochs: Number of epochs for model training.
    :type epochs: int
    :ivar test_size: Fraction of data to use for testing.
    :type test_size: float
    :ivar random_state: Seed for random number generation to ensure reproducibility.
    :type random_state: int
    :ivar threshold: Decision threshold for binary classification tasks.
    :type threshold: float
    :ivar cv: Number of cross-validation folds. If None, no cross-validation is used.
    :type cv: Optional[int]
    :ivar save: Flag indicating whether to save the trained model.
    :type save: bool
    """
    dataset_path: str
    size: Optional[int] = None
    max_len: int = 6
    embedding_dim: int = 64
    lstm_units: int = 32
    batch_size: int = 64
    epochs: int = 10
    test_size: float = 0.2
    random_state: int = 42
    threshold: float = 0.5
    cv: Optional[int] = None
    save: bool = False
 def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
    """
    Load and preprocess the dataset based on the provided configuration.
    This function performs a series of operations including loading the dataset
    from the specified path, cleaning and preprocessing data (e.g., converting
    to lowercase, stripping whitespace, handling missing values), tokenizing names
    using a tokenizer, and encoding the labels using a label encoder. The final processed
    data and tools (tokenizer and label encoder) are returned for further use.
    :param cfg: Config object containing dataset parameters such as dataset path, size, and
        maximum sequence length.
    :type cfg: Config
    :return: A tuple containing processed padded sequences (numpy ndarray), corresponding
        encoded labels (numpy ndarray), tokenizer object used for preprocessing names,
        and label encoder object used for encoding labels.
    :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
    """
    logging.info("Loading and preprocessing data")
    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
    df["name"] = df["name"].str.lower().str.strip()
    df["sex"] = df["sex"].str.lower().str.strip()
    tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
    tokenizer.fit_on_texts(df["name"])
    sequences = tokenizer.texts_to_sequences(df["name"])
    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(df["sex"])
    return padded, labels, tokenizer, label_encoder
 def build_model(cfg: Config, vocab_size: int) -> Sequential:
    logging.info("Building LSTM model")
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
        Bidirectional(LSTM(cfg.lstm_units)),
        Dense(32, activation="relu"),
        Dense(2, activation="softmax")
    ])
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model
 def evaluate_proba(y_true, y_proba, threshold, class_names):
    """
    Evaluate the performance of a binary classification model by calculating key metrics and printing
    a detailed classification report.
    This function thresholds the predicted probabilities to produce binary predictions and calculates
    metrics such as accuracy, precision, recall, and F1 score. It also generates a confusion matrix
    and a classification report for the model's performance. Additionally, metrics are logged and
    informational outputs are printed.
    :param y_true: Ground truth binary labels. Must be a 1-dimensional array or list of integers.
    :param y_proba: Predicted probabilities for each class from the model. It is a 2-dimensional array
        where the second dimension represents class probabilities for each sample.
    :param threshold: Threshold value for converting probabilities into binary predictions. Should be
        a float between 0 and 1.
    :param class_names: List of class names corresponding to the binary labels. Used for labeling the
        classification report.
    :return: None
    """
    y_pred = 1 if y_proba[:, 1] >= threshold else 0
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    cm = confusion_matrix(y_true, y_pred)
    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
 def cross_validate(cfg: Config, X, y, vocab_size: int):
    """
    Performs k-fold cross-validation on a dataset using a specified model configuration.
    This function takes a dataset and corresponding labels, splits the dataset into
    k folds (based on the `cv` attribute of the provided configuration object), and
    performs cross-validation using the specified deep learning model. The model is
    built and trained on the training subset for each fold, and the validation subset
    is used to compute accuracy scores. Finally, it logs the individual fold accuracies
    and the overall mean accuracy with its standard deviation.
    :param cfg: Configuration object containing the parameters for cross-validation,
                model training, and other settings. `cv` specifies the number of folds,
                and other attributes such as `epochs`, `batch_size`, and `random_state`
                dictate the training and reproducibility behavior.
    :type cfg: Config
    :param X: Feature data for the dataset. Assumes the input is compatible with the
              model configuration.
    :param y: True labels corresponding to the dataset. The order should correspond
              to the feature set `X`.
    :param vocab_size: Total vocabulary size used for building the model. Determines
                       the structure of the model input.
    :type vocab_size: int
    :return: A list containing the accuracy scores for each fold.
    :rtype: List[float]
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
    accuracies = []
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        logging.info(f"Fold {fold + 1}")
        model = build_model(cfg, vocab_size)
        model.fit(X[train_idx], y[train_idx],
                  epochs=cfg.epochs,
                  batch_size=cfg.batch_size,
                  verbose=0)
        y_pred = model.predict(X[val_idx])
        acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
        accuracies.append(acc)
        logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
    logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
 def save_artifacts(model, tokenizer, encoder):
    """
    Save the model, tokenizer, and label encoder artifacts to predefined file paths
    within the GENDER_MODELS_DIR directory. The function ensures that the model is
    saved in H5 format, while the tokenizer and encoder are serialized using the
    Pickle module. It logs a message indicating the completion of the saving process.
    :param model: The machine learning model object to be saved.
    :type model: Any
    :param tokenizer: The tokenizer object used in preprocessing, to be serialized
        for future use.
    :type tokenizer: Any
    :param encoder: The label encoder object used for encoding labels during
        training, to be serialized for future use.
    :type encoder: Any
    :return: None
    """
    model_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5")
    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl")
    encoder_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl")
    model.save(model_path)
    with open(tokenizer_path, "wb") as f:
        pickle.dump(tokenizer, f)
    with open(encoder_path, "wb") as f:
        pickle.dump(encoder, f)
    logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
 def main():
    parser = argparse.ArgumentParser(description="Train BiLSTM model for name-based gender classification")
    parser.add_argument("--dataset", type=str, default="names.csv")
    parser.add_argument("--size", type=int)
    parser.add_argument("--threshold", type=float, default=0.5)
    parser.add_argument("--cv", type=int)
    parser.add_argument("--save", action="store_true")
    args = parser.parse_args()
    cfg = Config(
        dataset_path=args.dataset,
        size=args.size,
        threshold=args.threshold,
        cv=args.cv,
        save=args.save
    )
    X, y, tokenizer, encoder = load_and_prepare(cfg)
    vocab_size = len(tokenizer.word_index) + 1
    if cfg.cv:
        cross_validate(cfg, X, y, vocab_size)
        return
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
    )
    model = build_model(cfg, vocab_size)
    model.summary()
    logging.info("Training model")
    model.fit(X_train, y_train,
              validation_split=0.1,
              epochs=cfg.epochs,
              batch_size=cfg.batch_size,
              callbacks=[ProgbarLogger()])
    y_proba = model.predict(X_test)
    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
    if cfg.save:
        save_artifacts(model, tokenizer, encoder)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,222 @@
 import argparse
 import logging
 import os
 import pickle
 from dataclasses import dataclass
 from typing import Tuple, Optional
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.preprocessing import LabelEncoder
 from misc import GENDER_MODELS_DIR, load_csv_dataset
 logging.basicConfig(level=logging.INFO, format=">> %(message)s")
@dataclass
 class Config:
    dataset_path: str
    size: Optional[int]
    test_size: float = 0.2
    ngram_range: Tuple[int, int] = (2, 5)
    max_iter: int = 1000
    random_state: int = 42
    threshold: float = 0.5
    cv: Optional[int] = None
    save: bool = False
 def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]:
    """
    Load and clean dataset as specified by the provided configuration. This function reads
    a CSV dataset from the path specified in the configuration, processes it to remove
    missing values from key columns ('name' and 'sex'), and cleans string data in these
    columns by converting them to lowercase and stripping whitespace. The cleaned data
    is then returned as two separate pandas Series objects.
    :param cfg: Configuration object specifying the dataset path and size
    :type cfg: Config
    :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects
    :rtype: Tuple[pd.Series, pd.Series]
    """
    logging.info(f"Loading dataset from {cfg.dataset_path}")
    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size))
    df = df.dropna(subset=["name", "sex"])
    df["name"] = df["name"].str.lower().str.strip()
    df["sex"] = df["sex"].str.lower().str.strip()
    return df["name"], df["sex"]
 def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
    """
    Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical
    labels to integers, which is particularly useful for machine learning models that require numerical
    input data.
    :param y: A pandas Series of categorical labels to be encoded.
    :type y: pd.Series
    :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object.
    :rtype: Tuple[pd.Series, LabelEncoder]
    """
    logging.info("Encoding labels")
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)
    return y_encoded, encoder
 def build_model(cfg: Config) -> Pipeline:
    """
    Builds a machine learning pipeline for text classification.
    This function constructs and returns a scikit-learn pipeline that consists of
    a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer
    leverages character-level n-grams based on the provided configuration, and the
    logistic regression model is trained with a maximum number of iterations defined
    in the configuration. This pipeline is used for processing text data and training
    classification models.
    :param cfg: Configuration object containing the n-gram range and the maximum
                number of iterations for the logistic regression model.
    :type cfg: Config
    :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression`
             based on the provided configuration.
    :rtype: Pipeline
    """
    return make_pipeline(
        CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
        LogisticRegression(max_iter=cfg.max_iter)
    )
 def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
    """
    Evaluates the performance of a classification model using a specified threshold
    for predicted probabilities. Computes metrics such as accuracy, precision,
    recall, F1-score, and the confusion matrix. Also generates a classification
    report with detailed metrics for each class.
    Logs the evaluation metrics at the specified threshold and prints the confusion
    matrix and classification report.
    :param y_true: Ground truth (correct) labels.
    :type y_true: array-like
    :param y_proba: Predicted probabilities for each class, where each row
        corresponds to an instance and contains probabilities for each target class.
    :type y_proba: numpy.ndarray
    :param threshold: The threshold on predicted probabilities to determine
        class membership for each instance.
    :type threshold: float
    :param class_names: List of class names for the target variable used in the
        classification report.
    :type class_names: list of str
    :return: None
    """
    logging.info(f"Evaluating at threshold = {threshold}")
    y_pred = (y_proba[:, 1] >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    cm = confusion_matrix(y_true, y_pred)
    logging.info(f"Accuracy: {acc:.4f}")
    logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
 def cross_validate(cfg: Config, X, y) -> None:
    """
    Performs k-fold cross-validation on the provided dataset using the configuration and
    logs the results including individual fold scores, mean accuracy, and the standard
    deviation of the scores.
    :param cfg: Configuration object containing cross-validation settings such as the
        number of folds to use in the cross-validation (`cv`).
    :type cfg: Config
    :param X: Input feature matrix for the dataset to be used for cross-validation.
    :type X: Any
    :param y: Target labels corresponding to the input feature matrix `X`.
    :type y: Any
    :return: This function does not return any value. Results are logged.
    :rtype: None
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    pipeline = build_model(cfg)
    scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
    logging.info(f"Cross-validation scores: {scores}")
    logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
 def save_artifacts(model, encoder, cfg: Config):
    """
    Saves machine learning model and label encoder artifacts to specified directories
    within the gender models' directory. This function ensures that the model and encoder
    are serialized and stored as pickle files. It uses the specified configuration settings
    to locate the appropriate directory for storing the files.
    :param model: The machine learning model object to be saved.
    :type model: Any
    :param encoder: The label encoder object used for data preprocessing.
    :type encoder: Any
    :param cfg: Configuration object containing application-specific settings regarding
        paths and directories.
    :type cfg: Config
    :return: None
    """
    model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
    encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    with open(encoder_path, "wb") as f:
        pickle.dump(encoder, f)
    logging.info(f"Saved model to: {model_path}")
    logging.info(f"Saved label encoder to: {encoder_path}")
 def main():
    parser = argparse.ArgumentParser(description="Train a gender classifier on names")
    parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset")
    parser.add_argument("--size", type=int, help="Number of rows to load")
    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision")
    parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
    parser.add_argument("--save", action="store_true", help="Save the model and encoder")
    args = parser.parse_args()
    cfg = Config(
        dataset_path=args.dataset,
        size=args.size,
        threshold=args.threshold,
        cv=args.cv,
        save=args.save
    )
    X_raw, y_raw = load_and_clean_data(cfg)
    y_encoded, encoder = encode_labels(y_raw)
    if cfg.cv:
        cross_validate(cfg, X_raw, y_encoded)
        return
    X_train, X_test, y_train, y_test = train_test_split(
        X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
    )
    model = build_model(cfg)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)
    evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
    if cfg.save:
        save_artifacts(model, encoder, cfg)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,320 @@
 import argparse
 import logging
 import os
 import pickle
 from dataclasses import dataclass
 from typing import Tuple, Optional
 import numpy as np
 import pandas as pd
 import tensorflow as tf
 from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    classification_report, confusion_matrix
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
 from tensorflow.keras.callbacks import ProgbarLogger
 from tensorflow.keras.layers import (
    Input, Embedding, Dense, GlobalAveragePooling1D,
    MultiHeadAttention, Dropout, LayerNormalization
 )
 from tensorflow.keras.models import Model
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from misc import GENDER_MODELS_DIR, load_csv_dataset
 logging.basicConfig(level=logging.INFO, format=">> %(message)s")
@dataclass
 class Config:
    """
    Configuration data class used to store settings and parameters for a machine learning or deep
    learning model.
    This class allows the user to specify various parameters such as dataset path, size of input,
    model architecture details like embedding dimensions, transformer configurations, training settings
    like batch size and epochs, and validation and testing settings. The attributes provide flexibility
    to customize model configuration and training processes.
    :ivar dataset_path: The file path to the dataset.
    :type dataset_path: str
    :ivar size: Optional size parameter, can be used to specify sample size or custom
        configuration based on the user's requirement.
    :type size: Optional[int]
    :ivar max_len: Maximum sequence length for input data, used often in text or sequence
        processing.
    :type max_len: int
    :ivar embedding_dim: The dimensionality of embeddings used in the model.
    :type embedding_dim: int
    :ivar transformer_head_size: The size of each transformer attention head.
    :type transformer_head_size: int
    :ivar transformer_num_heads: The number of attention heads in the transformer model.
    :type transformer_num_heads: int
    :ivar transformer_ff_dim: The dimensionality of the feed-forward network in the transformer.
    :type transformer_ff_dim: int
    :ivar dropout: Dropout rate used for regularization during training.
    :type dropout: float
    :ivar batch_size: Batch size used for training and validation.
    :type batch_size: int
    :ivar epochs: Number of epochs for model training.
    :type epochs: int
    :ivar test_size: Proportion of the dataset to be used for testing.
    :type test_size: float
    :ivar random_state: Random seed value for reproducibility.
    :type random_state: int
    :ivar threshold: Threshold value for model predictions or classification.
    :type threshold: float
    :ivar cv: Cross-validation configuration, if applicable.
    :type cv: Optional[int]
    :ivar save: Boolean flag indicating whether to save the model after training.
    :type save: bool
    """
    dataset_path: str
    size: Optional[int]
    max_len: int = 6
    embedding_dim: int = 64
    transformer_head_size: int = 64
    transformer_num_heads: int = 2
    transformer_ff_dim: int = 128
    dropout: float = 0.1
    batch_size: int = 64
    epochs: int = 10
    test_size: float = 0.2
    random_state: int = 42
    threshold: float = 0.5
    cv: Optional[int] = None
    save: bool = False
 def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
    """
    Load and preprocess data for model training or evaluation. This function handles the
    loading of a dataset in CSV format, applies preprocessing to clean and normalize
    the input data, tokenizes text features, and encodes categorical labels.
    The preprocessed data is prepared as padded sequences and encoded labels, which
    can be directly used as inputs for machine learning models. Tokenizer and LabelEncoder
    are returned to ensure consistency between training and inference stages.
    :param cfg: Configuration object containing dataset path, size of the
                dataset to load, and maximum length for padding sequences.
    :type cfg: Config
    :return: A tuple containing padded input sequences for the model, encoded labels,
             the tokenizer used for text sequences, and the encoder used for labels.
    :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
    """
    logging.info("Loading and preprocessing data")
    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
    df["name"] = df["name"].str.lower().str.strip()
    df["sex"] = df["sex"].str.lower().str.strip()
    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(df["name"])
    sequences = tokenizer.texts_to_sequences(df["name"])
    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
    encoder = LabelEncoder()
    labels = encoder.fit_transform(df["sex"])
    return padded, labels, tokenizer, encoder
 def transformer_encoder(x, cfg: Config):
    """
    Transforms input tensor using a single Transformer encoder block with attention and feedforward
    layers. The encoder applies multi-head attention to the input tensor, adds the output to
    the original tensor for residual connection, and normalizes it. Subsequently, the processed
    tensor passes through a feedforward network with added dropout and normalization.
    :param x: Input tensor to be transformed.
    :type x: TensorFlow tensor
    :param cfg: Configuration object containing Transformer hyperparameters such as the number of
        attention heads, head size, feedforward dimension, and dropout rate.
    :type cfg: Config
    :return: Transformed tensor resulting from applying the Transformer encoder block.
    :rtype: TensorFlow tensor
    """
    attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
    x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
    ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
    ff = Dense(x.shape[-1])(ff)
    return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
 def build_model(cfg: Config, vocab_size: int) -> Model:
    """
    Builds a Transformer-based model using Keras/TensorFlow components. The model
    is designed for classification tasks, utilizing embedding layers with positional
    encoding, a Transformer encoder block, and fully connected layers for
    output generation.
    :param cfg: Configuration object containing model-specific hyperparameters
        such as maximum sequence length, embedding dimensions, etc.
    :type cfg: Config
    :param vocab_size: The size of the vocabulary for the embedding layer.
    :type vocab_size: int
    :return: A compiled Keras model, ready for training and evaluation.
    :rtype: Model
    """
    logging.info("Building Transformer model")
    inputs = Input(shape=(cfg.max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
    # Add positional encoding
    positions = tf.range(start=0, limit=cfg.max_len, delta=1)
    pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
    x = x + pos_embedding
    x = transformer_encoder(x, cfg)
    x = GlobalAveragePooling1D()(x)
    x = Dense(32, activation="relu")(x)
    outputs = Dense(2, activation="softmax")(x)
    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model
 def evaluate_proba(y_true, y_proba, threshold, class_names):
    """
    Evaluates the performance of a binary classification model by calculating accuracy,
    precision, recall, F1 score, confusion matrix, and generates a classification
    report. This function takes the true labels, predicted probabilities, a decision
    threshold, and class names to assist in the evaluation.
    :param y_true: Ground truth (correct) target values.
    :type y_true: array-like of shape (n_samples,)
    :param y_proba: Predicted probabilities for each class. Expected to be an array
        where the second column corresponds to the probability of the positive class.
    :type y_proba: array-like of shape (n_samples, 2)
    :param threshold: Decision threshold for classifying a sample as positive
        or negative based on predicted probabilities.
    :type threshold: float
    :param class_names: List of class names for labeling the classification report.
    :type class_names: list of str
    :return: None. Outputs performance metrics and confusion matrix to the logging
        system and the console.
    """
    y_pred = 1 if y_proba[:, 1] >= threshold else 0
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    cm = confusion_matrix(y_true, y_pred)
    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
 def cross_validate(cfg: Config, X, y, vocab_size: int):
    """
    Evaluate the performance of a model using K-fold cross-validation. This function takes
    configuration settings, input data, target labels, and vocabulary size to perform the
    specified number of cross-validation folds with a stratified approach. For each fold,
    it builds a new model, trains it, predicts the validation set, and calculates accuracy.
    :param cfg: The configuration object containing hyperparameters and settings for
                cross-validation, random state, and training.
    :type cfg: Config
    :param X: The input data samples provided as a dataset.
    :type X: numpy.ndarray
    :param y: The target labels corresponding to the input data samples.
    :type y: numpy.ndarray
    :param vocab_size: The size of the vocabulary, used to configure the language model.
    :type vocab_size: int
    :return: A list containing accuracy scores from each fold in the cross-validation process.
    :rtype: list
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
    accuracies = []
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        logging.info(f"Fold {fold + 1}")
        model = build_model(cfg, vocab_size)
        model.fit(X[train_idx], y[train_idx],
                  epochs=cfg.epochs,
                  batch_size=cfg.batch_size,
                  verbose=0)
        y_pred = model.predict(X[val_idx])
        acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
        accuracies.append(acc)
        logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
    logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
 def save_artifacts(model, tokenizer, encoder):
    """
    Saves the machine learning model and its associated artifacts such as tokenizer and
    label encoder to predefined file paths. This function ensures that the model and
    artifacts can be reloaded later for inference or further use.
    :param model: The machine learning model to be saved.
    :param tokenizer: The tokenizer used for preparing data for the model.
    :param encoder: The label encoder used for encoding target labels.
    :return: None
    """
    model_path = os.path.join(GENDER_MODELS_DIR, "transformer.h5")
    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
    encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
    model.save(model_path)
    with open(tokenizer_path, "wb") as f:
        pickle.dump(tokenizer, f)
    with open(encoder_path, "wb") as f:
        pickle.dump(encoder, f)
    logging.info("Model and artifacts saved.")
 def main():
    parser = argparse.ArgumentParser(description="Train Transformer model for name-based gender classification")
    parser.add_argument("--dataset", type=str, default="names.csv")
    parser.add_argument("--size", type=int)
    parser.add_argument("--threshold", type=float, default=0.5)
    parser.add_argument("--cv", type=int)
    parser.add_argument("--save", action="store_true")
    args = parser.parse_args()
    cfg = Config(
        dataset_path=args.dataset,
        size=args.size,
        threshold=args.threshold,
        cv=args.cv,
        save=args.save
    )
    X, y, tokenizer, encoder = load_and_prepare(cfg)
    vocab_size = len(tokenizer.word_index) + 1
    if cfg.cv:
        cross_validate(cfg, X, y, vocab_size)
        return
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
    )
    model = build_model(cfg, vocab_size)
    model.summary()
    logging.info("Training Transformer model")
    model.fit(
        X_train, y_train,
        validation_split=0.1,
        epochs=cfg.epochs,
        batch_size=cfg.batch_size,
        callbacks=[ProgbarLogger()]
    )
    y_proba = model.predict(X_test)
    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
    if cfg.save:
        save_artifacts(model, tokenizer, encoder)
 if __name__ == "__main__":
    main()