From 25f1df46d89fd5d4e1a83e9e6020a59c93694b38 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Sat, 21 Jun 2025 10:35:48 +0200 Subject: [PATCH] feat: improve inference for logreg model --- .gitignore | 2 +- ners/gender/models/__init__.py | 0 ners/gender/models/bilstm.py | 274 +++++++++++++++++++++++++ ners/gender/models/regression.py | 222 +++++++++++++++++++++ ners/gender/models/transformer.py | 320 ++++++++++++++++++++++++++++++ 5 files changed, 817 insertions(+), 1 deletion(-) create mode 100644 ners/gender/models/__init__.py create mode 100644 ners/gender/models/bilstm.py create mode 100644 ners/gender/models/regression.py create mode 100644 ners/gender/models/transformer.py diff --git a/.gitignore b/.gitignore index 82cdb45..1068dc2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ __pycache__/ .ipynb_checkpoints/ *.pyc -models/ +/models/ .env.local var/ /dataset/ diff --git a/ners/gender/models/__init__.py b/ners/gender/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ners/gender/models/bilstm.py b/ners/gender/models/bilstm.py new file mode 100644 index 0000000..4ee268e --- /dev/null +++ b/ners/gender/models/bilstm.py @@ -0,0 +1,274 @@ +import argparse +import logging +import os +import pickle +from dataclasses import dataclass +from typing import Tuple, Optional + +import numpy as np +import pandas as pd +from sklearn.metrics import ( + accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix +) +from sklearn.model_selection import train_test_split, StratifiedKFold +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.callbacks import ProgbarLogger +from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense +from tensorflow.keras.models import Sequential +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer + +from misc import GENDER_MODELS_DIR, load_csv_dataset + +logging.basicConfig(level=logging.INFO, format=">> %(message)s") + + +@dataclass +class Config: + """ + Configuration for the machine learning model and its training process. + + This class encapsulates the configuration options necessary for initializing, + training, and evaluating a machine learning model. It allows flexibility + in specifying dataset details, model parameters, training settings, and + options for evaluation. Attributes include paths, numerical parameters, + and flags that guide the model's behavior. + + :ivar dataset_path: Path to the dataset file. + :type dataset_path: str + :ivar size: Optional size of the dataset to use. If None, use the full dataset. + :type size: Optional[int] + :ivar max_len: Maximum length of sequences used in the model. + :type max_len: int + :ivar embedding_dim: Dimensionality of the embedding layer. + :type embedding_dim: int + :ivar lstm_units: Number of LSTM units in the model. + :type lstm_units: int + :ivar batch_size: Batch size to use during training. + :type batch_size: int + :ivar epochs: Number of epochs for model training. + :type epochs: int + :ivar test_size: Fraction of data to use for testing. + :type test_size: float + :ivar random_state: Seed for random number generation to ensure reproducibility. + :type random_state: int + :ivar threshold: Decision threshold for binary classification tasks. + :type threshold: float + :ivar cv: Number of cross-validation folds. If None, no cross-validation is used. + :type cv: Optional[int] + :ivar save: Flag indicating whether to save the trained model. + :type save: bool + """ + dataset_path: str + size: Optional[int] = None + max_len: int = 6 + embedding_dim: int = 64 + lstm_units: int = 32 + batch_size: int = 64 + epochs: int = 10 + test_size: float = 0.2 + random_state: int = 42 + threshold: float = 0.5 + cv: Optional[int] = None + save: bool = False + + +def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]: + """ + Load and preprocess the dataset based on the provided configuration. + + This function performs a series of operations including loading the dataset + from the specified path, cleaning and preprocessing data (e.g., converting + to lowercase, stripping whitespace, handling missing values), tokenizing names + using a tokenizer, and encoding the labels using a label encoder. The final processed + data and tools (tokenizer and label encoder) are returned for further use. + + :param cfg: Config object containing dataset parameters such as dataset path, size, and + maximum sequence length. + :type cfg: Config + :return: A tuple containing processed padded sequences (numpy ndarray), corresponding + encoded labels (numpy ndarray), tokenizer object used for preprocessing names, + and label encoder object used for encoding labels. + :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder] + """ + logging.info("Loading and preprocessing data") + df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"]) + df["name"] = df["name"].str.lower().str.strip() + df["sex"] = df["sex"].str.lower().str.strip() + + tokenizer = Tokenizer(char_level=False, lower=True, oov_token="") + tokenizer.fit_on_texts(df["name"]) + sequences = tokenizer.texts_to_sequences(df["name"]) + padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post") + + label_encoder = LabelEncoder() + labels = label_encoder.fit_transform(df["sex"]) + + return padded, labels, tokenizer, label_encoder + + +def build_model(cfg: Config, vocab_size: int) -> Sequential: + logging.info("Building LSTM model") + model = Sequential([ + Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim), + Bidirectional(LSTM(cfg.lstm_units)), + Dense(32, activation="relu"), + Dense(2, activation="softmax") + ]) + model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) + return model + + +def evaluate_proba(y_true, y_proba, threshold, class_names): + """ + Evaluate the performance of a binary classification model by calculating key metrics and printing + a detailed classification report. + + This function thresholds the predicted probabilities to produce binary predictions and calculates + metrics such as accuracy, precision, recall, and F1 score. It also generates a confusion matrix + and a classification report for the model's performance. Additionally, metrics are logged and + informational outputs are printed. + + :param y_true: Ground truth binary labels. Must be a 1-dimensional array or list of integers. + :param y_proba: Predicted probabilities for each class from the model. It is a 2-dimensional array + where the second dimension represents class probabilities for each sample. + :param threshold: Threshold value for converting probabilities into binary predictions. Should be + a float between 0 and 1. + :param class_names: List of class names corresponding to the binary labels. Used for labeling the + classification report. + :return: None + """ + y_pred = 1 if y_proba[:, 1] >= threshold else 0 + acc = accuracy_score(y_true, y_pred) + pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary') + cm = confusion_matrix(y_true, y_pred) + + logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}") + print("Confusion Matrix:\n", cm) + print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) + + +def cross_validate(cfg: Config, X, y, vocab_size: int): + """ + Performs k-fold cross-validation on a dataset using a specified model configuration. + + This function takes a dataset and corresponding labels, splits the dataset into + k folds (based on the `cv` attribute of the provided configuration object), and + performs cross-validation using the specified deep learning model. The model is + built and trained on the training subset for each fold, and the validation subset + is used to compute accuracy scores. Finally, it logs the individual fold accuracies + and the overall mean accuracy with its standard deviation. + + :param cfg: Configuration object containing the parameters for cross-validation, + model training, and other settings. `cv` specifies the number of folds, + and other attributes such as `epochs`, `batch_size`, and `random_state` + dictate the training and reproducibility behavior. + :type cfg: Config + :param X: Feature data for the dataset. Assumes the input is compatible with the + model configuration. + :param y: True labels corresponding to the dataset. The order should correspond + to the feature set `X`. + :param vocab_size: Total vocabulary size used for building the model. Determines + the structure of the model input. + :type vocab_size: int + :return: A list containing the accuracy scores for each fold. + :rtype: List[float] + """ + logging.info(f"Running {cfg.cv}-fold cross-validation") + skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state) + accuracies = [] + + for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)): + logging.info(f"Fold {fold + 1}") + model = build_model(cfg, vocab_size) + model.fit(X[train_idx], y[train_idx], + epochs=cfg.epochs, + batch_size=cfg.batch_size, + verbose=0) + y_pred = model.predict(X[val_idx]) + acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1)) + accuracies.append(acc) + logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}") + + logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}") + + +def save_artifacts(model, tokenizer, encoder): + """ + Save the model, tokenizer, and label encoder artifacts to predefined file paths + within the GENDER_MODELS_DIR directory. The function ensures that the model is + saved in H5 format, while the tokenizer and encoder are serialized using the + Pickle module. It logs a message indicating the completion of the saving process. + + :param model: The machine learning model object to be saved. + :type model: Any + + :param tokenizer: The tokenizer object used in preprocessing, to be serialized + for future use. + :type tokenizer: Any + + :param encoder: The label encoder object used for encoding labels during + training, to be serialized for future use. + :type encoder: Any + + :return: None + """ + model_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5") + tokenizer_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl") + encoder_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl") + + model.save(model_path) + with open(tokenizer_path, "wb") as f: + pickle.dump(tokenizer, f) + with open(encoder_path, "wb") as f: + pickle.dump(encoder, f) + logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}") + + +def main(): + parser = argparse.ArgumentParser(description="Train BiLSTM model for name-based gender classification") + parser.add_argument("--dataset", type=str, default="names.csv") + parser.add_argument("--size", type=int) + parser.add_argument("--threshold", type=float, default=0.5) + parser.add_argument("--cv", type=int) + parser.add_argument("--save", action="store_true") + args = parser.parse_args() + + cfg = Config( + dataset_path=args.dataset, + size=args.size, + threshold=args.threshold, + cv=args.cv, + save=args.save + ) + + X, y, tokenizer, encoder = load_and_prepare(cfg) + vocab_size = len(tokenizer.word_index) + 1 + + if cfg.cv: + cross_validate(cfg, X, y, vocab_size) + return + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y + ) + + model = build_model(cfg, vocab_size) + model.summary() + + logging.info("Training model") + model.fit(X_train, y_train, + validation_split=0.1, + epochs=cfg.epochs, + batch_size=cfg.batch_size, + callbacks=[ProgbarLogger()]) + + y_proba = model.predict(X_test) + evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_) + + if cfg.save: + save_artifacts(model, tokenizer, encoder) + + +if __name__ == "__main__": + main() diff --git a/ners/gender/models/regression.py b/ners/gender/models/regression.py new file mode 100644 index 0000000..9f13631 --- /dev/null +++ b/ners/gender/models/regression.py @@ -0,0 +1,222 @@ +import argparse +import logging +import os +import pickle +from dataclasses import dataclass +from typing import Tuple, Optional + +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ( + accuracy_score, classification_report, confusion_matrix, + precision_recall_fscore_support +) +from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score +from sklearn.pipeline import make_pipeline, Pipeline +from sklearn.preprocessing import LabelEncoder + +from misc import GENDER_MODELS_DIR, load_csv_dataset + +logging.basicConfig(level=logging.INFO, format=">> %(message)s") + +@dataclass +class Config: + dataset_path: str + size: Optional[int] + test_size: float = 0.2 + ngram_range: Tuple[int, int] = (2, 5) + max_iter: int = 1000 + random_state: int = 42 + threshold: float = 0.5 + cv: Optional[int] = None + save: bool = False + + +def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]: + """ + Load and clean dataset as specified by the provided configuration. This function reads + a CSV dataset from the path specified in the configuration, processes it to remove + missing values from key columns ('name' and 'sex'), and cleans string data in these + columns by converting them to lowercase and stripping whitespace. The cleaned data + is then returned as two separate pandas Series objects. + + :param cfg: Configuration object specifying the dataset path and size + :type cfg: Config + :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects + :rtype: Tuple[pd.Series, pd.Series] + """ + logging.info(f"Loading dataset from {cfg.dataset_path}") + df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)) + df = df.dropna(subset=["name", "sex"]) + df["name"] = df["name"].str.lower().str.strip() + df["sex"] = df["sex"].str.lower().str.strip() + return df["name"], df["sex"] + + +def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]: + """ + Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical + labels to integers, which is particularly useful for machine learning models that require numerical + input data. + + :param y: A pandas Series of categorical labels to be encoded. + :type y: pd.Series + :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object. + :rtype: Tuple[pd.Series, LabelEncoder] + """ + logging.info("Encoding labels") + encoder = LabelEncoder() + y_encoded = encoder.fit_transform(y) + return y_encoded, encoder + + +def build_model(cfg: Config) -> Pipeline: + """ + Builds a machine learning pipeline for text classification. + + This function constructs and returns a scikit-learn pipeline that consists of + a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer + leverages character-level n-grams based on the provided configuration, and the + logistic regression model is trained with a maximum number of iterations defined + in the configuration. This pipeline is used for processing text data and training + classification models. + + :param cfg: Configuration object containing the n-gram range and the maximum + number of iterations for the logistic regression model. + :type cfg: Config + :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression` + based on the provided configuration. + :rtype: Pipeline + """ + return make_pipeline( + CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range), + LogisticRegression(max_iter=cfg.max_iter) + ) + + +def evaluate_probabilities(y_true, y_proba, threshold: float, class_names): + """ + Evaluates the performance of a classification model using a specified threshold + for predicted probabilities. Computes metrics such as accuracy, precision, + recall, F1-score, and the confusion matrix. Also generates a classification + report with detailed metrics for each class. + + Logs the evaluation metrics at the specified threshold and prints the confusion + matrix and classification report. + + :param y_true: Ground truth (correct) labels. + :type y_true: array-like + :param y_proba: Predicted probabilities for each class, where each row + corresponds to an instance and contains probabilities for each target class. + :type y_proba: numpy.ndarray + :param threshold: The threshold on predicted probabilities to determine + class membership for each instance. + :type threshold: float + :param class_names: List of class names for the target variable used in the + classification report. + :type class_names: list of str + :return: None + """ + logging.info(f"Evaluating at threshold = {threshold}") + y_pred = (y_proba[:, 1] >= threshold).astype(int) + acc = accuracy_score(y_true, y_pred) + pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary') + cm = confusion_matrix(y_true, y_pred) + + logging.info(f"Accuracy: {acc:.4f}") + logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}") + print("Confusion Matrix:\n", cm) + print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) + + +def cross_validate(cfg: Config, X, y) -> None: + """ + Performs k-fold cross-validation on the provided dataset using the configuration and + logs the results including individual fold scores, mean accuracy, and the standard + deviation of the scores. + + :param cfg: Configuration object containing cross-validation settings such as the + number of folds to use in the cross-validation (`cv`). + :type cfg: Config + :param X: Input feature matrix for the dataset to be used for cross-validation. + :type X: Any + :param y: Target labels corresponding to the input feature matrix `X`. + :type y: Any + :return: This function does not return any value. Results are logged. + :rtype: None + """ + logging.info(f"Running {cfg.cv}-fold cross-validation") + pipeline = build_model(cfg) + scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy") + logging.info(f"Cross-validation scores: {scores}") + logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}") + + +def save_artifacts(model, encoder, cfg: Config): + """ + Saves machine learning model and label encoder artifacts to specified directories + within the gender models' directory. This function ensures that the model and encoder + are serialized and stored as pickle files. It uses the specified configuration settings + to locate the appropriate directory for storing the files. + + :param model: The machine learning model object to be saved. + :type model: Any + :param encoder: The label encoder object used for data preprocessing. + :type encoder: Any + :param cfg: Configuration object containing application-specific settings regarding + paths and directories. + :type cfg: Config + :return: None + """ + model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl") + encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl") + + with open(model_path, "wb") as f: + pickle.dump(model, f) + with open(encoder_path, "wb") as f: + pickle.dump(encoder, f) + logging.info(f"Saved model to: {model_path}") + logging.info(f"Saved label encoder to: {encoder_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Train a gender classifier on names") + parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset") + parser.add_argument("--size", type=int, help="Number of rows to load") + parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision") + parser.add_argument("--cv", type=int, help="Number of folds for cross-validation") + parser.add_argument("--save", action="store_true", help="Save the model and encoder") + args = parser.parse_args() + + cfg = Config( + dataset_path=args.dataset, + size=args.size, + threshold=args.threshold, + cv=args.cv, + save=args.save + ) + + X_raw, y_raw = load_and_clean_data(cfg) + y_encoded, encoder = encode_labels(y_raw) + + if cfg.cv: + cross_validate(cfg, X_raw, y_encoded) + return + + X_train, X_test, y_train, y_test = train_test_split( + X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded + ) + + model = build_model(cfg) + model.fit(X_train, y_train) + + y_proba = model.predict_proba(X_test) + evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_) + + if cfg.save: + save_artifacts(model, encoder, cfg) + + +if __name__ == "__main__": + main() diff --git a/ners/gender/models/transformer.py b/ners/gender/models/transformer.py new file mode 100644 index 0000000..d6e94bf --- /dev/null +++ b/ners/gender/models/transformer.py @@ -0,0 +1,320 @@ +import argparse +import logging +import os +import pickle +from dataclasses import dataclass +from typing import Tuple, Optional + +import numpy as np +import pandas as pd +import tensorflow as tf +from sklearn.metrics import ( + accuracy_score, precision_recall_fscore_support, + classification_report, confusion_matrix +) +from sklearn.model_selection import train_test_split, StratifiedKFold +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.callbacks import ProgbarLogger +from tensorflow.keras.layers import ( + Input, Embedding, Dense, GlobalAveragePooling1D, + MultiHeadAttention, Dropout, LayerNormalization +) +from tensorflow.keras.models import Model +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer + +from misc import GENDER_MODELS_DIR, load_csv_dataset + +logging.basicConfig(level=logging.INFO, format=">> %(message)s") + + +@dataclass +class Config: + """ + Configuration data class used to store settings and parameters for a machine learning or deep + learning model. + + This class allows the user to specify various parameters such as dataset path, size of input, + model architecture details like embedding dimensions, transformer configurations, training settings + like batch size and epochs, and validation and testing settings. The attributes provide flexibility + to customize model configuration and training processes. + + :ivar dataset_path: The file path to the dataset. + :type dataset_path: str + :ivar size: Optional size parameter, can be used to specify sample size or custom + configuration based on the user's requirement. + :type size: Optional[int] + :ivar max_len: Maximum sequence length for input data, used often in text or sequence + processing. + :type max_len: int + :ivar embedding_dim: The dimensionality of embeddings used in the model. + :type embedding_dim: int + :ivar transformer_head_size: The size of each transformer attention head. + :type transformer_head_size: int + :ivar transformer_num_heads: The number of attention heads in the transformer model. + :type transformer_num_heads: int + :ivar transformer_ff_dim: The dimensionality of the feed-forward network in the transformer. + :type transformer_ff_dim: int + :ivar dropout: Dropout rate used for regularization during training. + :type dropout: float + :ivar batch_size: Batch size used for training and validation. + :type batch_size: int + :ivar epochs: Number of epochs for model training. + :type epochs: int + :ivar test_size: Proportion of the dataset to be used for testing. + :type test_size: float + :ivar random_state: Random seed value for reproducibility. + :type random_state: int + :ivar threshold: Threshold value for model predictions or classification. + :type threshold: float + :ivar cv: Cross-validation configuration, if applicable. + :type cv: Optional[int] + :ivar save: Boolean flag indicating whether to save the model after training. + :type save: bool + """ + dataset_path: str + size: Optional[int] + max_len: int = 6 + embedding_dim: int = 64 + transformer_head_size: int = 64 + transformer_num_heads: int = 2 + transformer_ff_dim: int = 128 + dropout: float = 0.1 + batch_size: int = 64 + epochs: int = 10 + test_size: float = 0.2 + random_state: int = 42 + threshold: float = 0.5 + cv: Optional[int] = None + save: bool = False + + +def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]: + """ + Load and preprocess data for model training or evaluation. This function handles the + loading of a dataset in CSV format, applies preprocessing to clean and normalize + the input data, tokenizes text features, and encodes categorical labels. + + The preprocessed data is prepared as padded sequences and encoded labels, which + can be directly used as inputs for machine learning models. Tokenizer and LabelEncoder + are returned to ensure consistency between training and inference stages. + + :param cfg: Configuration object containing dataset path, size of the + dataset to load, and maximum length for padding sequences. + :type cfg: Config + :return: A tuple containing padded input sequences for the model, encoded labels, + the tokenizer used for text sequences, and the encoder used for labels. + :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder] + """ + logging.info("Loading and preprocessing data") + df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"]) + df["name"] = df["name"].str.lower().str.strip() + df["sex"] = df["sex"].str.lower().str.strip() + + tokenizer = Tokenizer(oov_token="") + tokenizer.fit_on_texts(df["name"]) + sequences = tokenizer.texts_to_sequences(df["name"]) + padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post") + + encoder = LabelEncoder() + labels = encoder.fit_transform(df["sex"]) + return padded, labels, tokenizer, encoder + + +def transformer_encoder(x, cfg: Config): + """ + Transforms input tensor using a single Transformer encoder block with attention and feedforward + layers. The encoder applies multi-head attention to the input tensor, adds the output to + the original tensor for residual connection, and normalizes it. Subsequently, the processed + tensor passes through a feedforward network with added dropout and normalization. + + :param x: Input tensor to be transformed. + :type x: TensorFlow tensor + :param cfg: Configuration object containing Transformer hyperparameters such as the number of + attention heads, head size, feedforward dimension, and dropout rate. + :type cfg: Config + :return: Transformed tensor resulting from applying the Transformer encoder block. + :rtype: TensorFlow tensor + """ + attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x) + x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn)) + + ff = Dense(cfg.transformer_ff_dim, activation="relu")(x) + ff = Dense(x.shape[-1])(ff) + return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff)) + + +def build_model(cfg: Config, vocab_size: int) -> Model: + """ + Builds a Transformer-based model using Keras/TensorFlow components. The model + is designed for classification tasks, utilizing embedding layers with positional + encoding, a Transformer encoder block, and fully connected layers for + output generation. + + :param cfg: Configuration object containing model-specific hyperparameters + such as maximum sequence length, embedding dimensions, etc. + :type cfg: Config + :param vocab_size: The size of the vocabulary for the embedding layer. + :type vocab_size: int + :return: A compiled Keras model, ready for training and evaluation. + :rtype: Model + """ + logging.info("Building Transformer model") + inputs = Input(shape=(cfg.max_len,)) + x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs) + + # Add positional encoding + positions = tf.range(start=0, limit=cfg.max_len, delta=1) + pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions) + x = x + pos_embedding + + x = transformer_encoder(x, cfg) + x = GlobalAveragePooling1D()(x) + x = Dense(32, activation="relu")(x) + outputs = Dense(2, activation="softmax")(x) + + model = Model(inputs, outputs) + model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) + return model + + +def evaluate_proba(y_true, y_proba, threshold, class_names): + """ + Evaluates the performance of a binary classification model by calculating accuracy, + precision, recall, F1 score, confusion matrix, and generates a classification + report. This function takes the true labels, predicted probabilities, a decision + threshold, and class names to assist in the evaluation. + + :param y_true: Ground truth (correct) target values. + :type y_true: array-like of shape (n_samples,) + :param y_proba: Predicted probabilities for each class. Expected to be an array + where the second column corresponds to the probability of the positive class. + :type y_proba: array-like of shape (n_samples, 2) + :param threshold: Decision threshold for classifying a sample as positive + or negative based on predicted probabilities. + :type threshold: float + :param class_names: List of class names for labeling the classification report. + :type class_names: list of str + :return: None. Outputs performance metrics and confusion matrix to the logging + system and the console. + """ + y_pred = 1 if y_proba[:, 1] >= threshold else 0 + acc = accuracy_score(y_true, y_pred) + pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary") + cm = confusion_matrix(y_true, y_pred) + + logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}") + print("Confusion Matrix:\n", cm) + print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) + + +def cross_validate(cfg: Config, X, y, vocab_size: int): + """ + Evaluate the performance of a model using K-fold cross-validation. This function takes + configuration settings, input data, target labels, and vocabulary size to perform the + specified number of cross-validation folds with a stratified approach. For each fold, + it builds a new model, trains it, predicts the validation set, and calculates accuracy. + + :param cfg: The configuration object containing hyperparameters and settings for + cross-validation, random state, and training. + :type cfg: Config + :param X: The input data samples provided as a dataset. + :type X: numpy.ndarray + :param y: The target labels corresponding to the input data samples. + :type y: numpy.ndarray + :param vocab_size: The size of the vocabulary, used to configure the language model. + :type vocab_size: int + :return: A list containing accuracy scores from each fold in the cross-validation process. + :rtype: list + """ + logging.info(f"Running {cfg.cv}-fold cross-validation") + skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state) + accuracies = [] + + for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)): + logging.info(f"Fold {fold + 1}") + model = build_model(cfg, vocab_size) + model.fit(X[train_idx], y[train_idx], + epochs=cfg.epochs, + batch_size=cfg.batch_size, + verbose=0) + y_pred = model.predict(X[val_idx]) + acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1)) + accuracies.append(acc) + logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}") + + logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}") + + +def save_artifacts(model, tokenizer, encoder): + """ + Saves the machine learning model and its associated artifacts such as tokenizer and + label encoder to predefined file paths. This function ensures that the model and + artifacts can be reloaded later for inference or further use. + + :param model: The machine learning model to be saved. + :param tokenizer: The tokenizer used for preparing data for the model. + :param encoder: The label encoder used for encoding target labels. + :return: None + """ + model_path = os.path.join(GENDER_MODELS_DIR, "transformer.h5") + tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl") + encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl") + + model.save(model_path) + with open(tokenizer_path, "wb") as f: + pickle.dump(tokenizer, f) + with open(encoder_path, "wb") as f: + pickle.dump(encoder, f) + logging.info("Model and artifacts saved.") + + +def main(): + parser = argparse.ArgumentParser(description="Train Transformer model for name-based gender classification") + parser.add_argument("--dataset", type=str, default="names.csv") + parser.add_argument("--size", type=int) + parser.add_argument("--threshold", type=float, default=0.5) + parser.add_argument("--cv", type=int) + parser.add_argument("--save", action="store_true") + args = parser.parse_args() + + cfg = Config( + dataset_path=args.dataset, + size=args.size, + threshold=args.threshold, + cv=args.cv, + save=args.save + ) + + X, y, tokenizer, encoder = load_and_prepare(cfg) + vocab_size = len(tokenizer.word_index) + 1 + + if cfg.cv: + cross_validate(cfg, X, y, vocab_size) + return + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y + ) + + model = build_model(cfg, vocab_size) + model.summary() + + logging.info("Training Transformer model") + model.fit( + X_train, y_train, + validation_split=0.1, + epochs=cfg.epochs, + batch_size=cfg.batch_size, + callbacks=[ProgbarLogger()] + ) + + y_proba = model.predict(X_test) + evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_) + + if cfg.save: + save_artifacts(model, tokenizer, encoder) + + +if __name__ == "__main__": + main()