import argparse import logging import os from dataclasses import dataclass from typing import Tuple, Optional import numpy as np import pandas as pd from sklearn.metrics import ( accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix ) from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.preprocessing import LabelEncoder from tensorflow.keras.callbacks import ProgbarLogger from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense from tensorflow.keras.models import Sequential from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle logging.basicConfig(level=logging.INFO, format=">> %(message)s") @dataclass class Config: """ Configuration for the machine learning model and its training process. This class encapsulates the configuration options necessary for initializing, training, and evaluating a machine learning model. It allows flexibility in specifying dataset details, model parameters, training settings, and options for evaluation. Attributes include paths, numerical parameters, and flags that guide the model's behavior. :ivar dataset_path: Path to the dataset file. :type dataset_path: str :ivar size: Optional size of the dataset to use. If None, use the full dataset. :type size: Optional[int] :ivar max_len: Maximum length of sequences used in the model. :type max_len: int :ivar embedding_dim: Dimensionality of the embedding layer. :type embedding_dim: int :ivar lstm_units: Number of LSTM units in the model. :type lstm_units: int :ivar batch_size: Batch size to use during training. :type batch_size: int :ivar epochs: Number of epochs for model training. :type epochs: int :ivar test_size: Fraction of data to use for testing. :type test_size: float :ivar random_state: Seed for random number generation to ensure reproducibility. :type random_state: int :ivar threshold: Decision threshold for binary classification tasks. :type threshold: float :ivar cv: Number of cross-validation folds. If None, no cross-validation is used. :type cv: Optional[int] :ivar save: Flag indicating whether to save the trained model. :type save: bool """ dataset_path: str size: Optional[int] = None max_len: int = 6 embedding_dim: int = 64 lstm_units: int = 32 batch_size: int = 64 epochs: int = 10 test_size: float = 0.2 random_state: int = 42 threshold: float = 0.5 cv: Optional[int] = None save: bool = False def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]: """ Load and preprocess the dataset based on the provided configuration. This function performs a series of operations including loading the dataset from the specified path, cleaning and preprocessing data (e.g., converting to lowercase, stripping whitespace, handling missing values), tokenizing names using a tokenizer, and encoding the labels using a label encoder. The final processed data and tools (tokenizer and label encoder) are returned for further use. :param cfg: Config object containing dataset parameters such as dataset path, size, and maximum sequence length. :type cfg: Config :return: A tuple containing processed padded sequences (numpy ndarray), corresponding encoded labels (numpy ndarray), tokenizer object used for preprocessing names, and label encoder object used for encoding labels. :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder] """ logging.info("Loading and preprocessing data") df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"]) df["name"] = df["name"].str.lower().str.strip() df["sex"] = df["sex"].str.lower().str.strip() tokenizer = Tokenizer(char_level=False, lower=True, oov_token="") tokenizer.fit_on_texts(df["name"]) sequences = tokenizer.texts_to_sequences(df["name"]) padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post") label_encoder = LabelEncoder() labels = label_encoder.fit_transform(df["sex"]) return padded, labels, tokenizer, label_encoder def build_model(cfg: Config, vocab_size: int) -> Sequential: logging.info("Building LSTM model") model = Sequential([ Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim), Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)), Bidirectional(LSTM(cfg.lstm_units)), Dense(64, activation="relu"), Dense(2, activation="softmax") ]) model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) return model def evaluate_proba(y_true, y_proba, threshold, class_names): """ Evaluate the performance of a binary classification model by calculating key metrics and printing a detailed classification report. This function thresholds the predicted probabilities to produce binary predictions and calculates metrics such as accuracy, precision, recall, and F1 score. It also generates a confusion matrix and a classification report for the model's performance. Additionally, metrics are logged and informational outputs are printed. :param y_true: Ground truth binary labels. Must be a 1-dimensional array or list of integers. :param y_proba: Predicted probabilities for each class from the model. It is a 2-dimensional array where the second dimension represents class probabilities for each sample. :param threshold: Threshold value for converting probabilities into binary predictions. Should be a float between 0 and 1. :param class_names: List of class names corresponding to the binary labels. Used for labeling the classification report. :return: None """ y_pred = (y_proba[:, 1] >= threshold).astype(int) acc = accuracy_score(y_true, y_pred) pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary') cm = confusion_matrix(y_true, y_pred) logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}") print("Confusion Matrix:\n", cm) print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) def cross_validate(cfg: Config, X, y, vocab_size: int): """ Performs k-fold cross-validation on a dataset using a specified model configuration. This function takes a dataset and corresponding labels, splits the dataset into k folds (based on the `cv` attribute of the provided configuration object), and performs cross-validation using the specified deep learning model. The model is built and trained on the training subset for each fold, and the validation subset is used to compute accuracy scores. Finally, it logs the individual fold accuracies and the overall mean accuracy with its standard deviation. :param cfg: Configuration object containing the parameters for cross-validation, model training, and other settings. `cv` specifies the number of folds, and other attributes such as `epochs`, `batch_size`, and `random_state` dictate the training and reproducibility behavior. :type cfg: Config :param X: Feature data for the dataset. Assumes the input is compatible with the model configuration. :param y: True labels corresponding to the dataset. The order should correspond to the feature set `X`. :param vocab_size: Total vocabulary size used for building the model. Determines the structure of the model input. :type vocab_size: int :return: A list containing the accuracy scores for each fold. :rtype: List[float] """ logging.info(f"Running {cfg.cv}-fold cross-validation") skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state) accuracies = [] for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)): logging.info(f"Fold {fold + 1}") model = build_model(cfg, vocab_size) model.fit(X[train_idx], y[train_idx], epochs=cfg.epochs, batch_size=cfg.batch_size, verbose=0) y_pred = model.predict(X[val_idx]) acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1)) accuracies.append(acc) logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}") logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}") def save_artifacts(model, tokenizer, encoder): """ Save the model, tokenizer, and label encoder artifacts to predefined file paths within the GENDER_MODELS_DIR directory. The function ensures that the model is saved in H5 format, while the tokenizer and encoder are serialized using the Pickle module. It logs a message indicating the completion of the saving process. :param model: The machine learning model object to be saved. :type model: Any :param tokenizer: The tokenizer object used in preprocessing, to be serialized for future use. :type tokenizer: Any :param encoder: The label encoder object used for encoding labels during training, to be serialized for future use. :type encoder: Any :return: None """ os.makedirs(GENDER_MODELS_DIR, exist_ok=True) model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")) save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")) save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl")) logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}") def main(): parser = argparse.ArgumentParser(description="Train BiLSTM model for name-based gender classification") parser.add_argument("--dataset", type=str, default="names.csv") parser.add_argument("--size", type=int) parser.add_argument("--threshold", type=float, default=0.5) parser.add_argument("--cv", type=int) parser.add_argument("--save", action="store_true") args = parser.parse_args() cfg = Config( dataset_path=args.dataset, size=args.size, threshold=args.threshold, cv=args.cv, save=args.save ) X, y, tokenizer, encoder = load_and_prepare(cfg) vocab_size = len(tokenizer.word_index) + 1 if cfg.cv: cross_validate(cfg, X, y, vocab_size) return X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y ) model = build_model(cfg, vocab_size) model.summary() logging.info("Training model") model.fit(X_train, y_train, validation_split=0.1, epochs=cfg.epochs, batch_size=cfg.batch_size, callbacks=[ProgbarLogger()]) y_proba = model.predict(X_test) evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_) if cfg.save: save_artifacts(model, tokenizer, encoder) if __name__ == "__main__": main()