import argparse import logging import os import pickle from dataclasses import dataclass from typing import Tuple, Optional import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import ( accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support ) from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.pipeline import make_pipeline, Pipeline from sklearn.preprocessing import LabelEncoder from misc import GENDER_MODELS_DIR, load_csv_dataset logging.basicConfig(level=logging.INFO, format=">> %(message)s") @dataclass class Config: dataset_path: str size: Optional[int] test_size: float = 0.2 ngram_range: Tuple[int, int] = (2, 5) max_iter: int = 1000 random_state: int = 42 threshold: float = 0.5 cv: Optional[int] = None save: bool = False def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]: """ Load and clean dataset as specified by the provided configuration. This function reads a CSV dataset from the path specified in the configuration, processes it to remove missing values from key columns ('name' and 'sex'), and cleans string data in these columns by converting them to lowercase and stripping whitespace. The cleaned data is then returned as two separate pandas Series objects. :param cfg: Configuration object specifying the dataset path and size :type cfg: Config :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects :rtype: Tuple[pd.Series, pd.Series] """ logging.info(f"Loading dataset from {cfg.dataset_path}") df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)) df = df.dropna(subset=["name", "sex"]) df["name"] = df["name"].str.lower().str.strip() df["sex"] = df["sex"].str.lower().str.strip() return df["name"], df["sex"] def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]: """ Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical labels to integers, which is particularly useful for machine learning models that require numerical input data. :param y: A pandas Series of categorical labels to be encoded. :type y: pd.Series :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object. :rtype: Tuple[pd.Series, LabelEncoder] """ logging.info("Encoding labels") encoder = LabelEncoder() y_encoded = encoder.fit_transform(y) return y_encoded, encoder def build_model(cfg: Config) -> Pipeline: """ Builds a machine learning pipeline for text classification. This function constructs and returns a scikit-learn pipeline that consists of a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer leverages character-level n-grams based on the provided configuration, and the logistic regression model is trained with a maximum number of iterations defined in the configuration. This pipeline is used for processing text data and training classification models. :param cfg: Configuration object containing the n-gram range and the maximum number of iterations for the logistic regression model. :type cfg: Config :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression` based on the provided configuration. :rtype: Pipeline """ return make_pipeline( CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range), LogisticRegression(max_iter=cfg.max_iter) ) def evaluate_probabilities(y_true, y_proba, threshold: float, class_names): """ Evaluates the performance of a classification model using a specified threshold for predicted probabilities. Computes metrics such as accuracy, precision, recall, F1-score, and the confusion matrix. Also generates a classification report with detailed metrics for each class. Logs the evaluation metrics at the specified threshold and prints the confusion matrix and classification report. :param y_true: Ground truth (correct) labels. :type y_true: array-like :param y_proba: Predicted probabilities for each class, where each row corresponds to an instance and contains probabilities for each target class. :type y_proba: numpy.ndarray :param threshold: The threshold on predicted probabilities to determine class membership for each instance. :type threshold: float :param class_names: List of class names for the target variable used in the classification report. :type class_names: list of str :return: None """ logging.info(f"Evaluating at threshold = {threshold}") y_pred = (y_proba[:, 1] >= threshold).astype(int) acc = accuracy_score(y_true, y_pred) pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary') cm = confusion_matrix(y_true, y_pred) logging.info(f"Accuracy: {acc:.4f}") logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}") print("Confusion Matrix:\n", cm) print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) def cross_validate(cfg: Config, X, y) -> None: """ Performs k-fold cross-validation on the provided dataset using the configuration and logs the results including individual fold scores, mean accuracy, and the standard deviation of the scores. :param cfg: Configuration object containing cross-validation settings such as the number of folds to use in the cross-validation (`cv`). :type cfg: Config :param X: Input feature matrix for the dataset to be used for cross-validation. :type X: Any :param y: Target labels corresponding to the input feature matrix `X`. :type y: Any :return: This function does not return any value. Results are logged. :rtype: None """ logging.info(f"Running {cfg.cv}-fold cross-validation") pipeline = build_model(cfg) scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy") logging.info(f"Cross-validation scores: {scores}") logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}") def save_artifacts(model, encoder, cfg: Config): """ Saves machine learning model and label encoder artifacts to specified directories within the gender models' directory. This function ensures that the model and encoder are serialized and stored as pickle files. It uses the specified configuration settings to locate the appropriate directory for storing the files. :param model: The machine learning model object to be saved. :type model: Any :param encoder: The label encoder object used for data preprocessing. :type encoder: Any :param cfg: Configuration object containing application-specific settings regarding paths and directories. :type cfg: Config :return: None """ model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl") encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl") with open(model_path, "wb") as f: pickle.dump(model, f) with open(encoder_path, "wb") as f: pickle.dump(encoder, f) logging.info(f"Saved model to: {model_path}") logging.info(f"Saved label encoder to: {encoder_path}") def main(): parser = argparse.ArgumentParser(description="Train a gender classifier on names") parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset") parser.add_argument("--size", type=int, help="Number of rows to load") parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision") parser.add_argument("--cv", type=int, help="Number of folds for cross-validation") parser.add_argument("--save", action="store_true", help="Save the model and encoder") args = parser.parse_args() cfg = Config( dataset_path=args.dataset, size=args.size, threshold=args.threshold, cv=args.cv, save=args.save ) X_raw, y_raw = load_and_clean_data(cfg) y_encoded, encoder = encode_labels(y_raw) if cfg.cv: cross_validate(cfg, X_raw, y_encoded) return X_train, X_test, y_train, y_test = train_test_split( X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded ) model = build_model(cfg) model.fit(X_train, y_train) y_proba = model.predict_proba(X_test) evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_) if cfg.save: save_artifacts(model, encoder, cfg) if __name__ == "__main__": main()