docs: add gender inference instructions

2025-06-21 10:53:02 +02:00
parent 25f1df46d8
commit 88bb2f207e
3 changed files with 23 additions and 0 deletions
@@ -0,0 +1,222 @@
+import argparse
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+from typing import Tuple, Optional
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    accuracy_score, classification_report, confusion_matrix,
+    precision_recall_fscore_support
+)
+from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.preprocessing import LabelEncoder
+
+from misc import GENDER_MODELS_DIR, load_csv_dataset
+
+logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+
+@dataclass
+class Config:
+    dataset_path: str
+    size: Optional[int]
+    test_size: float = 0.2
+    ngram_range: Tuple[int, int] = (2, 5)
+    max_iter: int = 1000
+    random_state: int = 42
+    threshold: float = 0.5
+    cv: Optional[int] = None
+    save: bool = False
+
+
+def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]:
+    """
+    Load and clean dataset as specified by the provided configuration. This function reads
+    a CSV dataset from the path specified in the configuration, processes it to remove
+    missing values from key columns ('name' and 'sex'), and cleans string data in these
+    columns by converting them to lowercase and stripping whitespace. The cleaned data
+    is then returned as two separate pandas Series objects.
+
+    :param cfg: Configuration object specifying the dataset path and size
+    :type cfg: Config
+    :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects
+    :rtype: Tuple[pd.Series, pd.Series]
+    """
+    logging.info(f"Loading dataset from {cfg.dataset_path}")
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size))
+    df = df.dropna(subset=["name", "sex"])
+    df["name"] = df["name"].str.lower().str.strip()
+    df["sex"] = df["sex"].str.lower().str.strip()
+    return df["name"], df["sex"]
+
+
+def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
+    """
+    Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical
+    labels to integers, which is particularly useful for machine learning models that require numerical
+    input data.
+
+    :param y: A pandas Series of categorical labels to be encoded.
+    :type y: pd.Series
+    :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object.
+    :rtype: Tuple[pd.Series, LabelEncoder]
+    """
+    logging.info("Encoding labels")
+    encoder = LabelEncoder()
+    y_encoded = encoder.fit_transform(y)
+    return y_encoded, encoder
+
+
+def build_model(cfg: Config) -> Pipeline:
+    """
+    Builds a machine learning pipeline for text classification.
+
+    This function constructs and returns a scikit-learn pipeline that consists of
+    a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer
+    leverages character-level n-grams based on the provided configuration, and the
+    logistic regression model is trained with a maximum number of iterations defined
+    in the configuration. This pipeline is used for processing text data and training
+    classification models.
+
+    :param cfg: Configuration object containing the n-gram range and the maximum
+                number of iterations for the logistic regression model.
+    :type cfg: Config
+    :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression`
+             based on the provided configuration.
+    :rtype: Pipeline
+    """
+    return make_pipeline(
+        CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
+        LogisticRegression(max_iter=cfg.max_iter)
+    )
+
+
+def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
+    """
+    Evaluates the performance of a classification model using a specified threshold
+    for predicted probabilities. Computes metrics such as accuracy, precision,
+    recall, F1-score, and the confusion matrix. Also generates a classification
+    report with detailed metrics for each class.
+
+    Logs the evaluation metrics at the specified threshold and prints the confusion
+    matrix and classification report.
+
+    :param y_true: Ground truth (correct) labels.
+    :type y_true: array-like
+    :param y_proba: Predicted probabilities for each class, where each row
+        corresponds to an instance and contains probabilities for each target class.
+    :type y_proba: numpy.ndarray
+    :param threshold: The threshold on predicted probabilities to determine
+        class membership for each instance.
+    :type threshold: float
+    :param class_names: List of class names for the target variable used in the
+        classification report.
+    :type class_names: list of str
+    :return: None
+    """
+    logging.info(f"Evaluating at threshold = {threshold}")
+    y_pred = (y_proba[:, 1] >= threshold).astype(int)
+    acc = accuracy_score(y_true, y_pred)
+    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
+    cm = confusion_matrix(y_true, y_pred)
+
+    logging.info(f"Accuracy: {acc:.4f}")
+    logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
+    print("Confusion Matrix:\n", cm)
+    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
+
+
+def cross_validate(cfg: Config, X, y) -> None:
+    """
+    Performs k-fold cross-validation on the provided dataset using the configuration and
+    logs the results including individual fold scores, mean accuracy, and the standard
+    deviation of the scores.
+
+    :param cfg: Configuration object containing cross-validation settings such as the
+        number of folds to use in the cross-validation (`cv`).
+    :type cfg: Config
+    :param X: Input feature matrix for the dataset to be used for cross-validation.
+    :type X: Any
+    :param y: Target labels corresponding to the input feature matrix `X`.
+    :type y: Any
+    :return: This function does not return any value. Results are logged.
+    :rtype: None
+    """
+    logging.info(f"Running {cfg.cv}-fold cross-validation")
+    pipeline = build_model(cfg)
+    scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
+    logging.info(f"Cross-validation scores: {scores}")
+    logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
+
+
+def save_artifacts(model, encoder, cfg: Config):
+    """
+    Saves machine learning model and label encoder artifacts to specified directories
+    within the gender models' directory. This function ensures that the model and encoder
+    are serialized and stored as pickle files. It uses the specified configuration settings
+    to locate the appropriate directory for storing the files.
+
+    :param model: The machine learning model object to be saved.
+    :type model: Any
+    :param encoder: The label encoder object used for data preprocessing.
+    :type encoder: Any
+    :param cfg: Configuration object containing application-specific settings regarding
+        paths and directories.
+    :type cfg: Config
+    :return: None
+    """
+    model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
+    encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
+
+    with open(model_path, "wb") as f:
+        pickle.dump(model, f)
+    with open(encoder_path, "wb") as f:
+        pickle.dump(encoder, f)
+    logging.info(f"Saved model to: {model_path}")
+    logging.info(f"Saved label encoder to: {encoder_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train a gender classifier on names")
+    parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset")
+    parser.add_argument("--size", type=int, help="Number of rows to load")
+    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision")
+    parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
+    parser.add_argument("--save", action="store_true", help="Save the model and encoder")
+    args = parser.parse_args()
+
+    cfg = Config(
+        dataset_path=args.dataset,
+        size=args.size,
+        threshold=args.threshold,
+        cv=args.cv,
+        save=args.save
+    )
+
+    X_raw, y_raw = load_and_clean_data(cfg)
+    y_encoded, encoder = encode_labels(y_raw)
+
+    if cfg.cv:
+        cross_validate(cfg, X_raw, y_encoded)
+        return
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
+    )
+
+    model = build_model(cfg)
+    model.fit(X_train, y_train)
+
+    y_proba = model.predict_proba(X_test)
+    evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
+
+    if cfg.save:
+        save_artifacts(model, encoder, cfg)
+
+
+if __name__ == "__main__":
+    main()