feat: improve inference for logreg model

2025-06-21 10:34:26 +02:00
parent 33d096f8ff
commit a46a5f7924
5 changed files with 356 additions and 5 deletions
@@ -1,6 +1,7 @@
 import csv
 import json
 import os
+import pickle
 from datetime import datetime
 from typing import Optional

@@ -10,12 +11,10 @@ DATA_DIR = os.path.join(ROOT_DIR, 'dataset')

 MODELS_DIR = os.path.join(ROOT_DIR, 'models')
 GENDER_MODELS_DIR = os.path.join(MODELS_DIR, 'gender')
+GENDER_RESULT_DIR = os.path.join(ROOT_DIR, 'gender', 'results')
+
 NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner')
-
-# Training
-TRAINING_EPOCHS = 5
-MODEL_NAME = f"./models/ners-{datetime.now().strftime('%Y%m%d%H%M%S')}"
-
+NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results')

 def clean_spacing(filename: str) -> Optional[str]:
    try:
@@ -57,3 +56,13 @@ def save_json_dataset(data: list, path: str) -> None:
    print(f">> Saving JSON dataset to {path}")
    with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, separators=(',', ':'))
+
+
+def save_pickle(obj, path):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "wb") as f:
+        pickle.dump(obj, f)
+
+def load_pickle(path: str):
+    with open(path, "rb") as f:
+        return pickle.load(f)
@@ -0,0 +1,192 @@
+import argparse
+import json
+import os
+
+import pandas as pd
+import tensorflow as tf
+from sklearn.metrics import (
+    accuracy_score, precision_recall_fscore_support, confusion_matrix
+)
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
+
+
+def load_dataset(path="names.csv", size=None):
+    """
+    Loads a dataset from a CSV file, processes it to remove missing values
+    and standardizes the case and formatting of specific columns.
+
+    :param path: The path to the CSV file containing the dataset. Defaults to "names.csv".
+    :type path: str
+    :param size: The number of rows to load from the dataset. If None, the whole dataset is loaded.
+    :type size: Optional[int]
+    :return: A pandas DataFrame with the processed dataset where missing values in the
+             'name' and 'sex' columns are removed, and the text in these columns is
+             converted to lowercase and stripped of leading/trailing whitespace.
+    :rtype: pandas.DataFrame
+    """
+    df = pd.DataFrame(load_csv_dataset(path, size)).dropna(subset=["name", "sex"])
+    df["name"] = df["name"].str.lower().str.strip()
+    df["sex"] = df["sex"].str.lower().str.strip()
+    return df
+
+
+def evaluate_logreg(df, threshold):
+    """
+    Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
+    a pre-trained model and label encoder, transforms the input data into the required format, and
+    performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
+    the encoder class labels.
+
+    :param df: Input data containing a column "name" for names to evaluate and a column "sex"
+        for true labels.
+        Type: pandas.DataFrame
+
+    :param threshold: Threshold value used for classifying the predictions. Probabilities greater
+        than or equal to this value are classified into the positive class.
+        Type: float
+
+    :return: A tuple containing:
+        - y_true: True labels after encoding.
+        - y_pred: Predicted binary class labels based on the threshold.
+        - proba[:, 1]: Probability values for the positive class.
+        - encoder.classes_: Labels used by the label encoder.
+        Type: tuple (numpy.ndarray, int, numpy.ndarray, numpy.ndarray)
+    """
+    model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
+    encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
+
+    X = df["name"].tolist()
+    y_true = encoder.transform(df["sex"])
+    proba = model.predict_proba(X)
+    y_pred = 1 if proba[:, 1] >= threshold else 0
+    return y_true, y_pred, proba[:, 1], encoder.classes_
+
+
+def evaluate_lstm(df, threshold, max_len=6):
+    """
+    Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
+    returns the true labels, predicted labels, prediction probabilities, and class names.
+
+    :param df: Input DataFrame containing the data for evaluation.
+               The DataFrame must have two columns: "name" containing
+               the input text data and "sex" containing the true labels.
+    :type df: Pandas.DataFrame
+    :param threshold: Decision threshold for determining binary classification
+                      outcome based on model's prediction probabilities.
+    :type threshold: Float
+    :param max_len: The maximum length of input sequences. Used to pad or truncate
+                    tokenized sequences. Default value is 6.
+    :type max_len: Int
+    :return: A tuple containing the following elements:
+             - y_true: The true labels from the input DataFrame.
+             - y_pred: The predicted binary labels according to the decision threshold.
+             - proba: Prediction probabilities for the positive class, as output by the model.
+             - encoder.classes_: An array of class names corresponding to the label encoding.
+    :rtype: Tuple
+    """
+    model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5"))
+    tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl"))
+    encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl"))
+
+    sequences = tokenizer.texts_to_sequences(df["name"])
+    X = pad_sequences(sequences, maxlen=max_len, padding="post")
+    y_true = encoder.transform(df["sex"])
+    proba = model.predict(X)
+    y_pred = 1 if proba[:, 1] >= threshold else 0
+    return y_true, y_pred, proba[:, 1], encoder.classes_
+
+
+def evaluate_transformer(df, threshold, max_len=6):
+    """
+    Evaluates the transformer model for gender prediction. The function loads a pre-trained
+    transformer model, tokenizer, and label encoder. It processes the input dataframe by
+    tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
+    The function then predicts the probabilities for the given names using the transformer model
+    and generates predictions based on the specified threshold.
+
+    :param df: Pandas DataFrame containing a "name" column with strings to be evaluated
+        and a "sex" column with corresponding target labels.
+    :type df: Pd.DataFrame
+    :param threshold: Threshold value used to determine binary classification labels
+        from predicted probabilities.
+    :type threshold: Float
+    :param max_len: Maximum length for padded sequences, default is 6.
+    :type max_len: Int, optional
+    :return: A tuple containing the ground truth labels, predicted labels, predicted
+        probabilities for the positive class, and a list of the label classes.
+    :rtype: Tuple
+    """
+    model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.h5"))
+    tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
+    encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
+
+    sequences = tokenizer.texts_to_sequences(df["name"])
+    X = pad_sequences(sequences, maxlen=max_len, padding="post")
+    y_true = encoder.transform(df["sex"])
+    proba = model.predict(X)
+    y_pred = 1 if proba[:, 1] >= threshold else 0
+    return y_true, y_pred, proba[:, 1], encoder.classes_
+
+
+def compute_metrics(y_true, y_pred, y_proba, class_names):
+    """
+    Computes classification metrics for given true and predicted labels, along with
+    class probabilities and class names. The function calculates accuracy, precision,
+    recall, F1 score, and confusion matrix for evaluating model performance.
+
+    :param y_true: Ground truth (correct) labels.
+    :type y_true: list or numpy.ndarray
+    :param y_pred: Predicted labels, as returned by a classifier.
+    :type y_pred: list or numpy.ndarray
+    :param y_proba: Predicted probabilities for positive class.
+    :type y_proba: list or numpy.ndarray
+    :param class_names: Names of the classes corresponding to labels in the confusion
+        matrix.
+    :type class_names: numpy.ndarray
+    :return: A dictionary containing computed accuracy, precision, recall, F1 score,
+        and confusion matrix with labels and matrix elements.
+    :rtype: dict
+    """
+    acc = accuracy_score(y_true, y_pred)
+    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
+    cm = confusion_matrix(y_true, y_pred).tolist()
+
+    return {
+        "accuracy": acc,
+        "precision": pr,
+        "recall": rc,
+        "f1": f1,
+        "confusion_matrix": {
+            "labels": class_names.tolist(),
+            "matrix": cm
+        }
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
+    parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
+    parser.add_argument("--dataset", default="names.csv")
+    parser.add_argument("--size", type=int)
+    parser.add_argument("--threshold", type=float, default=0.5)
+    args = parser.parse_args()
+
+    df = load_dataset(args.dataset, args.size)
+
+    if args.model == "logreg":
+        y_true, y_pred, y_proba, classes = evaluate_logreg(df, args.threshold)
+    elif args.model == "lstm":
+        y_true, y_pred, y_proba, classes = evaluate_lstm(df, args.threshold)
+    elif args.model == "transformer":
+        y_true, y_pred, y_proba, classes = evaluate_transformer(df, args.threshold)
+    else:
+        raise ValueError(f"Unknown model: {args.model}")
+
+    results = compute_metrics(y_true, y_pred, y_proba, classes)
+    save_json_dataset(results, os.path.join(GENDER_RESULT_DIR, f'{args.model}_eval'))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,150 @@
+import argparse
+import os
+from typing import List
+
+import tensorflow as tf
+from sklearn.pipeline import Pipeline
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+from misc import GENDER_MODELS_DIR, load_pickle
+
+
+def predict_logreg(names: List[str], threshold: float):
+    """
+    Predict gender labels for given names using a logistic regression model.
+
+    The function takes in a list of names and predicts the gender labels
+    based on a logistic regression model. A probabilistic threshold is used
+    to classify the names into one of the defined labels.
+
+    :param names:
+        A list of names for which the gender needs to be predicted. Each
+        name must be a string.
+    :param threshold:
+        A float value representing the threshold for classification. Names
+        with predicted probabilities greater than or equal to this value
+        will be classified into the positive class.
+    :return:
+        A tuple containing the predicted gender labels and their
+        corresponding probabilities. The first element of the tuple is a
+        list of predicted labels, while the second element is an array of
+        probability scores for each label.
+    """
+    model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
+    encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
+
+    model: Pipeline = load_pickle(model_path)
+    label_encoder = load_pickle(encoder_path)
+
+    X = [name.lower().strip() for name in names]
+    proba = model.predict_proba(X)
+    pred = (proba[:, 1] >= threshold).astype(int)
+    labels = label_encoder.inverse_transform(pred)
+    return labels, proba
+
+
+def predict_lstm(names: List[str], threshold: float, max_len=6):
+    """
+    Predicts gender labels and probabilities for a list of names using a pre-trained BiLSTM model.
+
+    The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
+    names, and then uses the loaded model to predict gender probabilities. Based on the threshold
+    value, it determines the predicted gender labels.
+
+    :param names: List of names to be classified.
+    :type names: List[str]
+    :param threshold: Probability threshold for classifying gender. If the predicted probability for the
+        'positive' class is greater than or equal to this threshold, it is classified accordingly.
+    :type threshold: float
+    :param max_len: Maximum length for name sequences. Names longer than this will be truncated, and shorter
+        ones will be padded. Default value is 6.
+    :type max_len: int, optional
+
+    :return: A tuple containing predicted labels and associated probabilities. Labels are the predicted gender
+        categories, and probabilities are the prediction scores for each input name.
+    :rtype: Tuple[numpy.ndarray, numpy.ndarray]
+    """
+    model_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5")
+    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl")
+    encoder_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl")
+
+    model = tf.keras.models.load_model(model_path)
+    tokenizer: Tokenizer = load_pickle(tokenizer_path)
+    label_encoder = load_pickle(encoder_path)
+
+    X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
+    X = pad_sequences(X, maxlen=max_len, padding="post")
+    proba = model.predict(X)
+    pred = (proba[:, 1] >= threshold).astype(int)
+    labels = label_encoder.inverse_transform(pred)
+    return labels, proba
+
+
+def predict_transformer(names: List[str], threshold: float, max_len=6):
+    """
+    Predicts gender labels for the provided names using a pre-trained transformer model.
+
+    This function loads a pre-trained transformer model along with its tokenizer and label
+    encoder, converts input names into tokenized sequences, and processes them to generate
+    gender predictions. The function returns the predicted labels and the associated
+    probabilities for each sample.
+
+    :param names: List of names to predict gender labels for.
+    :type names: List[str]
+    :param threshold: Threshold value to determine the prediction class. Probability values
+       above or equal to the threshold will be assigned to one class, and those below to
+       another.
+    :type threshold: float
+    :param max_len: Maximum length for the sequences. Names will be truncated or padded to
+       this length during processing, default is 6.
+    :type max_len: int, optional
+    :return: A tuple containing two elements: a list of predicted gender labels as strings
+       and a NumPy array of probabilities for each gender class (where the first index
+       corresponds to one class, and the second index corresponds to another).
+    :rtype: Tuple[List[str], numpy.ndarray]
+    """
+    model_path = os.path.join(GENDER_MODELS_DIR, "transformer.h5")
+    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
+    encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
+
+    model = tf.keras.models.load_model(model_path)
+    tokenizer: Tokenizer = load_pickle(tokenizer_path)
+    label_encoder = load_pickle(encoder_path)
+
+    X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
+    X = pad_sequences(X, maxlen=max_len, padding="post")
+    proba = model.predict(X)
+    pred = (proba[:, 1] >= threshold).astype(int)
+    labels = label_encoder.inverse_transform(pred)
+    return labels, proba
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
+    parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
+    parser.add_argument("--name", nargs="+", required=True, help="One or more names")
+    parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
+    args = parser.parse_args()
+
+    model = args.model
+    names = args.name
+    threshold = args.threshold
+
+    if model == "logreg":
+        labels, proba = predict_logreg(names, threshold)
+    elif model == "lstm":
+        labels, proba = predict_lstm(names, threshold)
+    elif model == "transformer":
+        labels, proba = predict_transformer(names, threshold)
+    else:
+        raise ValueError(f"Unsupported model type: {model}")
+
+    for i, name in enumerate(names):
+        p_female = proba[i][0]
+        p_male = proba[i][1]
+        print(f"{name} → {labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
+
+
+if __name__ == "__main__":
+    main()