feat: balanced dataset loading

2025-06-30 01:32:10 +02:00
parent eb139ee09a
commit 0888d94596
9 changed files with 306 additions and 614 deletions
@@ -1,8 +1,6 @@
 import argparse
-import json
 import os

-import pandas as pd
 import tensorflow as tf
 from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, confusion_matrix
@@ -12,47 +10,12 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR


-def load_dataset(path="names.csv", size=None):
-    """
-    Loads a dataset from a CSV file, processes it to remove missing values
-    and standardizes the case and formatting of specific columns.
-
-    :param path: The path to the CSV file containing the dataset. Defaults to "names.csv".
-    :type path: str
-    :param size: The number of rows to load from the dataset. If None, the whole dataset is loaded.
-    :type size: Optional[int]
-    :return: A pandas DataFrame with the processed dataset where missing values in the
-             'name' and 'sex' columns are removed, and the text in these columns is
-             converted to lowercase and stripped of leading/trailing whitespace.
-    :rtype: pandas.DataFrame
-    """
-    df = pd.DataFrame(load_csv_dataset(path, size)).dropna(subset=["name", "sex"])
-    df["name"] = df["name"].str.lower().str.strip()
-    df["sex"] = df["sex"].str.lower().str.strip()
-    return df
-
-
 def evaluate_logreg(df, threshold):
    """
    Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
    a pre-trained model and label encoder, transforms the input data into the required format, and
    performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
    the encoder class labels.
-
-    :param df: Input data containing a column "name" for names to evaluate and a column "sex"
-        for true labels.
-        Type: pandas.DataFrame
-
-    :param threshold: Threshold value used for classifying the predictions. Probabilities greater
-        than or equal to this value are classified into the positive class.
-        Type: float
-
-    :return: A tuple containing:
-        - y_true: True labels after encoding.
-        - y_pred: Predicted binary class labels based on the threshold.
-        - proba[:, 1]: Probability values for the positive class.
-        - encoder.classes_: Labels used by the label encoder.
-        Type: tuple (numpy.ndarray, int, numpy.ndarray, numpy.ndarray)
    """
    model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
    encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
@@ -60,7 +23,7 @@ def evaluate_logreg(df, threshold):
    X = df["name"].tolist()
    y_true = encoder.transform(df["sex"])
    proba = model.predict_proba(X)
-    y_pred = 1 if proba[:, 1] >= threshold else 0
+    y_pred = (proba[:, 1] >= threshold).astype(int)
    return y_true, y_pred, proba[:, 1], encoder.classes_


@@ -68,23 +31,6 @@ def evaluate_lstm(df, threshold, max_len=6):
    """
    Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
    returns the true labels, predicted labels, prediction probabilities, and class names.
-
-    :param df: Input DataFrame containing the data for evaluation.
-               The DataFrame must have two columns: "name" containing
-               the input text data and "sex" containing the true labels.
-    :type df: Pandas.DataFrame
-    :param threshold: Decision threshold for determining binary classification
-                      outcome based on model's prediction probabilities.
-    :type threshold: Float
-    :param max_len: The maximum length of input sequences. Used to pad or truncate
-                    tokenized sequences. Default value is 6.
-    :type max_len: Int
-    :return: A tuple containing the following elements:
-             - y_true: The true labels from the input DataFrame.
-             - y_pred: The predicted binary labels according to the decision threshold.
-             - proba: Prediction probabilities for the positive class, as output by the model.
-             - encoder.classes_: An array of class names corresponding to the label encoding.
-    :rtype: Tuple
    """
    model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
    tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
@@ -94,7 +40,7 @@ def evaluate_lstm(df, threshold, max_len=6):
    X = pad_sequences(sequences, maxlen=max_len, padding="post")
    y_true = encoder.transform(df["sex"])
    proba = model.predict(X)
-    y_pred = 1 if proba[:, 1] >= threshold else 0
+    y_pred = (proba[:, 1] >= threshold).astype(int)
    return y_true, y_pred, proba[:, 1], encoder.classes_


@@ -105,18 +51,6 @@ def evaluate_transformer(df, threshold, max_len=6):
    tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
    The function then predicts the probabilities for the given names using the transformer model
    and generates predictions based on the specified threshold.
-
-    :param df: Pandas DataFrame containing a "name" column with strings to be evaluated
-        and a "sex" column with corresponding target labels.
-    :type df: Pd.DataFrame
-    :param threshold: Threshold value used to determine binary classification labels
-        from predicted probabilities.
-    :type threshold: Float
-    :param max_len: Maximum length for padded sequences, default is 6.
-    :type max_len: Int, optional
-    :return: A tuple containing the ground truth labels, predicted labels, predicted
-        probabilities for the positive class, and a list of the label classes.
-    :rtype: Tuple
    """
    model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
    tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
@@ -126,7 +60,7 @@ def evaluate_transformer(df, threshold, max_len=6):
    X = pad_sequences(sequences, maxlen=max_len, padding="post")
    y_true = encoder.transform(df["sex"])
    proba = model.predict(X)
-    y_pred = 1 if proba[:, 1] >= threshold else 0
+    y_pred = (proba[:, 1] >= threshold).astype(int)
    return y_true, y_pred, proba[:, 1], encoder.classes_


@@ -135,19 +69,6 @@ def compute_metrics(y_true, y_pred, y_proba, class_names):
    Computes classification metrics for given true and predicted labels, along with
    class probabilities and class names. The function calculates accuracy, precision,
    recall, F1 score, and confusion matrix for evaluating model performance.
-
-    :param y_true: Ground truth (correct) labels.
-    :type y_true: list or numpy.ndarray
-    :param y_pred: Predicted labels, as returned by a classifier.
-    :type y_pred: list or numpy.ndarray
-    :param y_proba: Predicted probabilities for positive class.
-    :type y_proba: list or numpy.ndarray
-    :param class_names: Names of the classes corresponding to labels in the confusion
-        matrix.
-    :type class_names: numpy.ndarray
-    :return: A dictionary containing computed accuracy, precision, recall, F1 score,
-        and confusion matrix with labels and matrix elements.
-    :rtype: dict
    """
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
@@ -168,20 +89,22 @@ def compute_metrics(y_true, y_pred, y_proba, class_names):
 def main():
    parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
    parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
-    parser.add_argument("--dataset", default="names.csv")
-    parser.add_argument("--size", type=int)
-    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--dataset", default="names_featured.csv", help="Path to the dataset CSV file")
+    parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
+    parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
+    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
    args = parser.parse_args()

-    df = load_dataset(args.dataset, args.size)
+    df = load_csv_dataset(args.dataset, args.size, args.balanced)

-    if args.model == "logreg":
-        y_true, y_pred, y_proba, classes = evaluate_logreg(df, args.threshold)
-    elif args.model == "lstm":
-        y_true, y_pred, y_proba, classes = evaluate_lstm(df, args.threshold)
-    elif args.model == "transformer":
-        y_true, y_pred, y_proba, classes = evaluate_transformer(df, args.threshold)
-    else:
+    model_funcs = {
+        "logreg": evaluate_logreg,
+        "lstm": evaluate_lstm,
+        "transformer": evaluate_transformer,
+    }
+    try:
+        y_true, y_pred, y_proba, classes = model_funcs[args.model](df, args.threshold)
+    except KeyError:
        raise ValueError(f"Unknown model: {args.model}")

    results = compute_metrics(y_true, y_pred, y_proba, classes)