feat: improve inference for logreg model

This commit is contained in:
2025-06-21 10:34:26 +02:00
parent 33d096f8ff
commit a46a5f7924
5 changed files with 356 additions and 5 deletions
+14 -5
View File
@@ -1,6 +1,7 @@
import csv
import json
import os
import pickle
from datetime import datetime
from typing import Optional
@@ -10,12 +11,10 @@ DATA_DIR = os.path.join(ROOT_DIR, 'dataset')
MODELS_DIR = os.path.join(ROOT_DIR, 'models')
GENDER_MODELS_DIR = os.path.join(MODELS_DIR, 'gender')
GENDER_RESULT_DIR = os.path.join(ROOT_DIR, 'gender', 'results')
NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner')
# Training
TRAINING_EPOCHS = 5
MODEL_NAME = f"./models/ners-{datetime.now().strftime('%Y%m%d%H%M%S')}"
NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results')
def clean_spacing(filename: str) -> Optional[str]:
try:
@@ -57,3 +56,13 @@ def save_json_dataset(data: list, path: str) -> None:
print(f">> Saving JSON dataset to {path}")
with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, separators=(',', ':'))
def save_pickle(obj, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "wb") as f:
pickle.dump(obj, f)
def load_pickle(path: str):
with open(path, "rb") as f:
return pickle.load(f)
+192
View File
@@ -0,0 +1,192 @@
import argparse
import json
import os
import pandas as pd
import tensorflow as tf
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support, confusion_matrix
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
def load_dataset(path="names.csv", size=None):
"""
Loads a dataset from a CSV file, processes it to remove missing values
and standardizes the case and formatting of specific columns.
:param path: The path to the CSV file containing the dataset. Defaults to "names.csv".
:type path: str
:param size: The number of rows to load from the dataset. If None, the whole dataset is loaded.
:type size: Optional[int]
:return: A pandas DataFrame with the processed dataset where missing values in the
'name' and 'sex' columns are removed, and the text in these columns is
converted to lowercase and stripped of leading/trailing whitespace.
:rtype: pandas.DataFrame
"""
df = pd.DataFrame(load_csv_dataset(path, size)).dropna(subset=["name", "sex"])
df["name"] = df["name"].str.lower().str.strip()
df["sex"] = df["sex"].str.lower().str.strip()
return df
def evaluate_logreg(df, threshold):
"""
Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
a pre-trained model and label encoder, transforms the input data into the required format, and
performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
the encoder class labels.
:param df: Input data containing a column "name" for names to evaluate and a column "sex"
for true labels.
Type: pandas.DataFrame
:param threshold: Threshold value used for classifying the predictions. Probabilities greater
than or equal to this value are classified into the positive class.
Type: float
:return: A tuple containing:
- y_true: True labels after encoding.
- y_pred: Predicted binary class labels based on the threshold.
- proba[:, 1]: Probability values for the positive class.
- encoder.classes_: Labels used by the label encoder.
Type: tuple (numpy.ndarray, int, numpy.ndarray, numpy.ndarray)
"""
model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
X = df["name"].tolist()
y_true = encoder.transform(df["sex"])
proba = model.predict_proba(X)
y_pred = 1 if proba[:, 1] >= threshold else 0
return y_true, y_pred, proba[:, 1], encoder.classes_
def evaluate_lstm(df, threshold, max_len=6):
"""
Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
returns the true labels, predicted labels, prediction probabilities, and class names.
:param df: Input DataFrame containing the data for evaluation.
The DataFrame must have two columns: "name" containing
the input text data and "sex" containing the true labels.
:type df: Pandas.DataFrame
:param threshold: Decision threshold for determining binary classification
outcome based on model's prediction probabilities.
:type threshold: Float
:param max_len: The maximum length of input sequences. Used to pad or truncate
tokenized sequences. Default value is 6.
:type max_len: Int
:return: A tuple containing the following elements:
- y_true: The true labels from the input DataFrame.
- y_pred: The predicted binary labels according to the decision threshold.
- proba: Prediction probabilities for the positive class, as output by the model.
- encoder.classes_: An array of class names corresponding to the label encoding.
:rtype: Tuple
"""
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5"))
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl"))
sequences = tokenizer.texts_to_sequences(df["name"])
X = pad_sequences(sequences, maxlen=max_len, padding="post")
y_true = encoder.transform(df["sex"])
proba = model.predict(X)
y_pred = 1 if proba[:, 1] >= threshold else 0
return y_true, y_pred, proba[:, 1], encoder.classes_
def evaluate_transformer(df, threshold, max_len=6):
"""
Evaluates the transformer model for gender prediction. The function loads a pre-trained
transformer model, tokenizer, and label encoder. It processes the input dataframe by
tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
The function then predicts the probabilities for the given names using the transformer model
and generates predictions based on the specified threshold.
:param df: Pandas DataFrame containing a "name" column with strings to be evaluated
and a "sex" column with corresponding target labels.
:type df: Pd.DataFrame
:param threshold: Threshold value used to determine binary classification labels
from predicted probabilities.
:type threshold: Float
:param max_len: Maximum length for padded sequences, default is 6.
:type max_len: Int, optional
:return: A tuple containing the ground truth labels, predicted labels, predicted
probabilities for the positive class, and a list of the label classes.
:rtype: Tuple
"""
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.h5"))
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
sequences = tokenizer.texts_to_sequences(df["name"])
X = pad_sequences(sequences, maxlen=max_len, padding="post")
y_true = encoder.transform(df["sex"])
proba = model.predict(X)
y_pred = 1 if proba[:, 1] >= threshold else 0
return y_true, y_pred, proba[:, 1], encoder.classes_
def compute_metrics(y_true, y_pred, y_proba, class_names):
"""
Computes classification metrics for given true and predicted labels, along with
class probabilities and class names. The function calculates accuracy, precision,
recall, F1 score, and confusion matrix for evaluating model performance.
:param y_true: Ground truth (correct) labels.
:type y_true: list or numpy.ndarray
:param y_pred: Predicted labels, as returned by a classifier.
:type y_pred: list or numpy.ndarray
:param y_proba: Predicted probabilities for positive class.
:type y_proba: list or numpy.ndarray
:param class_names: Names of the classes corresponding to labels in the confusion
matrix.
:type class_names: numpy.ndarray
:return: A dictionary containing computed accuracy, precision, recall, F1 score,
and confusion matrix with labels and matrix elements.
:rtype: dict
"""
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
cm = confusion_matrix(y_true, y_pred).tolist()
return {
"accuracy": acc,
"precision": pr,
"recall": rc,
"f1": f1,
"confusion_matrix": {
"labels": class_names.tolist(),
"matrix": cm
}
}
def main():
parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
parser.add_argument("--dataset", default="names.csv")
parser.add_argument("--size", type=int)
parser.add_argument("--threshold", type=float, default=0.5)
args = parser.parse_args()
df = load_dataset(args.dataset, args.size)
if args.model == "logreg":
y_true, y_pred, y_proba, classes = evaluate_logreg(df, args.threshold)
elif args.model == "lstm":
y_true, y_pred, y_proba, classes = evaluate_lstm(df, args.threshold)
elif args.model == "transformer":
y_true, y_pred, y_proba, classes = evaluate_transformer(df, args.threshold)
else:
raise ValueError(f"Unknown model: {args.model}")
results = compute_metrics(y_true, y_pred, y_proba, classes)
save_json_dataset(results, os.path.join(GENDER_RESULT_DIR, f'{args.model}_eval'))
if __name__ == "__main__":
main()
+150
View File
@@ -0,0 +1,150 @@
import argparse
import os
from typing import List
import tensorflow as tf
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_pickle
def predict_logreg(names: List[str], threshold: float):
"""
Predict gender labels for given names using a logistic regression model.
The function takes in a list of names and predicts the gender labels
based on a logistic regression model. A probabilistic threshold is used
to classify the names into one of the defined labels.
:param names:
A list of names for which the gender needs to be predicted. Each
name must be a string.
:param threshold:
A float value representing the threshold for classification. Names
with predicted probabilities greater than or equal to this value
will be classified into the positive class.
:return:
A tuple containing the predicted gender labels and their
corresponding probabilities. The first element of the tuple is a
list of predicted labels, while the second element is an array of
probability scores for each label.
"""
model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
model: Pipeline = load_pickle(model_path)
label_encoder = load_pickle(encoder_path)
X = [name.lower().strip() for name in names]
proba = model.predict_proba(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def predict_lstm(names: List[str], threshold: float, max_len=6):
"""
Predicts gender labels and probabilities for a list of names using a pre-trained BiLSTM model.
The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
names, and then uses the loaded model to predict gender probabilities. Based on the threshold
value, it determines the predicted gender labels.
:param names: List of names to be classified.
:type names: List[str]
:param threshold: Probability threshold for classifying gender. If the predicted probability for the
'positive' class is greater than or equal to this threshold, it is classified accordingly.
:type threshold: float
:param max_len: Maximum length for name sequences. Names longer than this will be truncated, and shorter
ones will be padded. Default value is 6.
:type max_len: int, optional
:return: A tuple containing predicted labels and associated probabilities. Labels are the predicted gender
categories, and probabilities are the prediction scores for each input name.
:rtype: Tuple[numpy.ndarray, numpy.ndarray]
"""
model_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5")
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl")
model = tf.keras.models.load_model(model_path)
tokenizer: Tokenizer = load_pickle(tokenizer_path)
label_encoder = load_pickle(encoder_path)
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
X = pad_sequences(X, maxlen=max_len, padding="post")
proba = model.predict(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def predict_transformer(names: List[str], threshold: float, max_len=6):
"""
Predicts gender labels for the provided names using a pre-trained transformer model.
This function loads a pre-trained transformer model along with its tokenizer and label
encoder, converts input names into tokenized sequences, and processes them to generate
gender predictions. The function returns the predicted labels and the associated
probabilities for each sample.
:param names: List of names to predict gender labels for.
:type names: List[str]
:param threshold: Threshold value to determine the prediction class. Probability values
above or equal to the threshold will be assigned to one class, and those below to
another.
:type threshold: float
:param max_len: Maximum length for the sequences. Names will be truncated or padded to
this length during processing, default is 6.
:type max_len: int, optional
:return: A tuple containing two elements: a list of predicted gender labels as strings
and a NumPy array of probabilities for each gender class (where the first index
corresponds to one class, and the second index corresponds to another).
:rtype: Tuple[List[str], numpy.ndarray]
"""
model_path = os.path.join(GENDER_MODELS_DIR, "transformer.h5")
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
model = tf.keras.models.load_model(model_path)
tokenizer: Tokenizer = load_pickle(tokenizer_path)
label_encoder = load_pickle(encoder_path)
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
X = pad_sequences(X, maxlen=max_len, padding="post")
proba = model.predict(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def main():
parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
parser.add_argument("--name", nargs="+", required=True, help="One or more names")
parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
args = parser.parse_args()
model = args.model
names = args.name
threshold = args.threshold
if model == "logreg":
labels, proba = predict_logreg(names, threshold)
elif model == "lstm":
labels, proba = predict_lstm(names, threshold)
elif model == "transformer":
labels, proba = predict_transformer(names, threshold)
else:
raise ValueError(f"Unsupported model type: {model}")
for i, name in enumerate(names):
p_female = proba[i][0]
p_male = proba[i][1]
print(f"{name}{labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
if __name__ == "__main__":
main()