diff --git a/misc/__init__.py b/misc/__init__.py index 2f1f320..b669c35 100644 --- a/misc/__init__.py +++ b/misc/__init__.py @@ -1,6 +1,7 @@ import csv import json import os +import pickle from datetime import datetime from typing import Optional @@ -10,12 +11,10 @@ DATA_DIR = os.path.join(ROOT_DIR, 'dataset') MODELS_DIR = os.path.join(ROOT_DIR, 'models') GENDER_MODELS_DIR = os.path.join(MODELS_DIR, 'gender') +GENDER_RESULT_DIR = os.path.join(ROOT_DIR, 'gender', 'results') + NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner') - -# Training -TRAINING_EPOCHS = 5 -MODEL_NAME = f"./models/ners-{datetime.now().strftime('%Y%m%d%H%M%S')}" - +NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results') def clean_spacing(filename: str) -> Optional[str]: try: @@ -57,3 +56,13 @@ def save_json_dataset(data: list, path: str) -> None: print(f">> Saving JSON dataset to {path}") with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, separators=(',', ':')) + + +def save_pickle(obj, path): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as f: + pickle.dump(obj, f) + +def load_pickle(path: str): + with open(path, "rb") as f: + return pickle.load(f) \ No newline at end of file diff --git a/processing/__init__.py b/ners/__init__.py similarity index 100% rename from processing/__init__.py rename to ners/__init__.py diff --git a/processing/gender/__init__.py b/ners/gender/__init__.py similarity index 100% rename from processing/gender/__init__.py rename to ners/gender/__init__.py diff --git a/ners/gender/eval.py b/ners/gender/eval.py new file mode 100644 index 0000000..1e9533a --- /dev/null +++ b/ners/gender/eval.py @@ -0,0 +1,192 @@ +import argparse +import json +import os + +import pandas as pd +import tensorflow as tf +from sklearn.metrics import ( + accuracy_score, precision_recall_fscore_support, confusion_matrix +) +from tensorflow.keras.preprocessing.sequence import pad_sequences + +from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR + + +def load_dataset(path="names.csv", size=None): + """ + Loads a dataset from a CSV file, processes it to remove missing values + and standardizes the case and formatting of specific columns. + + :param path: The path to the CSV file containing the dataset. Defaults to "names.csv". + :type path: str + :param size: The number of rows to load from the dataset. If None, the whole dataset is loaded. + :type size: Optional[int] + :return: A pandas DataFrame with the processed dataset where missing values in the + 'name' and 'sex' columns are removed, and the text in these columns is + converted to lowercase and stripped of leading/trailing whitespace. + :rtype: pandas.DataFrame + """ + df = pd.DataFrame(load_csv_dataset(path, size)).dropna(subset=["name", "sex"]) + df["name"] = df["name"].str.lower().str.strip() + df["sex"] = df["sex"].str.lower().str.strip() + return df + + +def evaluate_logreg(df, threshold): + """ + Evaluates a logistic regression model with the given DataFrame and threshold. The function loads + a pre-trained model and label encoder, transforms the input data into the required format, and + performs predictions. It returns the true labels, predicted labels, predicted probabilities, and + the encoder class labels. + + :param df: Input data containing a column "name" for names to evaluate and a column "sex" + for true labels. + Type: pandas.DataFrame + + :param threshold: Threshold value used for classifying the predictions. Probabilities greater + than or equal to this value are classified into the positive class. + Type: float + + :return: A tuple containing: + - y_true: True labels after encoding. + - y_pred: Predicted binary class labels based on the threshold. + - proba[:, 1]: Probability values for the positive class. + - encoder.classes_: Labels used by the label encoder. + Type: tuple (numpy.ndarray, int, numpy.ndarray, numpy.ndarray) + """ + model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")) + encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")) + + X = df["name"].tolist() + y_true = encoder.transform(df["sex"]) + proba = model.predict_proba(X) + y_pred = 1 if proba[:, 1] >= threshold else 0 + return y_true, y_pred, proba[:, 1], encoder.classes_ + + +def evaluate_lstm(df, threshold, max_len=6): + """ + Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and + returns the true labels, predicted labels, prediction probabilities, and class names. + + :param df: Input DataFrame containing the data for evaluation. + The DataFrame must have two columns: "name" containing + the input text data and "sex" containing the true labels. + :type df: Pandas.DataFrame + :param threshold: Decision threshold for determining binary classification + outcome based on model's prediction probabilities. + :type threshold: Float + :param max_len: The maximum length of input sequences. Used to pad or truncate + tokenized sequences. Default value is 6. + :type max_len: Int + :return: A tuple containing the following elements: + - y_true: The true labels from the input DataFrame. + - y_pred: The predicted binary labels according to the decision threshold. + - proba: Prediction probabilities for the positive class, as output by the model. + - encoder.classes_: An array of class names corresponding to the label encoding. + :rtype: Tuple + """ + model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5")) + tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl")) + encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl")) + + sequences = tokenizer.texts_to_sequences(df["name"]) + X = pad_sequences(sequences, maxlen=max_len, padding="post") + y_true = encoder.transform(df["sex"]) + proba = model.predict(X) + y_pred = 1 if proba[:, 1] >= threshold else 0 + return y_true, y_pred, proba[:, 1], encoder.classes_ + + +def evaluate_transformer(df, threshold, max_len=6): + """ + Evaluates the transformer model for gender prediction. The function loads a pre-trained + transformer model, tokenizer, and label encoder. It processes the input dataframe by + tokenizing and padding the "name" column and encodes the "sex" column to numerical format. + The function then predicts the probabilities for the given names using the transformer model + and generates predictions based on the specified threshold. + + :param df: Pandas DataFrame containing a "name" column with strings to be evaluated + and a "sex" column with corresponding target labels. + :type df: Pd.DataFrame + :param threshold: Threshold value used to determine binary classification labels + from predicted probabilities. + :type threshold: Float + :param max_len: Maximum length for padded sequences, default is 6. + :type max_len: Int, optional + :return: A tuple containing the ground truth labels, predicted labels, predicted + probabilities for the positive class, and a list of the label classes. + :rtype: Tuple + """ + model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.h5")) + tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")) + encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")) + + sequences = tokenizer.texts_to_sequences(df["name"]) + X = pad_sequences(sequences, maxlen=max_len, padding="post") + y_true = encoder.transform(df["sex"]) + proba = model.predict(X) + y_pred = 1 if proba[:, 1] >= threshold else 0 + return y_true, y_pred, proba[:, 1], encoder.classes_ + + +def compute_metrics(y_true, y_pred, y_proba, class_names): + """ + Computes classification metrics for given true and predicted labels, along with + class probabilities and class names. The function calculates accuracy, precision, + recall, F1 score, and confusion matrix for evaluating model performance. + + :param y_true: Ground truth (correct) labels. + :type y_true: list or numpy.ndarray + :param y_pred: Predicted labels, as returned by a classifier. + :type y_pred: list or numpy.ndarray + :param y_proba: Predicted probabilities for positive class. + :type y_proba: list or numpy.ndarray + :param class_names: Names of the classes corresponding to labels in the confusion + matrix. + :type class_names: numpy.ndarray + :return: A dictionary containing computed accuracy, precision, recall, F1 score, + and confusion matrix with labels and matrix elements. + :rtype: dict + """ + acc = accuracy_score(y_true, y_pred) + pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary") + cm = confusion_matrix(y_true, y_pred).tolist() + + return { + "accuracy": acc, + "precision": pr, + "recall": rc, + "f1": f1, + "confusion_matrix": { + "labels": class_names.tolist(), + "matrix": cm + } + } + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate gender prediction model") + parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True) + parser.add_argument("--dataset", default="names.csv") + parser.add_argument("--size", type=int) + parser.add_argument("--threshold", type=float, default=0.5) + args = parser.parse_args() + + df = load_dataset(args.dataset, args.size) + + if args.model == "logreg": + y_true, y_pred, y_proba, classes = evaluate_logreg(df, args.threshold) + elif args.model == "lstm": + y_true, y_pred, y_proba, classes = evaluate_lstm(df, args.threshold) + elif args.model == "transformer": + y_true, y_pred, y_proba, classes = evaluate_transformer(df, args.threshold) + else: + raise ValueError(f"Unknown model: {args.model}") + + results = compute_metrics(y_true, y_pred, y_proba, classes) + save_json_dataset(results, os.path.join(GENDER_RESULT_DIR, f'{args.model}_eval')) + + +if __name__ == "__main__": + main() diff --git a/ners/gender/predict.py b/ners/gender/predict.py new file mode 100644 index 0000000..46a44c0 --- /dev/null +++ b/ners/gender/predict.py @@ -0,0 +1,150 @@ +import argparse +import os +from typing import List + +import tensorflow as tf +from sklearn.pipeline import Pipeline +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer + +from misc import GENDER_MODELS_DIR, load_pickle + + +def predict_logreg(names: List[str], threshold: float): + """ + Predict gender labels for given names using a logistic regression model. + + The function takes in a list of names and predicts the gender labels + based on a logistic regression model. A probabilistic threshold is used + to classify the names into one of the defined labels. + + :param names: + A list of names for which the gender needs to be predicted. Each + name must be a string. + :param threshold: + A float value representing the threshold for classification. Names + with predicted probabilities greater than or equal to this value + will be classified into the positive class. + :return: + A tuple containing the predicted gender labels and their + corresponding probabilities. The first element of the tuple is a + list of predicted labels, while the second element is an array of + probability scores for each label. + """ + model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl") + encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl") + + model: Pipeline = load_pickle(model_path) + label_encoder = load_pickle(encoder_path) + + X = [name.lower().strip() for name in names] + proba = model.predict_proba(X) + pred = (proba[:, 1] >= threshold).astype(int) + labels = label_encoder.inverse_transform(pred) + return labels, proba + + +def predict_lstm(names: List[str], threshold: float, max_len=6): + """ + Predicts gender labels and probabilities for a list of names using a pre-trained BiLSTM model. + + The function loads the model, tokenizer, and label encoder, performs preprocessing on the input + names, and then uses the loaded model to predict gender probabilities. Based on the threshold + value, it determines the predicted gender labels. + + :param names: List of names to be classified. + :type names: List[str] + :param threshold: Probability threshold for classifying gender. If the predicted probability for the + 'positive' class is greater than or equal to this threshold, it is classified accordingly. + :type threshold: float + :param max_len: Maximum length for name sequences. Names longer than this will be truncated, and shorter + ones will be padded. Default value is 6. + :type max_len: int, optional + + :return: A tuple containing predicted labels and associated probabilities. Labels are the predicted gender + categories, and probabilities are the prediction scores for each input name. + :rtype: Tuple[numpy.ndarray, numpy.ndarray] + """ + model_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_model.h5") + tokenizer_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_tokenizer.pkl") + encoder_path = os.path.join(GENDER_MODELS_DIR, "BiLSTM_label_encoder.pkl") + + model = tf.keras.models.load_model(model_path) + tokenizer: Tokenizer = load_pickle(tokenizer_path) + label_encoder = load_pickle(encoder_path) + + X = tokenizer.texts_to_sequences([n.lower().strip() for n in names]) + X = pad_sequences(X, maxlen=max_len, padding="post") + proba = model.predict(X) + pred = (proba[:, 1] >= threshold).astype(int) + labels = label_encoder.inverse_transform(pred) + return labels, proba + + +def predict_transformer(names: List[str], threshold: float, max_len=6): + """ + Predicts gender labels for the provided names using a pre-trained transformer model. + + This function loads a pre-trained transformer model along with its tokenizer and label + encoder, converts input names into tokenized sequences, and processes them to generate + gender predictions. The function returns the predicted labels and the associated + probabilities for each sample. + + :param names: List of names to predict gender labels for. + :type names: List[str] + :param threshold: Threshold value to determine the prediction class. Probability values + above or equal to the threshold will be assigned to one class, and those below to + another. + :type threshold: float + :param max_len: Maximum length for the sequences. Names will be truncated or padded to + this length during processing, default is 6. + :type max_len: int, optional + :return: A tuple containing two elements: a list of predicted gender labels as strings + and a NumPy array of probabilities for each gender class (where the first index + corresponds to one class, and the second index corresponds to another). + :rtype: Tuple[List[str], numpy.ndarray] + """ + model_path = os.path.join(GENDER_MODELS_DIR, "transformer.h5") + tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl") + encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl") + + model = tf.keras.models.load_model(model_path) + tokenizer: Tokenizer = load_pickle(tokenizer_path) + label_encoder = load_pickle(encoder_path) + + X = tokenizer.texts_to_sequences([n.lower().strip() for n in names]) + X = pad_sequences(X, maxlen=max_len, padding="post") + proba = model.predict(X) + pred = (proba[:, 1] >= threshold).astype(int) + labels = label_encoder.inverse_transform(pred) + return labels, proba + + +def main(): + parser = argparse.ArgumentParser(description="Predict gender from names using trained model") + parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True) + parser.add_argument("--name", nargs="+", required=True, help="One or more names") + parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification") + args = parser.parse_args() + + model = args.model + names = args.name + threshold = args.threshold + + if model == "logreg": + labels, proba = predict_logreg(names, threshold) + elif model == "lstm": + labels, proba = predict_lstm(names, threshold) + elif model == "transformer": + labels, proba = predict_transformer(names, threshold) + else: + raise ValueError(f"Unsupported model type: {model}") + + for i, name in enumerate(names): + p_female = proba[i][0] + p_male = proba[i][1] + print(f"{name} → {labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}") + + +if __name__ == "__main__": + main()