refactor: include province and annotation pipeline

This commit is contained in:
2025-07-24 12:50:30 +02:00
parent da7b09dab3
commit e2536c1899
18 changed files with 402 additions and 355 deletions
View File
+115
View File
@@ -0,0 +1,115 @@
import argparse
import os
import tensorflow as tf
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support, confusion_matrix
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
def evaluate_logreg(df, threshold):
"""
Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
a pre-trained model and label encoder, transforms the input data into the required format, and
performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
the encoder class labels.
"""
model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
X = df["name"].tolist()
y_true = encoder.transform(df["sex"])
proba = model.predict_proba(X)
y_pred = (proba[:, 1] >= threshold).astype(int)
return y_true, y_pred, proba[:, 1], encoder.classes_
def evaluate_lstm(df, threshold, max_len=6):
"""
Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
returns the true labels, predicted labels, prediction probabilities, and class names.
"""
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
sequences = tokenizer.texts_to_sequences(df["name"])
X = pad_sequences(sequences, maxlen=max_len, padding="post")
y_true = encoder.transform(df["sex"])
proba = model.predict(X)
y_pred = (proba[:, 1] >= threshold).astype(int)
return y_true, y_pred, proba[:, 1], encoder.classes_
def evaluate_transformer(df, threshold, max_len=6):
"""
Evaluates the transformer model for gender prediction. The function loads a pre-trained
transformer model, tokenizer, and label encoder. It processes the input dataframe by
tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
The function then predicts the probabilities for the given names using the transformer model
and generates predictions based on the specified threshold.
"""
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
sequences = tokenizer.texts_to_sequences(df["name"])
X = pad_sequences(sequences, maxlen=max_len, padding="post")
y_true = encoder.transform(df["sex"])
proba = model.predict(X)
y_pred = (proba[:, 1] >= threshold).astype(int)
return y_true, y_pred, proba[:, 1], encoder.classes_
def compute_metrics(y_true, y_pred, y_proba, class_names):
"""
Computes classification metrics for given true and predicted labels, along with
class probabilities and class names. The function calculates accuracy, precision,
recall, F1 score, and confusion matrix for evaluating model performance.
"""
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
cm = confusion_matrix(y_true, y_pred).tolist()
return {
"accuracy": acc,
"precision": pr,
"recall": rc,
"f1": f1,
"confusion_matrix": {
"labels": class_names.tolist(),
"matrix": cm
}
}
def main():
parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
parser.add_argument("--dataset", default="names_evaluation.csv", help="Path to the dataset CSV file")
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
args = parser.parse_args()
df = load_csv_dataset(args.dataset, args.size, args.balanced)
model_funcs = {
"logreg": evaluate_logreg,
"lstm": evaluate_lstm,
"transformer": evaluate_transformer,
}
try:
y_true, y_pred, y_proba, classes = model_funcs[args.model](df, args.threshold)
except KeyError:
raise ValueError(f"Unknown model: {args.model}")
results = compute_metrics(y_true, y_pred, y_proba, classes)
save_json_dataset(results, os.path.join(GENDER_RESULT_DIR, f'{args.model}_eval'))
if __name__ == "__main__":
main()
+80
View File
@@ -0,0 +1,80 @@
import argparse
from dataclasses import dataclass
from typing import Optional
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support,
classification_report, confusion_matrix
)
from misc import logging
def evaluate_proba(y_true, y_proba, threshold, class_names):
y_pred = (y_proba[:, 1] >= threshold).astype(int)
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
cm = confusion_matrix(y_true, y_pred)
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
@dataclass
class BaseConfig:
"""
Represents the base configuration for a dataset and its associated parameters.
This class serves as a foundational configuration handler to encapsulate
dataset-related parameters and options. It allows customization of dataset
behavior, including threshold values, size, cross-validation settings, and
whether to save derived configurations. It can also manage configurations
for balanced datasets if necessary.
"""
dataset_path: str = "names_featured.csv"
size: Optional[int] = None
threshold: float = 0.5
cv: Optional[int] = None
save: bool = False
balanced: bool = False
epochs: int = 10
test_size: float = 0.2
random_state: int = 42
def load_config(description: str) -> BaseConfig:
"""
Parses command-line arguments and loads the configuration for the logistic regression model.
This function sets up an argument parser for various command-line options including
the dataset path, dataset size, dataset balancing, classification threshold,
cross-validation folds, and saving the model and its associated artifacts. Once parsed,
it transfers the configurations to a ``BaseConfig`` instance and returns it.
"""
parser = argparse.ArgumentParser(description)
parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
args = parser.parse_args()
return BaseConfig(
dataset_path=args.dataset,
size=args.size,
threshold=args.threshold,
cv=args.cv,
save=args.save,
balanced=args.balanced,
epochs=args.epochs,
test_size=args.test_size,
random_state=args.random_state
)
+123
View File
@@ -0,0 +1,123 @@
import os
from dataclasses import dataclass
from typing import Tuple
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
precision_recall_fscore_support
)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import BaseConfig, load_config, logging
@dataclass
class Config(BaseConfig):
ngram_range: Tuple[int, int] = (2, 5)
max_iter: int = 1000
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
"""
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
for model training. The transformed labels and the fitted encoder are returned.
"""
logging.info("Encoding labels")
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
return y_encoded, encoder
def build_model(cfg: Config) -> Pipeline:
"""
Build a logistic regression model pipeline with a character-level CountVectorizer.
The pipeline consists of a CountVectorizer that transforms the input text into
character n-grams, followed by a Logistic Regression classifier. The n-gram range
and maximum iterations for the logistic regression can be configured through the
provided configuration object.
"""
return make_pipeline(
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
LogisticRegression(max_iter=cfg.max_iter)
)
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
"""
Evaluates the performance of a classification model using a specified threshold
for predicted probabilities. Computes metrics such as accuracy, precision,
recall, F1-score, and the confusion matrix. Also generates a classification
report with detailed metrics for each class.
Logs the evaluation metrics at the specified threshold and prints the confusion
matrix and classification report.
"""
logging.info(f"Evaluating at threshold = {threshold}")
y_pred = (y_proba[:, 1] >= threshold).astype(int)
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
cm = confusion_matrix(y_true, y_pred)
logging.info(f"Accuracy: {acc:.4f}")
logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
def cross_validate(cfg: Config, X, y) -> None:
"""
Performs k-fold cross-validation on the provided dataset using the configuration and
logs the results including individual fold scores, mean accuracy, and the standard
deviation of the scores.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
pipeline = build_model(cfg)
scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
logging.info(f"Cross-validation scores: {scores}")
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
def save_artifacts(model, encoder):
"""
Saves the trained model and label encoder artifacts to the specified directory.
"""
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("logistic regression model")))
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
X_raw, y_raw = df["name"], df["sex"]
y_encoded, encoder = encode_labels(y_raw)
if cfg.cv:
cross_validate(cfg, X_raw, y_encoded)
return
X_train, X_test, y_train, y_test = train_test_split(
X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
)
model = build_model(cfg)
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, encoder)
if __name__ == "__main__":
main()
+144
View File
@@ -0,0 +1,144 @@
import os
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import pandas as pd
from sklearn.metrics import (
accuracy_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ProgbarLogger
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import load_config, BaseConfig, evaluate_proba, logging
@dataclass
class Config(BaseConfig):
max_len: int = 6
embedding_dim: int = 64
lstm_units: int = 32
batch_size: int = 64
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
"""
Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
model training. The resulting outputs are ready for input into a machine learning pipeline.
"""
logging.info("Loading and preprocessing data")
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(df["name"])
sequences = tokenizer.texts_to_sequences(df["name"])
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["sex"])
return padded, labels, tokenizer, label_encoder
def build_model(cfg: Config, vocab_size: int) -> Sequential:
"""
Builds and compiles a Sequential LSTM-based model. The model consists of an
embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
activation, and an output layer with a softmax activation function. The model
is compiled using sparse categorical crossentropy loss and the Adam optimizer.
"""
logging.info("Building LSTM model")
model = Sequential([
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)),
Bidirectional(LSTM(cfg.lstm_units)),
Dense(64, activation="relu"),
Dense(2, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
return model
def cross_validate(cfg: Config, X, y, vocab_size: int):
"""
Performs cross-validation on the given dataset using the specified model configuration.
The function uses StratifiedKFold cross-validator to split the dataset into training and
validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
on the validation data, and logs the fold-wise and overall results.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
accuracies = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
logging.info(f"Fold {fold + 1}")
model = build_model(cfg, vocab_size)
model.fit(X[train_idx], y[train_idx],
epochs=cfg.epochs,
batch_size=cfg.batch_size,
verbose=0)
y_pred = model.predict(X[val_idx])
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
accuracies.append(acc)
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
def save_artifacts(model, tokenizer, encoder):
"""
Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
The function ensures that the specified directory for saving artifacts exists,
then serializes the model, tokenizer, and encoder using appropriate formats. It
also logs the success of the operation to notify the user of the action taken.
"""
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
X, y, tokenizer, encoder = load_and_prepare(cfg)
vocab_size = len(tokenizer.word_index) + 1
if cfg.cv:
cross_validate(cfg, X, y, vocab_size)
return
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
)
model = build_model(cfg, vocab_size)
model.summary()
logging.info("Training model")
model.fit(X_train, y_train,
validation_split=0.1,
epochs=cfg.epochs,
batch_size=cfg.batch_size,
callbacks=[ProgbarLogger()])
y_proba = model.predict(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, tokenizer, encoder)
if __name__ == "__main__":
main()
+173
View File
@@ -0,0 +1,173 @@
import os
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import (
accuracy_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ProgbarLogger
from tensorflow.keras.layers import (
Input, Embedding, Dense, GlobalAveragePooling1D,
MultiHeadAttention, Dropout, LayerNormalization
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import BaseConfig, load_config, evaluate_proba, logging
@dataclass
class Config(BaseConfig):
max_len: int = 6
embedding_dim: int = 64
transformer_head_size: int = 64
transformer_num_heads: int = 2
transformer_ff_dim: int = 128
dropout: float = 0.1
batch_size: int = 64
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
"""
Load and preprocess the dataset for training a Transformer model.
This function reads a CSV dataset, tokenizes the names, pads the sequences,
and encodes the labels. It returns the padded sequences, encoded labels,
tokenizer, and label encoder.
"""
logging.info("Loading and preprocessing data")
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df["name"])
sequences = tokenizer.texts_to_sequences(df["name"])
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
encoder = LabelEncoder()
labels = encoder.fit_transform(df["sex"])
return padded, labels, tokenizer, encoder
def transformer_encoder(x, cfg: Config):
"""
Transformer encoder block that applies multi-head attention and feed-forward
neural network layers with residual connections and layer normalization.
"""
attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
def build_model(cfg: Config, vocab_size: int) -> Model:
"""
Builds a Transformer-based model aimed at sequence processing tasks.
The model includes an embedding layer integrating positional encodings
and a Transformer encoder, followed by a global pooling layer,
a dense hidden layer, and a softmax output layer.
"""
logging.info("Building Transformer model")
inputs = Input(shape=(cfg.max_len,))
x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=cfg.max_len, delta=1)
pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
x = x + pos_embedding
x = transformer_encoder(x, cfg)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
outputs = Dense(2, activation="softmax")(x)
model = Model(inputs, outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
return model
def cross_validate(cfg: Config, X, y, vocab_size: int):
"""
Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
data. The overall mean and standard deviation of accuracies across all folds are logged.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
accuracies = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
logging.info(f"Fold {fold + 1}")
model = build_model(cfg, vocab_size)
model.fit(X[train_idx], y[train_idx],
epochs=cfg.epochs,
batch_size=cfg.batch_size,
verbose=0)
y_pred = model.predict(X[val_idx])
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
accuracies.append(acc)
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
def save_artifacts(model, tokenizer, encoder):
"""
Saves the model and associated artifacts to the designated directory. The model
is serialized and saved in a `.keras` file, while the tokenizer and label
encoder are serialized into `.pkl` files. If the directory does not exist, it
is created automatically. This function also logs the completion of the
operation.
"""
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("Transformer model")))
X, y, tokenizer, encoder = load_and_prepare(cfg)
vocab_size = len(tokenizer.word_index) + 1
if cfg.cv:
cross_validate(cfg, X, y, vocab_size)
return
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
)
model = build_model(cfg, vocab_size)
model.summary()
logging.info("Training Transformer model")
model.fit(
X_train, y_train,
validation_split=0.1,
epochs=cfg.epochs,
batch_size=cfg.batch_size,
callbacks=[ProgbarLogger()]
)
y_proba = model.predict(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, tokenizer, encoder)
if __name__ == "__main__":
main()
+107
View File
@@ -0,0 +1,107 @@
import argparse
import os
from typing import List
import tensorflow as tf
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_pickle
def predict_logreg(names: List[str], threshold: float):
"""
Predict gender labels for given names using a logistic regression model.
The function takes in a list of names and predicts the gender labels
based on a logistic regression model. A probabilistic threshold is used
to classify the names into one of the defined labels.
"""
model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
model: Pipeline = load_pickle(model_path)
label_encoder = load_pickle(encoder_path)
X = [name.lower().strip() for name in names]
proba = model.predict_proba(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def predict_lstm(names: List[str], threshold: float, max_len=6):
"""
Predicts gender labels and probabilities for a list of names using a pre-trained BiLSTM model.
The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
names, and then uses the loaded model to predict gender probabilities. Based on the threshold
value, it determines the predicted gender labels.
"""
model_path = os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl")
model = tf.keras.models.load_model(model_path)
tokenizer: Tokenizer = load_pickle(tokenizer_path)
label_encoder = load_pickle(encoder_path)
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
X = pad_sequences(X, maxlen=max_len, padding="post")
proba = model.predict(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def predict_transformer(names: List[str], threshold: float, max_len=6):
"""
Predicts gender labels for the provided names using a pre-trained transformer model.
This function loads a pre-trained transformer model along with its tokenizer and label
encoder, converts input names into tokenized sequences, and processes them to generate
gender predictions. The function returns the predicted labels and the associated
probabilities for each sample.
"""
model_path = os.path.join(GENDER_MODELS_DIR, "transformer.keras")
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
model = tf.keras.models.load_model(model_path)
tokenizer: Tokenizer = load_pickle(tokenizer_path)
label_encoder = load_pickle(encoder_path)
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
X = pad_sequences(X, maxlen=max_len, padding="post")
proba = model.predict(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def main():
parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
parser.add_argument("--names", nargs="+", required=True, help="One or more names")
parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
args = parser.parse_args()
model_funcs = {
"logreg": predict_logreg,
"lstm": predict_lstm,
"transformer": predict_transformer,
}
try:
labels, proba = model_funcs[args.model](args.names, args.threshold)
except KeyError:
raise ValueError(f"Unsupported model type: {args.model}")
for i, name in enumerate(args.names):
p_female = proba[i][0]
p_male = proba[i][1]
print(f"{name}{labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
if __name__ == "__main__":
main()