124 lines
4.5 KiB
Python
124 lines
4.5 KiB
Python
import os
|
|
from dataclasses import dataclass
|
|
from typing import Tuple
|
|
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import (
|
|
accuracy_score, classification_report, confusion_matrix,
|
|
precision_recall_fscore_support
|
|
)
|
|
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
|
|
from sklearn.pipeline import make_pipeline, Pipeline
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
|
from pipeline.gender.models import BaseConfig, load_config, logging
|
|
|
|
|
|
@dataclass
|
|
class Config(BaseConfig):
|
|
ngram_range: Tuple[int, int] = (2, 5)
|
|
max_iter: int = 1000
|
|
|
|
|
|
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
|
|
"""
|
|
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
|
|
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
|
|
for model training. The transformed labels and the fitted encoder are returned.
|
|
"""
|
|
logging.info("Encoding labels")
|
|
encoder = LabelEncoder()
|
|
y_encoded = encoder.fit_transform(y)
|
|
return y_encoded, encoder
|
|
|
|
|
|
def build_model(cfg: Config) -> Pipeline:
|
|
"""
|
|
Build a logistic regression model pipeline with a character-level CountVectorizer.
|
|
The pipeline consists of a CountVectorizer that transforms the input text into
|
|
character n-grams, followed by a Logistic Regression classifier. The n-gram range
|
|
and maximum iterations for the logistic regression can be configured through the
|
|
provided configuration object.
|
|
"""
|
|
return make_pipeline(
|
|
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
|
|
LogisticRegression(max_iter=cfg.max_iter)
|
|
)
|
|
|
|
|
|
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
|
|
"""
|
|
Evaluates the performance of a classification model using a specified threshold
|
|
for predicted probabilities. Computes metrics such as accuracy, precision,
|
|
recall, F1-score, and the confusion matrix. Also generates a classification
|
|
report with detailed metrics for each class.
|
|
|
|
Logs the evaluation metrics at the specified threshold and prints the confusion
|
|
matrix and classification report.
|
|
"""
|
|
logging.info(f"Evaluating at threshold = {threshold}")
|
|
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
|
acc = accuracy_score(y_true, y_pred)
|
|
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
|
|
cm = confusion_matrix(y_true, y_pred)
|
|
|
|
logging.info(f"Accuracy: {acc:.4f}")
|
|
logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
|
|
print("Confusion Matrix:\n", cm)
|
|
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
|
|
|
|
|
def cross_validate(cfg: Config, X, y) -> None:
|
|
"""
|
|
Performs k-fold cross-validation on the provided dataset using the configuration and
|
|
logs the results including individual fold scores, mean accuracy, and the standard
|
|
deviation of the scores.
|
|
"""
|
|
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
|
pipeline = build_model(cfg)
|
|
scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
|
|
logging.info(f"Cross-validation scores: {scores}")
|
|
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
|
|
|
|
|
|
def save_artifacts(model, encoder):
|
|
"""
|
|
Saves the trained model and label encoder artifacts to the specified directory.
|
|
"""
|
|
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
|
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
|
|
|
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
|
|
|
|
|
def main():
|
|
cfg = Config(**vars(load_config("logistic regression model")))
|
|
|
|
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
|
X_raw, y_raw = df["name"], df["sex"]
|
|
y_encoded, encoder = encode_labels(y_raw)
|
|
|
|
if cfg.cv:
|
|
cross_validate(cfg, X_raw, y_encoded)
|
|
return
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
|
|
)
|
|
|
|
model = build_model(cfg)
|
|
model.fit(X_train, y_train)
|
|
|
|
y_proba = model.predict_proba(X_test)
|
|
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
|
|
|
if cfg.save:
|
|
save_artifacts(model, encoder)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|