feat: balanced dataset loading

This commit is contained in:
2025-06-30 01:32:10 +02:00
parent eb139ee09a
commit 0888d94596
9 changed files with 306 additions and 614 deletions
+19 -113
View File
@@ -1,8 +1,6 @@
import argparse
import logging
import os
from dataclasses import dataclass
from typing import Tuple, Optional
from typing import Tuple
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
@@ -16,54 +14,20 @@ from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
logging.basicConfig(level=logging.INFO, format=">> %(message)s")
from ners.gender.models import BaseConfig, load_config, logging
@dataclass
class Config:
dataset_path: str
size: Optional[int]
test_size: float = 0.2
class Config(BaseConfig):
ngram_range: Tuple[int, int] = (2, 5)
max_iter: int = 1000
random_state: int = 42
threshold: float = 0.5
cv: Optional[int] = None
save: bool = False
def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]:
"""
Load and clean dataset as specified by the provided configuration. This function reads
a CSV dataset from the path specified in the configuration, processes it to remove
missing values from key columns ('name' and 'sex'), and cleans string data in these
columns by converting them to lowercase and stripping whitespace. The cleaned data
is then returned as two separate pandas Series objects.
:param cfg: Configuration object specifying the dataset path and size
:type cfg: Config
:return: A tuple containing cleaned `name` and `sex` data as pandas Series objects
:rtype: Tuple[pd.Series, pd.Series]
"""
logging.info(f"Loading dataset from {cfg.dataset_path}")
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size))
df = df.dropna(subset=["name", "sex"])
df["name"] = df["name"].str.lower().str.strip()
df["sex"] = df["sex"].str.lower().str.strip()
return df["name"], df["sex"]
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
"""
Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical
labels to integers, which is particularly useful for machine learning models that require numerical
input data.
:param y: A pandas Series of categorical labels to be encoded.
:type y: pd.Series
:return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object.
:rtype: Tuple[pd.Series, LabelEncoder]
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
for model training. The transformed labels and the fitted encoder are returned.
"""
logging.info("Encoding labels")
encoder = LabelEncoder()
@@ -73,21 +37,11 @@ def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
def build_model(cfg: Config) -> Pipeline:
"""
Builds a machine learning pipeline for text classification.
This function constructs and returns a scikit-learn pipeline that consists of
a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer
leverages character-level n-grams based on the provided configuration, and the
logistic regression model is trained with a maximum number of iterations defined
in the configuration. This pipeline is used for processing text data and training
classification models.
:param cfg: Configuration object containing the n-gram range and the maximum
number of iterations for the logistic regression model.
:type cfg: Config
:return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression`
based on the provided configuration.
:rtype: Pipeline
Build a logistic regression model pipeline with a character-level CountVectorizer.
The pipeline consists of a CountVectorizer that transforms the input text into
character n-grams, followed by a Logistic Regression classifier. The n-gram range
and maximum iterations for the logistic regression can be configured through the
provided configuration object.
"""
return make_pipeline(
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
@@ -95,7 +49,7 @@ def build_model(cfg: Config) -> Pipeline:
)
def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
"""
Evaluates the performance of a classification model using a specified threshold
for predicted probabilities. Computes metrics such as accuracy, precision,
@@ -104,19 +58,6 @@ def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
Logs the evaluation metrics at the specified threshold and prints the confusion
matrix and classification report.
:param y_true: Ground truth (correct) labels.
:type y_true: array-like
:param y_proba: Predicted probabilities for each class, where each row
corresponds to an instance and contains probabilities for each target class.
:type y_proba: numpy.ndarray
:param threshold: The threshold on predicted probabilities to determine
class membership for each instance.
:type threshold: float
:param class_names: List of class names for the target variable used in the
classification report.
:type class_names: list of str
:return: None
"""
logging.info(f"Evaluating at threshold = {threshold}")
y_pred = (y_proba[:, 1] >= threshold).astype(int)
@@ -135,16 +76,6 @@ def cross_validate(cfg: Config, X, y) -> None:
Performs k-fold cross-validation on the provided dataset using the configuration and
logs the results including individual fold scores, mean accuracy, and the standard
deviation of the scores.
:param cfg: Configuration object containing cross-validation settings such as the
number of folds to use in the cross-validation (`cv`).
:type cfg: Config
:param X: Input feature matrix for the dataset to be used for cross-validation.
:type X: Any
:param y: Target labels corresponding to the input feature matrix `X`.
:type y: Any
:return: This function does not return any value. Results are logged.
:rtype: None
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
pipeline = build_model(cfg)
@@ -153,21 +84,9 @@ def cross_validate(cfg: Config, X, y) -> None:
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
def save_artifacts(model, encoder, cfg: Config):
def save_artifacts(model, encoder):
"""
Saves machine learning model and label encoder artifacts to specified directories
within the gender models' directory. This function ensures that the model and encoder
are serialized and stored as pickle files. It uses the specified configuration settings
to locate the appropriate directory for storing the files.
:param model: The machine learning model object to be saved.
:type model: Any
:param encoder: The label encoder object used for data preprocessing.
:type encoder: Any
:param cfg: Configuration object containing application-specific settings regarding
paths and directories.
:type cfg: Config
:return: None
Saves the trained model and label encoder artifacts to the specified directory.
"""
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
@@ -176,23 +95,10 @@ def save_artifacts(model, encoder, cfg: Config):
def main():
parser = argparse.ArgumentParser(description="Train a gender classifier on names")
parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset")
parser.add_argument("--size", type=int, help="Number of rows to load")
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision")
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
parser.add_argument("--save", action="store_true", help="Save the model and encoder")
args = parser.parse_args()
cfg = Config(**vars(load_config("logistic regression model")))
cfg = Config(
dataset_path=args.dataset,
size=args.size,
threshold=args.threshold,
cv=args.cv,
save=args.save
)
X_raw, y_raw = load_and_clean_data(cfg)
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
X_raw, y_raw = df["name"], df["sex"]
y_encoded, encoder = encode_labels(y_raw)
if cfg.cv:
@@ -207,10 +113,10 @@ def main():
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)
evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, encoder, cfg)
save_artifacts(model, encoder)
if __name__ == "__main__":