feat: balanced dataset loading

2025-06-30 01:32:10 +02:00
parent eb139ee09a
commit 0888d94596
9 changed files with 306 additions and 614 deletions
@@ -1,8 +1,6 @@
-import argparse
-import logging
 import os
 from dataclasses import dataclass
-from typing import Tuple, Optional
+from typing import Tuple

 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
@@ -16,54 +14,20 @@ from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.preprocessing import LabelEncoder

 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
-
-logging.basicConfig(level=logging.INFO, format=">> %(message)s")
+from ners.gender.models import BaseConfig, load_config, logging


@dataclass
-class Config:
-    dataset_path: str
-    size: Optional[int]
-    test_size: float = 0.2
+class Config(BaseConfig):
    ngram_range: Tuple[int, int] = (2, 5)
    max_iter: int = 1000
-    random_state: int = 42
-    threshold: float = 0.5
-    cv: Optional[int] = None
-    save: bool = False
-
-
-def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]:
-    """
-    Load and clean dataset as specified by the provided configuration. This function reads
-    a CSV dataset from the path specified in the configuration, processes it to remove
-    missing values from key columns ('name' and 'sex'), and cleans string data in these
-    columns by converting them to lowercase and stripping whitespace. The cleaned data
-    is then returned as two separate pandas Series objects.
-
-    :param cfg: Configuration object specifying the dataset path and size
-    :type cfg: Config
-    :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects
-    :rtype: Tuple[pd.Series, pd.Series]
-    """
-    logging.info(f"Loading dataset from {cfg.dataset_path}")
-    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size))
-    df = df.dropna(subset=["name", "sex"])
-    df["name"] = df["name"].str.lower().str.strip()
-    df["sex"] = df["sex"].str.lower().str.strip()
-    return df["name"], df["sex"]


 def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
    """
-    Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical
-    labels to integers, which is particularly useful for machine learning models that require numerical
-    input data.
-
-    :param y: A pandas Series of categorical labels to be encoded.
-    :type y: pd.Series
-    :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object.
-    :rtype: Tuple[pd.Series, LabelEncoder]
+    Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
+    fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
+    for model training. The transformed labels and the fitted encoder are returned.
    """
    logging.info("Encoding labels")
    encoder = LabelEncoder()
@@ -73,21 +37,11 @@ def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:

 def build_model(cfg: Config) -> Pipeline:
    """
-    Builds a machine learning pipeline for text classification.
-
-    This function constructs and returns a scikit-learn pipeline that consists of
-    a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer
-    leverages character-level n-grams based on the provided configuration, and the
-    logistic regression model is trained with a maximum number of iterations defined
-    in the configuration. This pipeline is used for processing text data and training
-    classification models.
-
-    :param cfg: Configuration object containing the n-gram range and the maximum
-                number of iterations for the logistic regression model.
-    :type cfg: Config
-    :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression`
-             based on the provided configuration.
-    :rtype: Pipeline
+    Build a logistic regression model pipeline with a character-level CountVectorizer.
+    The pipeline consists of a CountVectorizer that transforms the input text into
+    character n-grams, followed by a Logistic Regression classifier. The n-gram range
+    and maximum iterations for the logistic regression can be configured through the
+    provided configuration object.
    """
    return make_pipeline(
        CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
@@ -95,7 +49,7 @@ def build_model(cfg: Config) -> Pipeline:
    )


-def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
+def evaluate_proba(y_true, y_proba, threshold: float, class_names):
    """
    Evaluates the performance of a classification model using a specified threshold
    for predicted probabilities. Computes metrics such as accuracy, precision,
@@ -104,19 +58,6 @@ def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):

    Logs the evaluation metrics at the specified threshold and prints the confusion
    matrix and classification report.
-
-    :param y_true: Ground truth (correct) labels.
-    :type y_true: array-like
-    :param y_proba: Predicted probabilities for each class, where each row
-        corresponds to an instance and contains probabilities for each target class.
-    :type y_proba: numpy.ndarray
-    :param threshold: The threshold on predicted probabilities to determine
-        class membership for each instance.
-    :type threshold: float
-    :param class_names: List of class names for the target variable used in the
-        classification report.
-    :type class_names: list of str
-    :return: None
    """
    logging.info(f"Evaluating at threshold = {threshold}")
    y_pred = (y_proba[:, 1] >= threshold).astype(int)
@@ -135,16 +76,6 @@ def cross_validate(cfg: Config, X, y) -> None:
    Performs k-fold cross-validation on the provided dataset using the configuration and
    logs the results including individual fold scores, mean accuracy, and the standard
    deviation of the scores.
-
-    :param cfg: Configuration object containing cross-validation settings such as the
-        number of folds to use in the cross-validation (`cv`).
-    :type cfg: Config
-    :param X: Input feature matrix for the dataset to be used for cross-validation.
-    :type X: Any
-    :param y: Target labels corresponding to the input feature matrix `X`.
-    :type y: Any
-    :return: This function does not return any value. Results are logged.
-    :rtype: None
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    pipeline = build_model(cfg)
@@ -153,21 +84,9 @@ def cross_validate(cfg: Config, X, y) -> None:
    logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")


-def save_artifacts(model, encoder, cfg: Config):
+def save_artifacts(model, encoder):
    """
-    Saves machine learning model and label encoder artifacts to specified directories
-    within the gender models' directory. This function ensures that the model and encoder
-    are serialized and stored as pickle files. It uses the specified configuration settings
-    to locate the appropriate directory for storing the files.
-
-    :param model: The machine learning model object to be saved.
-    :type model: Any
-    :param encoder: The label encoder object used for data preprocessing.
-    :type encoder: Any
-    :param cfg: Configuration object containing application-specific settings regarding
-        paths and directories.
-    :type cfg: Config
-    :return: None
+    Saves the trained model and label encoder artifacts to the specified directory.
    """
    save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
    save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
@@ -176,23 +95,10 @@ def save_artifacts(model, encoder, cfg: Config):


 def main():
-    parser = argparse.ArgumentParser(description="Train a gender classifier on names")
-    parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset")
-    parser.add_argument("--size", type=int, help="Number of rows to load")
-    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision")
-    parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
-    parser.add_argument("--save", action="store_true", help="Save the model and encoder")
-    args = parser.parse_args()
+    cfg = Config(**vars(load_config("logistic regression model")))

-    cfg = Config(
-        dataset_path=args.dataset,
-        size=args.size,
-        threshold=args.threshold,
-        cv=args.cv,
-        save=args.save
-    )
-
-    X_raw, y_raw = load_and_clean_data(cfg)
+    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
+    X_raw, y_raw = df["name"], df["sex"]
    y_encoded, encoder = encode_labels(y_raw)

    if cfg.cv:
@@ -207,10 +113,10 @@ def main():
    model.fit(X_train, y_train)

    y_proba = model.predict_proba(X_test)
-    evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
+    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)

    if cfg.save:
-        save_artifacts(model, encoder, cfg)
+        save_artifacts(model, encoder)


 if __name__ == "__main__":