diff --git a/README.md b/README.md index f039234..b77e455 100644 --- a/README.md +++ b/README.md @@ -13,28 +13,73 @@ cd drc-ners-nlp python3 -m venv .venv source .venv/bin/activate cp .env .env.local -make download pip install -r requirements.txt ``` ## Gender Inference -### 1. Training - +### 1. Dataset Preparation ```bash -python -m ners.gender.models.lstm --dataset names.csv --size 1000000 --save -python -m ners.gender.models.logreg --dataset names.csv --size 1000000 --save -python -m ners.gender.models.transformer --dataset names.csv --size 1000000 --save +python -m processing.gender.prepare ``` -### 2. Evaluation +### 2. Training +Arguments: + +| Name | Description | Default | +|----------------|--------------------------------------------------|--------------------| +| --dataset_path | Path to the dataset file | names_featured.csv | +| --size | Number of samples to use (None for full dataset) | None | +| --threshold | Probability threshold for gender classification | 0.5 | +| --cv | Number of cross-validation folds | None | +| --save | Whether to save the trained model | False | +| --balanced | Whether to balance the dataset | False | +| --epochs | Number of training epochs | 10 | +| --test_size | Proportion of data to use as test set | 0.2 | +| --random_state | Random seed for reproducibility | 42 | + + +Examples: + +```bash +python -m ners.gender.models.lstm --size 1000000 --save +python -m ners.gender.models.logreg 1000000 --save +python -m ners.gender.models.transformer --size 1000000 --save +``` + +### 3. Evaluation + + +Arguments: + +| Name | Description | Default | +|------------|-----------------------------------------------|----------------------| +| --model | Model type: logreg, lstm, or transformer | (required) | +| --dataset | Path to the dataset CSV file | names_featured.csv | +| --size | Number of rows to load from the dataset | None | +| --balanced | Load balanced dataset | False | +| --threshold| Probability threshold for classification | 0.5 | + +Examples: + ```bash python -m ners.gender.eval --dataset eval.csv --model logreg --threshold 0.5 --size 20000 python -m ners.gender.eval --dataset eval.csv --model lstm python -m ners.gender.eval --dataset eval.csv --model transformer ``` -### 3. Inference +### 4. Inference + +Arguments: + +| Name | Description | Default | +|-------------|------------------------------------------|-----------| +| --model | Model type: logreg, lstm, or transformer | (required)| +| --names | One or more names | (required)| +| --threshold | Threshold for classification | 0.5 | + +Examples: + ```bash python -m ners.gender.predict --model logreg --name "Tshisekedi" python -m ners.gender.predict --model lstm --name "Ilunga" "Albert" "Ilunga Albert" --threshold 0.7 diff --git a/misc/__init__.py b/misc/__init__.py index 0f038e7..e393613 100644 --- a/misc/__init__.py +++ b/misc/__init__.py @@ -4,6 +4,7 @@ import json import os import pickle from typing import Optional +from typing import List, Dict # Paths ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -17,15 +18,6 @@ NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner') NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results') -def clean_spacing(filename: str) -> Optional[str]: - try: - with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf8') as f: - content = f.read() - return content.translate(str.maketrans({'\00': ' ', ' ': ' '})) - except Exception as e: - return None - - def load_json_dataset(path: str) -> list: print(f">> Loading JSON dataset from {path}") with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f: @@ -40,30 +32,35 @@ def save_csv_dataset(data: list, path: str) -> None: writer.writerows(data) -def load_csv_dataset(path: str, limit: int = None) -> list: +def load_csv_dataset(path: str, limit: int = None, balanced: bool = False) -> List[Dict[str, str]]: print(f">> Loading CSV dataset from {path}") - data = [] - encodings = ['utf-8', 'utf-16', 'latin1'] - for enc in encodings: - try: - with open(os.path.join(DATA_DIR, path), "r", encoding=enc, errors="replace") as f: - raw_text = f.read().replace('\x00', '') + file_path = os.path.join(DATA_DIR, path) + with open(file_path, "r", encoding="utf-8", errors="replace", newline="") as f: + raw_text = f.read().replace('\x00', '') - csv_buffer = io.StringIO(raw_text) - reader = csv.DictReader(csv_buffer) - print(f">> Detected fieldnames: {reader.fieldnames}") + reader = csv.DictReader(io.StringIO(raw_text)) + print(f">> Detected fieldnames: {reader.fieldnames}") - for row in reader: - data.append(row) - if limit and len(data) >= limit: - break - print(f">> Successfully loaded with encoding: {enc}") - return data - except Exception as e: - print(f">> Failed with encoding: {enc}, error: {e}") + if balanced: + by_sex = {'m': [], 'f': []} + for row in reader: + sex = row.get("sex", "").lower() + if sex in by_sex: + by_sex[sex].append(row) + min_len = min(len(by_sex['m']), len(by_sex['f'])) + if limit: + min_len = min(min_len, limit // 2) + data = by_sex['m'][:min_len] + by_sex['f'][:min_len] + else: + data = [] + for i, row in enumerate(reader): + data.append(row) + if limit and i + 1 >= limit: + break - raise UnicodeDecodeError("load_csv_dataset", path, 0, 0, "Unable to decode file with common encodings.") + print(">> Successfully loaded with UTF-8 encoding") + return data def save_json_dataset(data: list, path: str) -> None: diff --git a/ners/gender/eval.py b/ners/gender/eval.py index cd71123..dbd6e09 100644 --- a/ners/gender/eval.py +++ b/ners/gender/eval.py @@ -1,8 +1,6 @@ import argparse -import json import os -import pandas as pd import tensorflow as tf from sklearn.metrics import ( accuracy_score, precision_recall_fscore_support, confusion_matrix @@ -12,47 +10,12 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR -def load_dataset(path="names.csv", size=None): - """ - Loads a dataset from a CSV file, processes it to remove missing values - and standardizes the case and formatting of specific columns. - - :param path: The path to the CSV file containing the dataset. Defaults to "names.csv". - :type path: str - :param size: The number of rows to load from the dataset. If None, the whole dataset is loaded. - :type size: Optional[int] - :return: A pandas DataFrame with the processed dataset where missing values in the - 'name' and 'sex' columns are removed, and the text in these columns is - converted to lowercase and stripped of leading/trailing whitespace. - :rtype: pandas.DataFrame - """ - df = pd.DataFrame(load_csv_dataset(path, size)).dropna(subset=["name", "sex"]) - df["name"] = df["name"].str.lower().str.strip() - df["sex"] = df["sex"].str.lower().str.strip() - return df - - def evaluate_logreg(df, threshold): """ Evaluates a logistic regression model with the given DataFrame and threshold. The function loads a pre-trained model and label encoder, transforms the input data into the required format, and performs predictions. It returns the true labels, predicted labels, predicted probabilities, and the encoder class labels. - - :param df: Input data containing a column "name" for names to evaluate and a column "sex" - for true labels. - Type: pandas.DataFrame - - :param threshold: Threshold value used for classifying the predictions. Probabilities greater - than or equal to this value are classified into the positive class. - Type: float - - :return: A tuple containing: - - y_true: True labels after encoding. - - y_pred: Predicted binary class labels based on the threshold. - - proba[:, 1]: Probability values for the positive class. - - encoder.classes_: Labels used by the label encoder. - Type: tuple (numpy.ndarray, int, numpy.ndarray, numpy.ndarray) """ model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")) encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")) @@ -60,7 +23,7 @@ def evaluate_logreg(df, threshold): X = df["name"].tolist() y_true = encoder.transform(df["sex"]) proba = model.predict_proba(X) - y_pred = 1 if proba[:, 1] >= threshold else 0 + y_pred = (proba[:, 1] >= threshold).astype(int) return y_true, y_pred, proba[:, 1], encoder.classes_ @@ -68,23 +31,6 @@ def evaluate_lstm(df, threshold, max_len=6): """ Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and returns the true labels, predicted labels, prediction probabilities, and class names. - - :param df: Input DataFrame containing the data for evaluation. - The DataFrame must have two columns: "name" containing - the input text data and "sex" containing the true labels. - :type df: Pandas.DataFrame - :param threshold: Decision threshold for determining binary classification - outcome based on model's prediction probabilities. - :type threshold: Float - :param max_len: The maximum length of input sequences. Used to pad or truncate - tokenized sequences. Default value is 6. - :type max_len: Int - :return: A tuple containing the following elements: - - y_true: The true labels from the input DataFrame. - - y_pred: The predicted binary labels according to the decision threshold. - - proba: Prediction probabilities for the positive class, as output by the model. - - encoder.classes_: An array of class names corresponding to the label encoding. - :rtype: Tuple """ model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")) tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")) @@ -94,7 +40,7 @@ def evaluate_lstm(df, threshold, max_len=6): X = pad_sequences(sequences, maxlen=max_len, padding="post") y_true = encoder.transform(df["sex"]) proba = model.predict(X) - y_pred = 1 if proba[:, 1] >= threshold else 0 + y_pred = (proba[:, 1] >= threshold).astype(int) return y_true, y_pred, proba[:, 1], encoder.classes_ @@ -105,18 +51,6 @@ def evaluate_transformer(df, threshold, max_len=6): tokenizing and padding the "name" column and encodes the "sex" column to numerical format. The function then predicts the probabilities for the given names using the transformer model and generates predictions based on the specified threshold. - - :param df: Pandas DataFrame containing a "name" column with strings to be evaluated - and a "sex" column with corresponding target labels. - :type df: Pd.DataFrame - :param threshold: Threshold value used to determine binary classification labels - from predicted probabilities. - :type threshold: Float - :param max_len: Maximum length for padded sequences, default is 6. - :type max_len: Int, optional - :return: A tuple containing the ground truth labels, predicted labels, predicted - probabilities for the positive class, and a list of the label classes. - :rtype: Tuple """ model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras")) tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")) @@ -126,7 +60,7 @@ def evaluate_transformer(df, threshold, max_len=6): X = pad_sequences(sequences, maxlen=max_len, padding="post") y_true = encoder.transform(df["sex"]) proba = model.predict(X) - y_pred = 1 if proba[:, 1] >= threshold else 0 + y_pred = (proba[:, 1] >= threshold).astype(int) return y_true, y_pred, proba[:, 1], encoder.classes_ @@ -135,19 +69,6 @@ def compute_metrics(y_true, y_pred, y_proba, class_names): Computes classification metrics for given true and predicted labels, along with class probabilities and class names. The function calculates accuracy, precision, recall, F1 score, and confusion matrix for evaluating model performance. - - :param y_true: Ground truth (correct) labels. - :type y_true: list or numpy.ndarray - :param y_pred: Predicted labels, as returned by a classifier. - :type y_pred: list or numpy.ndarray - :param y_proba: Predicted probabilities for positive class. - :type y_proba: list or numpy.ndarray - :param class_names: Names of the classes corresponding to labels in the confusion - matrix. - :type class_names: numpy.ndarray - :return: A dictionary containing computed accuracy, precision, recall, F1 score, - and confusion matrix with labels and matrix elements. - :rtype: dict """ acc = accuracy_score(y_true, y_pred) pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary") @@ -168,20 +89,22 @@ def compute_metrics(y_true, y_pred, y_proba, class_names): def main(): parser = argparse.ArgumentParser(description="Evaluate gender prediction model") parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True) - parser.add_argument("--dataset", default="names.csv") - parser.add_argument("--size", type=int) - parser.add_argument("--threshold", type=float, default=0.5) + parser.add_argument("--dataset", default="names_featured.csv", help="Path to the dataset CSV file") + parser.add_argument("--size", type=int, help="Number of rows to load from the dataset") + parser.add_argument("--balanced", action="store_true", help="Load balanced dataset") + parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification") args = parser.parse_args() - df = load_dataset(args.dataset, args.size) + df = load_csv_dataset(args.dataset, args.size, args.balanced) - if args.model == "logreg": - y_true, y_pred, y_proba, classes = evaluate_logreg(df, args.threshold) - elif args.model == "lstm": - y_true, y_pred, y_proba, classes = evaluate_lstm(df, args.threshold) - elif args.model == "transformer": - y_true, y_pred, y_proba, classes = evaluate_transformer(df, args.threshold) - else: + model_funcs = { + "logreg": evaluate_logreg, + "lstm": evaluate_lstm, + "transformer": evaluate_transformer, + } + try: + y_true, y_pred, y_proba, classes = model_funcs[args.model](df, args.threshold) + except KeyError: raise ValueError(f"Unknown model: {args.model}") results = compute_metrics(y_true, y_pred, y_proba, classes) diff --git a/ners/gender/models/__init__.py b/ners/gender/models/__init__.py index e69de29..b2cdf24 100644 --- a/ners/gender/models/__init__.py +++ b/ners/gender/models/__init__.py @@ -0,0 +1,82 @@ +import argparse +import logging +from dataclasses import dataclass +from typing import Optional + +from sklearn.metrics import ( + accuracy_score, precision_recall_fscore_support, + classification_report, confusion_matrix +) + +logging.basicConfig(level=logging.INFO, format=">> %(message)s") + + +def evaluate_proba(y_true, y_proba, threshold, class_names): + y_pred = (y_proba[:, 1] >= threshold).astype(int) + acc = accuracy_score(y_true, y_pred) + pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary") + cm = confusion_matrix(y_true, y_pred) + + logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}") + print("Confusion Matrix:\n", cm) + print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) + + +@dataclass +class BaseConfig: + """ + Represents the base configuration for a dataset and its associated parameters. + + This class serves as a foundational configuration handler to encapsulate + dataset-related parameters and options. It allows customization of dataset + behavior, including threshold values, size, cross-validation settings, and + whether to save derived configurations. It can also manage configurations + for balanced datasets if necessary. + """ + dataset_path: str = "names_featured.csv" + size: Optional[int] = None + threshold: float = 0.5 + cv: Optional[int] = None + save: bool = False + balanced: bool = False + + epochs: int = 10 + test_size: float = 0.2 + random_state: int = 42 + + +def load_config(description: str) -> BaseConfig: + """ + Parses command-line arguments and loads the configuration for the logistic regression model. + + This function sets up an argument parser for various command-line options including + the dataset path, dataset size, dataset balancing, classification threshold, + cross-validation folds, and saving the model and its associated artifacts. Once parsed, + it transfers the configurations to a ``BaseConfig`` instance and returns it. + """ + parser = argparse.ArgumentParser(description) + + parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file") + parser.add_argument("--size", type=int, help="Number of rows to load from the dataset") + parser.add_argument("--balanced", action="store_true", help="Load balanced dataset") + parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification") + parser.add_argument("--cv", type=int, help="Number of folds for cross-validation") + parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training") + + parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training") + parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split") + parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility") + + args = parser.parse_args() + + return BaseConfig( + dataset_path=args.dataset, + size=args.size, + threshold=args.threshold, + cv=args.cv, + save=args.save, + balanced=args.balanced, + epochs=args.epochs, + test_size=args.test_size, + random_state=args.random_state + ) diff --git a/ners/gender/models/logreg.py b/ners/gender/models/logreg.py index 5aed7dc..1be933e 100644 --- a/ners/gender/models/logreg.py +++ b/ners/gender/models/logreg.py @@ -1,8 +1,6 @@ -import argparse -import logging import os from dataclasses import dataclass -from typing import Tuple, Optional +from typing import Tuple import pandas as pd from sklearn.feature_extraction.text import CountVectorizer @@ -16,54 +14,20 @@ from sklearn.pipeline import make_pipeline, Pipeline from sklearn.preprocessing import LabelEncoder from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle - -logging.basicConfig(level=logging.INFO, format=">> %(message)s") +from ners.gender.models import BaseConfig, load_config, logging @dataclass -class Config: - dataset_path: str - size: Optional[int] - test_size: float = 0.2 +class Config(BaseConfig): ngram_range: Tuple[int, int] = (2, 5) max_iter: int = 1000 - random_state: int = 42 - threshold: float = 0.5 - cv: Optional[int] = None - save: bool = False - - -def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]: - """ - Load and clean dataset as specified by the provided configuration. This function reads - a CSV dataset from the path specified in the configuration, processes it to remove - missing values from key columns ('name' and 'sex'), and cleans string data in these - columns by converting them to lowercase and stripping whitespace. The cleaned data - is then returned as two separate pandas Series objects. - - :param cfg: Configuration object specifying the dataset path and size - :type cfg: Config - :return: A tuple containing cleaned `name` and `sex` data as pandas Series objects - :rtype: Tuple[pd.Series, pd.Series] - """ - logging.info(f"Loading dataset from {cfg.dataset_path}") - df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)) - df = df.dropna(subset=["name", "sex"]) - df["name"] = df["name"].str.lower().str.strip() - df["sex"] = df["sex"].str.lower().str.strip() - return df["name"], df["sex"] def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]: """ - Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical - labels to integers, which is particularly useful for machine learning models that require numerical - input data. - - :param y: A pandas Series of categorical labels to be encoded. - :type y: pd.Series - :return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object. - :rtype: Tuple[pd.Series, LabelEncoder] + Encode the labels using a LabelEncoder. This function takes a pandas Series of labels, + fits a LabelEncoder to the labels, and transforms them into a numerical format suitable + for model training. The transformed labels and the fitted encoder are returned. """ logging.info("Encoding labels") encoder = LabelEncoder() @@ -73,21 +37,11 @@ def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]: def build_model(cfg: Config) -> Pipeline: """ - Builds a machine learning pipeline for text classification. - - This function constructs and returns a scikit-learn pipeline that consists of - a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer - leverages character-level n-grams based on the provided configuration, and the - logistic regression model is trained with a maximum number of iterations defined - in the configuration. This pipeline is used for processing text data and training - classification models. - - :param cfg: Configuration object containing the n-gram range and the maximum - number of iterations for the logistic regression model. - :type cfg: Config - :return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression` - based on the provided configuration. - :rtype: Pipeline + Build a logistic regression model pipeline with a character-level CountVectorizer. + The pipeline consists of a CountVectorizer that transforms the input text into + character n-grams, followed by a Logistic Regression classifier. The n-gram range + and maximum iterations for the logistic regression can be configured through the + provided configuration object. """ return make_pipeline( CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range), @@ -95,7 +49,7 @@ def build_model(cfg: Config) -> Pipeline: ) -def evaluate_probabilities(y_true, y_proba, threshold: float, class_names): +def evaluate_proba(y_true, y_proba, threshold: float, class_names): """ Evaluates the performance of a classification model using a specified threshold for predicted probabilities. Computes metrics such as accuracy, precision, @@ -104,19 +58,6 @@ def evaluate_probabilities(y_true, y_proba, threshold: float, class_names): Logs the evaluation metrics at the specified threshold and prints the confusion matrix and classification report. - - :param y_true: Ground truth (correct) labels. - :type y_true: array-like - :param y_proba: Predicted probabilities for each class, where each row - corresponds to an instance and contains probabilities for each target class. - :type y_proba: numpy.ndarray - :param threshold: The threshold on predicted probabilities to determine - class membership for each instance. - :type threshold: float - :param class_names: List of class names for the target variable used in the - classification report. - :type class_names: list of str - :return: None """ logging.info(f"Evaluating at threshold = {threshold}") y_pred = (y_proba[:, 1] >= threshold).astype(int) @@ -135,16 +76,6 @@ def cross_validate(cfg: Config, X, y) -> None: Performs k-fold cross-validation on the provided dataset using the configuration and logs the results including individual fold scores, mean accuracy, and the standard deviation of the scores. - - :param cfg: Configuration object containing cross-validation settings such as the - number of folds to use in the cross-validation (`cv`). - :type cfg: Config - :param X: Input feature matrix for the dataset to be used for cross-validation. - :type X: Any - :param y: Target labels corresponding to the input feature matrix `X`. - :type y: Any - :return: This function does not return any value. Results are logged. - :rtype: None """ logging.info(f"Running {cfg.cv}-fold cross-validation") pipeline = build_model(cfg) @@ -153,21 +84,9 @@ def cross_validate(cfg: Config, X, y) -> None: logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}") -def save_artifacts(model, encoder, cfg: Config): +def save_artifacts(model, encoder): """ - Saves machine learning model and label encoder artifacts to specified directories - within the gender models' directory. This function ensures that the model and encoder - are serialized and stored as pickle files. It uses the specified configuration settings - to locate the appropriate directory for storing the files. - - :param model: The machine learning model object to be saved. - :type model: Any - :param encoder: The label encoder object used for data preprocessing. - :type encoder: Any - :param cfg: Configuration object containing application-specific settings regarding - paths and directories. - :type cfg: Config - :return: None + Saves the trained model and label encoder artifacts to the specified directory. """ save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")) save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")) @@ -176,23 +95,10 @@ def save_artifacts(model, encoder, cfg: Config): def main(): - parser = argparse.ArgumentParser(description="Train a gender classifier on names") - parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset") - parser.add_argument("--size", type=int, help="Number of rows to load") - parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision") - parser.add_argument("--cv", type=int, help="Number of folds for cross-validation") - parser.add_argument("--save", action="store_true", help="Save the model and encoder") - args = parser.parse_args() + cfg = Config(**vars(load_config("logistic regression model"))) - cfg = Config( - dataset_path=args.dataset, - size=args.size, - threshold=args.threshold, - cv=args.cv, - save=args.save - ) - - X_raw, y_raw = load_and_clean_data(cfg) + df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced)) + X_raw, y_raw = df["name"], df["sex"] y_encoded, encoder = encode_labels(y_raw) if cfg.cv: @@ -207,10 +113,10 @@ def main(): model.fit(X_train, y_train) y_proba = model.predict_proba(X_test) - evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_) + evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_) if cfg.save: - save_artifacts(model, encoder, cfg) + save_artifacts(model, encoder) if __name__ == "__main__": diff --git a/ners/gender/models/lstm.py b/ners/gender/models/lstm.py index 48750ea..536aeac 100644 --- a/ners/gender/models/lstm.py +++ b/ners/gender/models/lstm.py @@ -1,13 +1,11 @@ -import argparse -import logging import os from dataclasses import dataclass -from typing import Tuple, Optional +from typing import Tuple import numpy as np import pandas as pd from sklearn.metrics import ( - accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix + accuracy_score ) from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.preprocessing import LabelEncoder @@ -18,82 +16,25 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle - -logging.basicConfig(level=logging.INFO, format=">> %(message)s") +from ners.gender.models import load_config, BaseConfig, evaluate_proba, logging @dataclass -class Config: - """ - Configuration for the machine learning model and its training process. - - This class encapsulates the configuration options necessary for initializing, - training, and evaluating a machine learning model. It allows flexibility - in specifying dataset details, model parameters, training settings, and - options for evaluation. Attributes include paths, numerical parameters, - and flags that guide the model's behavior. - - :ivar dataset_path: Path to the dataset file. - :type dataset_path: str - :ivar size: Optional size of the dataset to use. If None, use the full dataset. - :type size: Optional[int] - :ivar max_len: Maximum length of sequences used in the model. - :type max_len: int - :ivar embedding_dim: Dimensionality of the embedding layer. - :type embedding_dim: int - :ivar lstm_units: Number of LSTM units in the model. - :type lstm_units: int - :ivar batch_size: Batch size to use during training. - :type batch_size: int - :ivar epochs: Number of epochs for model training. - :type epochs: int - :ivar test_size: Fraction of data to use for testing. - :type test_size: float - :ivar random_state: Seed for random number generation to ensure reproducibility. - :type random_state: int - :ivar threshold: Decision threshold for binary classification tasks. - :type threshold: float - :ivar cv: Number of cross-validation folds. If None, no cross-validation is used. - :type cv: Optional[int] - :ivar save: Flag indicating whether to save the trained model. - :type save: bool - """ - dataset_path: str - size: Optional[int] = None +class Config(BaseConfig): max_len: int = 6 embedding_dim: int = 64 lstm_units: int = 32 batch_size: int = 64 - epochs: int = 10 - test_size: float = 0.2 - random_state: int = 42 - threshold: float = 0.5 - cv: Optional[int] = None - save: bool = False def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]: """ - Load and preprocess the dataset based on the provided configuration. - - This function performs a series of operations including loading the dataset - from the specified path, cleaning and preprocessing data (e.g., converting - to lowercase, stripping whitespace, handling missing values), tokenizing names - using a tokenizer, and encoding the labels using a label encoder. The final processed - data and tools (tokenizer and label encoder) are returned for further use. - - :param cfg: Config object containing dataset parameters such as dataset path, size, and - maximum sequence length. - :type cfg: Config - :return: A tuple containing processed padded sequences (numpy ndarray), corresponding - encoded labels (numpy ndarray), tokenizer object used for preprocessing names, - and label encoder object used for encoding labels. - :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder] + Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences. + This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for + model training. The resulting outputs are ready for input into a machine learning pipeline. """ logging.info("Loading and preprocessing data") - df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"]) - df["name"] = df["name"].str.lower().str.strip() - df["sex"] = df["sex"].str.lower().str.strip() + df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced)) tokenizer = Tokenizer(char_level=False, lower=True, oov_token="") tokenizer.fit_on_texts(df["name"]) @@ -107,6 +48,12 @@ def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, La def build_model(cfg: Config, vocab_size: int) -> Sequential: + """ + Builds and compiles a Sequential LSTM-based model. The model consists of an + embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU + activation, and an output layer with a softmax activation function. The model + is compiled using sparse categorical crossentropy loss and the Adam optimizer. + """ logging.info("Building LSTM model") model = Sequential([ Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim), @@ -119,60 +66,12 @@ def build_model(cfg: Config, vocab_size: int) -> Sequential: return model -def evaluate_proba(y_true, y_proba, threshold, class_names): - """ - Evaluate the performance of a binary classification model by calculating key metrics and printing - a detailed classification report. - - This function thresholds the predicted probabilities to produce binary predictions and calculates - metrics such as accuracy, precision, recall, and F1 score. It also generates a confusion matrix - and a classification report for the model's performance. Additionally, metrics are logged and - informational outputs are printed. - - :param y_true: Ground truth binary labels. Must be a 1-dimensional array or list of integers. - :param y_proba: Predicted probabilities for each class from the model. It is a 2-dimensional array - where the second dimension represents class probabilities for each sample. - :param threshold: Threshold value for converting probabilities into binary predictions. Should be - a float between 0 and 1. - :param class_names: List of class names corresponding to the binary labels. Used for labeling the - classification report. - :return: None - """ - y_pred = (y_proba[:, 1] >= threshold).astype(int) - acc = accuracy_score(y_true, y_pred) - pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary') - cm = confusion_matrix(y_true, y_pred) - - logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}") - print("Confusion Matrix:\n", cm) - print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) - - def cross_validate(cfg: Config, X, y, vocab_size: int): """ - Performs k-fold cross-validation on a dataset using a specified model configuration. - - This function takes a dataset and corresponding labels, splits the dataset into - k folds (based on the `cv` attribute of the provided configuration object), and - performs cross-validation using the specified deep learning model. The model is - built and trained on the training subset for each fold, and the validation subset - is used to compute accuracy scores. Finally, it logs the individual fold accuracies - and the overall mean accuracy with its standard deviation. - - :param cfg: Configuration object containing the parameters for cross-validation, - model training, and other settings. `cv` specifies the number of folds, - and other attributes such as `epochs`, `batch_size`, and `random_state` - dictate the training and reproducibility behavior. - :type cfg: Config - :param X: Feature data for the dataset. Assumes the input is compatible with the - model configuration. - :param y: True labels corresponding to the dataset. The order should correspond - to the feature set `X`. - :param vocab_size: Total vocabulary size used for building the model. Determines - the structure of the model input. - :type vocab_size: int - :return: A list containing the accuracy scores for each fold. - :rtype: List[float] + Performs cross-validation on the given dataset using the specified model configuration. + The function uses StratifiedKFold cross-validator to split the dataset into training and + validation sets for each fold. For each fold, it trains the model, evaluates its accuracy + on the validation data, and logs the fold-wise and overall results. """ logging.info(f"Running {cfg.cv}-fold cross-validation") skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state) @@ -195,23 +94,11 @@ def cross_validate(cfg: Config, X, y, vocab_size: int): def save_artifacts(model, tokenizer, encoder): """ - Save the model, tokenizer, and label encoder artifacts to predefined file paths - within the GENDER_MODELS_DIR directory. The function ensures that the model is - saved in H5 format, while the tokenizer and encoder are serialized using the - Pickle module. It logs a message indicating the completion of the saving process. + Saves the given model, tokenizer, and encoder artifacts to a predefined directory. - :param model: The machine learning model object to be saved. - :type model: Any - - :param tokenizer: The tokenizer object used in preprocessing, to be serialized - for future use. - :type tokenizer: Any - - :param encoder: The label encoder object used for encoding labels during - training, to be serialized for future use. - :type encoder: Any - - :return: None + The function ensures that the specified directory for saving artifacts exists, + then serializes the model, tokenizer, and encoder using appropriate formats. It + also logs the success of the operation to notify the user of the action taken. """ os.makedirs(GENDER_MODELS_DIR, exist_ok=True) model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")) @@ -223,21 +110,7 @@ def save_artifacts(model, tokenizer, encoder): def main(): - parser = argparse.ArgumentParser(description="Train BiLSTM model for name-based gender classification") - parser.add_argument("--dataset", type=str, default="names.csv") - parser.add_argument("--size", type=int) - parser.add_argument("--threshold", type=float, default=0.5) - parser.add_argument("--cv", type=int) - parser.add_argument("--save", action="store_true") - args = parser.parse_args() - - cfg = Config( - dataset_path=args.dataset, - size=args.size, - threshold=args.threshold, - cv=args.cv, - save=args.save - ) + cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model"))) X, y, tokenizer, encoder = load_and_prepare(cfg) vocab_size = len(tokenizer.word_index) + 1 diff --git a/ners/gender/models/transformer.py b/ners/gender/models/transformer.py index 558feac..911b4ae 100644 --- a/ners/gender/models/transformer.py +++ b/ners/gender/models/transformer.py @@ -1,15 +1,12 @@ -import argparse -import logging import os from dataclasses import dataclass -from typing import Tuple, Optional +from typing import Tuple import numpy as np import pandas as pd import tensorflow as tf from sklearn.metrics import ( - accuracy_score, precision_recall_fscore_support, - classification_report, confusion_matrix + accuracy_score ) from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.preprocessing import LabelEncoder @@ -23,56 +20,11 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle - -logging.basicConfig(level=logging.INFO, format=">> %(message)s") +from ners.gender.models import BaseConfig, load_config, evaluate_proba, logging @dataclass -class Config: - """ - Configuration data class used to store settings and parameters for a machine learning or deep - learning model. - - This class allows the user to specify various parameters such as dataset path, size of input, - model architecture details like embedding dimensions, transformer configurations, training settings - like batch size and epochs, and validation and testing settings. The attributes provide flexibility - to customize model configuration and training processes. - - :ivar dataset_path: The file path to the dataset. - :type dataset_path: str - :ivar size: Optional size parameter, can be used to specify sample size or custom - configuration based on the user's requirement. - :type size: Optional[int] - :ivar max_len: Maximum sequence length for input data, used often in text or sequence - processing. - :type max_len: int - :ivar embedding_dim: The dimensionality of embeddings used in the model. - :type embedding_dim: int - :ivar transformer_head_size: The size of each transformer attention head. - :type transformer_head_size: int - :ivar transformer_num_heads: The number of attention heads in the transformer model. - :type transformer_num_heads: int - :ivar transformer_ff_dim: The dimensionality of the feed-forward network in the transformer. - :type transformer_ff_dim: int - :ivar dropout: Dropout rate used for regularization during training. - :type dropout: float - :ivar batch_size: Batch size used for training and validation. - :type batch_size: int - :ivar epochs: Number of epochs for model training. - :type epochs: int - :ivar test_size: Proportion of the dataset to be used for testing. - :type test_size: float - :ivar random_state: Random seed value for reproducibility. - :type random_state: int - :ivar threshold: Threshold value for model predictions or classification. - :type threshold: float - :ivar cv: Cross-validation configuration, if applicable. - :type cv: Optional[int] - :ivar save: Boolean flag indicating whether to save the model after training. - :type save: bool - """ - dataset_path: str - size: Optional[int] +class Config(BaseConfig): max_len: int = 6 embedding_dim: int = 64 transformer_head_size: int = 64 @@ -80,38 +32,21 @@ class Config: transformer_ff_dim: int = 128 dropout: float = 0.1 batch_size: int = 64 - epochs: int = 10 - test_size: float = 0.2 - random_state: int = 42 - threshold: float = 0.5 - cv: Optional[int] = None - save: bool = False def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]: """ - Load and preprocess data for model training or evaluation. This function handles the - loading of a dataset in CSV format, applies preprocessing to clean and normalize - the input data, tokenizes text features, and encodes categorical labels. - - The preprocessed data is prepared as padded sequences and encoded labels, which - can be directly used as inputs for machine learning models. Tokenizer and LabelEncoder - are returned to ensure consistency between training and inference stages. - - :param cfg: Configuration object containing dataset path, size of the - dataset to load, and maximum length for padding sequences. - :type cfg: Config - :return: A tuple containing padded input sequences for the model, encoded labels, - the tokenizer used for text sequences, and the encoder used for labels. - :rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder] + Load and preprocess the dataset for training a Transformer model. + This function reads a CSV dataset, tokenizes the names, pads the sequences, + and encodes the labels. It returns the padded sequences, encoded labels, + tokenizer, and label encoder. """ logging.info("Loading and preprocessing data") - df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"]) - df["name"] = df["name"].str.lower().str.strip() - df["sex"] = df["sex"].str.lower().str.strip() + df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced)) tokenizer = Tokenizer(oov_token="") tokenizer.fit_on_texts(df["name"]) + sequences = tokenizer.texts_to_sequences(df["name"]) padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post") @@ -122,18 +57,8 @@ def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, La def transformer_encoder(x, cfg: Config): """ - Transforms input tensor using a single Transformer encoder block with attention and feedforward - layers. The encoder applies multi-head attention to the input tensor, adds the output to - the original tensor for residual connection, and normalizes it. Subsequently, the processed - tensor passes through a feedforward network with added dropout and normalization. - - :param x: Input tensor to be transformed. - :type x: TensorFlow tensor - :param cfg: Configuration object containing Transformer hyperparameters such as the number of - attention heads, head size, feedforward dimension, and dropout rate. - :type cfg: Config - :return: Transformed tensor resulting from applying the Transformer encoder block. - :rtype: TensorFlow tensor + Transformer encoder block that applies multi-head attention and feed-forward + neural network layers with residual connections and layer normalization. """ attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x) x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn)) @@ -145,18 +70,10 @@ def transformer_encoder(x, cfg: Config): def build_model(cfg: Config, vocab_size: int) -> Model: """ - Builds a Transformer-based model using Keras/TensorFlow components. The model - is designed for classification tasks, utilizing embedding layers with positional - encoding, a Transformer encoder block, and fully connected layers for - output generation. - - :param cfg: Configuration object containing model-specific hyperparameters - such as maximum sequence length, embedding dimensions, etc. - :type cfg: Config - :param vocab_size: The size of the vocabulary for the embedding layer. - :type vocab_size: int - :return: A compiled Keras model, ready for training and evaluation. - :rtype: Model + Builds a Transformer-based model aimed at sequence processing tasks. + The model includes an embedding layer integrating positional encodings + and a Transformer encoder, followed by a global pooling layer, + a dense hidden layer, and a softmax output layer. """ logging.info("Building Transformer model") inputs = Input(shape=(cfg.max_len,)) @@ -177,54 +94,11 @@ def build_model(cfg: Config, vocab_size: int) -> Model: return model -def evaluate_proba(y_true, y_proba, threshold, class_names): - """ - Evaluates the performance of a binary classification model by calculating accuracy, - precision, recall, F1 score, confusion matrix, and generates a classification - report. This function takes the true labels, predicted probabilities, a decision - threshold, and class names to assist in the evaluation. - - :param y_true: Ground truth (correct) target values. - :type y_true: array-like of shape (n_samples,) - :param y_proba: Predicted probabilities for each class. Expected to be an array - where the second column corresponds to the probability of the positive class. - :type y_proba: array-like of shape (n_samples, 2) - :param threshold: Decision threshold for classifying a sample as positive - or negative based on predicted probabilities. - :type threshold: float - :param class_names: List of class names for labeling the classification report. - :type class_names: list of str - :return: None. Outputs performance metrics and confusion matrix to the logging - system and the console. - """ - y_pred = (y_proba[:, 1] >= threshold).astype(int) - acc = accuracy_score(y_true, y_pred) - pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary") - cm = confusion_matrix(y_true, y_pred) - - logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}") - print("Confusion Matrix:\n", cm) - print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names)) - - def cross_validate(cfg: Config, X, y, vocab_size: int): """ - Evaluate the performance of a model using K-fold cross-validation. This function takes - configuration settings, input data, target labels, and vocabulary size to perform the - specified number of cross-validation folds with a stratified approach. For each fold, - it builds a new model, trains it, predicts the validation set, and calculates accuracy. - - :param cfg: The configuration object containing hyperparameters and settings for - cross-validation, random state, and training. - :type cfg: Config - :param X: The input data samples provided as a dataset. - :type X: numpy.ndarray - :param y: The target labels corresponding to the input data samples. - :type y: numpy.ndarray - :param vocab_size: The size of the vocabulary, used to configure the language model. - :type vocab_size: int - :return: A list containing accuracy scores from each fold in the cross-validation process. - :rtype: list + Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function + splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation + data. The overall mean and standard deviation of accuracies across all folds are logged. """ logging.info(f"Running {cfg.cv}-fold cross-validation") skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state) @@ -247,14 +121,11 @@ def cross_validate(cfg: Config, X, y, vocab_size: int): def save_artifacts(model, tokenizer, encoder): """ - Saves the machine learning model and its associated artifacts such as tokenizer and - label encoder to predefined file paths. This function ensures that the model and - artifacts can be reloaded later for inference or further use. - - :param model: The machine learning model to be saved. - :param tokenizer: The tokenizer used for preparing data for the model. - :param encoder: The label encoder used for encoding target labels. - :return: None + Saves the model and associated artifacts to the designated directory. The model + is serialized and saved in a `.keras` file, while the tokenizer and label + encoder are serialized into `.pkl` files. If the directory does not exist, it + is created automatically. This function also logs the completion of the + operation. """ os.makedirs(GENDER_MODELS_DIR, exist_ok=True) model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras")) @@ -266,21 +137,7 @@ def save_artifacts(model, tokenizer, encoder): def main(): - parser = argparse.ArgumentParser(description="Train Transformer model for name-based gender classification") - parser.add_argument("--dataset", type=str, default="names.csv") - parser.add_argument("--size", type=int) - parser.add_argument("--threshold", type=float, default=0.5) - parser.add_argument("--cv", type=int) - parser.add_argument("--save", action="store_true") - args = parser.parse_args() - - cfg = Config( - dataset_path=args.dataset, - size=args.size, - threshold=args.threshold, - cv=args.cv, - save=args.save - ) + cfg = Config(**vars(load_config("Transformer model"))) X, y, tokenizer, encoder = load_and_prepare(cfg) vocab_size = len(tokenizer.word_index) + 1 diff --git a/ners/gender/predict.py b/ners/gender/predict.py index d15a637..cde8f18 100644 --- a/ners/gender/predict.py +++ b/ners/gender/predict.py @@ -17,19 +17,6 @@ def predict_logreg(names: List[str], threshold: float): The function takes in a list of names and predicts the gender labels based on a logistic regression model. A probabilistic threshold is used to classify the names into one of the defined labels. - - :param names: - A list of names for which the gender needs to be predicted. Each - name must be a string. - :param threshold: - A float value representing the threshold for classification. Names - with predicted probabilities greater than or equal to this value - will be classified into the positive class. - :return: - A tuple containing the predicted gender labels and their - corresponding probabilities. The first element of the tuple is a - list of predicted labels, while the second element is an array of - probability scores for each label. """ model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl") encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl") @@ -51,19 +38,6 @@ def predict_lstm(names: List[str], threshold: float, max_len=6): The function loads the model, tokenizer, and label encoder, performs preprocessing on the input names, and then uses the loaded model to predict gender probabilities. Based on the threshold value, it determines the predicted gender labels. - - :param names: List of names to be classified. - :type names: List[str] - :param threshold: Probability threshold for classifying gender. If the predicted probability for the - 'positive' class is greater than or equal to this threshold, it is classified accordingly. - :type threshold: float - :param max_len: Maximum length for name sequences. Names longer than this will be truncated, and shorter - ones will be padded. Default value is 6. - :type max_len: int, optional - - :return: A tuple containing predicted labels and associated probabilities. Labels are the predicted gender - categories, and probabilities are the prediction scores for each input name. - :rtype: Tuple[numpy.ndarray, numpy.ndarray] """ model_path = os.path.join(GENDER_MODELS_DIR, "lstm_model.keras") tokenizer_path = os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl") @@ -89,20 +63,6 @@ def predict_transformer(names: List[str], threshold: float, max_len=6): encoder, converts input names into tokenized sequences, and processes them to generate gender predictions. The function returns the predicted labels and the associated probabilities for each sample. - - :param names: List of names to predict gender labels for. - :type names: List[str] - :param threshold: Threshold value to determine the prediction class. Probability values - above or equal to the threshold will be assigned to one class, and those below to - another. - :type threshold: float - :param max_len: Maximum length for the sequences. Names will be truncated or padded to - this length during processing, default is 6. - :type max_len: int, optional - :return: A tuple containing two elements: a list of predicted gender labels as strings - and a NumPy array of probabilities for each gender class (where the first index - corresponds to one class, and the second index corresponds to another). - :rtype: Tuple[List[str], numpy.ndarray] """ model_path = os.path.join(GENDER_MODELS_DIR, "transformer.keras") tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl") @@ -123,24 +83,21 @@ def predict_transformer(names: List[str], threshold: float, max_len=6): def main(): parser = argparse.ArgumentParser(description="Predict gender from names using trained model") parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True) - parser.add_argument("--name", nargs="+", required=True, help="One or more names") + parser.add_argument("--names", nargs="+", required=True, help="One or more names") parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification") args = parser.parse_args() - model = args.model - names = args.name - threshold = args.threshold + model_funcs = { + "logreg": predict_logreg, + "lstm": predict_lstm, + "transformer": predict_transformer, + } + try: + labels, proba = model_funcs[args.model](args.names, args.threshold) + except KeyError: + raise ValueError(f"Unsupported model type: {args.model}") - if model == "logreg": - labels, proba = predict_logreg(names, threshold) - elif model == "lstm": - labels, proba = predict_lstm(names, threshold) - elif model == "transformer": - labels, proba = predict_transformer(names, threshold) - else: - raise ValueError(f"Unsupported model type: {model}") - - for i, name in enumerate(names): + for i, name in enumerate(args.names): p_female = proba[i][0] p_male = proba[i][1] print(f"{name} → {labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}") diff --git a/processing/gender/prepare.py b/processing/gender/prepare.py new file mode 100644 index 0000000..6b14bf5 --- /dev/null +++ b/processing/gender/prepare.py @@ -0,0 +1,52 @@ +import os + +import pandas as pd + +from misc import DATA_DIR + + +def clean(filepath): + encodings = ['utf-8', 'utf-16', 'latin1'] + for enc in encodings: + try: + print(f">> Trying to read {filepath} with encoding: {enc}") + df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip') + + print(">> Remove null bytes and non-breaking spaces from all string columns") + for col in df.select_dtypes(include=['object']).columns: + df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False) + df[col] = df[col].str.replace('\u00a0', ' ', regex=False) + df[col] = df[col].str.replace(' +', ' ', regex=True) + + print(f">> Successfully read with encoding: {enc}") + df = df.dropna(subset=['name', 'sex', 'region']) + df.to_csv(filepath, index=False, encoding='utf-8') + return df + except Exception: + continue + raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.") + + +def main(): + df = clean(os.path.join(DATA_DIR, 'names.csv')) + + df['name'] = df['name'].str.strip().str.lower() + df['words'] = df['name'].str.split().apply(len) + df['length'] = df['name'].str.replace(' ', '', regex=False).str.len() + df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '') + df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '') + + print(f">> Arranging columns") + cols = [c for c in df.columns if c != 'sex'] + ['sex'] + df = df[cols] + + print(f">> Saving featured dataset") + df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False) + + print(f">> Splitting dataset by sex") + df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False) + df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False) + + +if __name__ == '__main__': + main()