feat: balanced dataset loading
This commit is contained in:
@@ -13,28 +13,73 @@ cd drc-ners-nlp
|
|||||||
python3 -m venv .venv
|
python3 -m venv .venv
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
cp .env .env.local
|
cp .env .env.local
|
||||||
make download
|
|
||||||
|
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
## Gender Inference
|
## Gender Inference
|
||||||
### 1. Training
|
### 1. Dataset Preparation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m ners.gender.models.lstm --dataset names.csv --size 1000000 --save
|
python -m processing.gender.prepare
|
||||||
python -m ners.gender.models.logreg --dataset names.csv --size 1000000 --save
|
|
||||||
python -m ners.gender.models.transformer --dataset names.csv --size 1000000 --save
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2. Evaluation
|
### 2. Training
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
| Name | Description | Default |
|
||||||
|
|----------------|--------------------------------------------------|--------------------|
|
||||||
|
| --dataset_path | Path to the dataset file | names_featured.csv |
|
||||||
|
| --size | Number of samples to use (None for full dataset) | None |
|
||||||
|
| --threshold | Probability threshold for gender classification | 0.5 |
|
||||||
|
| --cv | Number of cross-validation folds | None |
|
||||||
|
| --save | Whether to save the trained model | False |
|
||||||
|
| --balanced | Whether to balance the dataset | False |
|
||||||
|
| --epochs | Number of training epochs | 10 |
|
||||||
|
| --test_size | Proportion of data to use as test set | 0.2 |
|
||||||
|
| --random_state | Random seed for reproducibility | 42 |
|
||||||
|
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m ners.gender.models.lstm --size 1000000 --save
|
||||||
|
python -m ners.gender.models.logreg 1000000 --save
|
||||||
|
python -m ners.gender.models.transformer --size 1000000 --save
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Evaluation
|
||||||
|
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
| Name | Description | Default |
|
||||||
|
|------------|-----------------------------------------------|----------------------|
|
||||||
|
| --model | Model type: logreg, lstm, or transformer | (required) |
|
||||||
|
| --dataset | Path to the dataset CSV file | names_featured.csv |
|
||||||
|
| --size | Number of rows to load from the dataset | None |
|
||||||
|
| --balanced | Load balanced dataset | False |
|
||||||
|
| --threshold| Probability threshold for classification | 0.5 |
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m ners.gender.eval --dataset eval.csv --model logreg --threshold 0.5 --size 20000
|
python -m ners.gender.eval --dataset eval.csv --model logreg --threshold 0.5 --size 20000
|
||||||
python -m ners.gender.eval --dataset eval.csv --model lstm
|
python -m ners.gender.eval --dataset eval.csv --model lstm
|
||||||
python -m ners.gender.eval --dataset eval.csv --model transformer
|
python -m ners.gender.eval --dataset eval.csv --model transformer
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3. Inference
|
### 4. Inference
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
| Name | Description | Default |
|
||||||
|
|-------------|------------------------------------------|-----------|
|
||||||
|
| --model | Model type: logreg, lstm, or transformer | (required)|
|
||||||
|
| --names | One or more names | (required)|
|
||||||
|
| --threshold | Threshold for classification | 0.5 |
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m ners.gender.predict --model logreg --name "Tshisekedi"
|
python -m ners.gender.predict --model logreg --name "Tshisekedi"
|
||||||
python -m ners.gender.predict --model lstm --name "Ilunga" "Albert" "Ilunga Albert" --threshold 0.7
|
python -m ners.gender.predict --model lstm --name "Ilunga" "Albert" "Ilunga Albert" --threshold 0.7
|
||||||
|
|||||||
+20
-23
@@ -4,6 +4,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
# Paths
|
# Paths
|
||||||
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
@@ -17,15 +18,6 @@ NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner')
|
|||||||
NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results')
|
NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results')
|
||||||
|
|
||||||
|
|
||||||
def clean_spacing(filename: str) -> Optional[str]:
|
|
||||||
try:
|
|
||||||
with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf8') as f:
|
|
||||||
content = f.read()
|
|
||||||
return content.translate(str.maketrans({'\00': ' ', ' ': ' '}))
|
|
||||||
except Exception as e:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def load_json_dataset(path: str) -> list:
|
def load_json_dataset(path: str) -> list:
|
||||||
print(f">> Loading JSON dataset from {path}")
|
print(f">> Loading JSON dataset from {path}")
|
||||||
with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f:
|
with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f:
|
||||||
@@ -40,30 +32,35 @@ def save_csv_dataset(data: list, path: str) -> None:
|
|||||||
writer.writerows(data)
|
writer.writerows(data)
|
||||||
|
|
||||||
|
|
||||||
def load_csv_dataset(path: str, limit: int = None) -> list:
|
def load_csv_dataset(path: str, limit: int = None, balanced: bool = False) -> List[Dict[str, str]]:
|
||||||
print(f">> Loading CSV dataset from {path}")
|
print(f">> Loading CSV dataset from {path}")
|
||||||
data = []
|
|
||||||
encodings = ['utf-8', 'utf-16', 'latin1']
|
|
||||||
|
|
||||||
for enc in encodings:
|
file_path = os.path.join(DATA_DIR, path)
|
||||||
try:
|
with open(file_path, "r", encoding="utf-8", errors="replace", newline="") as f:
|
||||||
with open(os.path.join(DATA_DIR, path), "r", encoding=enc, errors="replace") as f:
|
|
||||||
raw_text = f.read().replace('\x00', '')
|
raw_text = f.read().replace('\x00', '')
|
||||||
|
|
||||||
csv_buffer = io.StringIO(raw_text)
|
reader = csv.DictReader(io.StringIO(raw_text))
|
||||||
reader = csv.DictReader(csv_buffer)
|
|
||||||
print(f">> Detected fieldnames: {reader.fieldnames}")
|
print(f">> Detected fieldnames: {reader.fieldnames}")
|
||||||
|
|
||||||
|
if balanced:
|
||||||
|
by_sex = {'m': [], 'f': []}
|
||||||
for row in reader:
|
for row in reader:
|
||||||
|
sex = row.get("sex", "").lower()
|
||||||
|
if sex in by_sex:
|
||||||
|
by_sex[sex].append(row)
|
||||||
|
min_len = min(len(by_sex['m']), len(by_sex['f']))
|
||||||
|
if limit:
|
||||||
|
min_len = min(min_len, limit // 2)
|
||||||
|
data = by_sex['m'][:min_len] + by_sex['f'][:min_len]
|
||||||
|
else:
|
||||||
|
data = []
|
||||||
|
for i, row in enumerate(reader):
|
||||||
data.append(row)
|
data.append(row)
|
||||||
if limit and len(data) >= limit:
|
if limit and i + 1 >= limit:
|
||||||
break
|
break
|
||||||
print(f">> Successfully loaded with encoding: {enc}")
|
|
||||||
return data
|
|
||||||
except Exception as e:
|
|
||||||
print(f">> Failed with encoding: {enc}, error: {e}")
|
|
||||||
|
|
||||||
raise UnicodeDecodeError("load_csv_dataset", path, 0, 0, "Unable to decode file with common encodings.")
|
print(">> Successfully loaded with UTF-8 encoding")
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def save_json_dataset(data: list, path: str) -> None:
|
def save_json_dataset(data: list, path: str) -> None:
|
||||||
|
|||||||
+16
-93
@@ -1,8 +1,6 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from sklearn.metrics import (
|
from sklearn.metrics import (
|
||||||
accuracy_score, precision_recall_fscore_support, confusion_matrix
|
accuracy_score, precision_recall_fscore_support, confusion_matrix
|
||||||
@@ -12,47 +10,12 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
|
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(path="names.csv", size=None):
|
|
||||||
"""
|
|
||||||
Loads a dataset from a CSV file, processes it to remove missing values
|
|
||||||
and standardizes the case and formatting of specific columns.
|
|
||||||
|
|
||||||
:param path: The path to the CSV file containing the dataset. Defaults to "names.csv".
|
|
||||||
:type path: str
|
|
||||||
:param size: The number of rows to load from the dataset. If None, the whole dataset is loaded.
|
|
||||||
:type size: Optional[int]
|
|
||||||
:return: A pandas DataFrame with the processed dataset where missing values in the
|
|
||||||
'name' and 'sex' columns are removed, and the text in these columns is
|
|
||||||
converted to lowercase and stripped of leading/trailing whitespace.
|
|
||||||
:rtype: pandas.DataFrame
|
|
||||||
"""
|
|
||||||
df = pd.DataFrame(load_csv_dataset(path, size)).dropna(subset=["name", "sex"])
|
|
||||||
df["name"] = df["name"].str.lower().str.strip()
|
|
||||||
df["sex"] = df["sex"].str.lower().str.strip()
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_logreg(df, threshold):
|
def evaluate_logreg(df, threshold):
|
||||||
"""
|
"""
|
||||||
Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
|
Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
|
||||||
a pre-trained model and label encoder, transforms the input data into the required format, and
|
a pre-trained model and label encoder, transforms the input data into the required format, and
|
||||||
performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
|
performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
|
||||||
the encoder class labels.
|
the encoder class labels.
|
||||||
|
|
||||||
:param df: Input data containing a column "name" for names to evaluate and a column "sex"
|
|
||||||
for true labels.
|
|
||||||
Type: pandas.DataFrame
|
|
||||||
|
|
||||||
:param threshold: Threshold value used for classifying the predictions. Probabilities greater
|
|
||||||
than or equal to this value are classified into the positive class.
|
|
||||||
Type: float
|
|
||||||
|
|
||||||
:return: A tuple containing:
|
|
||||||
- y_true: True labels after encoding.
|
|
||||||
- y_pred: Predicted binary class labels based on the threshold.
|
|
||||||
- proba[:, 1]: Probability values for the positive class.
|
|
||||||
- encoder.classes_: Labels used by the label encoder.
|
|
||||||
Type: tuple (numpy.ndarray, int, numpy.ndarray, numpy.ndarray)
|
|
||||||
"""
|
"""
|
||||||
model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
||||||
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
||||||
@@ -60,7 +23,7 @@ def evaluate_logreg(df, threshold):
|
|||||||
X = df["name"].tolist()
|
X = df["name"].tolist()
|
||||||
y_true = encoder.transform(df["sex"])
|
y_true = encoder.transform(df["sex"])
|
||||||
proba = model.predict_proba(X)
|
proba = model.predict_proba(X)
|
||||||
y_pred = 1 if proba[:, 1] >= threshold else 0
|
y_pred = (proba[:, 1] >= threshold).astype(int)
|
||||||
return y_true, y_pred, proba[:, 1], encoder.classes_
|
return y_true, y_pred, proba[:, 1], encoder.classes_
|
||||||
|
|
||||||
|
|
||||||
@@ -68,23 +31,6 @@ def evaluate_lstm(df, threshold, max_len=6):
|
|||||||
"""
|
"""
|
||||||
Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
|
Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
|
||||||
returns the true labels, predicted labels, prediction probabilities, and class names.
|
returns the true labels, predicted labels, prediction probabilities, and class names.
|
||||||
|
|
||||||
:param df: Input DataFrame containing the data for evaluation.
|
|
||||||
The DataFrame must have two columns: "name" containing
|
|
||||||
the input text data and "sex" containing the true labels.
|
|
||||||
:type df: Pandas.DataFrame
|
|
||||||
:param threshold: Decision threshold for determining binary classification
|
|
||||||
outcome based on model's prediction probabilities.
|
|
||||||
:type threshold: Float
|
|
||||||
:param max_len: The maximum length of input sequences. Used to pad or truncate
|
|
||||||
tokenized sequences. Default value is 6.
|
|
||||||
:type max_len: Int
|
|
||||||
:return: A tuple containing the following elements:
|
|
||||||
- y_true: The true labels from the input DataFrame.
|
|
||||||
- y_pred: The predicted binary labels according to the decision threshold.
|
|
||||||
- proba: Prediction probabilities for the positive class, as output by the model.
|
|
||||||
- encoder.classes_: An array of class names corresponding to the label encoding.
|
|
||||||
:rtype: Tuple
|
|
||||||
"""
|
"""
|
||||||
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
||||||
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
|
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
|
||||||
@@ -94,7 +40,7 @@ def evaluate_lstm(df, threshold, max_len=6):
|
|||||||
X = pad_sequences(sequences, maxlen=max_len, padding="post")
|
X = pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||||
y_true = encoder.transform(df["sex"])
|
y_true = encoder.transform(df["sex"])
|
||||||
proba = model.predict(X)
|
proba = model.predict(X)
|
||||||
y_pred = 1 if proba[:, 1] >= threshold else 0
|
y_pred = (proba[:, 1] >= threshold).astype(int)
|
||||||
return y_true, y_pred, proba[:, 1], encoder.classes_
|
return y_true, y_pred, proba[:, 1], encoder.classes_
|
||||||
|
|
||||||
|
|
||||||
@@ -105,18 +51,6 @@ def evaluate_transformer(df, threshold, max_len=6):
|
|||||||
tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
|
tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
|
||||||
The function then predicts the probabilities for the given names using the transformer model
|
The function then predicts the probabilities for the given names using the transformer model
|
||||||
and generates predictions based on the specified threshold.
|
and generates predictions based on the specified threshold.
|
||||||
|
|
||||||
:param df: Pandas DataFrame containing a "name" column with strings to be evaluated
|
|
||||||
and a "sex" column with corresponding target labels.
|
|
||||||
:type df: Pd.DataFrame
|
|
||||||
:param threshold: Threshold value used to determine binary classification labels
|
|
||||||
from predicted probabilities.
|
|
||||||
:type threshold: Float
|
|
||||||
:param max_len: Maximum length for padded sequences, default is 6.
|
|
||||||
:type max_len: Int, optional
|
|
||||||
:return: A tuple containing the ground truth labels, predicted labels, predicted
|
|
||||||
probabilities for the positive class, and a list of the label classes.
|
|
||||||
:rtype: Tuple
|
|
||||||
"""
|
"""
|
||||||
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
|
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
|
||||||
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
|
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
|
||||||
@@ -126,7 +60,7 @@ def evaluate_transformer(df, threshold, max_len=6):
|
|||||||
X = pad_sequences(sequences, maxlen=max_len, padding="post")
|
X = pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||||
y_true = encoder.transform(df["sex"])
|
y_true = encoder.transform(df["sex"])
|
||||||
proba = model.predict(X)
|
proba = model.predict(X)
|
||||||
y_pred = 1 if proba[:, 1] >= threshold else 0
|
y_pred = (proba[:, 1] >= threshold).astype(int)
|
||||||
return y_true, y_pred, proba[:, 1], encoder.classes_
|
return y_true, y_pred, proba[:, 1], encoder.classes_
|
||||||
|
|
||||||
|
|
||||||
@@ -135,19 +69,6 @@ def compute_metrics(y_true, y_pred, y_proba, class_names):
|
|||||||
Computes classification metrics for given true and predicted labels, along with
|
Computes classification metrics for given true and predicted labels, along with
|
||||||
class probabilities and class names. The function calculates accuracy, precision,
|
class probabilities and class names. The function calculates accuracy, precision,
|
||||||
recall, F1 score, and confusion matrix for evaluating model performance.
|
recall, F1 score, and confusion matrix for evaluating model performance.
|
||||||
|
|
||||||
:param y_true: Ground truth (correct) labels.
|
|
||||||
:type y_true: list or numpy.ndarray
|
|
||||||
:param y_pred: Predicted labels, as returned by a classifier.
|
|
||||||
:type y_pred: list or numpy.ndarray
|
|
||||||
:param y_proba: Predicted probabilities for positive class.
|
|
||||||
:type y_proba: list or numpy.ndarray
|
|
||||||
:param class_names: Names of the classes corresponding to labels in the confusion
|
|
||||||
matrix.
|
|
||||||
:type class_names: numpy.ndarray
|
|
||||||
:return: A dictionary containing computed accuracy, precision, recall, F1 score,
|
|
||||||
and confusion matrix with labels and matrix elements.
|
|
||||||
:rtype: dict
|
|
||||||
"""
|
"""
|
||||||
acc = accuracy_score(y_true, y_pred)
|
acc = accuracy_score(y_true, y_pred)
|
||||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
|
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
|
||||||
@@ -168,20 +89,22 @@ def compute_metrics(y_true, y_pred, y_proba, class_names):
|
|||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
|
parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
|
||||||
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
|
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
|
||||||
parser.add_argument("--dataset", default="names.csv")
|
parser.add_argument("--dataset", default="names_featured.csv", help="Path to the dataset CSV file")
|
||||||
parser.add_argument("--size", type=int)
|
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
|
||||||
parser.add_argument("--threshold", type=float, default=0.5)
|
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
|
||||||
|
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
df = load_dataset(args.dataset, args.size)
|
df = load_csv_dataset(args.dataset, args.size, args.balanced)
|
||||||
|
|
||||||
if args.model == "logreg":
|
model_funcs = {
|
||||||
y_true, y_pred, y_proba, classes = evaluate_logreg(df, args.threshold)
|
"logreg": evaluate_logreg,
|
||||||
elif args.model == "lstm":
|
"lstm": evaluate_lstm,
|
||||||
y_true, y_pred, y_proba, classes = evaluate_lstm(df, args.threshold)
|
"transformer": evaluate_transformer,
|
||||||
elif args.model == "transformer":
|
}
|
||||||
y_true, y_pred, y_proba, classes = evaluate_transformer(df, args.threshold)
|
try:
|
||||||
else:
|
y_true, y_pred, y_proba, classes = model_funcs[args.model](df, args.threshold)
|
||||||
|
except KeyError:
|
||||||
raise ValueError(f"Unknown model: {args.model}")
|
raise ValueError(f"Unknown model: {args.model}")
|
||||||
|
|
||||||
results = compute_metrics(y_true, y_pred, y_proba, classes)
|
results = compute_metrics(y_true, y_pred, y_proba, classes)
|
||||||
|
|||||||
@@ -0,0 +1,82 @@
|
|||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from sklearn.metrics import (
|
||||||
|
accuracy_score, precision_recall_fscore_support,
|
||||||
|
classification_report, confusion_matrix
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format=">> %(message)s")
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_proba(y_true, y_proba, threshold, class_names):
|
||||||
|
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
||||||
|
acc = accuracy_score(y_true, y_pred)
|
||||||
|
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
|
||||||
|
cm = confusion_matrix(y_true, y_pred)
|
||||||
|
|
||||||
|
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
|
||||||
|
print("Confusion Matrix:\n", cm)
|
||||||
|
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BaseConfig:
|
||||||
|
"""
|
||||||
|
Represents the base configuration for a dataset and its associated parameters.
|
||||||
|
|
||||||
|
This class serves as a foundational configuration handler to encapsulate
|
||||||
|
dataset-related parameters and options. It allows customization of dataset
|
||||||
|
behavior, including threshold values, size, cross-validation settings, and
|
||||||
|
whether to save derived configurations. It can also manage configurations
|
||||||
|
for balanced datasets if necessary.
|
||||||
|
"""
|
||||||
|
dataset_path: str = "names_featured.csv"
|
||||||
|
size: Optional[int] = None
|
||||||
|
threshold: float = 0.5
|
||||||
|
cv: Optional[int] = None
|
||||||
|
save: bool = False
|
||||||
|
balanced: bool = False
|
||||||
|
|
||||||
|
epochs: int = 10
|
||||||
|
test_size: float = 0.2
|
||||||
|
random_state: int = 42
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(description: str) -> BaseConfig:
|
||||||
|
"""
|
||||||
|
Parses command-line arguments and loads the configuration for the logistic regression model.
|
||||||
|
|
||||||
|
This function sets up an argument parser for various command-line options including
|
||||||
|
the dataset path, dataset size, dataset balancing, classification threshold,
|
||||||
|
cross-validation folds, and saving the model and its associated artifacts. Once parsed,
|
||||||
|
it transfers the configurations to a ``BaseConfig`` instance and returns it.
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(description)
|
||||||
|
|
||||||
|
parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
|
||||||
|
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
|
||||||
|
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
|
||||||
|
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
|
||||||
|
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
|
||||||
|
parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
|
||||||
|
|
||||||
|
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
|
||||||
|
parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
|
||||||
|
parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
return BaseConfig(
|
||||||
|
dataset_path=args.dataset,
|
||||||
|
size=args.size,
|
||||||
|
threshold=args.threshold,
|
||||||
|
cv=args.cv,
|
||||||
|
save=args.save,
|
||||||
|
balanced=args.balanced,
|
||||||
|
epochs=args.epochs,
|
||||||
|
test_size=args.test_size,
|
||||||
|
random_state=args.random_state
|
||||||
|
)
|
||||||
|
|||||||
+19
-113
@@ -1,8 +1,6 @@
|
|||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Tuple, Optional
|
from typing import Tuple
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
@@ -16,54 +14,20 @@ from sklearn.pipeline import make_pipeline, Pipeline
|
|||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
||||||
|
from ners.gender.models import BaseConfig, load_config, logging
|
||||||
logging.basicConfig(level=logging.INFO, format=">> %(message)s")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Config:
|
class Config(BaseConfig):
|
||||||
dataset_path: str
|
|
||||||
size: Optional[int]
|
|
||||||
test_size: float = 0.2
|
|
||||||
ngram_range: Tuple[int, int] = (2, 5)
|
ngram_range: Tuple[int, int] = (2, 5)
|
||||||
max_iter: int = 1000
|
max_iter: int = 1000
|
||||||
random_state: int = 42
|
|
||||||
threshold: float = 0.5
|
|
||||||
cv: Optional[int] = None
|
|
||||||
save: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_clean_data(cfg: Config) -> Tuple[pd.Series, pd.Series]:
|
|
||||||
"""
|
|
||||||
Load and clean dataset as specified by the provided configuration. This function reads
|
|
||||||
a CSV dataset from the path specified in the configuration, processes it to remove
|
|
||||||
missing values from key columns ('name' and 'sex'), and cleans string data in these
|
|
||||||
columns by converting them to lowercase and stripping whitespace. The cleaned data
|
|
||||||
is then returned as two separate pandas Series objects.
|
|
||||||
|
|
||||||
:param cfg: Configuration object specifying the dataset path and size
|
|
||||||
:type cfg: Config
|
|
||||||
:return: A tuple containing cleaned `name` and `sex` data as pandas Series objects
|
|
||||||
:rtype: Tuple[pd.Series, pd.Series]
|
|
||||||
"""
|
|
||||||
logging.info(f"Loading dataset from {cfg.dataset_path}")
|
|
||||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size))
|
|
||||||
df = df.dropna(subset=["name", "sex"])
|
|
||||||
df["name"] = df["name"].str.lower().str.strip()
|
|
||||||
df["sex"] = df["sex"].str.lower().str.strip()
|
|
||||||
return df["name"], df["sex"]
|
|
||||||
|
|
||||||
|
|
||||||
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
|
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
|
||||||
"""
|
"""
|
||||||
Encode the labels of a given pandas Series using a LabelEncoder. This process maps categorical
|
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
|
||||||
labels to integers, which is particularly useful for machine learning models that require numerical
|
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
|
||||||
input data.
|
for model training. The transformed labels and the fitted encoder are returned.
|
||||||
|
|
||||||
:param y: A pandas Series of categorical labels to be encoded.
|
|
||||||
:type y: pd.Series
|
|
||||||
:return: A tuple containing the encoded labels as a pandas Series and the fitted LabelEncoder object.
|
|
||||||
:rtype: Tuple[pd.Series, LabelEncoder]
|
|
||||||
"""
|
"""
|
||||||
logging.info("Encoding labels")
|
logging.info("Encoding labels")
|
||||||
encoder = LabelEncoder()
|
encoder = LabelEncoder()
|
||||||
@@ -73,21 +37,11 @@ def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
|
|||||||
|
|
||||||
def build_model(cfg: Config) -> Pipeline:
|
def build_model(cfg: Config) -> Pipeline:
|
||||||
"""
|
"""
|
||||||
Builds a machine learning pipeline for text classification.
|
Build a logistic regression model pipeline with a character-level CountVectorizer.
|
||||||
|
The pipeline consists of a CountVectorizer that transforms the input text into
|
||||||
This function constructs and returns a scikit-learn pipeline that consists of
|
character n-grams, followed by a Logistic Regression classifier. The n-gram range
|
||||||
a `CountVectorizer` and a `LogisticRegression` classifier. The vectorizer
|
and maximum iterations for the logistic regression can be configured through the
|
||||||
leverages character-level n-grams based on the provided configuration, and the
|
provided configuration object.
|
||||||
logistic regression model is trained with a maximum number of iterations defined
|
|
||||||
in the configuration. This pipeline is used for processing text data and training
|
|
||||||
classification models.
|
|
||||||
|
|
||||||
:param cfg: Configuration object containing the n-gram range and the maximum
|
|
||||||
number of iterations for the logistic regression model.
|
|
||||||
:type cfg: Config
|
|
||||||
:return: A scikit-learn pipeline with a `CountVectorizer` and `LogisticRegression`
|
|
||||||
based on the provided configuration.
|
|
||||||
:rtype: Pipeline
|
|
||||||
"""
|
"""
|
||||||
return make_pipeline(
|
return make_pipeline(
|
||||||
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
|
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
|
||||||
@@ -95,7 +49,7 @@ def build_model(cfg: Config) -> Pipeline:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
|
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
|
||||||
"""
|
"""
|
||||||
Evaluates the performance of a classification model using a specified threshold
|
Evaluates the performance of a classification model using a specified threshold
|
||||||
for predicted probabilities. Computes metrics such as accuracy, precision,
|
for predicted probabilities. Computes metrics such as accuracy, precision,
|
||||||
@@ -104,19 +58,6 @@ def evaluate_probabilities(y_true, y_proba, threshold: float, class_names):
|
|||||||
|
|
||||||
Logs the evaluation metrics at the specified threshold and prints the confusion
|
Logs the evaluation metrics at the specified threshold and prints the confusion
|
||||||
matrix and classification report.
|
matrix and classification report.
|
||||||
|
|
||||||
:param y_true: Ground truth (correct) labels.
|
|
||||||
:type y_true: array-like
|
|
||||||
:param y_proba: Predicted probabilities for each class, where each row
|
|
||||||
corresponds to an instance and contains probabilities for each target class.
|
|
||||||
:type y_proba: numpy.ndarray
|
|
||||||
:param threshold: The threshold on predicted probabilities to determine
|
|
||||||
class membership for each instance.
|
|
||||||
:type threshold: float
|
|
||||||
:param class_names: List of class names for the target variable used in the
|
|
||||||
classification report.
|
|
||||||
:type class_names: list of str
|
|
||||||
:return: None
|
|
||||||
"""
|
"""
|
||||||
logging.info(f"Evaluating at threshold = {threshold}")
|
logging.info(f"Evaluating at threshold = {threshold}")
|
||||||
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
||||||
@@ -135,16 +76,6 @@ def cross_validate(cfg: Config, X, y) -> None:
|
|||||||
Performs k-fold cross-validation on the provided dataset using the configuration and
|
Performs k-fold cross-validation on the provided dataset using the configuration and
|
||||||
logs the results including individual fold scores, mean accuracy, and the standard
|
logs the results including individual fold scores, mean accuracy, and the standard
|
||||||
deviation of the scores.
|
deviation of the scores.
|
||||||
|
|
||||||
:param cfg: Configuration object containing cross-validation settings such as the
|
|
||||||
number of folds to use in the cross-validation (`cv`).
|
|
||||||
:type cfg: Config
|
|
||||||
:param X: Input feature matrix for the dataset to be used for cross-validation.
|
|
||||||
:type X: Any
|
|
||||||
:param y: Target labels corresponding to the input feature matrix `X`.
|
|
||||||
:type y: Any
|
|
||||||
:return: This function does not return any value. Results are logged.
|
|
||||||
:rtype: None
|
|
||||||
"""
|
"""
|
||||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
||||||
pipeline = build_model(cfg)
|
pipeline = build_model(cfg)
|
||||||
@@ -153,21 +84,9 @@ def cross_validate(cfg: Config, X, y) -> None:
|
|||||||
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
|
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
|
||||||
|
|
||||||
|
|
||||||
def save_artifacts(model, encoder, cfg: Config):
|
def save_artifacts(model, encoder):
|
||||||
"""
|
"""
|
||||||
Saves machine learning model and label encoder artifacts to specified directories
|
Saves the trained model and label encoder artifacts to the specified directory.
|
||||||
within the gender models' directory. This function ensures that the model and encoder
|
|
||||||
are serialized and stored as pickle files. It uses the specified configuration settings
|
|
||||||
to locate the appropriate directory for storing the files.
|
|
||||||
|
|
||||||
:param model: The machine learning model object to be saved.
|
|
||||||
:type model: Any
|
|
||||||
:param encoder: The label encoder object used for data preprocessing.
|
|
||||||
:type encoder: Any
|
|
||||||
:param cfg: Configuration object containing application-specific settings regarding
|
|
||||||
paths and directories.
|
|
||||||
:type cfg: Config
|
|
||||||
:return: None
|
|
||||||
"""
|
"""
|
||||||
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
||||||
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
||||||
@@ -176,23 +95,10 @@ def save_artifacts(model, encoder, cfg: Config):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Train a gender classifier on names")
|
cfg = Config(**vars(load_config("logistic regression model")))
|
||||||
parser.add_argument("--dataset", type=str, default="names.csv", help="Path to dataset")
|
|
||||||
parser.add_argument("--size", type=int, help="Number of rows to load")
|
|
||||||
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for binary decision")
|
|
||||||
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
|
|
||||||
parser.add_argument("--save", action="store_true", help="Save the model and encoder")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
cfg = Config(
|
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
||||||
dataset_path=args.dataset,
|
X_raw, y_raw = df["name"], df["sex"]
|
||||||
size=args.size,
|
|
||||||
threshold=args.threshold,
|
|
||||||
cv=args.cv,
|
|
||||||
save=args.save
|
|
||||||
)
|
|
||||||
|
|
||||||
X_raw, y_raw = load_and_clean_data(cfg)
|
|
||||||
y_encoded, encoder = encode_labels(y_raw)
|
y_encoded, encoder = encode_labels(y_raw)
|
||||||
|
|
||||||
if cfg.cv:
|
if cfg.cv:
|
||||||
@@ -207,10 +113,10 @@ def main():
|
|||||||
model.fit(X_train, y_train)
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
y_proba = model.predict_proba(X_test)
|
y_proba = model.predict_proba(X_test)
|
||||||
evaluate_probabilities(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
||||||
|
|
||||||
if cfg.save:
|
if cfg.save:
|
||||||
save_artifacts(model, encoder, cfg)
|
save_artifacts(model, encoder)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
+23
-150
@@ -1,13 +1,11 @@
|
|||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Tuple, Optional
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.metrics import (
|
from sklearn.metrics import (
|
||||||
accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix
|
accuracy_score
|
||||||
)
|
)
|
||||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
from sklearn.model_selection import train_test_split, StratifiedKFold
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
@@ -18,82 +16,25 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
||||||
|
from ners.gender.models import load_config, BaseConfig, evaluate_proba, logging
|
||||||
logging.basicConfig(level=logging.INFO, format=">> %(message)s")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Config:
|
class Config(BaseConfig):
|
||||||
"""
|
|
||||||
Configuration for the machine learning model and its training process.
|
|
||||||
|
|
||||||
This class encapsulates the configuration options necessary for initializing,
|
|
||||||
training, and evaluating a machine learning model. It allows flexibility
|
|
||||||
in specifying dataset details, model parameters, training settings, and
|
|
||||||
options for evaluation. Attributes include paths, numerical parameters,
|
|
||||||
and flags that guide the model's behavior.
|
|
||||||
|
|
||||||
:ivar dataset_path: Path to the dataset file.
|
|
||||||
:type dataset_path: str
|
|
||||||
:ivar size: Optional size of the dataset to use. If None, use the full dataset.
|
|
||||||
:type size: Optional[int]
|
|
||||||
:ivar max_len: Maximum length of sequences used in the model.
|
|
||||||
:type max_len: int
|
|
||||||
:ivar embedding_dim: Dimensionality of the embedding layer.
|
|
||||||
:type embedding_dim: int
|
|
||||||
:ivar lstm_units: Number of LSTM units in the model.
|
|
||||||
:type lstm_units: int
|
|
||||||
:ivar batch_size: Batch size to use during training.
|
|
||||||
:type batch_size: int
|
|
||||||
:ivar epochs: Number of epochs for model training.
|
|
||||||
:type epochs: int
|
|
||||||
:ivar test_size: Fraction of data to use for testing.
|
|
||||||
:type test_size: float
|
|
||||||
:ivar random_state: Seed for random number generation to ensure reproducibility.
|
|
||||||
:type random_state: int
|
|
||||||
:ivar threshold: Decision threshold for binary classification tasks.
|
|
||||||
:type threshold: float
|
|
||||||
:ivar cv: Number of cross-validation folds. If None, no cross-validation is used.
|
|
||||||
:type cv: Optional[int]
|
|
||||||
:ivar save: Flag indicating whether to save the trained model.
|
|
||||||
:type save: bool
|
|
||||||
"""
|
|
||||||
dataset_path: str
|
|
||||||
size: Optional[int] = None
|
|
||||||
max_len: int = 6
|
max_len: int = 6
|
||||||
embedding_dim: int = 64
|
embedding_dim: int = 64
|
||||||
lstm_units: int = 32
|
lstm_units: int = 32
|
||||||
batch_size: int = 64
|
batch_size: int = 64
|
||||||
epochs: int = 10
|
|
||||||
test_size: float = 0.2
|
|
||||||
random_state: int = 42
|
|
||||||
threshold: float = 0.5
|
|
||||||
cv: Optional[int] = None
|
|
||||||
save: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
||||||
"""
|
"""
|
||||||
Load and preprocess the dataset based on the provided configuration.
|
Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
|
||||||
|
This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
|
||||||
This function performs a series of operations including loading the dataset
|
model training. The resulting outputs are ready for input into a machine learning pipeline.
|
||||||
from the specified path, cleaning and preprocessing data (e.g., converting
|
|
||||||
to lowercase, stripping whitespace, handling missing values), tokenizing names
|
|
||||||
using a tokenizer, and encoding the labels using a label encoder. The final processed
|
|
||||||
data and tools (tokenizer and label encoder) are returned for further use.
|
|
||||||
|
|
||||||
:param cfg: Config object containing dataset parameters such as dataset path, size, and
|
|
||||||
maximum sequence length.
|
|
||||||
:type cfg: Config
|
|
||||||
:return: A tuple containing processed padded sequences (numpy ndarray), corresponding
|
|
||||||
encoded labels (numpy ndarray), tokenizer object used for preprocessing names,
|
|
||||||
and label encoder object used for encoding labels.
|
|
||||||
:rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
|
|
||||||
"""
|
"""
|
||||||
logging.info("Loading and preprocessing data")
|
logging.info("Loading and preprocessing data")
|
||||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
|
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
||||||
df["name"] = df["name"].str.lower().str.strip()
|
|
||||||
df["sex"] = df["sex"].str.lower().str.strip()
|
|
||||||
|
|
||||||
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||||
tokenizer.fit_on_texts(df["name"])
|
tokenizer.fit_on_texts(df["name"])
|
||||||
@@ -107,6 +48,12 @@ def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, La
|
|||||||
|
|
||||||
|
|
||||||
def build_model(cfg: Config, vocab_size: int) -> Sequential:
|
def build_model(cfg: Config, vocab_size: int) -> Sequential:
|
||||||
|
"""
|
||||||
|
Builds and compiles a Sequential LSTM-based model. The model consists of an
|
||||||
|
embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
|
||||||
|
activation, and an output layer with a softmax activation function. The model
|
||||||
|
is compiled using sparse categorical crossentropy loss and the Adam optimizer.
|
||||||
|
"""
|
||||||
logging.info("Building LSTM model")
|
logging.info("Building LSTM model")
|
||||||
model = Sequential([
|
model = Sequential([
|
||||||
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
|
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
|
||||||
@@ -119,60 +66,12 @@ def build_model(cfg: Config, vocab_size: int) -> Sequential:
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def evaluate_proba(y_true, y_proba, threshold, class_names):
|
|
||||||
"""
|
|
||||||
Evaluate the performance of a binary classification model by calculating key metrics and printing
|
|
||||||
a detailed classification report.
|
|
||||||
|
|
||||||
This function thresholds the predicted probabilities to produce binary predictions and calculates
|
|
||||||
metrics such as accuracy, precision, recall, and F1 score. It also generates a confusion matrix
|
|
||||||
and a classification report for the model's performance. Additionally, metrics are logged and
|
|
||||||
informational outputs are printed.
|
|
||||||
|
|
||||||
:param y_true: Ground truth binary labels. Must be a 1-dimensional array or list of integers.
|
|
||||||
:param y_proba: Predicted probabilities for each class from the model. It is a 2-dimensional array
|
|
||||||
where the second dimension represents class probabilities for each sample.
|
|
||||||
:param threshold: Threshold value for converting probabilities into binary predictions. Should be
|
|
||||||
a float between 0 and 1.
|
|
||||||
:param class_names: List of class names corresponding to the binary labels. Used for labeling the
|
|
||||||
classification report.
|
|
||||||
:return: None
|
|
||||||
"""
|
|
||||||
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
|
||||||
acc = accuracy_score(y_true, y_pred)
|
|
||||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
|
|
||||||
cm = confusion_matrix(y_true, y_pred)
|
|
||||||
|
|
||||||
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
|
|
||||||
print("Confusion Matrix:\n", cm)
|
|
||||||
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
|
||||||
|
|
||||||
|
|
||||||
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
||||||
"""
|
"""
|
||||||
Performs k-fold cross-validation on a dataset using a specified model configuration.
|
Performs cross-validation on the given dataset using the specified model configuration.
|
||||||
|
The function uses StratifiedKFold cross-validator to split the dataset into training and
|
||||||
This function takes a dataset and corresponding labels, splits the dataset into
|
validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
|
||||||
k folds (based on the `cv` attribute of the provided configuration object), and
|
on the validation data, and logs the fold-wise and overall results.
|
||||||
performs cross-validation using the specified deep learning model. The model is
|
|
||||||
built and trained on the training subset for each fold, and the validation subset
|
|
||||||
is used to compute accuracy scores. Finally, it logs the individual fold accuracies
|
|
||||||
and the overall mean accuracy with its standard deviation.
|
|
||||||
|
|
||||||
:param cfg: Configuration object containing the parameters for cross-validation,
|
|
||||||
model training, and other settings. `cv` specifies the number of folds,
|
|
||||||
and other attributes such as `epochs`, `batch_size`, and `random_state`
|
|
||||||
dictate the training and reproducibility behavior.
|
|
||||||
:type cfg: Config
|
|
||||||
:param X: Feature data for the dataset. Assumes the input is compatible with the
|
|
||||||
model configuration.
|
|
||||||
:param y: True labels corresponding to the dataset. The order should correspond
|
|
||||||
to the feature set `X`.
|
|
||||||
:param vocab_size: Total vocabulary size used for building the model. Determines
|
|
||||||
the structure of the model input.
|
|
||||||
:type vocab_size: int
|
|
||||||
:return: A list containing the accuracy scores for each fold.
|
|
||||||
:rtype: List[float]
|
|
||||||
"""
|
"""
|
||||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
||||||
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
||||||
@@ -195,23 +94,11 @@ def cross_validate(cfg: Config, X, y, vocab_size: int):
|
|||||||
|
|
||||||
def save_artifacts(model, tokenizer, encoder):
|
def save_artifacts(model, tokenizer, encoder):
|
||||||
"""
|
"""
|
||||||
Save the model, tokenizer, and label encoder artifacts to predefined file paths
|
Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
|
||||||
within the GENDER_MODELS_DIR directory. The function ensures that the model is
|
|
||||||
saved in H5 format, while the tokenizer and encoder are serialized using the
|
|
||||||
Pickle module. It logs a message indicating the completion of the saving process.
|
|
||||||
|
|
||||||
:param model: The machine learning model object to be saved.
|
The function ensures that the specified directory for saving artifacts exists,
|
||||||
:type model: Any
|
then serializes the model, tokenizer, and encoder using appropriate formats. It
|
||||||
|
also logs the success of the operation to notify the user of the action taken.
|
||||||
:param tokenizer: The tokenizer object used in preprocessing, to be serialized
|
|
||||||
for future use.
|
|
||||||
:type tokenizer: Any
|
|
||||||
|
|
||||||
:param encoder: The label encoder object used for encoding labels during
|
|
||||||
training, to be serialized for future use.
|
|
||||||
:type encoder: Any
|
|
||||||
|
|
||||||
:return: None
|
|
||||||
"""
|
"""
|
||||||
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
||||||
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
||||||
@@ -223,21 +110,7 @@ def save_artifacts(model, tokenizer, encoder):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Train BiLSTM model for name-based gender classification")
|
cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
|
||||||
parser.add_argument("--dataset", type=str, default="names.csv")
|
|
||||||
parser.add_argument("--size", type=int)
|
|
||||||
parser.add_argument("--threshold", type=float, default=0.5)
|
|
||||||
parser.add_argument("--cv", type=int)
|
|
||||||
parser.add_argument("--save", action="store_true")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
cfg = Config(
|
|
||||||
dataset_path=args.dataset,
|
|
||||||
size=args.size,
|
|
||||||
threshold=args.threshold,
|
|
||||||
cv=args.cv,
|
|
||||||
save=args.save
|
|
||||||
)
|
|
||||||
|
|
||||||
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
||||||
vocab_size = len(tokenizer.word_index) + 1
|
vocab_size = len(tokenizer.word_index) + 1
|
||||||
|
|||||||
@@ -1,15 +1,12 @@
|
|||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Tuple, Optional
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from sklearn.metrics import (
|
from sklearn.metrics import (
|
||||||
accuracy_score, precision_recall_fscore_support,
|
accuracy_score
|
||||||
classification_report, confusion_matrix
|
|
||||||
)
|
)
|
||||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
from sklearn.model_selection import train_test_split, StratifiedKFold
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
@@ -23,56 +20,11 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
||||||
|
from ners.gender.models import BaseConfig, load_config, evaluate_proba, logging
|
||||||
logging.basicConfig(level=logging.INFO, format=">> %(message)s")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Config:
|
class Config(BaseConfig):
|
||||||
"""
|
|
||||||
Configuration data class used to store settings and parameters for a machine learning or deep
|
|
||||||
learning model.
|
|
||||||
|
|
||||||
This class allows the user to specify various parameters such as dataset path, size of input,
|
|
||||||
model architecture details like embedding dimensions, transformer configurations, training settings
|
|
||||||
like batch size and epochs, and validation and testing settings. The attributes provide flexibility
|
|
||||||
to customize model configuration and training processes.
|
|
||||||
|
|
||||||
:ivar dataset_path: The file path to the dataset.
|
|
||||||
:type dataset_path: str
|
|
||||||
:ivar size: Optional size parameter, can be used to specify sample size or custom
|
|
||||||
configuration based on the user's requirement.
|
|
||||||
:type size: Optional[int]
|
|
||||||
:ivar max_len: Maximum sequence length for input data, used often in text or sequence
|
|
||||||
processing.
|
|
||||||
:type max_len: int
|
|
||||||
:ivar embedding_dim: The dimensionality of embeddings used in the model.
|
|
||||||
:type embedding_dim: int
|
|
||||||
:ivar transformer_head_size: The size of each transformer attention head.
|
|
||||||
:type transformer_head_size: int
|
|
||||||
:ivar transformer_num_heads: The number of attention heads in the transformer model.
|
|
||||||
:type transformer_num_heads: int
|
|
||||||
:ivar transformer_ff_dim: The dimensionality of the feed-forward network in the transformer.
|
|
||||||
:type transformer_ff_dim: int
|
|
||||||
:ivar dropout: Dropout rate used for regularization during training.
|
|
||||||
:type dropout: float
|
|
||||||
:ivar batch_size: Batch size used for training and validation.
|
|
||||||
:type batch_size: int
|
|
||||||
:ivar epochs: Number of epochs for model training.
|
|
||||||
:type epochs: int
|
|
||||||
:ivar test_size: Proportion of the dataset to be used for testing.
|
|
||||||
:type test_size: float
|
|
||||||
:ivar random_state: Random seed value for reproducibility.
|
|
||||||
:type random_state: int
|
|
||||||
:ivar threshold: Threshold value for model predictions or classification.
|
|
||||||
:type threshold: float
|
|
||||||
:ivar cv: Cross-validation configuration, if applicable.
|
|
||||||
:type cv: Optional[int]
|
|
||||||
:ivar save: Boolean flag indicating whether to save the model after training.
|
|
||||||
:type save: bool
|
|
||||||
"""
|
|
||||||
dataset_path: str
|
|
||||||
size: Optional[int]
|
|
||||||
max_len: int = 6
|
max_len: int = 6
|
||||||
embedding_dim: int = 64
|
embedding_dim: int = 64
|
||||||
transformer_head_size: int = 64
|
transformer_head_size: int = 64
|
||||||
@@ -80,38 +32,21 @@ class Config:
|
|||||||
transformer_ff_dim: int = 128
|
transformer_ff_dim: int = 128
|
||||||
dropout: float = 0.1
|
dropout: float = 0.1
|
||||||
batch_size: int = 64
|
batch_size: int = 64
|
||||||
epochs: int = 10
|
|
||||||
test_size: float = 0.2
|
|
||||||
random_state: int = 42
|
|
||||||
threshold: float = 0.5
|
|
||||||
cv: Optional[int] = None
|
|
||||||
save: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
||||||
"""
|
"""
|
||||||
Load and preprocess data for model training or evaluation. This function handles the
|
Load and preprocess the dataset for training a Transformer model.
|
||||||
loading of a dataset in CSV format, applies preprocessing to clean and normalize
|
This function reads a CSV dataset, tokenizes the names, pads the sequences,
|
||||||
the input data, tokenizes text features, and encodes categorical labels.
|
and encodes the labels. It returns the padded sequences, encoded labels,
|
||||||
|
tokenizer, and label encoder.
|
||||||
The preprocessed data is prepared as padded sequences and encoded labels, which
|
|
||||||
can be directly used as inputs for machine learning models. Tokenizer and LabelEncoder
|
|
||||||
are returned to ensure consistency between training and inference stages.
|
|
||||||
|
|
||||||
:param cfg: Configuration object containing dataset path, size of the
|
|
||||||
dataset to load, and maximum length for padding sequences.
|
|
||||||
:type cfg: Config
|
|
||||||
:return: A tuple containing padded input sequences for the model, encoded labels,
|
|
||||||
the tokenizer used for text sequences, and the encoder used for labels.
|
|
||||||
:rtype: Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]
|
|
||||||
"""
|
"""
|
||||||
logging.info("Loading and preprocessing data")
|
logging.info("Loading and preprocessing data")
|
||||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size)).dropna(subset=["name", "sex"])
|
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
||||||
df["name"] = df["name"].str.lower().str.strip()
|
|
||||||
df["sex"] = df["sex"].str.lower().str.strip()
|
|
||||||
|
|
||||||
tokenizer = Tokenizer(oov_token="<OOV>")
|
tokenizer = Tokenizer(oov_token="<OOV>")
|
||||||
tokenizer.fit_on_texts(df["name"])
|
tokenizer.fit_on_texts(df["name"])
|
||||||
|
|
||||||
sequences = tokenizer.texts_to_sequences(df["name"])
|
sequences = tokenizer.texts_to_sequences(df["name"])
|
||||||
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
|
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
|
||||||
|
|
||||||
@@ -122,18 +57,8 @@ def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, La
|
|||||||
|
|
||||||
def transformer_encoder(x, cfg: Config):
|
def transformer_encoder(x, cfg: Config):
|
||||||
"""
|
"""
|
||||||
Transforms input tensor using a single Transformer encoder block with attention and feedforward
|
Transformer encoder block that applies multi-head attention and feed-forward
|
||||||
layers. The encoder applies multi-head attention to the input tensor, adds the output to
|
neural network layers with residual connections and layer normalization.
|
||||||
the original tensor for residual connection, and normalizes it. Subsequently, the processed
|
|
||||||
tensor passes through a feedforward network with added dropout and normalization.
|
|
||||||
|
|
||||||
:param x: Input tensor to be transformed.
|
|
||||||
:type x: TensorFlow tensor
|
|
||||||
:param cfg: Configuration object containing Transformer hyperparameters such as the number of
|
|
||||||
attention heads, head size, feedforward dimension, and dropout rate.
|
|
||||||
:type cfg: Config
|
|
||||||
:return: Transformed tensor resulting from applying the Transformer encoder block.
|
|
||||||
:rtype: TensorFlow tensor
|
|
||||||
"""
|
"""
|
||||||
attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
|
attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
|
||||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
|
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
|
||||||
@@ -145,18 +70,10 @@ def transformer_encoder(x, cfg: Config):
|
|||||||
|
|
||||||
def build_model(cfg: Config, vocab_size: int) -> Model:
|
def build_model(cfg: Config, vocab_size: int) -> Model:
|
||||||
"""
|
"""
|
||||||
Builds a Transformer-based model using Keras/TensorFlow components. The model
|
Builds a Transformer-based model aimed at sequence processing tasks.
|
||||||
is designed for classification tasks, utilizing embedding layers with positional
|
The model includes an embedding layer integrating positional encodings
|
||||||
encoding, a Transformer encoder block, and fully connected layers for
|
and a Transformer encoder, followed by a global pooling layer,
|
||||||
output generation.
|
a dense hidden layer, and a softmax output layer.
|
||||||
|
|
||||||
:param cfg: Configuration object containing model-specific hyperparameters
|
|
||||||
such as maximum sequence length, embedding dimensions, etc.
|
|
||||||
:type cfg: Config
|
|
||||||
:param vocab_size: The size of the vocabulary for the embedding layer.
|
|
||||||
:type vocab_size: int
|
|
||||||
:return: A compiled Keras model, ready for training and evaluation.
|
|
||||||
:rtype: Model
|
|
||||||
"""
|
"""
|
||||||
logging.info("Building Transformer model")
|
logging.info("Building Transformer model")
|
||||||
inputs = Input(shape=(cfg.max_len,))
|
inputs = Input(shape=(cfg.max_len,))
|
||||||
@@ -177,54 +94,11 @@ def build_model(cfg: Config, vocab_size: int) -> Model:
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def evaluate_proba(y_true, y_proba, threshold, class_names):
|
|
||||||
"""
|
|
||||||
Evaluates the performance of a binary classification model by calculating accuracy,
|
|
||||||
precision, recall, F1 score, confusion matrix, and generates a classification
|
|
||||||
report. This function takes the true labels, predicted probabilities, a decision
|
|
||||||
threshold, and class names to assist in the evaluation.
|
|
||||||
|
|
||||||
:param y_true: Ground truth (correct) target values.
|
|
||||||
:type y_true: array-like of shape (n_samples,)
|
|
||||||
:param y_proba: Predicted probabilities for each class. Expected to be an array
|
|
||||||
where the second column corresponds to the probability of the positive class.
|
|
||||||
:type y_proba: array-like of shape (n_samples, 2)
|
|
||||||
:param threshold: Decision threshold for classifying a sample as positive
|
|
||||||
or negative based on predicted probabilities.
|
|
||||||
:type threshold: float
|
|
||||||
:param class_names: List of class names for labeling the classification report.
|
|
||||||
:type class_names: list of str
|
|
||||||
:return: None. Outputs performance metrics and confusion matrix to the logging
|
|
||||||
system and the console.
|
|
||||||
"""
|
|
||||||
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
|
||||||
acc = accuracy_score(y_true, y_pred)
|
|
||||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
|
|
||||||
cm = confusion_matrix(y_true, y_pred)
|
|
||||||
|
|
||||||
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
|
|
||||||
print("Confusion Matrix:\n", cm)
|
|
||||||
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
|
||||||
|
|
||||||
|
|
||||||
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
||||||
"""
|
"""
|
||||||
Evaluate the performance of a model using K-fold cross-validation. This function takes
|
Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
|
||||||
configuration settings, input data, target labels, and vocabulary size to perform the
|
splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
|
||||||
specified number of cross-validation folds with a stratified approach. For each fold,
|
data. The overall mean and standard deviation of accuracies across all folds are logged.
|
||||||
it builds a new model, trains it, predicts the validation set, and calculates accuracy.
|
|
||||||
|
|
||||||
:param cfg: The configuration object containing hyperparameters and settings for
|
|
||||||
cross-validation, random state, and training.
|
|
||||||
:type cfg: Config
|
|
||||||
:param X: The input data samples provided as a dataset.
|
|
||||||
:type X: numpy.ndarray
|
|
||||||
:param y: The target labels corresponding to the input data samples.
|
|
||||||
:type y: numpy.ndarray
|
|
||||||
:param vocab_size: The size of the vocabulary, used to configure the language model.
|
|
||||||
:type vocab_size: int
|
|
||||||
:return: A list containing accuracy scores from each fold in the cross-validation process.
|
|
||||||
:rtype: list
|
|
||||||
"""
|
"""
|
||||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
||||||
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
||||||
@@ -247,14 +121,11 @@ def cross_validate(cfg: Config, X, y, vocab_size: int):
|
|||||||
|
|
||||||
def save_artifacts(model, tokenizer, encoder):
|
def save_artifacts(model, tokenizer, encoder):
|
||||||
"""
|
"""
|
||||||
Saves the machine learning model and its associated artifacts such as tokenizer and
|
Saves the model and associated artifacts to the designated directory. The model
|
||||||
label encoder to predefined file paths. This function ensures that the model and
|
is serialized and saved in a `.keras` file, while the tokenizer and label
|
||||||
artifacts can be reloaded later for inference or further use.
|
encoder are serialized into `.pkl` files. If the directory does not exist, it
|
||||||
|
is created automatically. This function also logs the completion of the
|
||||||
:param model: The machine learning model to be saved.
|
operation.
|
||||||
:param tokenizer: The tokenizer used for preparing data for the model.
|
|
||||||
:param encoder: The label encoder used for encoding target labels.
|
|
||||||
:return: None
|
|
||||||
"""
|
"""
|
||||||
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
||||||
model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
|
model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
|
||||||
@@ -266,21 +137,7 @@ def save_artifacts(model, tokenizer, encoder):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Train Transformer model for name-based gender classification")
|
cfg = Config(**vars(load_config("Transformer model")))
|
||||||
parser.add_argument("--dataset", type=str, default="names.csv")
|
|
||||||
parser.add_argument("--size", type=int)
|
|
||||||
parser.add_argument("--threshold", type=float, default=0.5)
|
|
||||||
parser.add_argument("--cv", type=int)
|
|
||||||
parser.add_argument("--save", action="store_true")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
cfg = Config(
|
|
||||||
dataset_path=args.dataset,
|
|
||||||
size=args.size,
|
|
||||||
threshold=args.threshold,
|
|
||||||
cv=args.cv,
|
|
||||||
save=args.save
|
|
||||||
)
|
|
||||||
|
|
||||||
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
||||||
vocab_size = len(tokenizer.word_index) + 1
|
vocab_size = len(tokenizer.word_index) + 1
|
||||||
|
|||||||
+11
-54
@@ -17,19 +17,6 @@ def predict_logreg(names: List[str], threshold: float):
|
|||||||
The function takes in a list of names and predicts the gender labels
|
The function takes in a list of names and predicts the gender labels
|
||||||
based on a logistic regression model. A probabilistic threshold is used
|
based on a logistic regression model. A probabilistic threshold is used
|
||||||
to classify the names into one of the defined labels.
|
to classify the names into one of the defined labels.
|
||||||
|
|
||||||
:param names:
|
|
||||||
A list of names for which the gender needs to be predicted. Each
|
|
||||||
name must be a string.
|
|
||||||
:param threshold:
|
|
||||||
A float value representing the threshold for classification. Names
|
|
||||||
with predicted probabilities greater than or equal to this value
|
|
||||||
will be classified into the positive class.
|
|
||||||
:return:
|
|
||||||
A tuple containing the predicted gender labels and their
|
|
||||||
corresponding probabilities. The first element of the tuple is a
|
|
||||||
list of predicted labels, while the second element is an array of
|
|
||||||
probability scores for each label.
|
|
||||||
"""
|
"""
|
||||||
model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
|
model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
|
||||||
encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
|
encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
|
||||||
@@ -51,19 +38,6 @@ def predict_lstm(names: List[str], threshold: float, max_len=6):
|
|||||||
The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
|
The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
|
||||||
names, and then uses the loaded model to predict gender probabilities. Based on the threshold
|
names, and then uses the loaded model to predict gender probabilities. Based on the threshold
|
||||||
value, it determines the predicted gender labels.
|
value, it determines the predicted gender labels.
|
||||||
|
|
||||||
:param names: List of names to be classified.
|
|
||||||
:type names: List[str]
|
|
||||||
:param threshold: Probability threshold for classifying gender. If the predicted probability for the
|
|
||||||
'positive' class is greater than or equal to this threshold, it is classified accordingly.
|
|
||||||
:type threshold: float
|
|
||||||
:param max_len: Maximum length for name sequences. Names longer than this will be truncated, and shorter
|
|
||||||
ones will be padded. Default value is 6.
|
|
||||||
:type max_len: int, optional
|
|
||||||
|
|
||||||
:return: A tuple containing predicted labels and associated probabilities. Labels are the predicted gender
|
|
||||||
categories, and probabilities are the prediction scores for each input name.
|
|
||||||
:rtype: Tuple[numpy.ndarray, numpy.ndarray]
|
|
||||||
"""
|
"""
|
||||||
model_path = os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")
|
model_path = os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")
|
||||||
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")
|
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")
|
||||||
@@ -89,20 +63,6 @@ def predict_transformer(names: List[str], threshold: float, max_len=6):
|
|||||||
encoder, converts input names into tokenized sequences, and processes them to generate
|
encoder, converts input names into tokenized sequences, and processes them to generate
|
||||||
gender predictions. The function returns the predicted labels and the associated
|
gender predictions. The function returns the predicted labels and the associated
|
||||||
probabilities for each sample.
|
probabilities for each sample.
|
||||||
|
|
||||||
:param names: List of names to predict gender labels for.
|
|
||||||
:type names: List[str]
|
|
||||||
:param threshold: Threshold value to determine the prediction class. Probability values
|
|
||||||
above or equal to the threshold will be assigned to one class, and those below to
|
|
||||||
another.
|
|
||||||
:type threshold: float
|
|
||||||
:param max_len: Maximum length for the sequences. Names will be truncated or padded to
|
|
||||||
this length during processing, default is 6.
|
|
||||||
:type max_len: int, optional
|
|
||||||
:return: A tuple containing two elements: a list of predicted gender labels as strings
|
|
||||||
and a NumPy array of probabilities for each gender class (where the first index
|
|
||||||
corresponds to one class, and the second index corresponds to another).
|
|
||||||
:rtype: Tuple[List[str], numpy.ndarray]
|
|
||||||
"""
|
"""
|
||||||
model_path = os.path.join(GENDER_MODELS_DIR, "transformer.keras")
|
model_path = os.path.join(GENDER_MODELS_DIR, "transformer.keras")
|
||||||
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
|
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
|
||||||
@@ -123,24 +83,21 @@ def predict_transformer(names: List[str], threshold: float, max_len=6):
|
|||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
|
parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
|
||||||
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
|
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
|
||||||
parser.add_argument("--name", nargs="+", required=True, help="One or more names")
|
parser.add_argument("--names", nargs="+", required=True, help="One or more names")
|
||||||
parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
|
parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
model = args.model
|
model_funcs = {
|
||||||
names = args.name
|
"logreg": predict_logreg,
|
||||||
threshold = args.threshold
|
"lstm": predict_lstm,
|
||||||
|
"transformer": predict_transformer,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
labels, proba = model_funcs[args.model](args.names, args.threshold)
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError(f"Unsupported model type: {args.model}")
|
||||||
|
|
||||||
if model == "logreg":
|
for i, name in enumerate(args.names):
|
||||||
labels, proba = predict_logreg(names, threshold)
|
|
||||||
elif model == "lstm":
|
|
||||||
labels, proba = predict_lstm(names, threshold)
|
|
||||||
elif model == "transformer":
|
|
||||||
labels, proba = predict_transformer(names, threshold)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported model type: {model}")
|
|
||||||
|
|
||||||
for i, name in enumerate(names):
|
|
||||||
p_female = proba[i][0]
|
p_female = proba[i][0]
|
||||||
p_male = proba[i][1]
|
p_male = proba[i][1]
|
||||||
print(f"{name} → {labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
|
print(f"{name} → {labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
|
||||||
|
|||||||
@@ -0,0 +1,52 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from misc import DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
|
def clean(filepath):
|
||||||
|
encodings = ['utf-8', 'utf-16', 'latin1']
|
||||||
|
for enc in encodings:
|
||||||
|
try:
|
||||||
|
print(f">> Trying to read {filepath} with encoding: {enc}")
|
||||||
|
df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip')
|
||||||
|
|
||||||
|
print(">> Remove null bytes and non-breaking spaces from all string columns")
|
||||||
|
for col in df.select_dtypes(include=['object']).columns:
|
||||||
|
df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False)
|
||||||
|
df[col] = df[col].str.replace('\u00a0', ' ', regex=False)
|
||||||
|
df[col] = df[col].str.replace(' +', ' ', regex=True)
|
||||||
|
|
||||||
|
print(f">> Successfully read with encoding: {enc}")
|
||||||
|
df = df.dropna(subset=['name', 'sex', 'region'])
|
||||||
|
df.to_csv(filepath, index=False, encoding='utf-8')
|
||||||
|
return df
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
df = clean(os.path.join(DATA_DIR, 'names.csv'))
|
||||||
|
|
||||||
|
df['name'] = df['name'].str.strip().str.lower()
|
||||||
|
df['words'] = df['name'].str.split().apply(len)
|
||||||
|
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
|
||||||
|
df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
|
||||||
|
df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '')
|
||||||
|
|
||||||
|
print(f">> Arranging columns")
|
||||||
|
cols = [c for c in df.columns if c != 'sex'] + ['sex']
|
||||||
|
df = df[cols]
|
||||||
|
|
||||||
|
print(f">> Saving featured dataset")
|
||||||
|
df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
|
||||||
|
|
||||||
|
print(f">> Splitting dataset by sex")
|
||||||
|
df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
|
||||||
|
df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user