refactor: include province and annotation pipeline

This commit is contained in:
2025-07-24 12:50:30 +02:00
parent da7b09dab3
commit e2536c1899
18 changed files with 402 additions and 355 deletions
+80
View File
@@ -0,0 +1,80 @@
import argparse
from dataclasses import dataclass
from typing import Optional
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support,
classification_report, confusion_matrix
)
from misc import logging
def evaluate_proba(y_true, y_proba, threshold, class_names):
y_pred = (y_proba[:, 1] >= threshold).astype(int)
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
cm = confusion_matrix(y_true, y_pred)
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
@dataclass
class BaseConfig:
"""
Represents the base configuration for a dataset and its associated parameters.
This class serves as a foundational configuration handler to encapsulate
dataset-related parameters and options. It allows customization of dataset
behavior, including threshold values, size, cross-validation settings, and
whether to save derived configurations. It can also manage configurations
for balanced datasets if necessary.
"""
dataset_path: str = "names_featured.csv"
size: Optional[int] = None
threshold: float = 0.5
cv: Optional[int] = None
save: bool = False
balanced: bool = False
epochs: int = 10
test_size: float = 0.2
random_state: int = 42
def load_config(description: str) -> BaseConfig:
"""
Parses command-line arguments and loads the configuration for the logistic regression model.
This function sets up an argument parser for various command-line options including
the dataset path, dataset size, dataset balancing, classification threshold,
cross-validation folds, and saving the model and its associated artifacts. Once parsed,
it transfers the configurations to a ``BaseConfig`` instance and returns it.
"""
parser = argparse.ArgumentParser(description)
parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
args = parser.parse_args()
return BaseConfig(
dataset_path=args.dataset,
size=args.size,
threshold=args.threshold,
cv=args.cv,
save=args.save,
balanced=args.balanced,
epochs=args.epochs,
test_size=args.test_size,
random_state=args.random_state
)
+123
View File
@@ -0,0 +1,123 @@
import os
from dataclasses import dataclass
from typing import Tuple
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
precision_recall_fscore_support
)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import BaseConfig, load_config, logging
@dataclass
class Config(BaseConfig):
ngram_range: Tuple[int, int] = (2, 5)
max_iter: int = 1000
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
"""
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
for model training. The transformed labels and the fitted encoder are returned.
"""
logging.info("Encoding labels")
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
return y_encoded, encoder
def build_model(cfg: Config) -> Pipeline:
"""
Build a logistic regression model pipeline with a character-level CountVectorizer.
The pipeline consists of a CountVectorizer that transforms the input text into
character n-grams, followed by a Logistic Regression classifier. The n-gram range
and maximum iterations for the logistic regression can be configured through the
provided configuration object.
"""
return make_pipeline(
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
LogisticRegression(max_iter=cfg.max_iter)
)
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
"""
Evaluates the performance of a classification model using a specified threshold
for predicted probabilities. Computes metrics such as accuracy, precision,
recall, F1-score, and the confusion matrix. Also generates a classification
report with detailed metrics for each class.
Logs the evaluation metrics at the specified threshold and prints the confusion
matrix and classification report.
"""
logging.info(f"Evaluating at threshold = {threshold}")
y_pred = (y_proba[:, 1] >= threshold).astype(int)
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
cm = confusion_matrix(y_true, y_pred)
logging.info(f"Accuracy: {acc:.4f}")
logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
def cross_validate(cfg: Config, X, y) -> None:
"""
Performs k-fold cross-validation on the provided dataset using the configuration and
logs the results including individual fold scores, mean accuracy, and the standard
deviation of the scores.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
pipeline = build_model(cfg)
scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
logging.info(f"Cross-validation scores: {scores}")
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
def save_artifacts(model, encoder):
"""
Saves the trained model and label encoder artifacts to the specified directory.
"""
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("logistic regression model")))
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
X_raw, y_raw = df["name"], df["sex"]
y_encoded, encoder = encode_labels(y_raw)
if cfg.cv:
cross_validate(cfg, X_raw, y_encoded)
return
X_train, X_test, y_train, y_test = train_test_split(
X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
)
model = build_model(cfg)
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, encoder)
if __name__ == "__main__":
main()
+144
View File
@@ -0,0 +1,144 @@
import os
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import pandas as pd
from sklearn.metrics import (
accuracy_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ProgbarLogger
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import load_config, BaseConfig, evaluate_proba, logging
@dataclass
class Config(BaseConfig):
max_len: int = 6
embedding_dim: int = 64
lstm_units: int = 32
batch_size: int = 64
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
"""
Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
model training. The resulting outputs are ready for input into a machine learning pipeline.
"""
logging.info("Loading and preprocessing data")
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(df["name"])
sequences = tokenizer.texts_to_sequences(df["name"])
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["sex"])
return padded, labels, tokenizer, label_encoder
def build_model(cfg: Config, vocab_size: int) -> Sequential:
"""
Builds and compiles a Sequential LSTM-based model. The model consists of an
embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
activation, and an output layer with a softmax activation function. The model
is compiled using sparse categorical crossentropy loss and the Adam optimizer.
"""
logging.info("Building LSTM model")
model = Sequential([
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)),
Bidirectional(LSTM(cfg.lstm_units)),
Dense(64, activation="relu"),
Dense(2, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
return model
def cross_validate(cfg: Config, X, y, vocab_size: int):
"""
Performs cross-validation on the given dataset using the specified model configuration.
The function uses StratifiedKFold cross-validator to split the dataset into training and
validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
on the validation data, and logs the fold-wise and overall results.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
accuracies = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
logging.info(f"Fold {fold + 1}")
model = build_model(cfg, vocab_size)
model.fit(X[train_idx], y[train_idx],
epochs=cfg.epochs,
batch_size=cfg.batch_size,
verbose=0)
y_pred = model.predict(X[val_idx])
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
accuracies.append(acc)
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
def save_artifacts(model, tokenizer, encoder):
"""
Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
The function ensures that the specified directory for saving artifacts exists,
then serializes the model, tokenizer, and encoder using appropriate formats. It
also logs the success of the operation to notify the user of the action taken.
"""
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
X, y, tokenizer, encoder = load_and_prepare(cfg)
vocab_size = len(tokenizer.word_index) + 1
if cfg.cv:
cross_validate(cfg, X, y, vocab_size)
return
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
)
model = build_model(cfg, vocab_size)
model.summary()
logging.info("Training model")
model.fit(X_train, y_train,
validation_split=0.1,
epochs=cfg.epochs,
batch_size=cfg.batch_size,
callbacks=[ProgbarLogger()])
y_proba = model.predict(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, tokenizer, encoder)
if __name__ == "__main__":
main()
+173
View File
@@ -0,0 +1,173 @@
import os
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import (
accuracy_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ProgbarLogger
from tensorflow.keras.layers import (
Input, Embedding, Dense, GlobalAveragePooling1D,
MultiHeadAttention, Dropout, LayerNormalization
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import BaseConfig, load_config, evaluate_proba, logging
@dataclass
class Config(BaseConfig):
max_len: int = 6
embedding_dim: int = 64
transformer_head_size: int = 64
transformer_num_heads: int = 2
transformer_ff_dim: int = 128
dropout: float = 0.1
batch_size: int = 64
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
"""
Load and preprocess the dataset for training a Transformer model.
This function reads a CSV dataset, tokenizes the names, pads the sequences,
and encodes the labels. It returns the padded sequences, encoded labels,
tokenizer, and label encoder.
"""
logging.info("Loading and preprocessing data")
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df["name"])
sequences = tokenizer.texts_to_sequences(df["name"])
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
encoder = LabelEncoder()
labels = encoder.fit_transform(df["sex"])
return padded, labels, tokenizer, encoder
def transformer_encoder(x, cfg: Config):
"""
Transformer encoder block that applies multi-head attention and feed-forward
neural network layers with residual connections and layer normalization.
"""
attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
def build_model(cfg: Config, vocab_size: int) -> Model:
"""
Builds a Transformer-based model aimed at sequence processing tasks.
The model includes an embedding layer integrating positional encodings
and a Transformer encoder, followed by a global pooling layer,
a dense hidden layer, and a softmax output layer.
"""
logging.info("Building Transformer model")
inputs = Input(shape=(cfg.max_len,))
x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=cfg.max_len, delta=1)
pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
x = x + pos_embedding
x = transformer_encoder(x, cfg)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
outputs = Dense(2, activation="softmax")(x)
model = Model(inputs, outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
return model
def cross_validate(cfg: Config, X, y, vocab_size: int):
"""
Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
data. The overall mean and standard deviation of accuracies across all folds are logged.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
accuracies = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
logging.info(f"Fold {fold + 1}")
model = build_model(cfg, vocab_size)
model.fit(X[train_idx], y[train_idx],
epochs=cfg.epochs,
batch_size=cfg.batch_size,
verbose=0)
y_pred = model.predict(X[val_idx])
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
accuracies.append(acc)
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
def save_artifacts(model, tokenizer, encoder):
"""
Saves the model and associated artifacts to the designated directory. The model
is serialized and saved in a `.keras` file, while the tokenizer and label
encoder are serialized into `.pkl` files. If the directory does not exist, it
is created automatically. This function also logs the completion of the
operation.
"""
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("Transformer model")))
X, y, tokenizer, encoder = load_and_prepare(cfg)
vocab_size = len(tokenizer.word_index) + 1
if cfg.cv:
cross_validate(cfg, X, y, vocab_size)
return
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
)
model = build_model(cfg, vocab_size)
model.summary()
logging.info("Training Transformer model")
model.fit(
X_train, y_train,
validation_split=0.1,
epochs=cfg.epochs,
batch_size=cfg.batch_size,
callbacks=[ProgbarLogger()]
)
y_proba = model.predict(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, tokenizer, encoder)
if __name__ == "__main__":
main()