145 lines
5.4 KiB
Python
145 lines
5.4 KiB
Python
import os
|
|
from dataclasses import dataclass
|
|
from typing import Tuple
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.metrics import (
|
|
accuracy_score
|
|
)
|
|
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from tensorflow.keras.callbacks import ProgbarLogger
|
|
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
|
|
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
|
from pipeline.gender.models import load_config, BaseConfig, evaluate_proba, logging
|
|
|
|
|
|
@dataclass
|
|
class Config(BaseConfig):
|
|
max_len: int = 6
|
|
embedding_dim: int = 64
|
|
lstm_units: int = 32
|
|
batch_size: int = 64
|
|
|
|
|
|
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
|
"""
|
|
Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
|
|
This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
|
|
model training. The resulting outputs are ready for input into a machine learning pipeline.
|
|
"""
|
|
logging.info("Loading and preprocessing data")
|
|
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
|
|
|
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
|
tokenizer.fit_on_texts(df["name"])
|
|
sequences = tokenizer.texts_to_sequences(df["name"])
|
|
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
|
|
|
|
label_encoder = LabelEncoder()
|
|
labels = label_encoder.fit_transform(df["sex"])
|
|
|
|
return padded, labels, tokenizer, label_encoder
|
|
|
|
|
|
def build_model(cfg: Config, vocab_size: int) -> Sequential:
|
|
"""
|
|
Builds and compiles a Sequential LSTM-based model. The model consists of an
|
|
embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
|
|
activation, and an output layer with a softmax activation function. The model
|
|
is compiled using sparse categorical crossentropy loss and the Adam optimizer.
|
|
"""
|
|
logging.info("Building LSTM model")
|
|
model = Sequential([
|
|
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
|
|
Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)),
|
|
Bidirectional(LSTM(cfg.lstm_units)),
|
|
Dense(64, activation="relu"),
|
|
Dense(2, activation="softmax")
|
|
])
|
|
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
|
|
return model
|
|
|
|
|
|
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
|
"""
|
|
Performs cross-validation on the given dataset using the specified model configuration.
|
|
The function uses StratifiedKFold cross-validator to split the dataset into training and
|
|
validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
|
|
on the validation data, and logs the fold-wise and overall results.
|
|
"""
|
|
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
|
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
|
accuracies = []
|
|
|
|
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
|
|
logging.info(f"Fold {fold + 1}")
|
|
model = build_model(cfg, vocab_size)
|
|
model.fit(X[train_idx], y[train_idx],
|
|
epochs=cfg.epochs,
|
|
batch_size=cfg.batch_size,
|
|
verbose=0)
|
|
y_pred = model.predict(X[val_idx])
|
|
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
|
|
accuracies.append(acc)
|
|
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
|
|
|
|
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
|
|
|
|
|
|
def save_artifacts(model, tokenizer, encoder):
|
|
"""
|
|
Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
|
|
|
|
The function ensures that the specified directory for saving artifacts exists,
|
|
then serializes the model, tokenizer, and encoder using appropriate formats. It
|
|
also logs the success of the operation to notify the user of the action taken.
|
|
"""
|
|
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
|
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
|
|
|
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
|
|
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
|
|
|
|
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
|
|
|
|
|
def main():
|
|
cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
|
|
|
|
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
|
vocab_size = len(tokenizer.word_index) + 1
|
|
|
|
if cfg.cv:
|
|
cross_validate(cfg, X, y, vocab_size)
|
|
return
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
|
|
)
|
|
|
|
model = build_model(cfg, vocab_size)
|
|
model.summary()
|
|
|
|
logging.info("Training model")
|
|
model.fit(X_train, y_train,
|
|
validation_split=0.1,
|
|
epochs=cfg.epochs,
|
|
batch_size=cfg.batch_size,
|
|
callbacks=[ProgbarLogger()])
|
|
|
|
y_proba = model.predict(X_test)
|
|
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
|
|
|
if cfg.save:
|
|
save_artifacts(model, tokenizer, encoder)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|