refactor: include province and annotation pipeline
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from sklearn.metrics import (
|
||||
accuracy_score, precision_recall_fscore_support,
|
||||
classification_report, confusion_matrix
|
||||
)
|
||||
|
||||
from misc import logging
|
||||
|
||||
def evaluate_proba(y_true, y_proba, threshold, class_names):
|
||||
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
||||
acc = accuracy_score(y_true, y_pred)
|
||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
|
||||
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
|
||||
print("Confusion Matrix:\n", cm)
|
||||
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseConfig:
|
||||
"""
|
||||
Represents the base configuration for a dataset and its associated parameters.
|
||||
|
||||
This class serves as a foundational configuration handler to encapsulate
|
||||
dataset-related parameters and options. It allows customization of dataset
|
||||
behavior, including threshold values, size, cross-validation settings, and
|
||||
whether to save derived configurations. It can also manage configurations
|
||||
for balanced datasets if necessary.
|
||||
"""
|
||||
dataset_path: str = "names_featured.csv"
|
||||
size: Optional[int] = None
|
||||
threshold: float = 0.5
|
||||
cv: Optional[int] = None
|
||||
save: bool = False
|
||||
balanced: bool = False
|
||||
|
||||
epochs: int = 10
|
||||
test_size: float = 0.2
|
||||
random_state: int = 42
|
||||
|
||||
|
||||
def load_config(description: str) -> BaseConfig:
|
||||
"""
|
||||
Parses command-line arguments and loads the configuration for the logistic regression model.
|
||||
|
||||
This function sets up an argument parser for various command-line options including
|
||||
the dataset path, dataset size, dataset balancing, classification threshold,
|
||||
cross-validation folds, and saving the model and its associated artifacts. Once parsed,
|
||||
it transfers the configurations to a ``BaseConfig`` instance and returns it.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description)
|
||||
|
||||
parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
|
||||
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
|
||||
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
|
||||
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
|
||||
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
|
||||
parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
|
||||
|
||||
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
|
||||
parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
|
||||
parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return BaseConfig(
|
||||
dataset_path=args.dataset,
|
||||
size=args.size,
|
||||
threshold=args.threshold,
|
||||
cv=args.cv,
|
||||
save=args.save,
|
||||
balanced=args.balanced,
|
||||
epochs=args.epochs,
|
||||
test_size=args.test_size,
|
||||
random_state=args.random_state
|
||||
)
|
||||
@@ -0,0 +1,123 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import (
|
||||
accuracy_score, classification_report, confusion_matrix,
|
||||
precision_recall_fscore_support
|
||||
)
|
||||
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
|
||||
from sklearn.pipeline import make_pipeline, Pipeline
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
||||
from pipeline.gender.models import BaseConfig, load_config, logging
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config(BaseConfig):
|
||||
ngram_range: Tuple[int, int] = (2, 5)
|
||||
max_iter: int = 1000
|
||||
|
||||
|
||||
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
|
||||
"""
|
||||
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
|
||||
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
|
||||
for model training. The transformed labels and the fitted encoder are returned.
|
||||
"""
|
||||
logging.info("Encoding labels")
|
||||
encoder = LabelEncoder()
|
||||
y_encoded = encoder.fit_transform(y)
|
||||
return y_encoded, encoder
|
||||
|
||||
|
||||
def build_model(cfg: Config) -> Pipeline:
|
||||
"""
|
||||
Build a logistic regression model pipeline with a character-level CountVectorizer.
|
||||
The pipeline consists of a CountVectorizer that transforms the input text into
|
||||
character n-grams, followed by a Logistic Regression classifier. The n-gram range
|
||||
and maximum iterations for the logistic regression can be configured through the
|
||||
provided configuration object.
|
||||
"""
|
||||
return make_pipeline(
|
||||
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
|
||||
LogisticRegression(max_iter=cfg.max_iter)
|
||||
)
|
||||
|
||||
|
||||
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
|
||||
"""
|
||||
Evaluates the performance of a classification model using a specified threshold
|
||||
for predicted probabilities. Computes metrics such as accuracy, precision,
|
||||
recall, F1-score, and the confusion matrix. Also generates a classification
|
||||
report with detailed metrics for each class.
|
||||
|
||||
Logs the evaluation metrics at the specified threshold and prints the confusion
|
||||
matrix and classification report.
|
||||
"""
|
||||
logging.info(f"Evaluating at threshold = {threshold}")
|
||||
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
||||
acc = accuracy_score(y_true, y_pred)
|
||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
|
||||
logging.info(f"Accuracy: {acc:.4f}")
|
||||
logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
|
||||
print("Confusion Matrix:\n", cm)
|
||||
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
||||
|
||||
|
||||
def cross_validate(cfg: Config, X, y) -> None:
|
||||
"""
|
||||
Performs k-fold cross-validation on the provided dataset using the configuration and
|
||||
logs the results including individual fold scores, mean accuracy, and the standard
|
||||
deviation of the scores.
|
||||
"""
|
||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
||||
pipeline = build_model(cfg)
|
||||
scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
|
||||
logging.info(f"Cross-validation scores: {scores}")
|
||||
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
|
||||
|
||||
|
||||
def save_artifacts(model, encoder):
|
||||
"""
|
||||
Saves the trained model and label encoder artifacts to the specified directory.
|
||||
"""
|
||||
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
||||
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
||||
|
||||
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
||||
|
||||
|
||||
def main():
|
||||
cfg = Config(**vars(load_config("logistic regression model")))
|
||||
|
||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
||||
X_raw, y_raw = df["name"], df["sex"]
|
||||
y_encoded, encoder = encode_labels(y_raw)
|
||||
|
||||
if cfg.cv:
|
||||
cross_validate(cfg, X_raw, y_encoded)
|
||||
return
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
|
||||
)
|
||||
|
||||
model = build_model(cfg)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
y_proba = model.predict_proba(X_test)
|
||||
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
||||
|
||||
if cfg.save:
|
||||
save_artifacts(model, encoder)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,144 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import (
|
||||
accuracy_score
|
||||
)
|
||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras.callbacks import ProgbarLogger
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
||||
from pipeline.gender.models import load_config, BaseConfig, evaluate_proba, logging
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config(BaseConfig):
|
||||
max_len: int = 6
|
||||
embedding_dim: int = 64
|
||||
lstm_units: int = 32
|
||||
batch_size: int = 64
|
||||
|
||||
|
||||
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
||||
"""
|
||||
Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
|
||||
This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
|
||||
model training. The resulting outputs are ready for input into a machine learning pipeline.
|
||||
"""
|
||||
logging.info("Loading and preprocessing data")
|
||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
||||
|
||||
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
tokenizer.fit_on_texts(df["name"])
|
||||
sequences = tokenizer.texts_to_sequences(df["name"])
|
||||
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
labels = label_encoder.fit_transform(df["sex"])
|
||||
|
||||
return padded, labels, tokenizer, label_encoder
|
||||
|
||||
|
||||
def build_model(cfg: Config, vocab_size: int) -> Sequential:
|
||||
"""
|
||||
Builds and compiles a Sequential LSTM-based model. The model consists of an
|
||||
embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
|
||||
activation, and an output layer with a softmax activation function. The model
|
||||
is compiled using sparse categorical crossentropy loss and the Adam optimizer.
|
||||
"""
|
||||
logging.info("Building LSTM model")
|
||||
model = Sequential([
|
||||
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
|
||||
Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)),
|
||||
Bidirectional(LSTM(cfg.lstm_units)),
|
||||
Dense(64, activation="relu"),
|
||||
Dense(2, activation="softmax")
|
||||
])
|
||||
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
|
||||
return model
|
||||
|
||||
|
||||
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
||||
"""
|
||||
Performs cross-validation on the given dataset using the specified model configuration.
|
||||
The function uses StratifiedKFold cross-validator to split the dataset into training and
|
||||
validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
|
||||
on the validation data, and logs the fold-wise and overall results.
|
||||
"""
|
||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
||||
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
||||
accuracies = []
|
||||
|
||||
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
|
||||
logging.info(f"Fold {fold + 1}")
|
||||
model = build_model(cfg, vocab_size)
|
||||
model.fit(X[train_idx], y[train_idx],
|
||||
epochs=cfg.epochs,
|
||||
batch_size=cfg.batch_size,
|
||||
verbose=0)
|
||||
y_pred = model.predict(X[val_idx])
|
||||
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
|
||||
accuracies.append(acc)
|
||||
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
|
||||
|
||||
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
|
||||
|
||||
|
||||
def save_artifacts(model, tokenizer, encoder):
|
||||
"""
|
||||
Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
|
||||
|
||||
The function ensures that the specified directory for saving artifacts exists,
|
||||
then serializes the model, tokenizer, and encoder using appropriate formats. It
|
||||
also logs the success of the operation to notify the user of the action taken.
|
||||
"""
|
||||
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
||||
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
||||
|
||||
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
|
||||
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
|
||||
|
||||
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
||||
|
||||
|
||||
def main():
|
||||
cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
|
||||
|
||||
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
if cfg.cv:
|
||||
cross_validate(cfg, X, y, vocab_size)
|
||||
return
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
|
||||
)
|
||||
|
||||
model = build_model(cfg, vocab_size)
|
||||
model.summary()
|
||||
|
||||
logging.info("Training model")
|
||||
model.fit(X_train, y_train,
|
||||
validation_split=0.1,
|
||||
epochs=cfg.epochs,
|
||||
batch_size=cfg.batch_size,
|
||||
callbacks=[ProgbarLogger()])
|
||||
|
||||
y_proba = model.predict(X_test)
|
||||
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
||||
|
||||
if cfg.save:
|
||||
save_artifacts(model, tokenizer, encoder)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,173 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
from sklearn.metrics import (
|
||||
accuracy_score
|
||||
)
|
||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras.callbacks import ProgbarLogger
|
||||
from tensorflow.keras.layers import (
|
||||
Input, Embedding, Dense, GlobalAveragePooling1D,
|
||||
MultiHeadAttention, Dropout, LayerNormalization
|
||||
)
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
||||
from pipeline.gender.models import BaseConfig, load_config, evaluate_proba, logging
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config(BaseConfig):
|
||||
max_len: int = 6
|
||||
embedding_dim: int = 64
|
||||
transformer_head_size: int = 64
|
||||
transformer_num_heads: int = 2
|
||||
transformer_ff_dim: int = 128
|
||||
dropout: float = 0.1
|
||||
batch_size: int = 64
|
||||
|
||||
|
||||
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
||||
"""
|
||||
Load and preprocess the dataset for training a Transformer model.
|
||||
This function reads a CSV dataset, tokenizes the names, pads the sequences,
|
||||
and encodes the labels. It returns the padded sequences, encoded labels,
|
||||
tokenizer, and label encoder.
|
||||
"""
|
||||
logging.info("Loading and preprocessing data")
|
||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
||||
|
||||
tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
tokenizer.fit_on_texts(df["name"])
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(df["name"])
|
||||
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
|
||||
|
||||
encoder = LabelEncoder()
|
||||
labels = encoder.fit_transform(df["sex"])
|
||||
return padded, labels, tokenizer, encoder
|
||||
|
||||
|
||||
def transformer_encoder(x, cfg: Config):
|
||||
"""
|
||||
Transformer encoder block that applies multi-head attention and feed-forward
|
||||
neural network layers with residual connections and layer normalization.
|
||||
"""
|
||||
attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
|
||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
|
||||
|
||||
ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
|
||||
ff = Dense(x.shape[-1])(ff)
|
||||
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
|
||||
|
||||
|
||||
def build_model(cfg: Config, vocab_size: int) -> Model:
|
||||
"""
|
||||
Builds a Transformer-based model aimed at sequence processing tasks.
|
||||
The model includes an embedding layer integrating positional encodings
|
||||
and a Transformer encoder, followed by a global pooling layer,
|
||||
a dense hidden layer, and a softmax output layer.
|
||||
"""
|
||||
logging.info("Building Transformer model")
|
||||
inputs = Input(shape=(cfg.max_len,))
|
||||
x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=cfg.max_len, delta=1)
|
||||
pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
|
||||
x = x + pos_embedding
|
||||
|
||||
x = transformer_encoder(x, cfg)
|
||||
x = GlobalAveragePooling1D()(x)
|
||||
x = Dense(32, activation="relu")(x)
|
||||
outputs = Dense(2, activation="softmax")(x)
|
||||
|
||||
model = Model(inputs, outputs)
|
||||
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
||||
return model
|
||||
|
||||
|
||||
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
||||
"""
|
||||
Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
|
||||
splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
|
||||
data. The overall mean and standard deviation of accuracies across all folds are logged.
|
||||
"""
|
||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
||||
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
||||
accuracies = []
|
||||
|
||||
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
|
||||
logging.info(f"Fold {fold + 1}")
|
||||
model = build_model(cfg, vocab_size)
|
||||
model.fit(X[train_idx], y[train_idx],
|
||||
epochs=cfg.epochs,
|
||||
batch_size=cfg.batch_size,
|
||||
verbose=0)
|
||||
y_pred = model.predict(X[val_idx])
|
||||
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
|
||||
accuracies.append(acc)
|
||||
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
|
||||
|
||||
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
|
||||
|
||||
|
||||
def save_artifacts(model, tokenizer, encoder):
|
||||
"""
|
||||
Saves the model and associated artifacts to the designated directory. The model
|
||||
is serialized and saved in a `.keras` file, while the tokenizer and label
|
||||
encoder are serialized into `.pkl` files. If the directory does not exist, it
|
||||
is created automatically. This function also logs the completion of the
|
||||
operation.
|
||||
"""
|
||||
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
||||
model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
|
||||
|
||||
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
|
||||
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
|
||||
|
||||
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
||||
|
||||
|
||||
def main():
|
||||
cfg = Config(**vars(load_config("Transformer model")))
|
||||
|
||||
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
if cfg.cv:
|
||||
cross_validate(cfg, X, y, vocab_size)
|
||||
return
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
|
||||
)
|
||||
|
||||
model = build_model(cfg, vocab_size)
|
||||
model.summary()
|
||||
|
||||
logging.info("Training Transformer model")
|
||||
model.fit(
|
||||
X_train, y_train,
|
||||
validation_split=0.1,
|
||||
epochs=cfg.epochs,
|
||||
batch_size=cfg.batch_size,
|
||||
callbacks=[ProgbarLogger()]
|
||||
)
|
||||
|
||||
y_proba = model.predict(X_test)
|
||||
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
||||
|
||||
if cfg.save:
|
||||
save_artifacts(model, tokenizer, encoder)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user