Files
drc-ners-nlp/train.py
T

153 lines
4.3 KiB
Python

#!.venv/bin/python3
import logging
import argparse
from research.model_trainer import ModelTrainer
def train_baseline_models():
"""
Quick function to train all baseline models and save artifacts.
"""
logger = logging.getLogger(__name__)
logger.info("Training Baseline Models with Artifact Saving")
trainer = ModelTrainer()
# Define baseline model configurations
baseline_configs = [
{
"model_type": "logistic_regression",
"features": ["full_name"],
"model_params": {"ngram_range": [2, 5], "max_features": 10000},
},
{
"model_type": "logistic_regression",
"features": ["native_name"],
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
},
{
"model_type": "logistic_regression",
"features": ["surname"],
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
},
{
"model_type": "random_forest",
"features": ["name_length", "word_count", "province"],
"model_params": {"n_estimators": 100, "max_depth": 10},
},
{
"model_type": "svm",
"features": ["full_name"],
"model_params": {"kernel": "rbf", "C": 1.0},
},
{"model_type": "naive_bayes", "features": ["full_name"], "model_params": {"alpha": 1.0}},
]
# Train all baseline models
experiment_ids = trainer.train_multiple_models("baseline", baseline_configs)
# Show summary
logger.info(f"\n Training Summary:")
for exp_id in experiment_ids:
experiment = trainer.experiment_tracker.get_experiment(exp_id)
if experiment:
acc = experiment.test_metrics.get("accuracy", 0)
logger.info(f" {experiment.config.name}: {acc:.4f} accuracy")
return experiment_ids
def train_neural_networks():
"""
Train neural network models with proper parameters.
"""
logging.info("Training Neural Network Models")
trainer = ModelTrainer()
neural_configs = [
{
"model_type": "lstm",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"lstm_units": 32,
"epochs": 10,
"batch_size": 64,
"max_len": 6,
},
},
{
"model_type": "cnn",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"filters": 64,
"kernel_size": 3,
"epochs": 10,
"batch_size": 64,
"max_len": 20, # Character level
},
},
{
"model_type": "transformer",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"transformer_num_heads": 2,
"epochs": 10,
"batch_size": 64,
"max_len": 6,
},
},
]
experiment_ids = trainer.train_multiple_models("neural_networks", neural_configs)
return experiment_ids
def main():
"""
Main training script with different options.
"""
parser = argparse.ArgumentParser(description="Train DRC Names Models")
parser.add_argument(
"--mode",
choices=["baseline", "neural", "list"],
default="list",
help="Training mode",
)
parser.add_argument("--model-type", type=str, help="Specific model type to train")
parser.add_argument("--name", type=str, help="Model name")
args = parser.parse_args()
trainer = ModelTrainer()
if args.mode == "baseline":
train_baseline_models()
elif args.mode == "neural":
train_neural_networks()
elif args.mode == "list":
logging.info("📋 Saved Models:")
saved_models = trainer.list_saved_models()
if not saved_models.empty:
logging.info(saved_models.to_string(index=False))
else:
logging.info("No saved models found.")
elif args.model_type and args.name:
# Train specific model
trainer.train_single_model(
model_name=args.name, model_type=args.model_type, features=["full_name"]
)
if __name__ == "__main__":
main()