153 lines
4.3 KiB
Python
153 lines
4.3 KiB
Python
#!.venv/bin/python3
|
|
import logging
|
|
import argparse
|
|
|
|
from research.model_trainer import ModelTrainer
|
|
|
|
|
|
def train_baseline_models():
|
|
"""
|
|
Quick function to train all baseline models and save artifacts.
|
|
"""
|
|
logger = logging.getLogger(__name__)
|
|
logger.info("Training Baseline Models with Artifact Saving")
|
|
|
|
trainer = ModelTrainer()
|
|
|
|
# Define baseline model configurations
|
|
baseline_configs = [
|
|
{
|
|
"model_type": "logistic_regression",
|
|
"features": ["full_name"],
|
|
"model_params": {"ngram_range": [2, 5], "max_features": 10000},
|
|
},
|
|
{
|
|
"model_type": "logistic_regression",
|
|
"features": ["native_name"],
|
|
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
|
|
},
|
|
{
|
|
"model_type": "logistic_regression",
|
|
"features": ["surname"],
|
|
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
|
|
},
|
|
{
|
|
"model_type": "random_forest",
|
|
"features": ["name_length", "word_count", "province"],
|
|
"model_params": {"n_estimators": 100, "max_depth": 10},
|
|
},
|
|
{
|
|
"model_type": "svm",
|
|
"features": ["full_name"],
|
|
"model_params": {"kernel": "rbf", "C": 1.0},
|
|
},
|
|
{"model_type": "naive_bayes", "features": ["full_name"], "model_params": {"alpha": 1.0}},
|
|
]
|
|
|
|
# Train all baseline models
|
|
experiment_ids = trainer.train_multiple_models("baseline", baseline_configs)
|
|
|
|
# Show summary
|
|
logger.info(f"\n Training Summary:")
|
|
for exp_id in experiment_ids:
|
|
experiment = trainer.experiment_tracker.get_experiment(exp_id)
|
|
if experiment:
|
|
acc = experiment.test_metrics.get("accuracy", 0)
|
|
logger.info(f" {experiment.config.name}: {acc:.4f} accuracy")
|
|
|
|
return experiment_ids
|
|
|
|
|
|
def train_neural_networks():
|
|
"""
|
|
Train neural network models with proper parameters.
|
|
"""
|
|
|
|
logging.info("Training Neural Network Models")
|
|
|
|
trainer = ModelTrainer()
|
|
|
|
neural_configs = [
|
|
{
|
|
"model_type": "lstm",
|
|
"features": ["full_name"],
|
|
"model_params": {
|
|
"embedding_dim": 64,
|
|
"lstm_units": 32,
|
|
"epochs": 10,
|
|
"batch_size": 64,
|
|
"max_len": 6,
|
|
},
|
|
},
|
|
{
|
|
"model_type": "cnn",
|
|
"features": ["full_name"],
|
|
"model_params": {
|
|
"embedding_dim": 64,
|
|
"filters": 64,
|
|
"kernel_size": 3,
|
|
"epochs": 10,
|
|
"batch_size": 64,
|
|
"max_len": 20, # Character level
|
|
},
|
|
},
|
|
{
|
|
"model_type": "transformer",
|
|
"features": ["full_name"],
|
|
"model_params": {
|
|
"embedding_dim": 64,
|
|
"transformer_num_heads": 2,
|
|
"epochs": 10,
|
|
"batch_size": 64,
|
|
"max_len": 6,
|
|
},
|
|
},
|
|
]
|
|
|
|
experiment_ids = trainer.train_multiple_models("neural_networks", neural_configs)
|
|
return experiment_ids
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main training script with different options.
|
|
"""
|
|
|
|
parser = argparse.ArgumentParser(description="Train DRC Names Models")
|
|
parser.add_argument(
|
|
"--mode",
|
|
choices=["baseline", "neural", "list"],
|
|
default="list",
|
|
help="Training mode",
|
|
)
|
|
parser.add_argument("--model-type", type=str, help="Specific model type to train")
|
|
parser.add_argument("--name", type=str, help="Model name")
|
|
|
|
args = parser.parse_args()
|
|
|
|
trainer = ModelTrainer()
|
|
|
|
if args.mode == "baseline":
|
|
train_baseline_models()
|
|
|
|
elif args.mode == "neural":
|
|
train_neural_networks()
|
|
|
|
elif args.mode == "list":
|
|
logging.info("📋 Saved Models:")
|
|
saved_models = trainer.list_saved_models()
|
|
if not saved_models.empty:
|
|
logging.info(saved_models.to_string(index=False))
|
|
else:
|
|
logging.info("No saved models found.")
|
|
|
|
elif args.model_type and args.name:
|
|
# Train specific model
|
|
trainer.train_single_model(
|
|
model_name=args.name, model_type=args.model_type, features=["full_name"]
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|