import json import logging from datetime import datetime from typing import List, Dict, Any import pandas as pd from core.config import get_config from core.utils.data_loader import DataLoader from research.experiment import FeatureType, ExperimentConfig from research.experiment.experiment_runner import ExperimentRunner from research.experiment.experiment_tracker import ExperimentTracker from research.model_registry import MODEL_REGISTRY class ModelTrainer: """Comprehensive model training and artifact management""" def __init__(self, config=None): self.config = config or get_config() self.data_loader = DataLoader(self.config) self.experiment_runner = ExperimentRunner(self.config) self.experiment_tracker = ExperimentTracker(self.config) # Setup model artifacts directory self.models_dir = self.config.paths.models_dir self.models_dir.mkdir(parents=True, exist_ok=True) def train_single_model( self, model_name: str, model_type: str = "logistic_regression", features: List[str] = None, model_params: Dict[str, Any] = None, tags: List[str] = None, save_artifacts: bool = True, ) -> str: """ Train a single model and save its artifacts. Returns the experiment ID. """ logging.info(f"Training {model_type} model: {model_name}") if features is None: features = ["full_name"] feature_types = [FeatureType(f) for f in features] # Prepare tags - combine default tags with template tags default_tags = ["training", model_type] experiment_tags = default_tags + (tags or []) # Create experiment configuration config = ExperimentConfig( name=model_name, description=f"Training {model_type} model with features: {', '.join(features)}", model_type=model_type, features=feature_types, model_params=model_params or {}, tags=experiment_tags, ) # Run experiment experiment_id = self.experiment_runner.run_experiment(config) experiment = self.experiment_tracker.get_experiment(experiment_id) if experiment and experiment.test_metrics: logging.info("Training completed successfully!") logging.info(f"Experiment ID: {experiment_id}") logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}") logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}") if save_artifacts: self.save_model_artifacts(experiment_id) return experiment_id def train_multiple_models( self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True ) -> List[str]: """ Train multiple models with different configurations. """ logging.info(f"Training {len(model_configs)} models...") experiment_ids = [] for i, config in enumerate(model_configs): model_name = f"{base_name}_{config['model_type']}_{i + 1}" try: exp_id = self.train_single_model( model_name=model_name, model_type=config["model_type"], features=config.get("features", ["full_name"]), model_params=config.get("model_params", {}), save_artifacts=save_all, ) experiment_ids.append(exp_id) except Exception as e: logging.error(f"Failed to train {model_name}: {e}") continue logging.info(f"Completed training {len(experiment_ids)} models successfully") return experiment_ids def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]: """ Save model artifacts in a structured way for easy loading. Returns paths to saved artifacts. """ experiment = self.experiment_tracker.get_experiment(experiment_id) if not experiment: raise ValueError(f"Experiment {experiment_id} not found") # Create model-specific directory model_dir = self.models_dir / experiment_id model_dir.mkdir(parents=True, exist_ok=True) # Load the trained model trained_model = self.experiment_runner.load_experiment_model(experiment_id) if not trained_model: raise ValueError(f"Could not load model for experiment {experiment_id}") # Save complete model with joblib model_path = model_dir / "complete_model.joblib" trained_model.save(str(model_path)) # Save model configuration config_path = model_dir / "model_config.json" with open(config_path, "w") as f: import json json.dump(experiment.config.to_dict(), f, indent=2) # Save experiment results results_path = model_dir / "experiment_results.json" with open(results_path, "w") as f: json.dump(experiment.to_dict(), f, indent=2, default=str) # Generate and save learning curves learning_curve_path = None training_history_path = None try: # Load data for learning curve generation data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"]) if data_path.exists(): df = self.data_loader.load_csv_complete(data_path) # Generate learning curve logging.info("Generating learning curve...") trained_model.generate_learning_curve(df, df[experiment.config.target_column]) # Plot and save learning curve learning_curve_path = model_dir / "learning_curve.png" trained_model.plot_learning_curve(str(learning_curve_path)) # Plot and save training history (for neural networks) if trained_model.training_history: training_history_path = model_dir / "training_history.png" trained_model.plot_training_history(str(training_history_path)) # Save learning curve data as JSON learning_data_path = model_dir / "learning_curve_data.json" with open(learning_data_path, "w") as f: json.dump(trained_model.learning_curve_data, f, indent=2) # Save training history data as JSON if trained_model.training_history: history_data_path = model_dir / "training_history_data.json" with open(history_data_path, "w") as f: json.dump(trained_model.training_history, f, indent=2) except Exception as e: logging.warning(f"Could not generate learning curves: {e}") # Save artifacts metadata metadata = { "experiment_id": experiment_id, "model_name": experiment.config.name, "model_type": experiment.config.model_type, "features": [f.value for f in experiment.config.features], "training_date": datetime.now().isoformat(), "test_accuracy": experiment.test_metrics.get("accuracy", 0), "test_f1": experiment.test_metrics.get("f1", 0), "model_path": str(model_path), "config_path": str(config_path), "results_path": str(results_path), "learning_curve_plot": str(learning_curve_path) if learning_curve_path else None, "training_history_plot": str(training_history_path) if training_history_path else None, "has_learning_curve": bool(trained_model.learning_curve_data), "has_training_history": bool(trained_model.training_history), } metadata_path = model_dir / "metadata.json" with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) logging.info(f"Model artifacts saved to: {model_dir}") logging.info(f" - Complete model: {model_path.name}") logging.info(f" - Configuration: {config_path.name}") logging.info(f" - Results: {results_path.name}") logging.info(f" - Metadata: {metadata_path.name}") if learning_curve_path and learning_curve_path.exists(): logging.info(f" - Learning curve: {learning_curve_path.name}") if training_history_path and training_history_path.exists(): logging.info(f" - Training history: {training_history_path.name}") return { "model_dir": str(model_dir), "model_path": str(model_path), "config_path": str(config_path), "results_path": str(results_path), "metadata_path": str(metadata_path), "learning_curve_plot": str(learning_curve_path) if learning_curve_path else None, "training_history_plot": str(training_history_path) if training_history_path else None, } def load_trained_model(self, experiment_id: str): """ Load a previously trained model from artifacts. """ model_dir = self.models_dir / experiment_id model_path = model_dir / "complete_model.joblib" if not model_path.exists(): raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}") # Load the model class dynamically metadata_path = model_dir / "metadata.json" with open(metadata_path, "r") as f: metadata = json.load(f) model_type = metadata["model_type"] model_class = MODEL_REGISTRY[model_type] # Load the complete model loaded_model = model_class.load(str(model_path)) logging.info(f"Loaded model: {metadata['model_name']}") logging.info(f" Type: {model_type}") logging.info(f" Accuracy: {metadata['test_accuracy']:.4f}") return loaded_model def list_saved_models(self) -> pd.DataFrame: """ List all saved model artifacts. """ models_data = [] for model_dir in self.models_dir.iterdir(): if model_dir.is_dir(): metadata_path = model_dir / "metadata.json" if metadata_path.exists(): try: with open(metadata_path, "r") as f: metadata = json.load(f) models_data.append(metadata) except Exception as e: logging.warning(f"Could not read metadata for {model_dir.name}: {e}") if not models_data: logging.info("No saved models found.") return pd.DataFrame() df = pd.DataFrame(models_data) # Format the display display_columns = [ "model_name", "model_type", "features", "test_accuracy", "test_f1", "training_date", ] available_columns = [col for col in display_columns if col in df.columns] return df[available_columns].sort_values("training_date", ascending=False)