import logging from abc import ABC, abstractmethod from typing import Dict, Any, Optional, List import joblib import matplotlib.pyplot as plt import numpy as np import pandas as pd from research.experiment import ExperimentConfig class BaseModel(ABC): """Abstract base class for all models""" def __init__(self, config: ExperimentConfig): self.config = config self.model = None self.feature_extractor = None self.label_encoder = None self.tokenizer = None # For neural models self.is_fitted = False self.training_history = {} # Store training history for learning curves self.learning_curve_data = {} # Store learning curve experiment data @property @abstractmethod def architecture(self) -> str: """Return the architecture type: 'neural_network', 'traditional', or 'ensemble'""" pass @abstractmethod def prepare_features(self, X: pd.DataFrame) -> np.ndarray: """Prepare features for training/prediction""" pass @abstractmethod def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel": """Fit the model - implemented differently for each architecture""" pass @abstractmethod def cross_validate( self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5 ) -> Dict[str, float] | dict[str, np.floating[Any]]: """Perform cross-validation and return average scores""" pass @abstractmethod def generate_learning_curve( self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None ) -> Dict[str, Any]: """Generate learning curve data for the model""" pass def predict(self, X: pd.DataFrame) -> np.ndarray: """Make predictions""" if not self.is_fitted: raise ValueError("Model must be fitted before making predictions") features_df = self.feature_extractor.extract_features(X) X_prepared = self.prepare_features(features_df) predictions = self.model.predict(X_prepared) # Handle different prediction formats if hasattr(predictions, "shape") and len(predictions.shape) > 1: # Neural network outputs (probabilities) predictions = predictions.argmax(axis=1) return self.label_encoder.inverse_transform(predictions) def predict_proba(self, X: pd.DataFrame) -> np.ndarray: """Get prediction probabilities if supported""" if not self.is_fitted: raise ValueError("Model must be fitted before making predictions") features_df = self.feature_extractor.extract_features(X) X_prepared = self.prepare_features(features_df) if hasattr(self.model, "predict_proba"): return self.model.predict_proba(X_prepared) elif hasattr(self.model, "predict"): # For neural networks that return probabilities directly probabilities = self.model.predict(X_prepared) if len(probabilities.shape) == 2 and probabilities.shape[1] > 1: return probabilities raise NotImplementedError("Model does not support probability predictions") def get_feature_importance(self) -> Optional[Dict[str, float]]: """Get feature importance if supported by the model""" if hasattr(self.model, "feature_importances_"): # For tree-based models importances = self.model.feature_importances_ feature_names = self._get_feature_names() return dict(zip(feature_names, importances)) elif hasattr(self.model, "coef_"): # For linear models coefficients = np.abs(self.model.coef_[0]) feature_names = self._get_feature_names() return dict(zip(feature_names, coefficients)) elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps: # For sklearn pipelines (like LogisticRegression with vectorizer) classifier = self.model.named_steps["classifier"] if hasattr(classifier, "coef_"): coefficients = np.abs(classifier.coef_[0]) if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"): feature_names = self.model.named_steps["vectorizer"].get_feature_names_out() # Take top features to avoid too many n-grams top_indices = np.argsort(coefficients)[-20:] return dict(zip(feature_names[top_indices], coefficients[top_indices])) return None def _get_feature_names(self) -> List[str]: """Get feature names (override in subclasses if needed)""" if hasattr(self.model, "feature_names_in_"): return list(self.model.feature_names_in_) return [f"feature_{i}" for i in range(100)] # Default fallback def save(self, path: str): """Save the complete model with training history""" model_data = { "model": self.model, "feature_extractor": self.feature_extractor, "label_encoder": self.label_encoder, "tokenizer": self.tokenizer, "config": self.config.to_dict(), "is_fitted": self.is_fitted, "training_history": self.training_history, "learning_curve_data": self.learning_curve_data, } joblib.dump(model_data, path) @classmethod def load(cls, path: str) -> "BaseModel": """Load a saved model with training history""" model_data = joblib.load(path) # Recreate the model instance from research.experiment import ExperimentConfig config = ExperimentConfig.from_dict(model_data["config"]) instance = cls(config) # Restore state instance.model = model_data["model"] instance.feature_extractor = model_data["feature_extractor"] instance.label_encoder = model_data["label_encoder"] instance.tokenizer = model_data.get("tokenizer") instance.is_fitted = model_data["is_fitted"] instance.training_history = model_data.get("training_history", {}) instance.learning_curve_data = model_data.get("learning_curve_data", {}) return instance def plot_learning_curve(self, save_path: Optional[str] = None) -> str: """Plot and save learning curve""" if not self.learning_curve_data: logging.warning("No learning curve data available") return "" plt.figure(figsize=(10, 6)) data = self.learning_curve_data train_sizes = data["train_sizes"] train_scores = data["train_scores"] val_scores = data["val_scores"] train_std = data.get("train_scores_std", [0] * len(train_sizes)) val_std = data.get("val_scores_std", [0] * len(train_sizes)) # Plot learning curves plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score") plt.fill_between( train_sizes, np.array(train_scores) - np.array(train_std), np.array(train_scores) + np.array(train_std), alpha=0.1, color="blue", ) plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score") plt.fill_between( train_sizes, np.array(val_scores) - np.array(val_std), np.array(val_scores) + np.array(val_std), alpha=0.1, color="red", ) plt.xlabel("Training Set Size") plt.ylabel("Accuracy Score") plt.title(f"Learning Curve - {self.__class__.__name__}") plt.legend(loc="best") plt.grid(True, alpha=0.3) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches="tight") plt.close() return save_path else: plt.show() return "" def plot_training_history(self, save_path: Optional[str] = None) -> str: """Plot training history for neural networks""" if not self.training_history: logging.warning("No training history available") return "" fig, axes = plt.subplots(1, 2, figsize=(15, 5)) # Plot accuracy if "accuracy" in self.training_history: axes[0].plot(self.training_history["accuracy"], label="Training Accuracy") if "val_accuracy" in self.training_history: axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy") axes[0].set_title("Model Accuracy") axes[0].set_xlabel("Epoch") axes[0].set_ylabel("Accuracy") axes[0].legend() axes[0].grid(True, alpha=0.3) # Plot loss if "loss" in self.training_history: axes[1].plot(self.training_history["loss"], label="Training Loss") if "val_loss" in self.training_history: axes[1].plot(self.training_history["val_loss"], label="Validation Loss") axes[1].set_title("Model Loss") axes[1].set_xlabel("Epoch") axes[1].set_ylabel("Loss") axes[1].legend() axes[1].grid(True, alpha=0.3) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches="tight") plt.close() return save_path else: plt.show() return ""