refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,250 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
import joblib
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
|
||||
|
||||
class BaseModel(ABC):
|
||||
"""Abstract base class for all models"""
|
||||
|
||||
def __init__(self, config: ExperimentConfig):
|
||||
self.config = config
|
||||
self.model = None
|
||||
self.feature_extractor = None
|
||||
self.label_encoder = None
|
||||
self.tokenizer = None # For neural models
|
||||
self.is_fitted = False
|
||||
self.training_history = {} # Store training history for learning curves
|
||||
self.learning_curve_data = {} # Store learning curve experiment data
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def architecture(self) -> str:
|
||||
"""Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Prepare features for training/prediction"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||
"""Fit the model - implemented differently for each architecture"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> Dict[str, float] | dict[str, np.floating[Any]]:
|
||||
"""Perform cross-validation and return average scores"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
pass
|
||||
|
||||
def predict(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Make predictions"""
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Model must be fitted before making predictions")
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
predictions = self.model.predict(X_prepared)
|
||||
|
||||
# Handle different prediction formats
|
||||
if hasattr(predictions, "shape") and len(predictions.shape) > 1:
|
||||
# Neural network outputs (probabilities)
|
||||
predictions = predictions.argmax(axis=1)
|
||||
|
||||
return self.label_encoder.inverse_transform(predictions)
|
||||
|
||||
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Get prediction probabilities if supported"""
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Model must be fitted before making predictions")
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
if hasattr(self.model, "predict_proba"):
|
||||
return self.model.predict_proba(X_prepared)
|
||||
elif hasattr(self.model, "predict"):
|
||||
# For neural networks that return probabilities directly
|
||||
probabilities = self.model.predict(X_prepared)
|
||||
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
|
||||
return probabilities
|
||||
|
||||
raise NotImplementedError("Model does not support probability predictions")
|
||||
|
||||
def get_feature_importance(self) -> Optional[Dict[str, float]]:
|
||||
"""Get feature importance if supported by the model"""
|
||||
|
||||
if hasattr(self.model, "feature_importances_"):
|
||||
# For tree-based models
|
||||
importances = self.model.feature_importances_
|
||||
feature_names = self._get_feature_names()
|
||||
return dict(zip(feature_names, importances))
|
||||
|
||||
elif hasattr(self.model, "coef_"):
|
||||
# For linear models
|
||||
coefficients = np.abs(self.model.coef_[0])
|
||||
feature_names = self._get_feature_names()
|
||||
return dict(zip(feature_names, coefficients))
|
||||
|
||||
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
|
||||
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
||||
classifier = self.model.named_steps["classifier"]
|
||||
if hasattr(classifier, "coef_"):
|
||||
coefficients = np.abs(classifier.coef_[0])
|
||||
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
|
||||
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
|
||||
# Take top features to avoid too many n-grams
|
||||
top_indices = np.argsort(coefficients)[-20:]
|
||||
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
|
||||
|
||||
return None
|
||||
|
||||
def _get_feature_names(self) -> List[str]:
|
||||
"""Get feature names (override in subclasses if needed)"""
|
||||
if hasattr(self.model, "feature_names_in_"):
|
||||
return list(self.model.feature_names_in_)
|
||||
return [f"feature_{i}" for i in range(100)] # Default fallback
|
||||
|
||||
def save(self, path: str):
|
||||
"""Save the complete model with training history"""
|
||||
|
||||
model_data = {
|
||||
"model": self.model,
|
||||
"feature_extractor": self.feature_extractor,
|
||||
"label_encoder": self.label_encoder,
|
||||
"tokenizer": self.tokenizer,
|
||||
"config": self.config.to_dict(),
|
||||
"is_fitted": self.is_fitted,
|
||||
"training_history": self.training_history,
|
||||
"learning_curve_data": self.learning_curve_data,
|
||||
}
|
||||
joblib.dump(model_data, path)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str) -> "BaseModel":
|
||||
"""Load a saved model with training history"""
|
||||
model_data = joblib.load(path)
|
||||
|
||||
# Recreate the model instance
|
||||
from research.experiment import ExperimentConfig
|
||||
|
||||
config = ExperimentConfig.from_dict(model_data["config"])
|
||||
instance = cls(config)
|
||||
|
||||
# Restore state
|
||||
instance.model = model_data["model"]
|
||||
instance.feature_extractor = model_data["feature_extractor"]
|
||||
instance.label_encoder = model_data["label_encoder"]
|
||||
instance.tokenizer = model_data.get("tokenizer")
|
||||
instance.is_fitted = model_data["is_fitted"]
|
||||
instance.training_history = model_data.get("training_history", {})
|
||||
instance.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||
|
||||
return instance
|
||||
|
||||
def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
|
||||
"""Plot and save learning curve"""
|
||||
|
||||
if not self.learning_curve_data:
|
||||
logging.warning("No learning curve data available")
|
||||
return ""
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
data = self.learning_curve_data
|
||||
train_sizes = data["train_sizes"]
|
||||
train_scores = data["train_scores"]
|
||||
val_scores = data["val_scores"]
|
||||
train_std = data.get("train_scores_std", [0] * len(train_sizes))
|
||||
val_std = data.get("val_scores_std", [0] * len(train_sizes))
|
||||
|
||||
# Plot learning curves
|
||||
plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
|
||||
plt.fill_between(
|
||||
train_sizes,
|
||||
np.array(train_scores) - np.array(train_std),
|
||||
np.array(train_scores) + np.array(train_std),
|
||||
alpha=0.1,
|
||||
color="blue",
|
||||
)
|
||||
|
||||
plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
|
||||
plt.fill_between(
|
||||
train_sizes,
|
||||
np.array(val_scores) - np.array(val_std),
|
||||
np.array(val_scores) + np.array(val_std),
|
||||
alpha=0.1,
|
||||
color="red",
|
||||
)
|
||||
|
||||
plt.xlabel("Training Set Size")
|
||||
plt.ylabel("Accuracy Score")
|
||||
plt.title(f"Learning Curve - {self.__class__.__name__}")
|
||||
plt.legend(loc="best")
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
if save_path:
|
||||
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
return save_path
|
||||
else:
|
||||
plt.show()
|
||||
return ""
|
||||
|
||||
def plot_training_history(self, save_path: Optional[str] = None) -> str:
|
||||
"""Plot training history for neural networks"""
|
||||
if not self.training_history:
|
||||
logging.warning("No training history available")
|
||||
return ""
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
|
||||
|
||||
# Plot accuracy
|
||||
if "accuracy" in self.training_history:
|
||||
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
||||
if "val_accuracy" in self.training_history:
|
||||
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
|
||||
axes[0].set_title("Model Accuracy")
|
||||
axes[0].set_xlabel("Epoch")
|
||||
axes[0].set_ylabel("Accuracy")
|
||||
axes[0].legend()
|
||||
axes[0].grid(True, alpha=0.3)
|
||||
|
||||
# Plot loss
|
||||
if "loss" in self.training_history:
|
||||
axes[1].plot(self.training_history["loss"], label="Training Loss")
|
||||
if "val_loss" in self.training_history:
|
||||
axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
|
||||
axes[1].set_title("Model Loss")
|
||||
axes[1].set_xlabel("Epoch")
|
||||
axes[1].set_ylabel("Loss")
|
||||
axes[1].legend()
|
||||
axes[1].grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
if save_path:
|
||||
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
return save_path
|
||||
else:
|
||||
plt.show()
|
||||
return ""
|
||||
Reference in New Issue
Block a user