refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,250 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
import joblib
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
|
||||
|
||||
class BaseModel(ABC):
|
||||
"""Abstract base class for all models"""
|
||||
|
||||
def __init__(self, config: ExperimentConfig):
|
||||
self.config = config
|
||||
self.model = None
|
||||
self.feature_extractor = None
|
||||
self.label_encoder = None
|
||||
self.tokenizer = None # For neural models
|
||||
self.is_fitted = False
|
||||
self.training_history = {} # Store training history for learning curves
|
||||
self.learning_curve_data = {} # Store learning curve experiment data
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def architecture(self) -> str:
|
||||
"""Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Prepare features for training/prediction"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||
"""Fit the model - implemented differently for each architecture"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> Dict[str, float] | dict[str, np.floating[Any]]:
|
||||
"""Perform cross-validation and return average scores"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
pass
|
||||
|
||||
def predict(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Make predictions"""
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Model must be fitted before making predictions")
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
predictions = self.model.predict(X_prepared)
|
||||
|
||||
# Handle different prediction formats
|
||||
if hasattr(predictions, "shape") and len(predictions.shape) > 1:
|
||||
# Neural network outputs (probabilities)
|
||||
predictions = predictions.argmax(axis=1)
|
||||
|
||||
return self.label_encoder.inverse_transform(predictions)
|
||||
|
||||
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Get prediction probabilities if supported"""
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Model must be fitted before making predictions")
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
if hasattr(self.model, "predict_proba"):
|
||||
return self.model.predict_proba(X_prepared)
|
||||
elif hasattr(self.model, "predict"):
|
||||
# For neural networks that return probabilities directly
|
||||
probabilities = self.model.predict(X_prepared)
|
||||
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
|
||||
return probabilities
|
||||
|
||||
raise NotImplementedError("Model does not support probability predictions")
|
||||
|
||||
def get_feature_importance(self) -> Optional[Dict[str, float]]:
|
||||
"""Get feature importance if supported by the model"""
|
||||
|
||||
if hasattr(self.model, "feature_importances_"):
|
||||
# For tree-based models
|
||||
importances = self.model.feature_importances_
|
||||
feature_names = self._get_feature_names()
|
||||
return dict(zip(feature_names, importances))
|
||||
|
||||
elif hasattr(self.model, "coef_"):
|
||||
# For linear models
|
||||
coefficients = np.abs(self.model.coef_[0])
|
||||
feature_names = self._get_feature_names()
|
||||
return dict(zip(feature_names, coefficients))
|
||||
|
||||
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
|
||||
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
||||
classifier = self.model.named_steps["classifier"]
|
||||
if hasattr(classifier, "coef_"):
|
||||
coefficients = np.abs(classifier.coef_[0])
|
||||
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
|
||||
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
|
||||
# Take top features to avoid too many n-grams
|
||||
top_indices = np.argsort(coefficients)[-20:]
|
||||
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
|
||||
|
||||
return None
|
||||
|
||||
def _get_feature_names(self) -> List[str]:
|
||||
"""Get feature names (override in subclasses if needed)"""
|
||||
if hasattr(self.model, "feature_names_in_"):
|
||||
return list(self.model.feature_names_in_)
|
||||
return [f"feature_{i}" for i in range(100)] # Default fallback
|
||||
|
||||
def save(self, path: str):
|
||||
"""Save the complete model with training history"""
|
||||
|
||||
model_data = {
|
||||
"model": self.model,
|
||||
"feature_extractor": self.feature_extractor,
|
||||
"label_encoder": self.label_encoder,
|
||||
"tokenizer": self.tokenizer,
|
||||
"config": self.config.to_dict(),
|
||||
"is_fitted": self.is_fitted,
|
||||
"training_history": self.training_history,
|
||||
"learning_curve_data": self.learning_curve_data,
|
||||
}
|
||||
joblib.dump(model_data, path)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str) -> "BaseModel":
|
||||
"""Load a saved model with training history"""
|
||||
model_data = joblib.load(path)
|
||||
|
||||
# Recreate the model instance
|
||||
from research.experiment import ExperimentConfig
|
||||
|
||||
config = ExperimentConfig.from_dict(model_data["config"])
|
||||
instance = cls(config)
|
||||
|
||||
# Restore state
|
||||
instance.model = model_data["model"]
|
||||
instance.feature_extractor = model_data["feature_extractor"]
|
||||
instance.label_encoder = model_data["label_encoder"]
|
||||
instance.tokenizer = model_data.get("tokenizer")
|
||||
instance.is_fitted = model_data["is_fitted"]
|
||||
instance.training_history = model_data.get("training_history", {})
|
||||
instance.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||
|
||||
return instance
|
||||
|
||||
def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
|
||||
"""Plot and save learning curve"""
|
||||
|
||||
if not self.learning_curve_data:
|
||||
logging.warning("No learning curve data available")
|
||||
return ""
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
data = self.learning_curve_data
|
||||
train_sizes = data["train_sizes"]
|
||||
train_scores = data["train_scores"]
|
||||
val_scores = data["val_scores"]
|
||||
train_std = data.get("train_scores_std", [0] * len(train_sizes))
|
||||
val_std = data.get("val_scores_std", [0] * len(train_sizes))
|
||||
|
||||
# Plot learning curves
|
||||
plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
|
||||
plt.fill_between(
|
||||
train_sizes,
|
||||
np.array(train_scores) - np.array(train_std),
|
||||
np.array(train_scores) + np.array(train_std),
|
||||
alpha=0.1,
|
||||
color="blue",
|
||||
)
|
||||
|
||||
plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
|
||||
plt.fill_between(
|
||||
train_sizes,
|
||||
np.array(val_scores) - np.array(val_std),
|
||||
np.array(val_scores) + np.array(val_std),
|
||||
alpha=0.1,
|
||||
color="red",
|
||||
)
|
||||
|
||||
plt.xlabel("Training Set Size")
|
||||
plt.ylabel("Accuracy Score")
|
||||
plt.title(f"Learning Curve - {self.__class__.__name__}")
|
||||
plt.legend(loc="best")
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
if save_path:
|
||||
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
return save_path
|
||||
else:
|
||||
plt.show()
|
||||
return ""
|
||||
|
||||
def plot_training_history(self, save_path: Optional[str] = None) -> str:
|
||||
"""Plot training history for neural networks"""
|
||||
if not self.training_history:
|
||||
logging.warning("No training history available")
|
||||
return ""
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
|
||||
|
||||
# Plot accuracy
|
||||
if "accuracy" in self.training_history:
|
||||
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
||||
if "val_accuracy" in self.training_history:
|
||||
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
|
||||
axes[0].set_title("Model Accuracy")
|
||||
axes[0].set_xlabel("Epoch")
|
||||
axes[0].set_ylabel("Accuracy")
|
||||
axes[0].legend()
|
||||
axes[0].grid(True, alpha=0.3)
|
||||
|
||||
# Plot loss
|
||||
if "loss" in self.training_history:
|
||||
axes[1].plot(self.training_history["loss"], label="Training Loss")
|
||||
if "val_loss" in self.training_history:
|
||||
axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
|
||||
axes[1].set_title("Model Loss")
|
||||
axes[1].set_xlabel("Epoch")
|
||||
axes[1].set_ylabel("Loss")
|
||||
axes[1].legend()
|
||||
axes[1].grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
if save_path:
|
||||
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
return save_path
|
||||
else:
|
||||
plt.show()
|
||||
return ""
|
||||
@@ -0,0 +1,91 @@
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from enum import Enum
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
|
||||
from .feature_extractor import FeatureType
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExperimentConfig:
|
||||
"""Configuration for a single experiment"""
|
||||
|
||||
# Experiment metadata
|
||||
name: str
|
||||
description: str = ""
|
||||
tags: List[str] = field(default_factory=list)
|
||||
|
||||
# Model configuration
|
||||
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||
model_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Feature configuration
|
||||
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
|
||||
feature_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Data configuration
|
||||
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
|
||||
test_data_filter: Optional[Dict[str, Any]] = None
|
||||
target_column: str = "sex"
|
||||
|
||||
# Training configuration
|
||||
test_size: float = 0.2
|
||||
random_seed: int = 42
|
||||
cross_validation_folds: int = 5
|
||||
|
||||
# Evaluation configuration
|
||||
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization"""
|
||||
result = asdict(self)
|
||||
# Convert enums to strings
|
||||
result["features"] = [f.value for f in self.features]
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
|
||||
"""Create from dictionary"""
|
||||
if "features" in data:
|
||||
data["features"] = [FeatureType(f) for f in data["features"]]
|
||||
return cls(**data)
|
||||
|
||||
|
||||
class ExperimentStatus(Enum):
|
||||
"""Experiment execution status"""
|
||||
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
def calculate_metrics(
|
||||
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
|
||||
) -> Dict[str, float]:
|
||||
"""Calculate specified metrics"""
|
||||
|
||||
if metrics is None:
|
||||
metrics = ["accuracy", "precision", "recall", "f1"]
|
||||
|
||||
results = {}
|
||||
|
||||
if "accuracy" in metrics:
|
||||
results["accuracy"] = accuracy_score(y_true, y_pred)
|
||||
|
||||
if any(m in metrics for m in ["precision", "recall", "f1"]):
|
||||
precision, recall, f1, _ = precision_recall_fscore_support(
|
||||
y_true, y_pred, average="weighted"
|
||||
)
|
||||
|
||||
if "precision" in metrics:
|
||||
results["precision"] = precision
|
||||
if "recall" in metrics:
|
||||
results["recall"] = recall
|
||||
if "f1" in metrics:
|
||||
results["f1"] = f1
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,56 @@
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExperimentResult:
|
||||
"""Results from an experiment execution"""
|
||||
|
||||
experiment_id: str
|
||||
config: ExperimentConfig
|
||||
|
||||
# Execution metadata
|
||||
start_time: datetime
|
||||
end_time: Optional[datetime] = None
|
||||
status: ExperimentStatus = ExperimentStatus.PENDING
|
||||
error_message: Optional[str] = None
|
||||
|
||||
# Model artifacts
|
||||
model_path: Optional[str] = None
|
||||
feature_extractor_path: Optional[str] = None
|
||||
|
||||
# Metrics
|
||||
train_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
test_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
cv_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
|
||||
# Additional results
|
||||
confusion_matrix: Optional[List[List[int]]] = None
|
||||
feature_importance: Optional[Dict[str, float]] = None
|
||||
prediction_examples: Optional[List[Dict]] = None
|
||||
|
||||
# Data statistics
|
||||
train_size: int = 0
|
||||
test_size: int = 0
|
||||
class_distribution: Dict[str, int] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization"""
|
||||
result = asdict(self)
|
||||
result["config"] = self.config.to_dict()
|
||||
result["start_time"] = self.start_time.isoformat()
|
||||
result["end_time"] = self.end_time.isoformat() if self.end_time else None
|
||||
result["status"] = self.status.value
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
|
||||
"""Create from dictionary"""
|
||||
data["config"] = ExperimentConfig.from_dict(data["config"])
|
||||
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
||||
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||
data["status"] = ExperimentStatus(data["status"])
|
||||
return cls(**data)
|
||||
@@ -0,0 +1,123 @@
|
||||
from typing import List
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.experiment.feature_extractor import FeatureType
|
||||
|
||||
|
||||
class ExperimentBuilder:
|
||||
"""Helper class to build experiment configurations"""
|
||||
|
||||
@staticmethod
|
||||
def create_baseline_experiments() -> List[ExperimentConfig]:
|
||||
"""Create a set of baseline experiments for comparison"""
|
||||
|
||||
return [
|
||||
# Full name experiments
|
||||
ExperimentConfig(
|
||||
name="baseline_logistic_regression_fullname",
|
||||
description="Logistic regression with full name",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.FULL_NAME],
|
||||
tags=["baseline", "fullname"],
|
||||
),
|
||||
# Native name only
|
||||
ExperimentConfig(
|
||||
name="baseline_logistic_regression_native",
|
||||
description="Logistic regression with native name only",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.NATIVE_NAME],
|
||||
tags=["baseline", "native"],
|
||||
),
|
||||
# Surname only
|
||||
ExperimentConfig(
|
||||
name="baseline_logistic_regression_surname",
|
||||
description="Logistic regression with surname only",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.SURNAME],
|
||||
tags=["baseline", "surname"],
|
||||
),
|
||||
# Random Forest with engineered features
|
||||
ExperimentConfig(
|
||||
name="baseline_rf_engineered",
|
||||
description="Random Forest with engineered features",
|
||||
model_type="random_forest",
|
||||
features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
|
||||
tags=["baseline", "engineered"],
|
||||
),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def create_feature_ablation_study() -> List[ExperimentConfig]:
|
||||
"""Create experiments for feature ablation study"""
|
||||
base_features = [
|
||||
FeatureType.FULL_NAME,
|
||||
FeatureType.NAME_LENGTH,
|
||||
FeatureType.WORD_COUNT,
|
||||
FeatureType.PROVINCE,
|
||||
]
|
||||
|
||||
experiments = []
|
||||
|
||||
# Test removing each feature one by one
|
||||
for i, feature_to_remove in enumerate(base_features):
|
||||
remaining_features = [f for f in base_features if f != feature_to_remove]
|
||||
|
||||
experiments.append(
|
||||
ExperimentConfig(
|
||||
name=f"ablation_remove_{feature_to_remove.value}",
|
||||
description=f"Ablation study: removed {feature_to_remove.value}",
|
||||
model_type="logistic_regression",
|
||||
features=remaining_features,
|
||||
tags=["ablation", feature_to_remove.value],
|
||||
)
|
||||
)
|
||||
|
||||
return experiments
|
||||
|
||||
@staticmethod
|
||||
def create_name_component_study() -> List[ExperimentConfig]:
|
||||
"""Create experiments to study different name components"""
|
||||
experiments = []
|
||||
|
||||
name_components = [
|
||||
(FeatureType.FIRST_WORD, "first_word"),
|
||||
(FeatureType.LAST_WORD, "last_word"),
|
||||
(FeatureType.NATIVE_NAME, "native_name"),
|
||||
(FeatureType.SURNAME, "surname"),
|
||||
(FeatureType.NAME_BEGINNINGS, "name_beginnings"),
|
||||
(FeatureType.NAME_ENDINGS, "name_endings"),
|
||||
]
|
||||
|
||||
for feature, name in name_components:
|
||||
experiments.append(
|
||||
ExperimentConfig(
|
||||
name=f"component_study_{name}",
|
||||
description=f"Study of {name} for gender prediction",
|
||||
model_type="logistic_regression",
|
||||
features=[feature],
|
||||
tags=["component_study", name],
|
||||
)
|
||||
)
|
||||
|
||||
return experiments
|
||||
|
||||
@staticmethod
|
||||
def create_province_specific_study() -> List[ExperimentConfig]:
|
||||
"""Create experiments for province-specific analysis"""
|
||||
provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed
|
||||
|
||||
experiments = []
|
||||
|
||||
for province in provinces:
|
||||
experiments.append(
|
||||
ExperimentConfig(
|
||||
name=f"province_study_{province}",
|
||||
description=f"Gender prediction for {province} province only",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.FULL_NAME],
|
||||
train_data_filter={"province": province},
|
||||
tags=["province_study", province],
|
||||
)
|
||||
)
|
||||
|
||||
return experiments
|
||||
@@ -0,0 +1,238 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from research.model_registry import create_model
|
||||
|
||||
|
||||
class ExperimentRunner:
|
||||
"""Runs and manages experiments"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.tracker = ExperimentTracker(self.config)
|
||||
self.data_loader = DataLoader(self.config)
|
||||
|
||||
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
|
||||
"""Run a single experiment and return experiment ID"""
|
||||
# Create experiment
|
||||
experiment_id = self.tracker.create_experiment(experiment_config)
|
||||
|
||||
try:
|
||||
logging.info(f"Starting experiment: {experiment_id}")
|
||||
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
||||
|
||||
# Load data
|
||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
df = self.data_loader.load_csv_complete(data_path)
|
||||
|
||||
# Apply data filters if specified
|
||||
df = self._apply_data_filters(df, experiment_config)
|
||||
|
||||
# Prepare target variable
|
||||
y = df[experiment_config.target_column]
|
||||
X = df
|
||||
|
||||
# Split data
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
test_size=experiment_config.test_size,
|
||||
random_state=experiment_config.random_seed,
|
||||
stratify=y,
|
||||
)
|
||||
|
||||
# Create and train model
|
||||
model = create_model(experiment_config)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Make predictions
|
||||
train_pred = model.predict(X_train)
|
||||
test_pred = model.predict(X_test)
|
||||
|
||||
# Calculate metrics
|
||||
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
|
||||
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
|
||||
|
||||
# Cross-validation if requested
|
||||
cv_metrics = {}
|
||||
if experiment_config.cross_validation_folds > 1:
|
||||
cv_metrics = model.cross_validate(
|
||||
X_train, y_train, experiment_config.cross_validation_folds
|
||||
)
|
||||
|
||||
# Additional analysis
|
||||
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
|
||||
feature_importance = model.get_feature_importance()
|
||||
|
||||
# Create prediction examples
|
||||
prediction_examples = self._create_prediction_examples(
|
||||
X_test, y_test, test_pred, model, n_examples=10
|
||||
)
|
||||
|
||||
# Calculate class distribution
|
||||
class_distribution = y.value_counts().to_dict()
|
||||
|
||||
# Save model
|
||||
model_path = self._save_model(model, experiment_id)
|
||||
|
||||
# Update experiment with results
|
||||
self.tracker.update_experiment(
|
||||
experiment_id,
|
||||
status=ExperimentStatus.COMPLETED,
|
||||
end_time=datetime.now(),
|
||||
model_path=str(model_path),
|
||||
train_metrics=train_metrics,
|
||||
test_metrics=test_metrics,
|
||||
cv_metrics=cv_metrics,
|
||||
confusion_matrix=conf_matrix,
|
||||
feature_importance=feature_importance,
|
||||
prediction_examples=prediction_examples,
|
||||
train_size=len(X_train),
|
||||
test_size=len(X_test),
|
||||
class_distribution=class_distribution,
|
||||
)
|
||||
|
||||
logging.info(f"Experiment {experiment_id} completed successfully")
|
||||
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
|
||||
|
||||
return experiment_id
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
|
||||
self.tracker.update_experiment(
|
||||
experiment_id,
|
||||
status=ExperimentStatus.FAILED,
|
||||
end_time=datetime.now(),
|
||||
error_message=str(e),
|
||||
)
|
||||
raise
|
||||
|
||||
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
|
||||
"""Run multiple experiments"""
|
||||
experiment_ids = []
|
||||
|
||||
for i, config in enumerate(experiments):
|
||||
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
|
||||
try:
|
||||
exp_id = self.run_experiment(config)
|
||||
experiment_ids.append(exp_id)
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to run experiment {config.name}: {e}")
|
||||
continue
|
||||
|
||||
return experiment_ids
|
||||
|
||||
@classmethod
|
||||
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
|
||||
"""Apply data filters specified in experiment config"""
|
||||
filtered_df = df.copy()
|
||||
|
||||
# Apply training data filters
|
||||
if config.train_data_filter:
|
||||
for column, criteria in config.train_data_filter.items():
|
||||
if column in filtered_df.columns:
|
||||
if isinstance(criteria, list):
|
||||
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
||||
elif isinstance(criteria, dict):
|
||||
if "min" in criteria:
|
||||
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
|
||||
if "max" in criteria:
|
||||
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
|
||||
else:
|
||||
filtered_df = filtered_df[filtered_df[column] == criteria]
|
||||
|
||||
return filtered_df
|
||||
|
||||
@classmethod
|
||||
def _create_prediction_examples(
|
||||
cls,
|
||||
X_test: pd.DataFrame,
|
||||
y_test: pd.Series,
|
||||
predictions: np.ndarray,
|
||||
model: BaseModel,
|
||||
n_examples: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""Create prediction examples for analysis"""
|
||||
examples = []
|
||||
|
||||
# Get both correct and incorrect predictions
|
||||
correct_mask = y_test == predictions
|
||||
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
|
||||
correct_indices = X_test[correct_mask].index[: n_examples // 2]
|
||||
|
||||
sample_indices = list(incorrect_indices) + list(correct_indices)
|
||||
|
||||
for idx in sample_indices[:n_examples]:
|
||||
example = {
|
||||
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
|
||||
"true_label": y_test.loc[idx],
|
||||
"predicted_label": predictions[X_test.index.get_loc(idx)],
|
||||
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
|
||||
}
|
||||
|
||||
# Add probability if available
|
||||
if model.architecture == "traditional":
|
||||
proba = model.predict_proba(X_test.loc[[idx]])
|
||||
example["prediction_confidence"] = float(proba.max())
|
||||
|
||||
examples.append(example)
|
||||
|
||||
return examples
|
||||
|
||||
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
|
||||
"""Save trained model"""
|
||||
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model_path = model_dir / "model.joblib"
|
||||
model.save(str(model_path))
|
||||
|
||||
return model_path
|
||||
|
||||
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
|
||||
"""Load a model from a completed experiment"""
|
||||
experiment = self.tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.model_path:
|
||||
return BaseModel.load(experiment.model_path)
|
||||
|
||||
return None
|
||||
|
||||
def compare_experiments(
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
) -> pd.DataFrame:
|
||||
"""Compare experiments and return analysis"""
|
||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||
|
||||
if f"test_{metric}" in comparison_df.columns:
|
||||
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
|
||||
|
||||
return comparison_df
|
||||
|
||||
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
|
||||
"""Get feature importance analysis for an experiment"""
|
||||
experiment = self.tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.feature_importance:
|
||||
importance_df = pd.DataFrame(
|
||||
[
|
||||
{"feature": feature, "importance": importance}
|
||||
for feature, importance in experiment.feature_importance.items()
|
||||
]
|
||||
)
|
||||
return importance_df.sort_values("importance", ascending=False)
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,194 @@
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config import PipelineConfig, get_config
|
||||
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from research.experiment.experiement_result import ExperimentResult
|
||||
|
||||
|
||||
class ExperimentTracker:
|
||||
"""Tracks and manages experiments"""
|
||||
|
||||
def __init__(self, config: Optional[PipelineConfig] = None):
|
||||
self.config = config or get_config()
|
||||
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
|
||||
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.results_db_path = self.experiments_dir / "experiments.json"
|
||||
self._results: Dict[str, ExperimentResult] = {}
|
||||
self._load_results()
|
||||
|
||||
def _load_results(self):
|
||||
"""Load existing experiment results"""
|
||||
if self.results_db_path.exists():
|
||||
try:
|
||||
with open(self.results_db_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for exp_id, exp_data in data.items():
|
||||
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load experiment results: {e}")
|
||||
|
||||
def _save_results(self):
|
||||
"""Save experiment results to disk"""
|
||||
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
|
||||
|
||||
with open(self.results_db_path, "w") as f:
|
||||
json.dump(data, f, indent=2, default=str)
|
||||
|
||||
def create_experiment(self, config: ExperimentConfig) -> str:
|
||||
"""Create a new experiment and return its ID"""
|
||||
# Generate experiment ID
|
||||
config_hash = hashlib.md5(
|
||||
json.dumps(config.to_dict(), sort_keys=True).encode()
|
||||
).hexdigest()[:8]
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
|
||||
|
||||
# Create result object
|
||||
result = ExperimentResult(
|
||||
experiment_id=experiment_id, config=config, start_time=datetime.now()
|
||||
)
|
||||
|
||||
self._results[experiment_id] = result
|
||||
self._save_results()
|
||||
|
||||
return experiment_id
|
||||
|
||||
def update_experiment(self, experiment_id: str, **updates):
|
||||
"""Update an experiment's results"""
|
||||
if experiment_id in self._results:
|
||||
result = self._results[experiment_id]
|
||||
|
||||
for key, value in updates.items():
|
||||
if hasattr(result, key):
|
||||
setattr(result, key, value)
|
||||
|
||||
self._save_results()
|
||||
|
||||
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
|
||||
"""Get experiment by ID"""
|
||||
return self._results.get(experiment_id)
|
||||
|
||||
def list_experiments(
|
||||
self,
|
||||
status: Optional[ExperimentStatus] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
) -> List[ExperimentResult]:
|
||||
"""List experiments with optional filtering"""
|
||||
results = list(self._results.values())
|
||||
|
||||
if status:
|
||||
results = [r for r in results if r.status == status]
|
||||
|
||||
if tags:
|
||||
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
|
||||
|
||||
if model_type:
|
||||
results = [r for r in results if r.config.model_type == model_type]
|
||||
|
||||
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||
|
||||
def get_best_experiment(
|
||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||
) -> Optional[ExperimentResult]:
|
||||
"""Get the best experiment based on a metric"""
|
||||
experiments = self.list_experiments()
|
||||
|
||||
if filters:
|
||||
# Apply additional filters
|
||||
if "model_type" in filters:
|
||||
experiments = [
|
||||
e for e in experiments if e.config.model_type == filters["model_type"]
|
||||
]
|
||||
if "features" in filters:
|
||||
experiments = [
|
||||
e
|
||||
for e in experiments
|
||||
if any(f in e.config.features for f in filters["features"])
|
||||
]
|
||||
|
||||
valid_experiments = []
|
||||
for exp in experiments:
|
||||
if exp.status == ExperimentStatus.COMPLETED:
|
||||
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
|
||||
if metric in metrics_dict:
|
||||
valid_experiments.append((exp, metrics_dict[metric]))
|
||||
|
||||
if not valid_experiments:
|
||||
return None
|
||||
|
||||
return max(valid_experiments, key=lambda x: x[1])[0]
|
||||
|
||||
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
|
||||
"""Compare multiple experiments in a DataFrame"""
|
||||
rows = []
|
||||
|
||||
for exp_id in experiment_ids:
|
||||
exp = self.get_experiment(exp_id)
|
||||
if exp:
|
||||
row = {
|
||||
"experiment_id": exp_id,
|
||||
"name": exp.config.name,
|
||||
"model_type": exp.config.model_type,
|
||||
"features": ",".join([f.value for f in exp.config.features]),
|
||||
"status": exp.status.value,
|
||||
"train_size": exp.train_size,
|
||||
"test_size": exp.test_size,
|
||||
}
|
||||
|
||||
# Add metrics
|
||||
for metric, value in exp.test_metrics.items():
|
||||
row[f"test_{metric}"] = value
|
||||
|
||||
for metric, value in exp.cv_metrics.items():
|
||||
row[f"cv_{metric}"] = value
|
||||
|
||||
rows.append(row)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
def export_results(self, output_path: Optional[Path] = None) -> Path:
|
||||
"""Export all results to CSV"""
|
||||
if output_path is None:
|
||||
output_path = (
|
||||
self.experiments_dir
|
||||
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
)
|
||||
|
||||
rows = []
|
||||
for exp in self._results.values():
|
||||
row = {
|
||||
"experiment_id": exp.experiment_id,
|
||||
"name": exp.config.name,
|
||||
"description": exp.config.description,
|
||||
"model_type": exp.config.model_type,
|
||||
"features": ",".join([f.value for f in exp.config.features]),
|
||||
"status": exp.status.value,
|
||||
"start_time": exp.start_time.isoformat(),
|
||||
"end_time": exp.end_time.isoformat() if exp.end_time else None,
|
||||
"train_size": exp.train_size,
|
||||
"test_size": exp.test_size,
|
||||
}
|
||||
|
||||
# Add all metrics
|
||||
for metric, value in exp.test_metrics.items():
|
||||
row[f"test_{metric}"] = value
|
||||
|
||||
for metric, value in exp.cv_metrics.items():
|
||||
row[f"cv_{metric}"] = value
|
||||
|
||||
rows.append(row)
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
df.to_csv(output_path, index=False)
|
||||
|
||||
return output_path
|
||||
@@ -0,0 +1,90 @@
|
||||
from enum import Enum
|
||||
from typing import List, Dict, Any, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class FeatureType(Enum):
|
||||
"""Types of features that can be extracted from names"""
|
||||
|
||||
FULL_NAME = "full_name"
|
||||
NATIVE_NAME = "native_name"
|
||||
SURNAME = "surname"
|
||||
FIRST_WORD = "first_word"
|
||||
LAST_WORD = "last_word"
|
||||
NAME_LENGTH = "name_length"
|
||||
WORD_COUNT = "word_count"
|
||||
PROVINCE = "province"
|
||||
CHAR_NGRAMS = "char_ngrams"
|
||||
WORD_NGRAMS = "word_ngrams"
|
||||
NAME_ENDINGS = "name_endings"
|
||||
NAME_BEGINNINGS = "name_beginnings"
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
"""Extract different types of features from name data"""
|
||||
|
||||
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
|
||||
self.feature_types = feature_types
|
||||
self.feature_params = feature_params or {}
|
||||
|
||||
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Extract all configured features"""
|
||||
features_df = pd.DataFrame(index=df.index)
|
||||
|
||||
for feature_type in self.feature_types:
|
||||
feature_data = self._extract_single_feature(df, feature_type)
|
||||
|
||||
if isinstance(feature_data, pd.DataFrame):
|
||||
features_df = pd.concat([features_df, feature_data], axis=1)
|
||||
else:
|
||||
features_df[feature_type.value] = feature_data
|
||||
|
||||
return features_df
|
||||
|
||||
def _extract_single_feature(
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
) -> Union[pd.Series, pd.DataFrame]:
|
||||
"""Extract a single type of feature"""
|
||||
if feature_type == FeatureType.FULL_NAME:
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NATIVE_NAME:
|
||||
return df["identified_name"].fillna(df["probable_native"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.SURNAME:
|
||||
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.FIRST_WORD:
|
||||
return df["name"].str.split().str[0].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.LAST_WORD:
|
||||
return df["name"].str.split().str[-1].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_LENGTH:
|
||||
return df["name"].str.len().fillna(0)
|
||||
|
||||
elif feature_type == FeatureType.WORD_COUNT:
|
||||
return df["words"].fillna(1)
|
||||
|
||||
elif feature_type == FeatureType.PROVINCE:
|
||||
return df["province"].fillna("unknown")
|
||||
|
||||
elif feature_type == FeatureType.NAME_ENDINGS:
|
||||
n = self.feature_params.get("ending_length", 3)
|
||||
return df["name"].str[-n:].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_BEGINNINGS:
|
||||
n = self.feature_params.get("beginning_length", 3)
|
||||
return df["name"].str[:n].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.CHAR_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.WORD_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown feature type: {feature_type}")
|
||||
@@ -0,0 +1,44 @@
|
||||
from typing import List
|
||||
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.models.bigru_model import BiGRUModel
|
||||
from research.models.cnn_model import CNNModel
|
||||
from research.models.ensemble_model import EnsembleModel
|
||||
from research.models.lightgbm_model import LightGBMModel
|
||||
from research.models.logistic_regression_model import LogisticRegressionModel
|
||||
from research.models.lstm_model import LSTMModel
|
||||
from research.models.naive_bayes_model import NaiveBayesModel
|
||||
from research.models.random_forest_model import RandomForestModel
|
||||
from research.models.svm_model import SVMModel
|
||||
from research.models.transformer_model import TransformerModel
|
||||
from research.models.xgboost_model import XGBoostModel
|
||||
|
||||
MODEL_REGISTRY = {
|
||||
"bigru": BiGRUModel,
|
||||
"cnn": CNNModel,
|
||||
"ensemble": EnsembleModel,
|
||||
"lightgbm": LightGBMModel,
|
||||
"logistic_regression": LogisticRegressionModel,
|
||||
"lstm": LSTMModel,
|
||||
"naive_bayes": NaiveBayesModel,
|
||||
"random_forest": RandomForestModel,
|
||||
"svm": SVMModel,
|
||||
"transformer": TransformerModel,
|
||||
"xgboost": XGBoostModel,
|
||||
}
|
||||
|
||||
|
||||
def create_model(config: ExperimentConfig) -> BaseModel:
|
||||
"""Factory function to create models"""
|
||||
model_class = MODEL_REGISTRY.get(config.model_type)
|
||||
|
||||
if model_class is None:
|
||||
raise ValueError(f"Unknown model type: {config.model_type}")
|
||||
|
||||
return model_class(config)
|
||||
|
||||
|
||||
def list_available_models() -> List[str]:
|
||||
"""List all available model types"""
|
||||
return list(MODEL_REGISTRY.keys())
|
||||
@@ -0,0 +1,281 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config import get_config
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
from research.experiment import FeatureType, ExperimentConfig
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
|
||||
class ModelTrainer:
|
||||
"""Comprehensive model training and artifact management"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
self.config = config or get_config()
|
||||
self.data_loader = DataLoader(self.config)
|
||||
self.experiment_runner = ExperimentRunner(self.config)
|
||||
self.experiment_tracker = ExperimentTracker(self.config)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Setup model artifacts directory
|
||||
self.models_dir = self.config.paths.models_dir
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def train_single_model(
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
save_artifacts: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Train a single model and save its artifacts.
|
||||
Returns the experiment ID.
|
||||
"""
|
||||
self.logger.info(f"Training {model_type} model: {model_name}")
|
||||
|
||||
if features is None:
|
||||
features = ["full_name"]
|
||||
feature_types = [FeatureType(f) for f in features]
|
||||
|
||||
# Create experiment configuration
|
||||
config = ExperimentConfig(
|
||||
name=model_name,
|
||||
description=f"Training {model_type} model with features: {', '.join(features)}",
|
||||
model_type=model_type,
|
||||
features=feature_types,
|
||||
model_params=model_params or {},
|
||||
tags=["training", model_type],
|
||||
)
|
||||
|
||||
# Run experiment
|
||||
experiment_id = self.experiment_runner.run_experiment(config)
|
||||
experiment = self.experiment_tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.test_metrics:
|
||||
self.logger.info("Training completed successfully!")
|
||||
self.logger.info(f" Experiment ID: {experiment_id}")
|
||||
self.logger.info(f" Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
|
||||
self.logger.info(f" Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
|
||||
|
||||
if save_artifacts:
|
||||
self.save_model_artifacts(experiment_id)
|
||||
|
||||
return experiment_id
|
||||
|
||||
def train_multiple_models(
|
||||
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||
) -> List[str]:
|
||||
"""
|
||||
Train multiple models with different configurations.
|
||||
"""
|
||||
self.logger.info(f"Training {len(model_configs)} models...")
|
||||
|
||||
experiment_ids = []
|
||||
|
||||
for i, config in enumerate(model_configs):
|
||||
model_name = f"{base_name}_{config['model_type']}_{i + 1}"
|
||||
|
||||
try:
|
||||
exp_id = self.train_single_model(
|
||||
model_name=model_name,
|
||||
model_type=config["model_type"],
|
||||
features=config.get("features", ["full_name"]),
|
||||
model_params=config.get("model_params", {}),
|
||||
save_artifacts=save_all,
|
||||
)
|
||||
experiment_ids.append(exp_id)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to train {model_name}: {e}")
|
||||
continue
|
||||
|
||||
self.logger.info(f"Completed training {len(experiment_ids)} models successfully")
|
||||
return experiment_ids
|
||||
|
||||
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
|
||||
"""
|
||||
Save model artifacts in a structured way for easy loading.
|
||||
Returns paths to saved artifacts.
|
||||
"""
|
||||
experiment = self.experiment_tracker.get_experiment(experiment_id)
|
||||
if not experiment:
|
||||
raise ValueError(f"Experiment {experiment_id} not found")
|
||||
|
||||
# Create model-specific directory
|
||||
model_dir = self.models_dir / experiment_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load the trained model
|
||||
trained_model = self.experiment_runner.load_experiment_model(experiment_id)
|
||||
if not trained_model:
|
||||
raise ValueError(f"Could not load model for experiment {experiment_id}")
|
||||
|
||||
# Save complete model with joblib
|
||||
model_path = model_dir / "complete_model.joblib"
|
||||
trained_model.save(str(model_path))
|
||||
|
||||
# Save model configuration
|
||||
config_path = model_dir / "model_config.json"
|
||||
with open(config_path, "w") as f:
|
||||
import json
|
||||
|
||||
json.dump(experiment.config.to_dict(), f, indent=2)
|
||||
|
||||
# Save experiment results
|
||||
results_path = model_dir / "experiment_results.json"
|
||||
with open(results_path, "w") as f:
|
||||
json.dump(experiment.to_dict(), f, indent=2, default=str)
|
||||
|
||||
# Generate and save learning curves
|
||||
learning_curve_path = None
|
||||
training_history_path = None
|
||||
|
||||
try:
|
||||
# Load data for learning curve generation
|
||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
if data_path.exists():
|
||||
df = self.data_loader.load_csv_complete(data_path)
|
||||
|
||||
# Generate learning curve
|
||||
self.logger.info("Generating learning curve...")
|
||||
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
|
||||
|
||||
# Plot and save learning curve
|
||||
learning_curve_path = model_dir / "learning_curve.png"
|
||||
trained_model.plot_learning_curve(str(learning_curve_path))
|
||||
|
||||
# Plot and save training history (for neural networks)
|
||||
if trained_model.training_history:
|
||||
training_history_path = model_dir / "training_history.png"
|
||||
trained_model.plot_training_history(str(training_history_path))
|
||||
|
||||
# Save learning curve data as JSON
|
||||
learning_data_path = model_dir / "learning_curve_data.json"
|
||||
with open(learning_data_path, "w") as f:
|
||||
json.dump(trained_model.learning_curve_data, f, indent=2)
|
||||
|
||||
# Save training history data as JSON
|
||||
if trained_model.training_history:
|
||||
history_data_path = model_dir / "training_history_data.json"
|
||||
with open(history_data_path, "w") as f:
|
||||
json.dump(trained_model.training_history, f, indent=2)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not generate learning curves: {e}")
|
||||
|
||||
# Save artifacts metadata
|
||||
metadata = {
|
||||
"experiment_id": experiment_id,
|
||||
"model_name": experiment.config.name,
|
||||
"model_type": experiment.config.model_type,
|
||||
"features": [f.value for f in experiment.config.features],
|
||||
"training_date": datetime.now().isoformat(),
|
||||
"test_accuracy": experiment.test_metrics.get("accuracy", 0),
|
||||
"test_f1": experiment.test_metrics.get("f1", 0),
|
||||
"model_path": str(model_path),
|
||||
"config_path": str(config_path),
|
||||
"results_path": str(results_path),
|
||||
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
||||
"training_history_plot": str(training_history_path) if training_history_path else None,
|
||||
"has_learning_curve": bool(trained_model.learning_curve_data),
|
||||
"has_training_history": bool(trained_model.training_history),
|
||||
}
|
||||
|
||||
metadata_path = model_dir / "metadata.json"
|
||||
with open(metadata_path, "w") as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
self.logger.info(f"Model artifacts saved to: {model_dir}")
|
||||
self.logger.info(f" - Complete model: {model_path.name}")
|
||||
self.logger.info(f" - Configuration: {config_path.name}")
|
||||
self.logger.info(f" - Results: {results_path.name}")
|
||||
self.logger.info(f" - Metadata: {metadata_path.name}")
|
||||
|
||||
if learning_curve_path and learning_curve_path.exists():
|
||||
self.logger.info(f" - Learning curve: {learning_curve_path.name}")
|
||||
|
||||
if training_history_path and training_history_path.exists():
|
||||
self.logger.info(f" - Training history: {training_history_path.name}")
|
||||
|
||||
return {
|
||||
"model_dir": str(model_dir),
|
||||
"model_path": str(model_path),
|
||||
"config_path": str(config_path),
|
||||
"results_path": str(results_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
||||
"training_history_plot": str(training_history_path) if training_history_path else None,
|
||||
}
|
||||
|
||||
def load_trained_model(self, experiment_id: str):
|
||||
"""
|
||||
Load a previously trained model from artifacts.
|
||||
"""
|
||||
model_dir = self.models_dir / experiment_id
|
||||
model_path = model_dir / "complete_model.joblib"
|
||||
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
|
||||
|
||||
# Load the model class dynamically
|
||||
metadata_path = model_dir / "metadata.json"
|
||||
with open(metadata_path, "r") as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
model_type = metadata["model_type"]
|
||||
from research.model_registry import MODEL_REGISTRY
|
||||
|
||||
model_class = MODEL_REGISTRY[model_type]
|
||||
|
||||
# Load the complete model
|
||||
loaded_model = model_class.load(str(model_path))
|
||||
|
||||
self.logger.info(f"Loaded model: {metadata['model_name']}")
|
||||
self.logger.info(f" Type: {model_type}")
|
||||
self.logger.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
|
||||
|
||||
return loaded_model
|
||||
|
||||
def list_saved_models(self) -> pd.DataFrame:
|
||||
"""
|
||||
List all saved model artifacts.
|
||||
"""
|
||||
models_data = []
|
||||
|
||||
for model_dir in self.models_dir.iterdir():
|
||||
if model_dir.is_dir():
|
||||
metadata_path = model_dir / "metadata.json"
|
||||
if metadata_path.exists():
|
||||
try:
|
||||
with open(metadata_path, "r") as f:
|
||||
metadata = json.load(f)
|
||||
models_data.append(metadata)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}")
|
||||
|
||||
if not models_data:
|
||||
self.logger.info("No saved models found.")
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(models_data)
|
||||
|
||||
# Format the display
|
||||
display_columns = [
|
||||
"model_name",
|
||||
"model_type",
|
||||
"features",
|
||||
"test_accuracy",
|
||||
"test_f1",
|
||||
"training_date",
|
||||
]
|
||||
available_columns = [col for col in display_columns if col in df.columns]
|
||||
|
||||
return df[available_columns].sort_values("training_date", ascending=False)
|
||||
@@ -0,0 +1,56 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class BiGRUModel(NeuralNetworkModel):
|
||||
"""Bidirectional GRU model for name classification"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Bidirectional(
|
||||
GRU(
|
||||
params.get("gru_units", 32),
|
||||
return_sequences=True,
|
||||
dropout=params.get("dropout", 0.2),
|
||||
)
|
||||
),
|
||||
Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,75 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import (
|
||||
Embedding,
|
||||
Conv1D,
|
||||
MaxPooling1D,
|
||||
GlobalMaxPooling1D,
|
||||
Dense,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.models import Sequential
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class CNNModel(NeuralNetworkModel):
|
||||
"""1D Convolutional Neural Network for character patterns"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any:
|
||||
"""Build CNN model with known vocabulary size"""
|
||||
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
),
|
||||
MaxPooling1D(pool_size=2),
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
),
|
||||
GlobalMaxPooling1D(),
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Prepare sequences for CNN using extracted features"""
|
||||
# X here contains the features already extracted by FeatureExtractor
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
# Get text data from extracted features - use character level for CNN
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
# Fallback - should not happen if FeatureExtractor is properly configured
|
||||
text_data = [""] * len(X)
|
||||
|
||||
# Initialize character-level tokenizer
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,97 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class EnsembleModel(TraditionalModel):
|
||||
"""Ensemble model combining multiple base models"""
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
"""Return the architecture type"""
|
||||
return "ensemble"
|
||||
|
||||
def __init__(self, config: ExperimentConfig):
|
||||
super().__init__(config)
|
||||
self.base_models = []
|
||||
self.model_weights = None
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
base_model_types = params.get(
|
||||
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
|
||||
)
|
||||
|
||||
# Create base models with simplified configs
|
||||
estimators = []
|
||||
for model_type in base_model_types:
|
||||
if model_type == "logistic_regression":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append((f"logistic_regression", model))
|
||||
|
||||
elif model_type == "random_forest":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
RandomForestClassifier(
|
||||
n_estimators=50, random_state=self.config.random_seed
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append((f"rf", model))
|
||||
|
||||
elif model_type == "naive_bayes":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
|
||||
),
|
||||
("classifier", MultinomialNB()),
|
||||
]
|
||||
)
|
||||
estimators.append((f"nb", model))
|
||||
|
||||
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||
return VotingClassifier(estimators=estimators, voting=voting_type)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,51 @@
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LightGBMModel(TraditionalModel):
|
||||
"""LightGBM with engineered features"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
return lgb.LGBMClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", -1),
|
||||
learning_rate=params.get("learning_rate", 0.1),
|
||||
num_leaves=params.get("num_leaves", 31),
|
||||
subsample=params.get("subsample", 0.8),
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=-1,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character n-grams for text features
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = vectorizer.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
features.append(char_features)
|
||||
else:
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,44 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LogisticRegressionModel(TraditionalModel):
|
||||
"""Logistic Regression with character n-grams"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 5)),
|
||||
max_features=params.get("max_features", 10000),
|
||||
)
|
||||
|
||||
classifier = LogisticRegression(
|
||||
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
# Collect text-based features from the extracted features DataFrame
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
# Combine text features
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
# Concatenate multiple text features with separator
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,52 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class LSTMModel(NeuralNetworkModel):
|
||||
"""LSTM model for sequence learning"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
|
||||
Bidirectional(LSTM(params.get("lstm_units", 32))),
|
||||
Dense(64, activation="relu"),
|
||||
Dense(2, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,39 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class NaiveBayesModel(TraditionalModel):
|
||||
"""Multinomial Naive Bayes with character n-grams"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (1, 4)),
|
||||
max_features=params.get("max_features", 8000),
|
||||
)
|
||||
|
||||
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class RandomForestModel(TraditionalModel):
|
||||
"""Random Forest with engineered features"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
|
||||
params = self.config.model_params
|
||||
|
||||
return RandomForestClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", None),
|
||||
random_state=self.config.random_seed,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
# Handle different feature types
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
else:
|
||||
# Categorical features (encode them)
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,45 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class SVMModel(TraditionalModel):
|
||||
"""Support Vector Machine with character n-grams and RBF kernel"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
vectorizer = TfidfVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 4)),
|
||||
max_features=params.get("max_features", 5000),
|
||||
)
|
||||
|
||||
classifier = SVC(
|
||||
kernel=params.get("kernel", "rbf"),
|
||||
C=params.get("C", 1.0),
|
||||
gamma=params.get("gamma", "scale"),
|
||||
probability=True, # Enable probability prediction
|
||||
random_state=self.config.random_seed,
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,82 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
Embedding,
|
||||
Dense,
|
||||
GlobalAveragePooling1D,
|
||||
MultiHeadAttention,
|
||||
Dropout,
|
||||
LayerNormalization,
|
||||
)
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class TransformerModel(NeuralNetworkModel):
|
||||
"""Transformer-based model"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
|
||||
# Build Transformer model
|
||||
inputs = Input(shape=(max_len,))
|
||||
x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=max_len, delta=1)
|
||||
pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))(
|
||||
positions
|
||||
)
|
||||
x = x + pos_embedding
|
||||
|
||||
x = self._transformer_encoder(x, params)
|
||||
x = GlobalAveragePooling1D()(x)
|
||||
x = Dense(32, activation="relu")(x)
|
||||
outputs = Dense(2, activation="softmax")(x)
|
||||
|
||||
model = Model(inputs, outputs)
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
@classmethod
|
||||
def _transformer_encoder(cls, x, cfg_params):
|
||||
"""Transformer encoder block"""
|
||||
|
||||
attn = MultiHeadAttention(
|
||||
num_heads=cfg_params.get("transformer_num_heads", 2),
|
||||
key_dim=cfg_params.get("transformer_head_size", 64),
|
||||
)(x, x)
|
||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
|
||||
|
||||
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
|
||||
ff = Dense(x.shape[-1])(ff)
|
||||
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = []
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||
|
||||
if not text_data:
|
||||
raise ValueError("No text data found in the provided DataFrame.")
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,52 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class XGBoostModel(TraditionalModel):
|
||||
"""XGBoost with engineered features and character embeddings"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
return xgb.XGBClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", 6),
|
||||
learning_rate=params.get("learning_rate", 0.1),
|
||||
subsample=params.get("subsample", 0.8),
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
eval_metric="logloss",
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character-level features for names
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||
)
|
||||
char_features = vectorizer.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
features.append(char_features)
|
||||
else:
|
||||
# Categorical features
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,201 @@
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.metrics import precision_recall_fscore_support
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment.feature_extractor import FeatureExtractor
|
||||
|
||||
|
||||
class NeuralNetworkModel(BaseModel):
|
||||
"""Base class for neural network models (TensorFlow/Keras)"""
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
return "neural_network"
|
||||
|
||||
@abstractmethod
|
||||
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||
"""Build neural network model with known vocabulary size"""
|
||||
pass
|
||||
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||
"""Fit the neural network model with deferred building"""
|
||||
logging.info(f"Training {self.__class__.__name__}")
|
||||
|
||||
# Setup feature extraction
|
||||
if self.feature_extractor is None:
|
||||
self.feature_extractor = FeatureExtractor(
|
||||
self.config.features, self.config.feature_params
|
||||
)
|
||||
|
||||
# Extract and prepare features (this will also initialize tokenizer)
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
# Encode labels
|
||||
if self.label_encoder is None:
|
||||
self.label_encoder = LabelEncoder()
|
||||
y_encoded = self.label_encoder.fit_transform(y)
|
||||
else:
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
# Now we can build the model with known vocab size
|
||||
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||
|
||||
# Get additional model parameters
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
self.model = self.build_model_with_vocab(
|
||||
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||
)
|
||||
|
||||
# Train the neural network
|
||||
history = self.model.fit(
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
epochs=self.config.model_params.get("epochs", 10),
|
||||
batch_size=self.config.model_params.get("batch_size", 64),
|
||||
validation_split=0.1,
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
# Store training history
|
||||
self.training_history = {
|
||||
"accuracy": history.history["accuracy"],
|
||||
"loss": history.history["loss"],
|
||||
"val_accuracy": history.history.get("val_accuracy", []),
|
||||
"val_loss": history.history.get("val_loss", []),
|
||||
}
|
||||
|
||||
self.is_fitted = True
|
||||
return self
|
||||
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> dict[str, np.floating[Any]]:
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
||||
|
||||
accuracies = []
|
||||
precisions = []
|
||||
recalls = []
|
||||
f1_scores = []
|
||||
|
||||
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
|
||||
# Create fresh model for each fold
|
||||
fold_model = self.build_model()
|
||||
|
||||
# Train on fold
|
||||
if hasattr(fold_model, "fit"):
|
||||
fold_model.fit(
|
||||
X_prepared[train_idx],
|
||||
y_encoded[train_idx],
|
||||
epochs=self.config.model_params.get("epochs", 10),
|
||||
batch_size=self.config.model_params.get("batch_size", 32),
|
||||
verbose=0,
|
||||
)
|
||||
|
||||
# Predict on validation
|
||||
y_pred = fold_model.predict(X_prepared[val_idx])
|
||||
if len(y_pred.shape) > 1:
|
||||
y_pred = y_pred.argmax(axis=1)
|
||||
|
||||
# Calculate metrics
|
||||
acc = accuracy_score(y_encoded[val_idx], y_pred)
|
||||
prec, rec, f1, _ = precision_recall_fscore_support(
|
||||
y_encoded[val_idx], y_pred, average="weighted"
|
||||
)
|
||||
|
||||
accuracies.append(acc)
|
||||
precisions.append(prec)
|
||||
recalls.append(rec)
|
||||
f1_scores.append(f1)
|
||||
|
||||
return {
|
||||
"accuracy": np.mean(accuracies),
|
||||
"accuracy_std": np.std(accuracies),
|
||||
"precision": np.mean(precisions),
|
||||
"precision_std": np.std(precisions),
|
||||
"recall": np.mean(recalls),
|
||||
"recall_std": np.std(recalls),
|
||||
"f1": np.mean(f1_scores),
|
||||
"f1_std": np.std(f1_scores),
|
||||
}
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
learning_curve_data = {
|
||||
"train_sizes": [],
|
||||
"train_scores": [],
|
||||
"val_scores": [],
|
||||
"train_scores_std": [],
|
||||
"val_scores_std": [],
|
||||
}
|
||||
|
||||
# Split data once for validation
|
||||
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
||||
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
|
||||
)
|
||||
|
||||
for size in train_sizes:
|
||||
train_size = int(len(X_train_full) * size)
|
||||
if train_size < 10: # Minimum training size
|
||||
continue
|
||||
|
||||
# Sample training data
|
||||
indices = np.random.choice(len(X_train_full), train_size, replace=False)
|
||||
X_train_subset = X_train_full[indices]
|
||||
y_train_subset = y_train_full[indices]
|
||||
|
||||
# Train multiple models for variance estimation
|
||||
train_scores = []
|
||||
val_scores = []
|
||||
|
||||
for seed in range(3): # 3 runs for variance
|
||||
# Build fresh model
|
||||
model = self.build_model()
|
||||
|
||||
# Train model
|
||||
if hasattr(model, "fit"):
|
||||
history = model.fit(
|
||||
X_train_subset,
|
||||
y_train_subset,
|
||||
epochs=self.config.model_params.get("epochs", 10),
|
||||
batch_size=self.config.model_params.get("batch_size", 32),
|
||||
validation_data=(X_val, y_val),
|
||||
verbose=0,
|
||||
)
|
||||
|
||||
# Evaluate
|
||||
train_pred = model.predict(X_train_subset)
|
||||
val_pred = model.predict(X_val)
|
||||
|
||||
train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
|
||||
val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
|
||||
|
||||
train_scores.append(train_acc)
|
||||
val_scores.append(val_acc)
|
||||
|
||||
learning_curve_data["train_sizes"].append(train_size)
|
||||
learning_curve_data["train_scores"].append(np.mean(train_scores))
|
||||
learning_curve_data["val_scores"].append(np.mean(val_scores))
|
||||
learning_curve_data["train_scores_std"].append(np.std(train_scores))
|
||||
learning_curve_data["val_scores_std"].append(np.std(val_scores))
|
||||
|
||||
self.learning_curve_data = learning_curve_data
|
||||
return learning_curve_data
|
||||
@@ -0,0 +1,134 @@
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from typing import Dict, Any, List
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.model_selection import StratifiedKFold, cross_val_score
|
||||
from sklearn.model_selection import learning_curve
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment.feature_extractor import FeatureExtractor
|
||||
|
||||
|
||||
class TraditionalModel(BaseModel):
|
||||
"""Base class for traditional ML models (scikit-learn compatible)"""
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
return "traditional"
|
||||
|
||||
@abstractmethod
|
||||
def build_model(self) -> BaseEstimator:
|
||||
"""Build and return the sklearn model instance"""
|
||||
pass
|
||||
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||
"""Fit the traditional ML model"""
|
||||
logging.info(f"Training {self.__class__.__name__}")
|
||||
|
||||
# Build model if not already built
|
||||
if self.model is None:
|
||||
self.model = self.build_model()
|
||||
|
||||
# Setup feature extraction
|
||||
if self.feature_extractor is None:
|
||||
self.feature_extractor = FeatureExtractor(
|
||||
self.config.features, self.config.feature_params
|
||||
)
|
||||
|
||||
# Extract and prepare features
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
# Encode labels
|
||||
if self.label_encoder is None:
|
||||
self.label_encoder = LabelEncoder()
|
||||
y_encoded = self.label_encoder.fit_transform(y)
|
||||
else:
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
# Train model
|
||||
self.model.fit(X_prepared, y_encoded)
|
||||
self.is_fitted = True
|
||||
|
||||
return self
|
||||
|
||||
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
||||
|
||||
# Calculate different metrics
|
||||
results = {}
|
||||
|
||||
# Accuracy
|
||||
accuracy_scores = cross_val_score(
|
||||
self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
|
||||
)
|
||||
results["accuracy"] = accuracy_scores.mean()
|
||||
results["accuracy_std"] = accuracy_scores.std()
|
||||
|
||||
# Precision, Recall, F1
|
||||
for metric in ["precision", "recall", "f1"]:
|
||||
if metric in self.config.metrics:
|
||||
scores = cross_val_score(
|
||||
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
|
||||
)
|
||||
results[metric] = scores.mean()
|
||||
results[f"{metric}_std"] = scores.std()
|
||||
|
||||
return results
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
if train_sizes is None:
|
||||
train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
|
||||
|
||||
# Prepare features
|
||||
if self.feature_extractor is None:
|
||||
self.feature_extractor = FeatureExtractor(
|
||||
self.config.features, self.config.feature_params
|
||||
)
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
# Encode labels
|
||||
if self.label_encoder is None:
|
||||
self.label_encoder = LabelEncoder()
|
||||
y_encoded = self.label_encoder.fit_transform(y)
|
||||
else:
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
try:
|
||||
train_sizes_abs, train_scores, val_scores = learning_curve(
|
||||
self.build_model(),
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
train_sizes=train_sizes,
|
||||
cv=3, # Use 3-fold CV for speed
|
||||
scoring="accuracy",
|
||||
random_state=self.config.random_seed,
|
||||
)
|
||||
|
||||
learning_curve_data = {
|
||||
"train_sizes": train_sizes_abs.tolist(),
|
||||
"train_scores": train_scores.mean(axis=1).tolist(),
|
||||
"val_scores": val_scores.mean(axis=1).tolist(),
|
||||
"train_scores_std": train_scores.std(axis=1).tolist(),
|
||||
"val_scores_std": val_scores.std(axis=1).tolist(),
|
||||
}
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not generate learning curve: {e}")
|
||||
return {}
|
||||
|
||||
self.learning_curve_data = learning_curve_data
|
||||
return learning_curve_data
|
||||
Reference in New Issue
Block a user