refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
View File
+250
View File
@@ -0,0 +1,250 @@
import logging
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from research.experiment import ExperimentConfig
class BaseModel(ABC):
"""Abstract base class for all models"""
def __init__(self, config: ExperimentConfig):
self.config = config
self.model = None
self.feature_extractor = None
self.label_encoder = None
self.tokenizer = None # For neural models
self.is_fitted = False
self.training_history = {} # Store training history for learning curves
self.learning_curve_data = {} # Store learning curve experiment data
@property
@abstractmethod
def architecture(self) -> str:
"""Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
pass
@abstractmethod
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
"""Prepare features for training/prediction"""
pass
@abstractmethod
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the model - implemented differently for each architecture"""
pass
@abstractmethod
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float] | dict[str, np.floating[Any]]:
"""Perform cross-validation and return average scores"""
pass
@abstractmethod
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
pass
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Make predictions"""
if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions")
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
predictions = self.model.predict(X_prepared)
# Handle different prediction formats
if hasattr(predictions, "shape") and len(predictions.shape) > 1:
# Neural network outputs (probabilities)
predictions = predictions.argmax(axis=1)
return self.label_encoder.inverse_transform(predictions)
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
"""Get prediction probabilities if supported"""
if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions")
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
if hasattr(self.model, "predict_proba"):
return self.model.predict_proba(X_prepared)
elif hasattr(self.model, "predict"):
# For neural networks that return probabilities directly
probabilities = self.model.predict(X_prepared)
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
return probabilities
raise NotImplementedError("Model does not support probability predictions")
def get_feature_importance(self) -> Optional[Dict[str, float]]:
"""Get feature importance if supported by the model"""
if hasattr(self.model, "feature_importances_"):
# For tree-based models
importances = self.model.feature_importances_
feature_names = self._get_feature_names()
return dict(zip(feature_names, importances))
elif hasattr(self.model, "coef_"):
# For linear models
coefficients = np.abs(self.model.coef_[0])
feature_names = self._get_feature_names()
return dict(zip(feature_names, coefficients))
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
# For sklearn pipelines (like LogisticRegression with vectorizer)
classifier = self.model.named_steps["classifier"]
if hasattr(classifier, "coef_"):
coefficients = np.abs(classifier.coef_[0])
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
# Take top features to avoid too many n-grams
top_indices = np.argsort(coefficients)[-20:]
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
return None
def _get_feature_names(self) -> List[str]:
"""Get feature names (override in subclasses if needed)"""
if hasattr(self.model, "feature_names_in_"):
return list(self.model.feature_names_in_)
return [f"feature_{i}" for i in range(100)] # Default fallback
def save(self, path: str):
"""Save the complete model with training history"""
model_data = {
"model": self.model,
"feature_extractor": self.feature_extractor,
"label_encoder": self.label_encoder,
"tokenizer": self.tokenizer,
"config": self.config.to_dict(),
"is_fitted": self.is_fitted,
"training_history": self.training_history,
"learning_curve_data": self.learning_curve_data,
}
joblib.dump(model_data, path)
@classmethod
def load(cls, path: str) -> "BaseModel":
"""Load a saved model with training history"""
model_data = joblib.load(path)
# Recreate the model instance
from research.experiment import ExperimentConfig
config = ExperimentConfig.from_dict(model_data["config"])
instance = cls(config)
# Restore state
instance.model = model_data["model"]
instance.feature_extractor = model_data["feature_extractor"]
instance.label_encoder = model_data["label_encoder"]
instance.tokenizer = model_data.get("tokenizer")
instance.is_fitted = model_data["is_fitted"]
instance.training_history = model_data.get("training_history", {})
instance.learning_curve_data = model_data.get("learning_curve_data", {})
return instance
def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
"""Plot and save learning curve"""
if not self.learning_curve_data:
logging.warning("No learning curve data available")
return ""
plt.figure(figsize=(10, 6))
data = self.learning_curve_data
train_sizes = data["train_sizes"]
train_scores = data["train_scores"]
val_scores = data["val_scores"]
train_std = data.get("train_scores_std", [0] * len(train_sizes))
val_std = data.get("val_scores_std", [0] * len(train_sizes))
# Plot learning curves
plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
plt.fill_between(
train_sizes,
np.array(train_scores) - np.array(train_std),
np.array(train_scores) + np.array(train_std),
alpha=0.1,
color="blue",
)
plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
plt.fill_between(
train_sizes,
np.array(val_scores) - np.array(val_std),
np.array(val_scores) + np.array(val_std),
alpha=0.1,
color="red",
)
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.title(f"Learning Curve - {self.__class__.__name__}")
plt.legend(loc="best")
plt.grid(True, alpha=0.3)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
plt.close()
return save_path
else:
plt.show()
return ""
def plot_training_history(self, save_path: Optional[str] = None) -> str:
"""Plot training history for neural networks"""
if not self.training_history:
logging.warning("No training history available")
return ""
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
# Plot accuracy
if "accuracy" in self.training_history:
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
if "val_accuracy" in self.training_history:
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
axes[0].set_title("Model Accuracy")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Accuracy")
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Plot loss
if "loss" in self.training_history:
axes[1].plot(self.training_history["loss"], label="Training Loss")
if "val_loss" in self.training_history:
axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
axes[1].set_title("Model Loss")
axes[1].set_xlabel("Epoch")
axes[1].set_ylabel("Loss")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
plt.close()
return save_path
else:
plt.show()
return ""
+91
View File
@@ -0,0 +1,91 @@
from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import List, Dict, Any, Optional
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from .feature_extractor import FeatureType
@dataclass
class ExperimentConfig:
"""Configuration for a single experiment"""
# Experiment metadata
name: str
description: str = ""
tags: List[str] = field(default_factory=list)
# Model configuration
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
model_params: Dict[str, Any] = field(default_factory=dict)
# Feature configuration
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
feature_params: Dict[str, Any] = field(default_factory=dict)
# Data configuration
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
test_data_filter: Optional[Dict[str, Any]] = None
target_column: str = "sex"
# Training configuration
test_size: float = 0.2
random_seed: int = 42
cross_validation_folds: int = 5
# Evaluation configuration
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
# Convert enums to strings
result["features"] = [f.value for f in self.features]
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
"""Create from dictionary"""
if "features" in data:
data["features"] = [FeatureType(f) for f in data["features"]]
return cls(**data)
class ExperimentStatus(Enum):
"""Experiment execution status"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
def calculate_metrics(
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
) -> Dict[str, float]:
"""Calculate specified metrics"""
if metrics is None:
metrics = ["accuracy", "precision", "recall", "f1"]
results = {}
if "accuracy" in metrics:
results["accuracy"] = accuracy_score(y_true, y_pred)
if any(m in metrics for m in ["precision", "recall", "f1"]):
precision, recall, f1, _ = precision_recall_fscore_support(
y_true, y_pred, average="weighted"
)
if "precision" in metrics:
results["precision"] = precision
if "recall" in metrics:
results["recall"] = recall
if "f1" in metrics:
results["f1"] = f1
return results
+56
View File
@@ -0,0 +1,56 @@
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional, Dict, List, Any
from research.experiment import ExperimentConfig, ExperimentStatus
@dataclass
class ExperimentResult:
"""Results from an experiment execution"""
experiment_id: str
config: ExperimentConfig
# Execution metadata
start_time: datetime
end_time: Optional[datetime] = None
status: ExperimentStatus = ExperimentStatus.PENDING
error_message: Optional[str] = None
# Model artifacts
model_path: Optional[str] = None
feature_extractor_path: Optional[str] = None
# Metrics
train_metrics: Dict[str, float] = field(default_factory=dict)
test_metrics: Dict[str, float] = field(default_factory=dict)
cv_metrics: Dict[str, float] = field(default_factory=dict)
# Additional results
confusion_matrix: Optional[List[List[int]]] = None
feature_importance: Optional[Dict[str, float]] = None
prediction_examples: Optional[List[Dict]] = None
# Data statistics
train_size: int = 0
test_size: int = 0
class_distribution: Dict[str, int] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
result["config"] = self.config.to_dict()
result["start_time"] = self.start_time.isoformat()
result["end_time"] = self.end_time.isoformat() if self.end_time else None
result["status"] = self.status.value
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
"""Create from dictionary"""
data["config"] = ExperimentConfig.from_dict(data["config"])
data["start_time"] = datetime.fromisoformat(data["start_time"])
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
data["status"] = ExperimentStatus(data["status"])
return cls(**data)
+123
View File
@@ -0,0 +1,123 @@
from typing import List
from research.experiment import ExperimentConfig
from research.experiment.feature_extractor import FeatureType
class ExperimentBuilder:
"""Helper class to build experiment configurations"""
@staticmethod
def create_baseline_experiments() -> List[ExperimentConfig]:
"""Create a set of baseline experiments for comparison"""
return [
# Full name experiments
ExperimentConfig(
name="baseline_logistic_regression_fullname",
description="Logistic regression with full name",
model_type="logistic_regression",
features=[FeatureType.FULL_NAME],
tags=["baseline", "fullname"],
),
# Native name only
ExperimentConfig(
name="baseline_logistic_regression_native",
description="Logistic regression with native name only",
model_type="logistic_regression",
features=[FeatureType.NATIVE_NAME],
tags=["baseline", "native"],
),
# Surname only
ExperimentConfig(
name="baseline_logistic_regression_surname",
description="Logistic regression with surname only",
model_type="logistic_regression",
features=[FeatureType.SURNAME],
tags=["baseline", "surname"],
),
# Random Forest with engineered features
ExperimentConfig(
name="baseline_rf_engineered",
description="Random Forest with engineered features",
model_type="random_forest",
features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
tags=["baseline", "engineered"],
),
]
@staticmethod
def create_feature_ablation_study() -> List[ExperimentConfig]:
"""Create experiments for feature ablation study"""
base_features = [
FeatureType.FULL_NAME,
FeatureType.NAME_LENGTH,
FeatureType.WORD_COUNT,
FeatureType.PROVINCE,
]
experiments = []
# Test removing each feature one by one
for i, feature_to_remove in enumerate(base_features):
remaining_features = [f for f in base_features if f != feature_to_remove]
experiments.append(
ExperimentConfig(
name=f"ablation_remove_{feature_to_remove.value}",
description=f"Ablation study: removed {feature_to_remove.value}",
model_type="logistic_regression",
features=remaining_features,
tags=["ablation", feature_to_remove.value],
)
)
return experiments
@staticmethod
def create_name_component_study() -> List[ExperimentConfig]:
"""Create experiments to study different name components"""
experiments = []
name_components = [
(FeatureType.FIRST_WORD, "first_word"),
(FeatureType.LAST_WORD, "last_word"),
(FeatureType.NATIVE_NAME, "native_name"),
(FeatureType.SURNAME, "surname"),
(FeatureType.NAME_BEGINNINGS, "name_beginnings"),
(FeatureType.NAME_ENDINGS, "name_endings"),
]
for feature, name in name_components:
experiments.append(
ExperimentConfig(
name=f"component_study_{name}",
description=f"Study of {name} for gender prediction",
model_type="logistic_regression",
features=[feature],
tags=["component_study", name],
)
)
return experiments
@staticmethod
def create_province_specific_study() -> List[ExperimentConfig]:
"""Create experiments for province-specific analysis"""
provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed
experiments = []
for province in provinces:
experiments.append(
ExperimentConfig(
name=f"province_study_{province}",
description=f"Gender prediction for {province} province only",
model_type="logistic_regression",
features=[FeatureType.FULL_NAME],
train_data_filter={"province": province},
tags=["province_study", province],
)
)
return experiments
+238
View File
@@ -0,0 +1,238 @@
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from research.base_model import BaseModel
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
from research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import create_model
class ExperimentRunner:
"""Runs and manages experiments"""
def __init__(self, config: PipelineConfig):
self.config = config
self.tracker = ExperimentTracker(self.config)
self.data_loader = DataLoader(self.config)
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
"""Run a single experiment and return experiment ID"""
# Create experiment
experiment_id = self.tracker.create_experiment(experiment_config)
try:
logging.info(f"Starting experiment: {experiment_id}")
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
# Load data
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
df = self.data_loader.load_csv_complete(data_path)
# Apply data filters if specified
df = self._apply_data_filters(df, experiment_config)
# Prepare target variable
y = df[experiment_config.target_column]
X = df
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=experiment_config.test_size,
random_state=experiment_config.random_seed,
stratify=y,
)
# Create and train model
model = create_model(experiment_config)
model.fit(X_train, y_train)
# Make predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
# Calculate metrics
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
# Cross-validation if requested
cv_metrics = {}
if experiment_config.cross_validation_folds > 1:
cv_metrics = model.cross_validate(
X_train, y_train, experiment_config.cross_validation_folds
)
# Additional analysis
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
feature_importance = model.get_feature_importance()
# Create prediction examples
prediction_examples = self._create_prediction_examples(
X_test, y_test, test_pred, model, n_examples=10
)
# Calculate class distribution
class_distribution = y.value_counts().to_dict()
# Save model
model_path = self._save_model(model, experiment_id)
# Update experiment with results
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.COMPLETED,
end_time=datetime.now(),
model_path=str(model_path),
train_metrics=train_metrics,
test_metrics=test_metrics,
cv_metrics=cv_metrics,
confusion_matrix=conf_matrix,
feature_importance=feature_importance,
prediction_examples=prediction_examples,
train_size=len(X_train),
test_size=len(X_test),
class_distribution=class_distribution,
)
logging.info(f"Experiment {experiment_id} completed successfully")
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
return experiment_id
except Exception as e:
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.FAILED,
end_time=datetime.now(),
error_message=str(e),
)
raise
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
"""Run multiple experiments"""
experiment_ids = []
for i, config in enumerate(experiments):
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
try:
exp_id = self.run_experiment(config)
experiment_ids.append(exp_id)
except Exception as e:
logging.error(f"Failed to run experiment {config.name}: {e}")
continue
return experiment_ids
@classmethod
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
"""Apply data filters specified in experiment config"""
filtered_df = df.copy()
# Apply training data filters
if config.train_data_filter:
for column, criteria in config.train_data_filter.items():
if column in filtered_df.columns:
if isinstance(criteria, list):
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
elif isinstance(criteria, dict):
if "min" in criteria:
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
if "max" in criteria:
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
else:
filtered_df = filtered_df[filtered_df[column] == criteria]
return filtered_df
@classmethod
def _create_prediction_examples(
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
) -> List[Dict]:
"""Create prediction examples for analysis"""
examples = []
# Get both correct and incorrect predictions
correct_mask = y_test == predictions
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
correct_indices = X_test[correct_mask].index[: n_examples // 2]
sample_indices = list(incorrect_indices) + list(correct_indices)
for idx in sample_indices[:n_examples]:
example = {
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
"true_label": y_test.loc[idx],
"predicted_label": predictions[X_test.index.get_loc(idx)],
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
}
# Add probability if available
if model.architecture == "traditional":
proba = model.predict_proba(X_test.loc[[idx]])
example["prediction_confidence"] = float(proba.max())
examples.append(example)
return examples
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
"""Save trained model"""
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / "model.joblib"
model.save(str(model_path))
return model_path
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
"""Load a model from a completed experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.model_path:
return BaseModel.load(experiment.model_path)
return None
def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
"""Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids)
if f"test_{metric}" in comparison_df.columns:
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
return comparison_df
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
"""Get feature importance analysis for an experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.feature_importance:
importance_df = pd.DataFrame(
[
{"feature": feature, "importance": importance}
for feature, importance in experiment.feature_importance.items()
]
)
return importance_df.sort_values("importance", ascending=False)
return None
+194
View File
@@ -0,0 +1,194 @@
import hashlib
import json
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, List
import pandas as pd
from core.config import PipelineConfig, get_config
from research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiement_result import ExperimentResult
class ExperimentTracker:
"""Tracks and manages experiments"""
def __init__(self, config: Optional[PipelineConfig] = None):
self.config = config or get_config()
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
self.experiments_dir.mkdir(parents=True, exist_ok=True)
self.results_db_path = self.experiments_dir / "experiments.json"
self._results: Dict[str, ExperimentResult] = {}
self._load_results()
def _load_results(self):
"""Load existing experiment results"""
if self.results_db_path.exists():
try:
with open(self.results_db_path, "r") as f:
data = json.load(f)
for exp_id, exp_data in data.items():
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
except Exception as e:
print(f"Warning: Failed to load experiment results: {e}")
def _save_results(self):
"""Save experiment results to disk"""
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
with open(self.results_db_path, "w") as f:
json.dump(data, f, indent=2, default=str)
def create_experiment(self, config: ExperimentConfig) -> str:
"""Create a new experiment and return its ID"""
# Generate experiment ID
config_hash = hashlib.md5(
json.dumps(config.to_dict(), sort_keys=True).encode()
).hexdigest()[:8]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
# Create result object
result = ExperimentResult(
experiment_id=experiment_id, config=config, start_time=datetime.now()
)
self._results[experiment_id] = result
self._save_results()
return experiment_id
def update_experiment(self, experiment_id: str, **updates):
"""Update an experiment's results"""
if experiment_id in self._results:
result = self._results[experiment_id]
for key, value in updates.items():
if hasattr(result, key):
setattr(result, key, value)
self._save_results()
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
"""Get experiment by ID"""
return self._results.get(experiment_id)
def list_experiments(
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
) -> List[ExperimentResult]:
"""List experiments with optional filtering"""
results = list(self._results.values())
if status:
results = [r for r in results if r.status == status]
if tags:
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
if model_type:
results = [r for r in results if r.config.model_type == model_type]
return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric"""
experiments = self.list_experiments()
if filters:
# Apply additional filters
if "model_type" in filters:
experiments = [
e for e in experiments if e.config.model_type == filters["model_type"]
]
if "features" in filters:
experiments = [
e
for e in experiments
if any(f in e.config.features for f in filters["features"])
]
valid_experiments = []
for exp in experiments:
if exp.status == ExperimentStatus.COMPLETED:
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
if metric in metrics_dict:
valid_experiments.append((exp, metrics_dict[metric]))
if not valid_experiments:
return None
return max(valid_experiments, key=lambda x: x[1])[0]
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
"""Compare multiple experiments in a DataFrame"""
rows = []
for exp_id in experiment_ids:
exp = self.get_experiment(exp_id)
if exp:
row = {
"experiment_id": exp_id,
"name": exp.config.name,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
return pd.DataFrame(rows)
def export_results(self, output_path: Optional[Path] = None) -> Path:
"""Export all results to CSV"""
if output_path is None:
output_path = (
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
rows = []
for exp in self._results.values():
row = {
"experiment_id": exp.experiment_id,
"name": exp.config.name,
"description": exp.config.description,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"start_time": exp.start_time.isoformat(),
"end_time": exp.end_time.isoformat() if exp.end_time else None,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add all metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv(output_path, index=False)
return output_path
+90
View File
@@ -0,0 +1,90 @@
from enum import Enum
from typing import List, Dict, Any, Union
import pandas as pd
class FeatureType(Enum):
"""Types of features that can be extracted from names"""
FULL_NAME = "full_name"
NATIVE_NAME = "native_name"
SURNAME = "surname"
FIRST_WORD = "first_word"
LAST_WORD = "last_word"
NAME_LENGTH = "name_length"
WORD_COUNT = "word_count"
PROVINCE = "province"
CHAR_NGRAMS = "char_ngrams"
WORD_NGRAMS = "word_ngrams"
NAME_ENDINGS = "name_endings"
NAME_BEGINNINGS = "name_beginnings"
class FeatureExtractor:
"""Extract different types of features from name data"""
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
self.feature_types = feature_types
self.feature_params = feature_params or {}
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Extract all configured features"""
features_df = pd.DataFrame(index=df.index)
for feature_type in self.feature_types:
feature_data = self._extract_single_feature(df, feature_type)
if isinstance(feature_data, pd.DataFrame):
features_df = pd.concat([features_df, feature_data], axis=1)
else:
features_df[feature_type.value] = feature_data
return features_df
def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME:
return df["name"].fillna("")
elif feature_type == FeatureType.NATIVE_NAME:
return df["identified_name"].fillna(df["probable_native"]).fillna("")
elif feature_type == FeatureType.SURNAME:
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
elif feature_type == FeatureType.FIRST_WORD:
return df["name"].str.split().str[0].fillna("")
elif feature_type == FeatureType.LAST_WORD:
return df["name"].str.split().str[-1].fillna("")
elif feature_type == FeatureType.NAME_LENGTH:
return df["name"].str.len().fillna(0)
elif feature_type == FeatureType.WORD_COUNT:
return df["words"].fillna(1)
elif feature_type == FeatureType.PROVINCE:
return df["province"].fillna("unknown")
elif feature_type == FeatureType.NAME_ENDINGS:
n = self.feature_params.get("ending_length", 3)
return df["name"].str[-n:].fillna("")
elif feature_type == FeatureType.NAME_BEGINNINGS:
n = self.feature_params.get("beginning_length", 3)
return df["name"].str[:n].fillna("")
elif feature_type == FeatureType.CHAR_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
elif feature_type == FeatureType.WORD_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
else:
raise ValueError(f"Unknown feature type: {feature_type}")
+44
View File
@@ -0,0 +1,44 @@
from typing import List
from research.base_model import BaseModel
from research.experiment import ExperimentConfig
from research.models.bigru_model import BiGRUModel
from research.models.cnn_model import CNNModel
from research.models.ensemble_model import EnsembleModel
from research.models.lightgbm_model import LightGBMModel
from research.models.logistic_regression_model import LogisticRegressionModel
from research.models.lstm_model import LSTMModel
from research.models.naive_bayes_model import NaiveBayesModel
from research.models.random_forest_model import RandomForestModel
from research.models.svm_model import SVMModel
from research.models.transformer_model import TransformerModel
from research.models.xgboost_model import XGBoostModel
MODEL_REGISTRY = {
"bigru": BiGRUModel,
"cnn": CNNModel,
"ensemble": EnsembleModel,
"lightgbm": LightGBMModel,
"logistic_regression": LogisticRegressionModel,
"lstm": LSTMModel,
"naive_bayes": NaiveBayesModel,
"random_forest": RandomForestModel,
"svm": SVMModel,
"transformer": TransformerModel,
"xgboost": XGBoostModel,
}
def create_model(config: ExperimentConfig) -> BaseModel:
"""Factory function to create models"""
model_class = MODEL_REGISTRY.get(config.model_type)
if model_class is None:
raise ValueError(f"Unknown model type: {config.model_type}")
return model_class(config)
def list_available_models() -> List[str]:
"""List all available model types"""
return list(MODEL_REGISTRY.keys())
+281
View File
@@ -0,0 +1,281 @@
import json
import logging
from datetime import datetime
from typing import List, Dict, Any
import pandas as pd
from core.config import get_config
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from research.experiment import FeatureType, ExperimentConfig
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
class ModelTrainer:
"""Comprehensive model training and artifact management"""
def __init__(self, config=None):
self.config = config or get_config()
self.data_loader = DataLoader(self.config)
self.experiment_runner = ExperimentRunner(self.config)
self.experiment_tracker = ExperimentTracker(self.config)
self.logger = logging.getLogger(__name__)
# Setup model artifacts directory
self.models_dir = self.config.paths.models_dir
self.models_dir.mkdir(parents=True, exist_ok=True)
def train_single_model(
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
save_artifacts: bool = True,
) -> str:
"""
Train a single model and save its artifacts.
Returns the experiment ID.
"""
self.logger.info(f"Training {model_type} model: {model_name}")
if features is None:
features = ["full_name"]
feature_types = [FeatureType(f) for f in features]
# Create experiment configuration
config = ExperimentConfig(
name=model_name,
description=f"Training {model_type} model with features: {', '.join(features)}",
model_type=model_type,
features=feature_types,
model_params=model_params or {},
tags=["training", model_type],
)
# Run experiment
experiment_id = self.experiment_runner.run_experiment(config)
experiment = self.experiment_tracker.get_experiment(experiment_id)
if experiment and experiment.test_metrics:
self.logger.info("Training completed successfully!")
self.logger.info(f" Experiment ID: {experiment_id}")
self.logger.info(f" Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
self.logger.info(f" Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
if save_artifacts:
self.save_model_artifacts(experiment_id)
return experiment_id
def train_multiple_models(
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
) -> List[str]:
"""
Train multiple models with different configurations.
"""
self.logger.info(f"Training {len(model_configs)} models...")
experiment_ids = []
for i, config in enumerate(model_configs):
model_name = f"{base_name}_{config['model_type']}_{i + 1}"
try:
exp_id = self.train_single_model(
model_name=model_name,
model_type=config["model_type"],
features=config.get("features", ["full_name"]),
model_params=config.get("model_params", {}),
save_artifacts=save_all,
)
experiment_ids.append(exp_id)
except Exception as e:
self.logger.error(f"Failed to train {model_name}: {e}")
continue
self.logger.info(f"Completed training {len(experiment_ids)} models successfully")
return experiment_ids
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
"""
Save model artifacts in a structured way for easy loading.
Returns paths to saved artifacts.
"""
experiment = self.experiment_tracker.get_experiment(experiment_id)
if not experiment:
raise ValueError(f"Experiment {experiment_id} not found")
# Create model-specific directory
model_dir = self.models_dir / experiment_id
model_dir.mkdir(parents=True, exist_ok=True)
# Load the trained model
trained_model = self.experiment_runner.load_experiment_model(experiment_id)
if not trained_model:
raise ValueError(f"Could not load model for experiment {experiment_id}")
# Save complete model with joblib
model_path = model_dir / "complete_model.joblib"
trained_model.save(str(model_path))
# Save model configuration
config_path = model_dir / "model_config.json"
with open(config_path, "w") as f:
import json
json.dump(experiment.config.to_dict(), f, indent=2)
# Save experiment results
results_path = model_dir / "experiment_results.json"
with open(results_path, "w") as f:
json.dump(experiment.to_dict(), f, indent=2, default=str)
# Generate and save learning curves
learning_curve_path = None
training_history_path = None
try:
# Load data for learning curve generation
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
if data_path.exists():
df = self.data_loader.load_csv_complete(data_path)
# Generate learning curve
self.logger.info("Generating learning curve...")
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
# Plot and save learning curve
learning_curve_path = model_dir / "learning_curve.png"
trained_model.plot_learning_curve(str(learning_curve_path))
# Plot and save training history (for neural networks)
if trained_model.training_history:
training_history_path = model_dir / "training_history.png"
trained_model.plot_training_history(str(training_history_path))
# Save learning curve data as JSON
learning_data_path = model_dir / "learning_curve_data.json"
with open(learning_data_path, "w") as f:
json.dump(trained_model.learning_curve_data, f, indent=2)
# Save training history data as JSON
if trained_model.training_history:
history_data_path = model_dir / "training_history_data.json"
with open(history_data_path, "w") as f:
json.dump(trained_model.training_history, f, indent=2)
except Exception as e:
self.logger.warning(f"Could not generate learning curves: {e}")
# Save artifacts metadata
metadata = {
"experiment_id": experiment_id,
"model_name": experiment.config.name,
"model_type": experiment.config.model_type,
"features": [f.value for f in experiment.config.features],
"training_date": datetime.now().isoformat(),
"test_accuracy": experiment.test_metrics.get("accuracy", 0),
"test_f1": experiment.test_metrics.get("f1", 0),
"model_path": str(model_path),
"config_path": str(config_path),
"results_path": str(results_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
"training_history_plot": str(training_history_path) if training_history_path else None,
"has_learning_curve": bool(trained_model.learning_curve_data),
"has_training_history": bool(trained_model.training_history),
}
metadata_path = model_dir / "metadata.json"
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=2)
self.logger.info(f"Model artifacts saved to: {model_dir}")
self.logger.info(f" - Complete model: {model_path.name}")
self.logger.info(f" - Configuration: {config_path.name}")
self.logger.info(f" - Results: {results_path.name}")
self.logger.info(f" - Metadata: {metadata_path.name}")
if learning_curve_path and learning_curve_path.exists():
self.logger.info(f" - Learning curve: {learning_curve_path.name}")
if training_history_path and training_history_path.exists():
self.logger.info(f" - Training history: {training_history_path.name}")
return {
"model_dir": str(model_dir),
"model_path": str(model_path),
"config_path": str(config_path),
"results_path": str(results_path),
"metadata_path": str(metadata_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
"training_history_plot": str(training_history_path) if training_history_path else None,
}
def load_trained_model(self, experiment_id: str):
"""
Load a previously trained model from artifacts.
"""
model_dir = self.models_dir / experiment_id
model_path = model_dir / "complete_model.joblib"
if not model_path.exists():
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
# Load the model class dynamically
metadata_path = model_dir / "metadata.json"
with open(metadata_path, "r") as f:
metadata = json.load(f)
model_type = metadata["model_type"]
from research.model_registry import MODEL_REGISTRY
model_class = MODEL_REGISTRY[model_type]
# Load the complete model
loaded_model = model_class.load(str(model_path))
self.logger.info(f"Loaded model: {metadata['model_name']}")
self.logger.info(f" Type: {model_type}")
self.logger.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
return loaded_model
def list_saved_models(self) -> pd.DataFrame:
"""
List all saved model artifacts.
"""
models_data = []
for model_dir in self.models_dir.iterdir():
if model_dir.is_dir():
metadata_path = model_dir / "metadata.json"
if metadata_path.exists():
try:
with open(metadata_path, "r") as f:
metadata = json.load(f)
models_data.append(metadata)
except Exception as e:
self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}")
if not models_data:
self.logger.info("No saved models found.")
return pd.DataFrame()
df = pd.DataFrame(models_data)
# Format the display
display_columns = [
"model_name",
"model_type",
"features",
"test_accuracy",
"test_f1",
"training_date",
]
available_columns = [col for col in display_columns if col in df.columns]
return df[available_columns].sort_values("training_date", ascending=False)
View File
+56
View File
@@ -0,0 +1,56 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
class BiGRUModel(NeuralNetworkModel):
"""Bidirectional GRU model for name classification"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Bidirectional(
GRU(
params.get("gru_units", 32),
return_sequences=True,
dropout=params.get("dropout", 0.2),
)
),
Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
Dense(2, activation="softmax"),
]
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+75
View File
@@ -0,0 +1,75 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import (
Embedding,
Conv1D,
MaxPooling1D,
GlobalMaxPooling1D,
Dense,
Dropout,
)
from tensorflow.keras.models import Sequential
from research.neural_network_model import NeuralNetworkModel
class CNNModel(NeuralNetworkModel):
"""1D Convolutional Neural Network for character patterns"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any:
"""Build CNN model with known vocabulary size"""
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
),
MaxPooling1D(pool_size=2),
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
),
GlobalMaxPooling1D(),
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
Dense(2, activation="softmax"),
]
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
"""Prepare sequences for CNN using extracted features"""
# X here contains the features already extracted by FeatureExtractor
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Get text data from extracted features - use character level for CNN
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
# Fallback - should not happen if FeatureExtractor is properly configured
text_data = [""] * len(X)
# Initialize character-level tokenizer
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
return pad_sequences(sequences, maxlen=max_len, padding="post")
+97
View File
@@ -0,0 +1,97 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from research.experiment import ExperimentConfig
from research.traditional_model import TraditionalModel
class EnsembleModel(TraditionalModel):
"""Ensemble model combining multiple base models"""
@property
def architecture(self) -> str:
"""Return the architecture type"""
return "ensemble"
def __init__(self, config: ExperimentConfig):
super().__init__(config)
self.base_models = []
self.model_weights = None
def build_model(self) -> BaseEstimator:
params = self.config.model_params
base_model_types = params.get(
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
)
# Create base models with simplified configs
estimators = []
for model_type in base_model_types:
if model_type == "logistic_regression":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
),
(
"classifier",
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
),
]
)
estimators.append((f"logistic_regression", model))
elif model_type == "random_forest":
model = Pipeline(
[
(
"vectorizer",
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
),
(
"classifier",
RandomForestClassifier(
n_estimators=50, random_state=self.config.random_seed
),
),
]
)
estimators.append((f"rf", model))
elif model_type == "naive_bayes":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
),
("classifier", MultinomialNB()),
]
)
estimators.append((f"nb", model))
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(estimators=estimators, voting=voting_type)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+51
View File
@@ -0,0 +1,51 @@
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
class LightGBMModel(TraditionalModel):
"""LightGBM with engineered features"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
return lgb.LGBMClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", -1),
learning_rate=params.get("learning_rate", 0.1),
num_leaves=params.get("num_leaves", 31),
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
verbose=-1,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]:
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character n-grams for text features
vectorizer = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=50
)
char_features = vectorizer.fit_transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features)
else:
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,44 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel
class LogisticRegressionModel(TraditionalModel):
"""Logistic Regression with character n-grams"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 5)),
max_features=params.get("max_features", 10000),
)
classifier = LogisticRegression(
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
# Collect text-based features from the extracted features DataFrame
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
# Combine text features
if len(text_features) == 1:
return text_features[0].values
else:
# Concatenate multiple text features with separator
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+52
View File
@@ -0,0 +1,52 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
class LSTMModel(NeuralNetworkModel):
"""LSTM model for sequence learning"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
Bidirectional(LSTM(params.get("lstm_units", 32))),
Dense(64, activation="relu"),
Dense(2, activation="softmax"),
]
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
# Initialize tokenizer if needed
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+39
View File
@@ -0,0 +1,39 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel
class NaiveBayesModel(TraditionalModel):
"""Multinomial Naive Bayes with character n-grams"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (1, 4)),
max_features=params.get("max_features", 8000),
)
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+40
View File
@@ -0,0 +1,40 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
class RandomForestModel(TraditionalModel):
"""Random Forest with engineered features"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
return RandomForestClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", None),
random_state=self.config.random_seed,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
# Handle different feature types
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
else:
# Categorical features (encode them)
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+45
View File
@@ -0,0 +1,45 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from research.traditional_model import TraditionalModel
class SVMModel(TraditionalModel):
"""Support Vector Machine with character n-grams and RBF kernel"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
vectorizer = TfidfVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 4)),
max_features=params.get("max_features", 5000),
)
classifier = SVC(
kernel=params.get("kernel", "rbf"),
C=params.get("C", 1.0),
gamma=params.get("gamma", "scale"),
probability=True, # Enable probability prediction
random_state=self.config.random_seed,
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+82
View File
@@ -0,0 +1,82 @@
from typing import Any
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import (
Input,
Embedding,
Dense,
GlobalAveragePooling1D,
MultiHeadAttention,
Dropout,
LayerNormalization,
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
class TransformerModel(NeuralNetworkModel):
"""Transformer-based model"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
params = kwargs
# Build Transformer model
inputs = Input(shape=(max_len,))
x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=max_len, delta=1)
pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))(
positions
)
x = x + pos_embedding
x = self._transformer_encoder(x, params)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
outputs = Dense(2, activation="softmax")(x)
model = Model(inputs, outputs)
model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return model
@classmethod
def _transformer_encoder(cls, x, cfg_params):
"""Transformer encoder block"""
attn = MultiHeadAttention(
num_heads=cfg_params.get("transformer_num_heads", 2),
key_dim=cfg_params.get("transformer_head_size", 64),
)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
# Initialize tokenizer if needed
if self.tokenizer is None:
self.tokenizer = Tokenizer(oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+52
View File
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
class XGBoostModel(TraditionalModel):
"""XGBoost with engineered features and character embeddings"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
return xgb.XGBClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", 6),
learning_rate=params.get("learning_rate", 0.1),
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
eval_metric="logloss",
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character-level features for names
vectorizer = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=100
)
char_features = vectorizer.fit_transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features)
else:
# Categorical features
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+201
View File
@@ -0,0 +1,201 @@
import logging
from abc import abstractmethod
from typing import Any, Dict, List
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor
class NeuralNetworkModel(BaseModel):
"""Base class for neural network models (TensorFlow/Keras)"""
@property
def architecture(self) -> str:
return "neural_network"
@abstractmethod
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
"""Build neural network model with known vocabulary size"""
pass
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the neural network model with deferred building"""
logging.info(f"Training {self.__class__.__name__}")
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
# Extract and prepare features (this will also initialize tokenizer)
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
# Now we can build the model with known vocab size
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
# Get additional model parameters
max_len = self.config.model_params.get("max_len", 6)
self.model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train the neural network
history = self.model.fit(
X_prepared,
y_encoded,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 64),
validation_split=0.1,
verbose=1,
)
# Store training history
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
"val_accuracy": history.history.get("val_accuracy", []),
"val_loss": history.history.get("val_loss", []),
}
self.is_fitted = True
return self
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
accuracies = []
precisions = []
recalls = []
f1_scores = []
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
# Create fresh model for each fold
fold_model = self.build_model()
# Train on fold
if hasattr(fold_model, "fit"):
fold_model.fit(
X_prepared[train_idx],
y_encoded[train_idx],
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
verbose=0,
)
# Predict on validation
y_pred = fold_model.predict(X_prepared[val_idx])
if len(y_pred.shape) > 1:
y_pred = y_pred.argmax(axis=1)
# Calculate metrics
acc = accuracy_score(y_encoded[val_idx], y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
y_encoded[val_idx], y_pred, average="weighted"
)
accuracies.append(acc)
precisions.append(prec)
recalls.append(rec)
f1_scores.append(f1)
return {
"accuracy": np.mean(accuracies),
"accuracy_std": np.std(accuracies),
"precision": np.mean(precisions),
"precision_std": np.std(precisions),
"recall": np.mean(recalls),
"recall_std": np.std(recalls),
"f1": np.mean(f1_scores),
"f1_std": np.std(f1_scores),
}
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
learning_curve_data = {
"train_sizes": [],
"train_scores": [],
"val_scores": [],
"train_scores_std": [],
"val_scores_std": [],
}
# Split data once for validation
X_train_full, X_val, y_train_full, y_val = train_test_split(
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
)
for size in train_sizes:
train_size = int(len(X_train_full) * size)
if train_size < 10: # Minimum training size
continue
# Sample training data
indices = np.random.choice(len(X_train_full), train_size, replace=False)
X_train_subset = X_train_full[indices]
y_train_subset = y_train_full[indices]
# Train multiple models for variance estimation
train_scores = []
val_scores = []
for seed in range(3): # 3 runs for variance
# Build fresh model
model = self.build_model()
# Train model
if hasattr(model, "fit"):
history = model.fit(
X_train_subset,
y_train_subset,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
validation_data=(X_val, y_val),
verbose=0,
)
# Evaluate
train_pred = model.predict(X_train_subset)
val_pred = model.predict(X_val)
train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
train_scores.append(train_acc)
val_scores.append(val_acc)
learning_curve_data["train_sizes"].append(train_size)
learning_curve_data["train_scores"].append(np.mean(train_scores))
learning_curve_data["val_scores"].append(np.mean(val_scores))
learning_curve_data["train_scores_std"].append(np.std(train_scores))
learning_curve_data["val_scores_std"].append(np.std(val_scores))
self.learning_curve_data = learning_curve_data
return learning_curve_data
+134
View File
@@ -0,0 +1,134 @@
import logging
from abc import abstractmethod
from typing import Dict, Any, List
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor
class TraditionalModel(BaseModel):
"""Base class for traditional ML models (scikit-learn compatible)"""
@property
def architecture(self) -> str:
return "traditional"
@abstractmethod
def build_model(self) -> BaseEstimator:
"""Build and return the sklearn model instance"""
pass
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the traditional ML model"""
logging.info(f"Training {self.__class__.__name__}")
# Build model if not already built
if self.model is None:
self.model = self.build_model()
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
# Extract and prepare features
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
# Train model
self.model.fit(X_prepared, y_encoded)
self.is_fitted = True
return self
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
# Calculate different metrics
results = {}
# Accuracy
accuracy_scores = cross_val_score(
self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
)
results["accuracy"] = accuracy_scores.mean()
results["accuracy_std"] = accuracy_scores.std()
# Precision, Recall, F1
for metric in ["precision", "recall", "f1"]:
if metric in self.config.metrics:
scores = cross_val_score(
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
)
results[metric] = scores.mean()
results[f"{metric}_std"] = scores.std()
return results
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
if train_sizes is None:
train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
# Prepare features
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
try:
train_sizes_abs, train_scores, val_scores = learning_curve(
self.build_model(),
X_prepared,
y_encoded,
train_sizes=train_sizes,
cv=3, # Use 3-fold CV for speed
scoring="accuracy",
random_state=self.config.random_seed,
)
learning_curve_data = {
"train_sizes": train_sizes_abs.tolist(),
"train_scores": train_scores.mean(axis=1).tolist(),
"val_scores": val_scores.mean(axis=1).tolist(),
"train_scores_std": train_scores.std(axis=1).tolist(),
"val_scores_std": val_scores.std(axis=1).tolist(),
}
except Exception as e:
logging.warning(f"Could not generate learning curve: {e}")
return {}
self.learning_curve_data = learning_curve_data
return learning_curve_data