refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,91 @@
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from enum import Enum
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
|
||||
from .feature_extractor import FeatureType
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExperimentConfig:
|
||||
"""Configuration for a single experiment"""
|
||||
|
||||
# Experiment metadata
|
||||
name: str
|
||||
description: str = ""
|
||||
tags: List[str] = field(default_factory=list)
|
||||
|
||||
# Model configuration
|
||||
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||
model_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Feature configuration
|
||||
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
|
||||
feature_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Data configuration
|
||||
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
|
||||
test_data_filter: Optional[Dict[str, Any]] = None
|
||||
target_column: str = "sex"
|
||||
|
||||
# Training configuration
|
||||
test_size: float = 0.2
|
||||
random_seed: int = 42
|
||||
cross_validation_folds: int = 5
|
||||
|
||||
# Evaluation configuration
|
||||
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization"""
|
||||
result = asdict(self)
|
||||
# Convert enums to strings
|
||||
result["features"] = [f.value for f in self.features]
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
|
||||
"""Create from dictionary"""
|
||||
if "features" in data:
|
||||
data["features"] = [FeatureType(f) for f in data["features"]]
|
||||
return cls(**data)
|
||||
|
||||
|
||||
class ExperimentStatus(Enum):
|
||||
"""Experiment execution status"""
|
||||
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
def calculate_metrics(
|
||||
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
|
||||
) -> Dict[str, float]:
|
||||
"""Calculate specified metrics"""
|
||||
|
||||
if metrics is None:
|
||||
metrics = ["accuracy", "precision", "recall", "f1"]
|
||||
|
||||
results = {}
|
||||
|
||||
if "accuracy" in metrics:
|
||||
results["accuracy"] = accuracy_score(y_true, y_pred)
|
||||
|
||||
if any(m in metrics for m in ["precision", "recall", "f1"]):
|
||||
precision, recall, f1, _ = precision_recall_fscore_support(
|
||||
y_true, y_pred, average="weighted"
|
||||
)
|
||||
|
||||
if "precision" in metrics:
|
||||
results["precision"] = precision
|
||||
if "recall" in metrics:
|
||||
results["recall"] = recall
|
||||
if "f1" in metrics:
|
||||
results["f1"] = f1
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,56 @@
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExperimentResult:
|
||||
"""Results from an experiment execution"""
|
||||
|
||||
experiment_id: str
|
||||
config: ExperimentConfig
|
||||
|
||||
# Execution metadata
|
||||
start_time: datetime
|
||||
end_time: Optional[datetime] = None
|
||||
status: ExperimentStatus = ExperimentStatus.PENDING
|
||||
error_message: Optional[str] = None
|
||||
|
||||
# Model artifacts
|
||||
model_path: Optional[str] = None
|
||||
feature_extractor_path: Optional[str] = None
|
||||
|
||||
# Metrics
|
||||
train_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
test_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
cv_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
|
||||
# Additional results
|
||||
confusion_matrix: Optional[List[List[int]]] = None
|
||||
feature_importance: Optional[Dict[str, float]] = None
|
||||
prediction_examples: Optional[List[Dict]] = None
|
||||
|
||||
# Data statistics
|
||||
train_size: int = 0
|
||||
test_size: int = 0
|
||||
class_distribution: Dict[str, int] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization"""
|
||||
result = asdict(self)
|
||||
result["config"] = self.config.to_dict()
|
||||
result["start_time"] = self.start_time.isoformat()
|
||||
result["end_time"] = self.end_time.isoformat() if self.end_time else None
|
||||
result["status"] = self.status.value
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
|
||||
"""Create from dictionary"""
|
||||
data["config"] = ExperimentConfig.from_dict(data["config"])
|
||||
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
||||
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||
data["status"] = ExperimentStatus(data["status"])
|
||||
return cls(**data)
|
||||
@@ -0,0 +1,123 @@
|
||||
from typing import List
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.experiment.feature_extractor import FeatureType
|
||||
|
||||
|
||||
class ExperimentBuilder:
|
||||
"""Helper class to build experiment configurations"""
|
||||
|
||||
@staticmethod
|
||||
def create_baseline_experiments() -> List[ExperimentConfig]:
|
||||
"""Create a set of baseline experiments for comparison"""
|
||||
|
||||
return [
|
||||
# Full name experiments
|
||||
ExperimentConfig(
|
||||
name="baseline_logistic_regression_fullname",
|
||||
description="Logistic regression with full name",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.FULL_NAME],
|
||||
tags=["baseline", "fullname"],
|
||||
),
|
||||
# Native name only
|
||||
ExperimentConfig(
|
||||
name="baseline_logistic_regression_native",
|
||||
description="Logistic regression with native name only",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.NATIVE_NAME],
|
||||
tags=["baseline", "native"],
|
||||
),
|
||||
# Surname only
|
||||
ExperimentConfig(
|
||||
name="baseline_logistic_regression_surname",
|
||||
description="Logistic regression with surname only",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.SURNAME],
|
||||
tags=["baseline", "surname"],
|
||||
),
|
||||
# Random Forest with engineered features
|
||||
ExperimentConfig(
|
||||
name="baseline_rf_engineered",
|
||||
description="Random Forest with engineered features",
|
||||
model_type="random_forest",
|
||||
features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
|
||||
tags=["baseline", "engineered"],
|
||||
),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def create_feature_ablation_study() -> List[ExperimentConfig]:
|
||||
"""Create experiments for feature ablation study"""
|
||||
base_features = [
|
||||
FeatureType.FULL_NAME,
|
||||
FeatureType.NAME_LENGTH,
|
||||
FeatureType.WORD_COUNT,
|
||||
FeatureType.PROVINCE,
|
||||
]
|
||||
|
||||
experiments = []
|
||||
|
||||
# Test removing each feature one by one
|
||||
for i, feature_to_remove in enumerate(base_features):
|
||||
remaining_features = [f for f in base_features if f != feature_to_remove]
|
||||
|
||||
experiments.append(
|
||||
ExperimentConfig(
|
||||
name=f"ablation_remove_{feature_to_remove.value}",
|
||||
description=f"Ablation study: removed {feature_to_remove.value}",
|
||||
model_type="logistic_regression",
|
||||
features=remaining_features,
|
||||
tags=["ablation", feature_to_remove.value],
|
||||
)
|
||||
)
|
||||
|
||||
return experiments
|
||||
|
||||
@staticmethod
|
||||
def create_name_component_study() -> List[ExperimentConfig]:
|
||||
"""Create experiments to study different name components"""
|
||||
experiments = []
|
||||
|
||||
name_components = [
|
||||
(FeatureType.FIRST_WORD, "first_word"),
|
||||
(FeatureType.LAST_WORD, "last_word"),
|
||||
(FeatureType.NATIVE_NAME, "native_name"),
|
||||
(FeatureType.SURNAME, "surname"),
|
||||
(FeatureType.NAME_BEGINNINGS, "name_beginnings"),
|
||||
(FeatureType.NAME_ENDINGS, "name_endings"),
|
||||
]
|
||||
|
||||
for feature, name in name_components:
|
||||
experiments.append(
|
||||
ExperimentConfig(
|
||||
name=f"component_study_{name}",
|
||||
description=f"Study of {name} for gender prediction",
|
||||
model_type="logistic_regression",
|
||||
features=[feature],
|
||||
tags=["component_study", name],
|
||||
)
|
||||
)
|
||||
|
||||
return experiments
|
||||
|
||||
@staticmethod
|
||||
def create_province_specific_study() -> List[ExperimentConfig]:
|
||||
"""Create experiments for province-specific analysis"""
|
||||
provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed
|
||||
|
||||
experiments = []
|
||||
|
||||
for province in provinces:
|
||||
experiments.append(
|
||||
ExperimentConfig(
|
||||
name=f"province_study_{province}",
|
||||
description=f"Gender prediction for {province} province only",
|
||||
model_type="logistic_regression",
|
||||
features=[FeatureType.FULL_NAME],
|
||||
train_data_filter={"province": province},
|
||||
tags=["province_study", province],
|
||||
)
|
||||
)
|
||||
|
||||
return experiments
|
||||
@@ -0,0 +1,238 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from research.model_registry import create_model
|
||||
|
||||
|
||||
class ExperimentRunner:
|
||||
"""Runs and manages experiments"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.tracker = ExperimentTracker(self.config)
|
||||
self.data_loader = DataLoader(self.config)
|
||||
|
||||
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
|
||||
"""Run a single experiment and return experiment ID"""
|
||||
# Create experiment
|
||||
experiment_id = self.tracker.create_experiment(experiment_config)
|
||||
|
||||
try:
|
||||
logging.info(f"Starting experiment: {experiment_id}")
|
||||
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
||||
|
||||
# Load data
|
||||
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
df = self.data_loader.load_csv_complete(data_path)
|
||||
|
||||
# Apply data filters if specified
|
||||
df = self._apply_data_filters(df, experiment_config)
|
||||
|
||||
# Prepare target variable
|
||||
y = df[experiment_config.target_column]
|
||||
X = df
|
||||
|
||||
# Split data
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
test_size=experiment_config.test_size,
|
||||
random_state=experiment_config.random_seed,
|
||||
stratify=y,
|
||||
)
|
||||
|
||||
# Create and train model
|
||||
model = create_model(experiment_config)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Make predictions
|
||||
train_pred = model.predict(X_train)
|
||||
test_pred = model.predict(X_test)
|
||||
|
||||
# Calculate metrics
|
||||
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
|
||||
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
|
||||
|
||||
# Cross-validation if requested
|
||||
cv_metrics = {}
|
||||
if experiment_config.cross_validation_folds > 1:
|
||||
cv_metrics = model.cross_validate(
|
||||
X_train, y_train, experiment_config.cross_validation_folds
|
||||
)
|
||||
|
||||
# Additional analysis
|
||||
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
|
||||
feature_importance = model.get_feature_importance()
|
||||
|
||||
# Create prediction examples
|
||||
prediction_examples = self._create_prediction_examples(
|
||||
X_test, y_test, test_pred, model, n_examples=10
|
||||
)
|
||||
|
||||
# Calculate class distribution
|
||||
class_distribution = y.value_counts().to_dict()
|
||||
|
||||
# Save model
|
||||
model_path = self._save_model(model, experiment_id)
|
||||
|
||||
# Update experiment with results
|
||||
self.tracker.update_experiment(
|
||||
experiment_id,
|
||||
status=ExperimentStatus.COMPLETED,
|
||||
end_time=datetime.now(),
|
||||
model_path=str(model_path),
|
||||
train_metrics=train_metrics,
|
||||
test_metrics=test_metrics,
|
||||
cv_metrics=cv_metrics,
|
||||
confusion_matrix=conf_matrix,
|
||||
feature_importance=feature_importance,
|
||||
prediction_examples=prediction_examples,
|
||||
train_size=len(X_train),
|
||||
test_size=len(X_test),
|
||||
class_distribution=class_distribution,
|
||||
)
|
||||
|
||||
logging.info(f"Experiment {experiment_id} completed successfully")
|
||||
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
|
||||
|
||||
return experiment_id
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
|
||||
self.tracker.update_experiment(
|
||||
experiment_id,
|
||||
status=ExperimentStatus.FAILED,
|
||||
end_time=datetime.now(),
|
||||
error_message=str(e),
|
||||
)
|
||||
raise
|
||||
|
||||
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
|
||||
"""Run multiple experiments"""
|
||||
experiment_ids = []
|
||||
|
||||
for i, config in enumerate(experiments):
|
||||
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
|
||||
try:
|
||||
exp_id = self.run_experiment(config)
|
||||
experiment_ids.append(exp_id)
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to run experiment {config.name}: {e}")
|
||||
continue
|
||||
|
||||
return experiment_ids
|
||||
|
||||
@classmethod
|
||||
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
|
||||
"""Apply data filters specified in experiment config"""
|
||||
filtered_df = df.copy()
|
||||
|
||||
# Apply training data filters
|
||||
if config.train_data_filter:
|
||||
for column, criteria in config.train_data_filter.items():
|
||||
if column in filtered_df.columns:
|
||||
if isinstance(criteria, list):
|
||||
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
||||
elif isinstance(criteria, dict):
|
||||
if "min" in criteria:
|
||||
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
|
||||
if "max" in criteria:
|
||||
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
|
||||
else:
|
||||
filtered_df = filtered_df[filtered_df[column] == criteria]
|
||||
|
||||
return filtered_df
|
||||
|
||||
@classmethod
|
||||
def _create_prediction_examples(
|
||||
cls,
|
||||
X_test: pd.DataFrame,
|
||||
y_test: pd.Series,
|
||||
predictions: np.ndarray,
|
||||
model: BaseModel,
|
||||
n_examples: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""Create prediction examples for analysis"""
|
||||
examples = []
|
||||
|
||||
# Get both correct and incorrect predictions
|
||||
correct_mask = y_test == predictions
|
||||
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
|
||||
correct_indices = X_test[correct_mask].index[: n_examples // 2]
|
||||
|
||||
sample_indices = list(incorrect_indices) + list(correct_indices)
|
||||
|
||||
for idx in sample_indices[:n_examples]:
|
||||
example = {
|
||||
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
|
||||
"true_label": y_test.loc[idx],
|
||||
"predicted_label": predictions[X_test.index.get_loc(idx)],
|
||||
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
|
||||
}
|
||||
|
||||
# Add probability if available
|
||||
if model.architecture == "traditional":
|
||||
proba = model.predict_proba(X_test.loc[[idx]])
|
||||
example["prediction_confidence"] = float(proba.max())
|
||||
|
||||
examples.append(example)
|
||||
|
||||
return examples
|
||||
|
||||
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
|
||||
"""Save trained model"""
|
||||
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model_path = model_dir / "model.joblib"
|
||||
model.save(str(model_path))
|
||||
|
||||
return model_path
|
||||
|
||||
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
|
||||
"""Load a model from a completed experiment"""
|
||||
experiment = self.tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.model_path:
|
||||
return BaseModel.load(experiment.model_path)
|
||||
|
||||
return None
|
||||
|
||||
def compare_experiments(
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
) -> pd.DataFrame:
|
||||
"""Compare experiments and return analysis"""
|
||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||
|
||||
if f"test_{metric}" in comparison_df.columns:
|
||||
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
|
||||
|
||||
return comparison_df
|
||||
|
||||
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
|
||||
"""Get feature importance analysis for an experiment"""
|
||||
experiment = self.tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.feature_importance:
|
||||
importance_df = pd.DataFrame(
|
||||
[
|
||||
{"feature": feature, "importance": importance}
|
||||
for feature, importance in experiment.feature_importance.items()
|
||||
]
|
||||
)
|
||||
return importance_df.sort_values("importance", ascending=False)
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,194 @@
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config import PipelineConfig, get_config
|
||||
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from research.experiment.experiement_result import ExperimentResult
|
||||
|
||||
|
||||
class ExperimentTracker:
|
||||
"""Tracks and manages experiments"""
|
||||
|
||||
def __init__(self, config: Optional[PipelineConfig] = None):
|
||||
self.config = config or get_config()
|
||||
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
|
||||
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.results_db_path = self.experiments_dir / "experiments.json"
|
||||
self._results: Dict[str, ExperimentResult] = {}
|
||||
self._load_results()
|
||||
|
||||
def _load_results(self):
|
||||
"""Load existing experiment results"""
|
||||
if self.results_db_path.exists():
|
||||
try:
|
||||
with open(self.results_db_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for exp_id, exp_data in data.items():
|
||||
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load experiment results: {e}")
|
||||
|
||||
def _save_results(self):
|
||||
"""Save experiment results to disk"""
|
||||
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
|
||||
|
||||
with open(self.results_db_path, "w") as f:
|
||||
json.dump(data, f, indent=2, default=str)
|
||||
|
||||
def create_experiment(self, config: ExperimentConfig) -> str:
|
||||
"""Create a new experiment and return its ID"""
|
||||
# Generate experiment ID
|
||||
config_hash = hashlib.md5(
|
||||
json.dumps(config.to_dict(), sort_keys=True).encode()
|
||||
).hexdigest()[:8]
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
|
||||
|
||||
# Create result object
|
||||
result = ExperimentResult(
|
||||
experiment_id=experiment_id, config=config, start_time=datetime.now()
|
||||
)
|
||||
|
||||
self._results[experiment_id] = result
|
||||
self._save_results()
|
||||
|
||||
return experiment_id
|
||||
|
||||
def update_experiment(self, experiment_id: str, **updates):
|
||||
"""Update an experiment's results"""
|
||||
if experiment_id in self._results:
|
||||
result = self._results[experiment_id]
|
||||
|
||||
for key, value in updates.items():
|
||||
if hasattr(result, key):
|
||||
setattr(result, key, value)
|
||||
|
||||
self._save_results()
|
||||
|
||||
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
|
||||
"""Get experiment by ID"""
|
||||
return self._results.get(experiment_id)
|
||||
|
||||
def list_experiments(
|
||||
self,
|
||||
status: Optional[ExperimentStatus] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
) -> List[ExperimentResult]:
|
||||
"""List experiments with optional filtering"""
|
||||
results = list(self._results.values())
|
||||
|
||||
if status:
|
||||
results = [r for r in results if r.status == status]
|
||||
|
||||
if tags:
|
||||
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
|
||||
|
||||
if model_type:
|
||||
results = [r for r in results if r.config.model_type == model_type]
|
||||
|
||||
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||
|
||||
def get_best_experiment(
|
||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||
) -> Optional[ExperimentResult]:
|
||||
"""Get the best experiment based on a metric"""
|
||||
experiments = self.list_experiments()
|
||||
|
||||
if filters:
|
||||
# Apply additional filters
|
||||
if "model_type" in filters:
|
||||
experiments = [
|
||||
e for e in experiments if e.config.model_type == filters["model_type"]
|
||||
]
|
||||
if "features" in filters:
|
||||
experiments = [
|
||||
e
|
||||
for e in experiments
|
||||
if any(f in e.config.features for f in filters["features"])
|
||||
]
|
||||
|
||||
valid_experiments = []
|
||||
for exp in experiments:
|
||||
if exp.status == ExperimentStatus.COMPLETED:
|
||||
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
|
||||
if metric in metrics_dict:
|
||||
valid_experiments.append((exp, metrics_dict[metric]))
|
||||
|
||||
if not valid_experiments:
|
||||
return None
|
||||
|
||||
return max(valid_experiments, key=lambda x: x[1])[0]
|
||||
|
||||
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
|
||||
"""Compare multiple experiments in a DataFrame"""
|
||||
rows = []
|
||||
|
||||
for exp_id in experiment_ids:
|
||||
exp = self.get_experiment(exp_id)
|
||||
if exp:
|
||||
row = {
|
||||
"experiment_id": exp_id,
|
||||
"name": exp.config.name,
|
||||
"model_type": exp.config.model_type,
|
||||
"features": ",".join([f.value for f in exp.config.features]),
|
||||
"status": exp.status.value,
|
||||
"train_size": exp.train_size,
|
||||
"test_size": exp.test_size,
|
||||
}
|
||||
|
||||
# Add metrics
|
||||
for metric, value in exp.test_metrics.items():
|
||||
row[f"test_{metric}"] = value
|
||||
|
||||
for metric, value in exp.cv_metrics.items():
|
||||
row[f"cv_{metric}"] = value
|
||||
|
||||
rows.append(row)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
def export_results(self, output_path: Optional[Path] = None) -> Path:
|
||||
"""Export all results to CSV"""
|
||||
if output_path is None:
|
||||
output_path = (
|
||||
self.experiments_dir
|
||||
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
)
|
||||
|
||||
rows = []
|
||||
for exp in self._results.values():
|
||||
row = {
|
||||
"experiment_id": exp.experiment_id,
|
||||
"name": exp.config.name,
|
||||
"description": exp.config.description,
|
||||
"model_type": exp.config.model_type,
|
||||
"features": ",".join([f.value for f in exp.config.features]),
|
||||
"status": exp.status.value,
|
||||
"start_time": exp.start_time.isoformat(),
|
||||
"end_time": exp.end_time.isoformat() if exp.end_time else None,
|
||||
"train_size": exp.train_size,
|
||||
"test_size": exp.test_size,
|
||||
}
|
||||
|
||||
# Add all metrics
|
||||
for metric, value in exp.test_metrics.items():
|
||||
row[f"test_{metric}"] = value
|
||||
|
||||
for metric, value in exp.cv_metrics.items():
|
||||
row[f"cv_{metric}"] = value
|
||||
|
||||
rows.append(row)
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
df.to_csv(output_path, index=False)
|
||||
|
||||
return output_path
|
||||
@@ -0,0 +1,90 @@
|
||||
from enum import Enum
|
||||
from typing import List, Dict, Any, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class FeatureType(Enum):
|
||||
"""Types of features that can be extracted from names"""
|
||||
|
||||
FULL_NAME = "full_name"
|
||||
NATIVE_NAME = "native_name"
|
||||
SURNAME = "surname"
|
||||
FIRST_WORD = "first_word"
|
||||
LAST_WORD = "last_word"
|
||||
NAME_LENGTH = "name_length"
|
||||
WORD_COUNT = "word_count"
|
||||
PROVINCE = "province"
|
||||
CHAR_NGRAMS = "char_ngrams"
|
||||
WORD_NGRAMS = "word_ngrams"
|
||||
NAME_ENDINGS = "name_endings"
|
||||
NAME_BEGINNINGS = "name_beginnings"
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
"""Extract different types of features from name data"""
|
||||
|
||||
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
|
||||
self.feature_types = feature_types
|
||||
self.feature_params = feature_params or {}
|
||||
|
||||
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Extract all configured features"""
|
||||
features_df = pd.DataFrame(index=df.index)
|
||||
|
||||
for feature_type in self.feature_types:
|
||||
feature_data = self._extract_single_feature(df, feature_type)
|
||||
|
||||
if isinstance(feature_data, pd.DataFrame):
|
||||
features_df = pd.concat([features_df, feature_data], axis=1)
|
||||
else:
|
||||
features_df[feature_type.value] = feature_data
|
||||
|
||||
return features_df
|
||||
|
||||
def _extract_single_feature(
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
) -> Union[pd.Series, pd.DataFrame]:
|
||||
"""Extract a single type of feature"""
|
||||
if feature_type == FeatureType.FULL_NAME:
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NATIVE_NAME:
|
||||
return df["identified_name"].fillna(df["probable_native"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.SURNAME:
|
||||
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.FIRST_WORD:
|
||||
return df["name"].str.split().str[0].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.LAST_WORD:
|
||||
return df["name"].str.split().str[-1].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_LENGTH:
|
||||
return df["name"].str.len().fillna(0)
|
||||
|
||||
elif feature_type == FeatureType.WORD_COUNT:
|
||||
return df["words"].fillna(1)
|
||||
|
||||
elif feature_type == FeatureType.PROVINCE:
|
||||
return df["province"].fillna("unknown")
|
||||
|
||||
elif feature_type == FeatureType.NAME_ENDINGS:
|
||||
n = self.feature_params.get("ending_length", 3)
|
||||
return df["name"].str[-n:].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_BEGINNINGS:
|
||||
n = self.feature_params.get("beginning_length", 3)
|
||||
return df["name"].str[:n].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.CHAR_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.WORD_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown feature type: {feature_type}")
|
||||
Reference in New Issue
Block a user