refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
+91
View File
@@ -0,0 +1,91 @@
from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import List, Dict, Any, Optional
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from .feature_extractor import FeatureType
@dataclass
class ExperimentConfig:
"""Configuration for a single experiment"""
# Experiment metadata
name: str
description: str = ""
tags: List[str] = field(default_factory=list)
# Model configuration
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
model_params: Dict[str, Any] = field(default_factory=dict)
# Feature configuration
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
feature_params: Dict[str, Any] = field(default_factory=dict)
# Data configuration
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
test_data_filter: Optional[Dict[str, Any]] = None
target_column: str = "sex"
# Training configuration
test_size: float = 0.2
random_seed: int = 42
cross_validation_folds: int = 5
# Evaluation configuration
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
# Convert enums to strings
result["features"] = [f.value for f in self.features]
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
"""Create from dictionary"""
if "features" in data:
data["features"] = [FeatureType(f) for f in data["features"]]
return cls(**data)
class ExperimentStatus(Enum):
"""Experiment execution status"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
def calculate_metrics(
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
) -> Dict[str, float]:
"""Calculate specified metrics"""
if metrics is None:
metrics = ["accuracy", "precision", "recall", "f1"]
results = {}
if "accuracy" in metrics:
results["accuracy"] = accuracy_score(y_true, y_pred)
if any(m in metrics for m in ["precision", "recall", "f1"]):
precision, recall, f1, _ = precision_recall_fscore_support(
y_true, y_pred, average="weighted"
)
if "precision" in metrics:
results["precision"] = precision
if "recall" in metrics:
results["recall"] = recall
if "f1" in metrics:
results["f1"] = f1
return results
+56
View File
@@ -0,0 +1,56 @@
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional, Dict, List, Any
from research.experiment import ExperimentConfig, ExperimentStatus
@dataclass
class ExperimentResult:
"""Results from an experiment execution"""
experiment_id: str
config: ExperimentConfig
# Execution metadata
start_time: datetime
end_time: Optional[datetime] = None
status: ExperimentStatus = ExperimentStatus.PENDING
error_message: Optional[str] = None
# Model artifacts
model_path: Optional[str] = None
feature_extractor_path: Optional[str] = None
# Metrics
train_metrics: Dict[str, float] = field(default_factory=dict)
test_metrics: Dict[str, float] = field(default_factory=dict)
cv_metrics: Dict[str, float] = field(default_factory=dict)
# Additional results
confusion_matrix: Optional[List[List[int]]] = None
feature_importance: Optional[Dict[str, float]] = None
prediction_examples: Optional[List[Dict]] = None
# Data statistics
train_size: int = 0
test_size: int = 0
class_distribution: Dict[str, int] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
result["config"] = self.config.to_dict()
result["start_time"] = self.start_time.isoformat()
result["end_time"] = self.end_time.isoformat() if self.end_time else None
result["status"] = self.status.value
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
"""Create from dictionary"""
data["config"] = ExperimentConfig.from_dict(data["config"])
data["start_time"] = datetime.fromisoformat(data["start_time"])
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
data["status"] = ExperimentStatus(data["status"])
return cls(**data)
+123
View File
@@ -0,0 +1,123 @@
from typing import List
from research.experiment import ExperimentConfig
from research.experiment.feature_extractor import FeatureType
class ExperimentBuilder:
"""Helper class to build experiment configurations"""
@staticmethod
def create_baseline_experiments() -> List[ExperimentConfig]:
"""Create a set of baseline experiments for comparison"""
return [
# Full name experiments
ExperimentConfig(
name="baseline_logistic_regression_fullname",
description="Logistic regression with full name",
model_type="logistic_regression",
features=[FeatureType.FULL_NAME],
tags=["baseline", "fullname"],
),
# Native name only
ExperimentConfig(
name="baseline_logistic_regression_native",
description="Logistic regression with native name only",
model_type="logistic_regression",
features=[FeatureType.NATIVE_NAME],
tags=["baseline", "native"],
),
# Surname only
ExperimentConfig(
name="baseline_logistic_regression_surname",
description="Logistic regression with surname only",
model_type="logistic_regression",
features=[FeatureType.SURNAME],
tags=["baseline", "surname"],
),
# Random Forest with engineered features
ExperimentConfig(
name="baseline_rf_engineered",
description="Random Forest with engineered features",
model_type="random_forest",
features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
tags=["baseline", "engineered"],
),
]
@staticmethod
def create_feature_ablation_study() -> List[ExperimentConfig]:
"""Create experiments for feature ablation study"""
base_features = [
FeatureType.FULL_NAME,
FeatureType.NAME_LENGTH,
FeatureType.WORD_COUNT,
FeatureType.PROVINCE,
]
experiments = []
# Test removing each feature one by one
for i, feature_to_remove in enumerate(base_features):
remaining_features = [f for f in base_features if f != feature_to_remove]
experiments.append(
ExperimentConfig(
name=f"ablation_remove_{feature_to_remove.value}",
description=f"Ablation study: removed {feature_to_remove.value}",
model_type="logistic_regression",
features=remaining_features,
tags=["ablation", feature_to_remove.value],
)
)
return experiments
@staticmethod
def create_name_component_study() -> List[ExperimentConfig]:
"""Create experiments to study different name components"""
experiments = []
name_components = [
(FeatureType.FIRST_WORD, "first_word"),
(FeatureType.LAST_WORD, "last_word"),
(FeatureType.NATIVE_NAME, "native_name"),
(FeatureType.SURNAME, "surname"),
(FeatureType.NAME_BEGINNINGS, "name_beginnings"),
(FeatureType.NAME_ENDINGS, "name_endings"),
]
for feature, name in name_components:
experiments.append(
ExperimentConfig(
name=f"component_study_{name}",
description=f"Study of {name} for gender prediction",
model_type="logistic_regression",
features=[feature],
tags=["component_study", name],
)
)
return experiments
@staticmethod
def create_province_specific_study() -> List[ExperimentConfig]:
"""Create experiments for province-specific analysis"""
provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed
experiments = []
for province in provinces:
experiments.append(
ExperimentConfig(
name=f"province_study_{province}",
description=f"Gender prediction for {province} province only",
model_type="logistic_regression",
features=[FeatureType.FULL_NAME],
train_data_filter={"province": province},
tags=["province_study", province],
)
)
return experiments
+238
View File
@@ -0,0 +1,238 @@
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from research.base_model import BaseModel
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
from research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import create_model
class ExperimentRunner:
"""Runs and manages experiments"""
def __init__(self, config: PipelineConfig):
self.config = config
self.tracker = ExperimentTracker(self.config)
self.data_loader = DataLoader(self.config)
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
"""Run a single experiment and return experiment ID"""
# Create experiment
experiment_id = self.tracker.create_experiment(experiment_config)
try:
logging.info(f"Starting experiment: {experiment_id}")
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
# Load data
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
df = self.data_loader.load_csv_complete(data_path)
# Apply data filters if specified
df = self._apply_data_filters(df, experiment_config)
# Prepare target variable
y = df[experiment_config.target_column]
X = df
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=experiment_config.test_size,
random_state=experiment_config.random_seed,
stratify=y,
)
# Create and train model
model = create_model(experiment_config)
model.fit(X_train, y_train)
# Make predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
# Calculate metrics
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
# Cross-validation if requested
cv_metrics = {}
if experiment_config.cross_validation_folds > 1:
cv_metrics = model.cross_validate(
X_train, y_train, experiment_config.cross_validation_folds
)
# Additional analysis
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
feature_importance = model.get_feature_importance()
# Create prediction examples
prediction_examples = self._create_prediction_examples(
X_test, y_test, test_pred, model, n_examples=10
)
# Calculate class distribution
class_distribution = y.value_counts().to_dict()
# Save model
model_path = self._save_model(model, experiment_id)
# Update experiment with results
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.COMPLETED,
end_time=datetime.now(),
model_path=str(model_path),
train_metrics=train_metrics,
test_metrics=test_metrics,
cv_metrics=cv_metrics,
confusion_matrix=conf_matrix,
feature_importance=feature_importance,
prediction_examples=prediction_examples,
train_size=len(X_train),
test_size=len(X_test),
class_distribution=class_distribution,
)
logging.info(f"Experiment {experiment_id} completed successfully")
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
return experiment_id
except Exception as e:
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.FAILED,
end_time=datetime.now(),
error_message=str(e),
)
raise
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
"""Run multiple experiments"""
experiment_ids = []
for i, config in enumerate(experiments):
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
try:
exp_id = self.run_experiment(config)
experiment_ids.append(exp_id)
except Exception as e:
logging.error(f"Failed to run experiment {config.name}: {e}")
continue
return experiment_ids
@classmethod
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
"""Apply data filters specified in experiment config"""
filtered_df = df.copy()
# Apply training data filters
if config.train_data_filter:
for column, criteria in config.train_data_filter.items():
if column in filtered_df.columns:
if isinstance(criteria, list):
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
elif isinstance(criteria, dict):
if "min" in criteria:
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
if "max" in criteria:
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
else:
filtered_df = filtered_df[filtered_df[column] == criteria]
return filtered_df
@classmethod
def _create_prediction_examples(
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
) -> List[Dict]:
"""Create prediction examples for analysis"""
examples = []
# Get both correct and incorrect predictions
correct_mask = y_test == predictions
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
correct_indices = X_test[correct_mask].index[: n_examples // 2]
sample_indices = list(incorrect_indices) + list(correct_indices)
for idx in sample_indices[:n_examples]:
example = {
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
"true_label": y_test.loc[idx],
"predicted_label": predictions[X_test.index.get_loc(idx)],
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
}
# Add probability if available
if model.architecture == "traditional":
proba = model.predict_proba(X_test.loc[[idx]])
example["prediction_confidence"] = float(proba.max())
examples.append(example)
return examples
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
"""Save trained model"""
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / "model.joblib"
model.save(str(model_path))
return model_path
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
"""Load a model from a completed experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.model_path:
return BaseModel.load(experiment.model_path)
return None
def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
"""Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids)
if f"test_{metric}" in comparison_df.columns:
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
return comparison_df
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
"""Get feature importance analysis for an experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.feature_importance:
importance_df = pd.DataFrame(
[
{"feature": feature, "importance": importance}
for feature, importance in experiment.feature_importance.items()
]
)
return importance_df.sort_values("importance", ascending=False)
return None
+194
View File
@@ -0,0 +1,194 @@
import hashlib
import json
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, List
import pandas as pd
from core.config import PipelineConfig, get_config
from research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiement_result import ExperimentResult
class ExperimentTracker:
"""Tracks and manages experiments"""
def __init__(self, config: Optional[PipelineConfig] = None):
self.config = config or get_config()
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
self.experiments_dir.mkdir(parents=True, exist_ok=True)
self.results_db_path = self.experiments_dir / "experiments.json"
self._results: Dict[str, ExperimentResult] = {}
self._load_results()
def _load_results(self):
"""Load existing experiment results"""
if self.results_db_path.exists():
try:
with open(self.results_db_path, "r") as f:
data = json.load(f)
for exp_id, exp_data in data.items():
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
except Exception as e:
print(f"Warning: Failed to load experiment results: {e}")
def _save_results(self):
"""Save experiment results to disk"""
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
with open(self.results_db_path, "w") as f:
json.dump(data, f, indent=2, default=str)
def create_experiment(self, config: ExperimentConfig) -> str:
"""Create a new experiment and return its ID"""
# Generate experiment ID
config_hash = hashlib.md5(
json.dumps(config.to_dict(), sort_keys=True).encode()
).hexdigest()[:8]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
# Create result object
result = ExperimentResult(
experiment_id=experiment_id, config=config, start_time=datetime.now()
)
self._results[experiment_id] = result
self._save_results()
return experiment_id
def update_experiment(self, experiment_id: str, **updates):
"""Update an experiment's results"""
if experiment_id in self._results:
result = self._results[experiment_id]
for key, value in updates.items():
if hasattr(result, key):
setattr(result, key, value)
self._save_results()
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
"""Get experiment by ID"""
return self._results.get(experiment_id)
def list_experiments(
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
) -> List[ExperimentResult]:
"""List experiments with optional filtering"""
results = list(self._results.values())
if status:
results = [r for r in results if r.status == status]
if tags:
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
if model_type:
results = [r for r in results if r.config.model_type == model_type]
return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric"""
experiments = self.list_experiments()
if filters:
# Apply additional filters
if "model_type" in filters:
experiments = [
e for e in experiments if e.config.model_type == filters["model_type"]
]
if "features" in filters:
experiments = [
e
for e in experiments
if any(f in e.config.features for f in filters["features"])
]
valid_experiments = []
for exp in experiments:
if exp.status == ExperimentStatus.COMPLETED:
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
if metric in metrics_dict:
valid_experiments.append((exp, metrics_dict[metric]))
if not valid_experiments:
return None
return max(valid_experiments, key=lambda x: x[1])[0]
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
"""Compare multiple experiments in a DataFrame"""
rows = []
for exp_id in experiment_ids:
exp = self.get_experiment(exp_id)
if exp:
row = {
"experiment_id": exp_id,
"name": exp.config.name,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
return pd.DataFrame(rows)
def export_results(self, output_path: Optional[Path] = None) -> Path:
"""Export all results to CSV"""
if output_path is None:
output_path = (
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
rows = []
for exp in self._results.values():
row = {
"experiment_id": exp.experiment_id,
"name": exp.config.name,
"description": exp.config.description,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"start_time": exp.start_time.isoformat(),
"end_time": exp.end_time.isoformat() if exp.end_time else None,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add all metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv(output_path, index=False)
return output_path
+90
View File
@@ -0,0 +1,90 @@
from enum import Enum
from typing import List, Dict, Any, Union
import pandas as pd
class FeatureType(Enum):
"""Types of features that can be extracted from names"""
FULL_NAME = "full_name"
NATIVE_NAME = "native_name"
SURNAME = "surname"
FIRST_WORD = "first_word"
LAST_WORD = "last_word"
NAME_LENGTH = "name_length"
WORD_COUNT = "word_count"
PROVINCE = "province"
CHAR_NGRAMS = "char_ngrams"
WORD_NGRAMS = "word_ngrams"
NAME_ENDINGS = "name_endings"
NAME_BEGINNINGS = "name_beginnings"
class FeatureExtractor:
"""Extract different types of features from name data"""
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
self.feature_types = feature_types
self.feature_params = feature_params or {}
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Extract all configured features"""
features_df = pd.DataFrame(index=df.index)
for feature_type in self.feature_types:
feature_data = self._extract_single_feature(df, feature_type)
if isinstance(feature_data, pd.DataFrame):
features_df = pd.concat([features_df, feature_data], axis=1)
else:
features_df[feature_type.value] = feature_data
return features_df
def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME:
return df["name"].fillna("")
elif feature_type == FeatureType.NATIVE_NAME:
return df["identified_name"].fillna(df["probable_native"]).fillna("")
elif feature_type == FeatureType.SURNAME:
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
elif feature_type == FeatureType.FIRST_WORD:
return df["name"].str.split().str[0].fillna("")
elif feature_type == FeatureType.LAST_WORD:
return df["name"].str.split().str[-1].fillna("")
elif feature_type == FeatureType.NAME_LENGTH:
return df["name"].str.len().fillna(0)
elif feature_type == FeatureType.WORD_COUNT:
return df["words"].fillna(1)
elif feature_type == FeatureType.PROVINCE:
return df["province"].fillna("unknown")
elif feature_type == FeatureType.NAME_ENDINGS:
n = self.feature_params.get("ending_length", 3)
return df["name"].str[-n:].fillna("")
elif feature_type == FeatureType.NAME_BEGINNINGS:
n = self.feature_params.get("beginning_length", 3)
return df["name"].str[:n].fillna("")
elif feature_type == FeatureType.CHAR_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
elif feature_type == FeatureType.WORD_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
else:
raise ValueError(f"Unknown feature type: {feature_type}")