refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,91 @@
+from dataclasses import dataclass, field, asdict
+from enum import Enum
+from typing import List, Dict, Any, Optional
+
+import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+
+from .feature_extractor import FeatureType
+
+
+@dataclass
+class ExperimentConfig:
+    """Configuration for a single experiment"""
+
+    # Experiment metadata
+    name: str
+    description: str = ""
+    tags: List[str] = field(default_factory=list)
+
+    # Model configuration
+    model_type: str = "logistic_regression"  # logistic_regression, lstm, transformer, etc.
+    model_params: Dict[str, Any] = field(default_factory=dict)
+
+    # Feature configuration
+    features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
+    feature_params: Dict[str, Any] = field(default_factory=dict)
+
+    # Data configuration
+    train_data_filter: Optional[Dict[str, Any]] = None  # Filter criteria for training data
+    test_data_filter: Optional[Dict[str, Any]] = None
+    target_column: str = "sex"
+
+    # Training configuration
+    test_size: float = 0.2
+    random_seed: int = 42
+    cross_validation_folds: int = 5
+
+    # Evaluation configuration
+    metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization"""
+        result = asdict(self)
+        # Convert enums to strings
+        result["features"] = [f.value for f in self.features]
+        return result
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
+        """Create from dictionary"""
+        if "features" in data:
+            data["features"] = [FeatureType(f) for f in data["features"]]
+        return cls(**data)
+
+
+class ExperimentStatus(Enum):
+    """Experiment execution status"""
+
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+def calculate_metrics(
+    y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
+) -> Dict[str, float]:
+    """Calculate specified metrics"""
+
+    if metrics is None:
+        metrics = ["accuracy", "precision", "recall", "f1"]
+
+    results = {}
+
+    if "accuracy" in metrics:
+        results["accuracy"] = accuracy_score(y_true, y_pred)
+
+    if any(m in metrics for m in ["precision", "recall", "f1"]):
+        precision, recall, f1, _ = precision_recall_fscore_support(
+            y_true, y_pred, average="weighted"
+        )
+
+        if "precision" in metrics:
+            results["precision"] = precision
+        if "recall" in metrics:
+            results["recall"] = recall
+        if "f1" in metrics:
+            results["f1"] = f1
+
+    return results
@@ -0,0 +1,56 @@
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from typing import Optional, Dict, List, Any
+
+from research.experiment import ExperimentConfig, ExperimentStatus
+
+
+@dataclass
+class ExperimentResult:
+    """Results from an experiment execution"""
+
+    experiment_id: str
+    config: ExperimentConfig
+
+    # Execution metadata
+    start_time: datetime
+    end_time: Optional[datetime] = None
+    status: ExperimentStatus = ExperimentStatus.PENDING
+    error_message: Optional[str] = None
+
+    # Model artifacts
+    model_path: Optional[str] = None
+    feature_extractor_path: Optional[str] = None
+
+    # Metrics
+    train_metrics: Dict[str, float] = field(default_factory=dict)
+    test_metrics: Dict[str, float] = field(default_factory=dict)
+    cv_metrics: Dict[str, float] = field(default_factory=dict)
+
+    # Additional results
+    confusion_matrix: Optional[List[List[int]]] = None
+    feature_importance: Optional[Dict[str, float]] = None
+    prediction_examples: Optional[List[Dict]] = None
+
+    # Data statistics
+    train_size: int = 0
+    test_size: int = 0
+    class_distribution: Dict[str, int] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization"""
+        result = asdict(self)
+        result["config"] = self.config.to_dict()
+        result["start_time"] = self.start_time.isoformat()
+        result["end_time"] = self.end_time.isoformat() if self.end_time else None
+        result["status"] = self.status.value
+        return result
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
+        """Create from dictionary"""
+        data["config"] = ExperimentConfig.from_dict(data["config"])
+        data["start_time"] = datetime.fromisoformat(data["start_time"])
+        data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
+        data["status"] = ExperimentStatus(data["status"])
+        return cls(**data)
@@ -0,0 +1,123 @@
+from typing import List
+
+from research.experiment import ExperimentConfig
+from research.experiment.feature_extractor import FeatureType
+
+
+class ExperimentBuilder:
+    """Helper class to build experiment configurations"""
+
+    @staticmethod
+    def create_baseline_experiments() -> List[ExperimentConfig]:
+        """Create a set of baseline experiments for comparison"""
+
+        return [
+            # Full name experiments
+            ExperimentConfig(
+                name="baseline_logistic_regression_fullname",
+                description="Logistic regression with full name",
+                model_type="logistic_regression",
+                features=[FeatureType.FULL_NAME],
+                tags=["baseline", "fullname"],
+            ),
+            # Native name only
+            ExperimentConfig(
+                name="baseline_logistic_regression_native",
+                description="Logistic regression with native name only",
+                model_type="logistic_regression",
+                features=[FeatureType.NATIVE_NAME],
+                tags=["baseline", "native"],
+            ),
+            # Surname only
+            ExperimentConfig(
+                name="baseline_logistic_regression_surname",
+                description="Logistic regression with surname only",
+                model_type="logistic_regression",
+                features=[FeatureType.SURNAME],
+                tags=["baseline", "surname"],
+            ),
+            # Random Forest with engineered features
+            ExperimentConfig(
+                name="baseline_rf_engineered",
+                description="Random Forest with engineered features",
+                model_type="random_forest",
+                features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
+                tags=["baseline", "engineered"],
+            ),
+        ]
+
+    @staticmethod
+    def create_feature_ablation_study() -> List[ExperimentConfig]:
+        """Create experiments for feature ablation study"""
+        base_features = [
+            FeatureType.FULL_NAME,
+            FeatureType.NAME_LENGTH,
+            FeatureType.WORD_COUNT,
+            FeatureType.PROVINCE,
+        ]
+
+        experiments = []
+
+        # Test removing each feature one by one
+        for i, feature_to_remove in enumerate(base_features):
+            remaining_features = [f for f in base_features if f != feature_to_remove]
+
+            experiments.append(
+                ExperimentConfig(
+                    name=f"ablation_remove_{feature_to_remove.value}",
+                    description=f"Ablation study: removed {feature_to_remove.value}",
+                    model_type="logistic_regression",
+                    features=remaining_features,
+                    tags=["ablation", feature_to_remove.value],
+                )
+            )
+
+        return experiments
+
+    @staticmethod
+    def create_name_component_study() -> List[ExperimentConfig]:
+        """Create experiments to study different name components"""
+        experiments = []
+
+        name_components = [
+            (FeatureType.FIRST_WORD, "first_word"),
+            (FeatureType.LAST_WORD, "last_word"),
+            (FeatureType.NATIVE_NAME, "native_name"),
+            (FeatureType.SURNAME, "surname"),
+            (FeatureType.NAME_BEGINNINGS, "name_beginnings"),
+            (FeatureType.NAME_ENDINGS, "name_endings"),
+        ]
+
+        for feature, name in name_components:
+            experiments.append(
+                ExperimentConfig(
+                    name=f"component_study_{name}",
+                    description=f"Study of {name} for gender prediction",
+                    model_type="logistic_regression",
+                    features=[feature],
+                    tags=["component_study", name],
+                )
+            )
+
+        return experiments
+
+    @staticmethod
+    def create_province_specific_study() -> List[ExperimentConfig]:
+        """Create experiments for province-specific analysis"""
+        provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"]  # Add more as needed
+
+        experiments = []
+
+        for province in provinces:
+            experiments.append(
+                ExperimentConfig(
+                    name=f"province_study_{province}",
+                    description=f"Gender prediction for {province} province only",
+                    model_type="logistic_regression",
+                    features=[FeatureType.FULL_NAME],
+                    train_data_filter={"province": province},
+                    tags=["province_study", province],
+                )
+            )
+
+        return experiments
@@ -0,0 +1,238 @@
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Optional
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import train_test_split
+
+from core.config import PipelineConfig
+from core.utils import get_data_file_path
+from core.utils.data_loader import DataLoader
+from research.base_model import BaseModel
+from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
+from research.experiment.experiment_tracker import ExperimentTracker
+from research.model_registry import create_model
+
+
+class ExperimentRunner:
+    """Runs and manages experiments"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.tracker = ExperimentTracker(self.config)
+        self.data_loader = DataLoader(self.config)
+
+    def run_experiment(self, experiment_config: ExperimentConfig) -> str:
+        """Run a single experiment and return experiment ID"""
+        # Create experiment
+        experiment_id = self.tracker.create_experiment(experiment_config)
+
+        try:
+            logging.info(f"Starting experiment: {experiment_id}")
+            self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
+
+            # Load data
+            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
+            df = self.data_loader.load_csv_complete(data_path)
+
+            # Apply data filters if specified
+            df = self._apply_data_filters(df, experiment_config)
+
+            # Prepare target variable
+            y = df[experiment_config.target_column]
+            X = df
+
+            # Split data
+            X_train, X_test, y_train, y_test = train_test_split(
+                X,
+                y,
+                test_size=experiment_config.test_size,
+                random_state=experiment_config.random_seed,
+                stratify=y,
+            )
+
+            # Create and train model
+            model = create_model(experiment_config)
+            model.fit(X_train, y_train)
+
+            # Make predictions
+            train_pred = model.predict(X_train)
+            test_pred = model.predict(X_test)
+
+            # Calculate metrics
+            train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
+            test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
+
+            # Cross-validation if requested
+            cv_metrics = {}
+            if experiment_config.cross_validation_folds > 1:
+                cv_metrics = model.cross_validate(
+                    X_train, y_train, experiment_config.cross_validation_folds
+                )
+
+            # Additional analysis
+            conf_matrix = confusion_matrix(y_test, test_pred).tolist()
+            feature_importance = model.get_feature_importance()
+
+            # Create prediction examples
+            prediction_examples = self._create_prediction_examples(
+                X_test, y_test, test_pred, model, n_examples=10
+            )
+
+            # Calculate class distribution
+            class_distribution = y.value_counts().to_dict()
+
+            # Save model
+            model_path = self._save_model(model, experiment_id)
+
+            # Update experiment with results
+            self.tracker.update_experiment(
+                experiment_id,
+                status=ExperimentStatus.COMPLETED,
+                end_time=datetime.now(),
+                model_path=str(model_path),
+                train_metrics=train_metrics,
+                test_metrics=test_metrics,
+                cv_metrics=cv_metrics,
+                confusion_matrix=conf_matrix,
+                feature_importance=feature_importance,
+                prediction_examples=prediction_examples,
+                train_size=len(X_train),
+                test_size=len(X_test),
+                class_distribution=class_distribution,
+            )
+
+            logging.info(f"Experiment {experiment_id} completed successfully")
+            logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
+
+            return experiment_id
+
+        except Exception as e:
+            logging.error(f"Experiment {experiment_id} failed: {str(e)}")
+            self.tracker.update_experiment(
+                experiment_id,
+                status=ExperimentStatus.FAILED,
+                end_time=datetime.now(),
+                error_message=str(e),
+            )
+            raise
+
+    def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
+        """Run multiple experiments"""
+        experiment_ids = []
+
+        for i, config in enumerate(experiments):
+            logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
+            try:
+                exp_id = self.run_experiment(config)
+                experiment_ids.append(exp_id)
+            except Exception as e:
+                logging.error(f"Failed to run experiment {config.name}: {e}")
+                continue
+
+        return experiment_ids
+
+    @classmethod
+    def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
+        """Apply data filters specified in experiment config"""
+        filtered_df = df.copy()
+
+        # Apply training data filters
+        if config.train_data_filter:
+            for column, criteria in config.train_data_filter.items():
+                if column in filtered_df.columns:
+                    if isinstance(criteria, list):
+                        filtered_df = filtered_df[filtered_df[column].isin(criteria)]
+                    elif isinstance(criteria, dict):
+                        if "min" in criteria:
+                            filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
+                        if "max" in criteria:
+                            filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
+                    else:
+                        filtered_df = filtered_df[filtered_df[column] == criteria]
+
+        return filtered_df
+
+    @classmethod
+    def _create_prediction_examples(
+        cls,
+        X_test: pd.DataFrame,
+        y_test: pd.Series,
+        predictions: np.ndarray,
+        model: BaseModel,
+        n_examples: int = 10,
+    ) -> List[Dict]:
+        """Create prediction examples for analysis"""
+        examples = []
+
+        # Get both correct and incorrect predictions
+        correct_mask = y_test == predictions
+        incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
+        correct_indices = X_test[correct_mask].index[: n_examples // 2]
+
+        sample_indices = list(incorrect_indices) + list(correct_indices)
+
+        for idx in sample_indices[:n_examples]:
+            example = {
+                "name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
+                "true_label": y_test.loc[idx],
+                "predicted_label": predictions[X_test.index.get_loc(idx)],
+                "correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
+            }
+
+            # Add probability if available
+            if model.architecture == "traditional":
+                proba = model.predict_proba(X_test.loc[[idx]])
+                example["prediction_confidence"] = float(proba.max())
+
+            examples.append(example)
+
+        return examples
+
+    def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
+        """Save trained model"""
+        model_dir = self.config.paths.models_dir / "experiments" / experiment_id
+        model_dir.mkdir(parents=True, exist_ok=True)
+
+        model_path = model_dir / "model.joblib"
+        model.save(str(model_path))
+
+        return model_path
+
+    def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
+        """Load a model from a completed experiment"""
+        experiment = self.tracker.get_experiment(experiment_id)
+
+        if experiment and experiment.model_path:
+            return BaseModel.load(experiment.model_path)
+
+        return None
+
+    def compare_experiments(
+        self, experiment_ids: List[str], metric: str = "accuracy"
+    ) -> pd.DataFrame:
+        """Compare experiments and return analysis"""
+        comparison_df = self.tracker.compare_experiments(experiment_ids)
+
+        if f"test_{metric}" in comparison_df.columns:
+            comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
+
+        return comparison_df
+
+    def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
+        """Get feature importance analysis for an experiment"""
+        experiment = self.tracker.get_experiment(experiment_id)
+
+        if experiment and experiment.feature_importance:
+            importance_df = pd.DataFrame(
+                [
+                    {"feature": feature, "importance": importance}
+                    for feature, importance in experiment.feature_importance.items()
+                ]
+            )
+            return importance_df.sort_values("importance", ascending=False)
+
+        return None
@@ -0,0 +1,194 @@
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Dict, List
+
+import pandas as pd
+
+from core.config import PipelineConfig, get_config
+
+from research.experiment import ExperimentConfig, ExperimentStatus
+from research.experiment.experiement_result import ExperimentResult
+
+
+class ExperimentTracker:
+    """Tracks and manages experiments"""
+
+    def __init__(self, config: Optional[PipelineConfig] = None):
+        self.config = config or get_config()
+        self.experiments_dir = self.config.paths.outputs_dir / "experiments"
+        self.experiments_dir.mkdir(parents=True, exist_ok=True)
+
+        self.results_db_path = self.experiments_dir / "experiments.json"
+        self._results: Dict[str, ExperimentResult] = {}
+        self._load_results()
+
+    def _load_results(self):
+        """Load existing experiment results"""
+        if self.results_db_path.exists():
+            try:
+                with open(self.results_db_path, "r") as f:
+                    data = json.load(f)
+
+                for exp_id, exp_data in data.items():
+                    self._results[exp_id] = ExperimentResult.from_dict(exp_data)
+            except Exception as e:
+                print(f"Warning: Failed to load experiment results: {e}")
+
+    def _save_results(self):
+        """Save experiment results to disk"""
+        data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
+
+        with open(self.results_db_path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+
+    def create_experiment(self, config: ExperimentConfig) -> str:
+        """Create a new experiment and return its ID"""
+        # Generate experiment ID
+        config_hash = hashlib.md5(
+            json.dumps(config.to_dict(), sort_keys=True).encode()
+        ).hexdigest()[:8]
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        experiment_id = f"{config.name}_{timestamp}_{config_hash}"
+
+        # Create result object
+        result = ExperimentResult(
+            experiment_id=experiment_id, config=config, start_time=datetime.now()
+        )
+
+        self._results[experiment_id] = result
+        self._save_results()
+
+        return experiment_id
+
+    def update_experiment(self, experiment_id: str, **updates):
+        """Update an experiment's results"""
+        if experiment_id in self._results:
+            result = self._results[experiment_id]
+
+            for key, value in updates.items():
+                if hasattr(result, key):
+                    setattr(result, key, value)
+
+            self._save_results()
+
+    def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
+        """Get experiment by ID"""
+        return self._results.get(experiment_id)
+
+    def list_experiments(
+        self,
+        status: Optional[ExperimentStatus] = None,
+        tags: Optional[List[str]] = None,
+        model_type: Optional[str] = None,
+    ) -> List[ExperimentResult]:
+        """List experiments with optional filtering"""
+        results = list(self._results.values())
+
+        if status:
+            results = [r for r in results if r.status == status]
+
+        if tags:
+            results = [r for r in results if any(tag in r.config.tags for tag in tags)]
+
+        if model_type:
+            results = [r for r in results if r.config.model_type == model_type]
+
+        return sorted(results, key=lambda x: x.start_time, reverse=True)
+
+    def get_best_experiment(
+        self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
+    ) -> Optional[ExperimentResult]:
+        """Get the best experiment based on a metric"""
+        experiments = self.list_experiments()
+
+        if filters:
+            # Apply additional filters
+            if "model_type" in filters:
+                experiments = [
+                    e for e in experiments if e.config.model_type == filters["model_type"]
+                ]
+            if "features" in filters:
+                experiments = [
+                    e
+                    for e in experiments
+                    if any(f in e.config.features for f in filters["features"])
+                ]
+
+        valid_experiments = []
+        for exp in experiments:
+            if exp.status == ExperimentStatus.COMPLETED:
+                metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
+                if metric in metrics_dict:
+                    valid_experiments.append((exp, metrics_dict[metric]))
+
+        if not valid_experiments:
+            return None
+
+        return max(valid_experiments, key=lambda x: x[1])[0]
+
+    def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
+        """Compare multiple experiments in a DataFrame"""
+        rows = []
+
+        for exp_id in experiment_ids:
+            exp = self.get_experiment(exp_id)
+            if exp:
+                row = {
+                    "experiment_id": exp_id,
+                    "name": exp.config.name,
+                    "model_type": exp.config.model_type,
+                    "features": ",".join([f.value for f in exp.config.features]),
+                    "status": exp.status.value,
+                    "train_size": exp.train_size,
+                    "test_size": exp.test_size,
+                }
+
+                # Add metrics
+                for metric, value in exp.test_metrics.items():
+                    row[f"test_{metric}"] = value
+
+                for metric, value in exp.cv_metrics.items():
+                    row[f"cv_{metric}"] = value
+
+                rows.append(row)
+
+        return pd.DataFrame(rows)
+
+    def export_results(self, output_path: Optional[Path] = None) -> Path:
+        """Export all results to CSV"""
+        if output_path is None:
+            output_path = (
+                self.experiments_dir
+                / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+            )
+
+        rows = []
+        for exp in self._results.values():
+            row = {
+                "experiment_id": exp.experiment_id,
+                "name": exp.config.name,
+                "description": exp.config.description,
+                "model_type": exp.config.model_type,
+                "features": ",".join([f.value for f in exp.config.features]),
+                "status": exp.status.value,
+                "start_time": exp.start_time.isoformat(),
+                "end_time": exp.end_time.isoformat() if exp.end_time else None,
+                "train_size": exp.train_size,
+                "test_size": exp.test_size,
+            }
+
+            # Add all metrics
+            for metric, value in exp.test_metrics.items():
+                row[f"test_{metric}"] = value
+
+            for metric, value in exp.cv_metrics.items():
+                row[f"cv_{metric}"] = value
+
+            rows.append(row)
+
+        df = pd.DataFrame(rows)
+        df.to_csv(output_path, index=False)
+
+        return output_path
@@ -0,0 +1,90 @@
+from enum import Enum
+from typing import List, Dict, Any, Union
+
+import pandas as pd
+
+
+class FeatureType(Enum):
+    """Types of features that can be extracted from names"""
+
+    FULL_NAME = "full_name"
+    NATIVE_NAME = "native_name"
+    SURNAME = "surname"
+    FIRST_WORD = "first_word"
+    LAST_WORD = "last_word"
+    NAME_LENGTH = "name_length"
+    WORD_COUNT = "word_count"
+    PROVINCE = "province"
+    CHAR_NGRAMS = "char_ngrams"
+    WORD_NGRAMS = "word_ngrams"
+    NAME_ENDINGS = "name_endings"
+    NAME_BEGINNINGS = "name_beginnings"
+
+
+class FeatureExtractor:
+    """Extract different types of features from name data"""
+
+    def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
+        self.feature_types = feature_types
+        self.feature_params = feature_params or {}
+
+    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Extract all configured features"""
+        features_df = pd.DataFrame(index=df.index)
+
+        for feature_type in self.feature_types:
+            feature_data = self._extract_single_feature(df, feature_type)
+
+            if isinstance(feature_data, pd.DataFrame):
+                features_df = pd.concat([features_df, feature_data], axis=1)
+            else:
+                features_df[feature_type.value] = feature_data
+
+        return features_df
+
+    def _extract_single_feature(
+        self, df: pd.DataFrame, feature_type: FeatureType
+    ) -> Union[pd.Series, pd.DataFrame]:
+        """Extract a single type of feature"""
+        if feature_type == FeatureType.FULL_NAME:
+            return df["name"].fillna("")
+
+        elif feature_type == FeatureType.NATIVE_NAME:
+            return df["identified_name"].fillna(df["probable_native"]).fillna("")
+
+        elif feature_type == FeatureType.SURNAME:
+            return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
+
+        elif feature_type == FeatureType.FIRST_WORD:
+            return df["name"].str.split().str[0].fillna("")
+
+        elif feature_type == FeatureType.LAST_WORD:
+            return df["name"].str.split().str[-1].fillna("")
+
+        elif feature_type == FeatureType.NAME_LENGTH:
+            return df["name"].str.len().fillna(0)
+
+        elif feature_type == FeatureType.WORD_COUNT:
+            return df["words"].fillna(1)
+
+        elif feature_type == FeatureType.PROVINCE:
+            return df["province"].fillna("unknown")
+
+        elif feature_type == FeatureType.NAME_ENDINGS:
+            n = self.feature_params.get("ending_length", 3)
+            return df["name"].str[-n:].fillna("")
+
+        elif feature_type == FeatureType.NAME_BEGINNINGS:
+            n = self.feature_params.get("beginning_length", 3)
+            return df["name"].str[:n].fillna("")
+
+        elif feature_type == FeatureType.CHAR_NGRAMS:
+            # This will be handled by the model's vectorizer
+            return df["name"].fillna("")
+
+        elif feature_type == FeatureType.WORD_NGRAMS:
+            # This will be handled by the model's vectorizer
+            return df["name"].fillna("")
+
+        else:
+            raise ValueError(f"Unknown feature type: {feature_type}")