refactoring: uv

2025-10-05 18:14:15 +02:00
parent f3b06fbd07
commit 9dd4f759b3
120 changed files with 5525 additions and 3366 deletions
@@ -0,0 +1,3 @@
+"""DRC NERS NLP package."""
+
+__all__: list[str] = []
@@ -0,0 +1,226 @@
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+
+import typer
+
+from ners.core.config import setup_config, PipelineConfig
+
+app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
+
+
+# -------------------------
+# Pipeline commands
+# -------------------------
+pipeline_app = typer.Typer(help="Data processing pipeline")
+app.add_typer(pipeline_app, name="pipeline")
+
+
+@pipeline_app.command("run")
+def pipeline_run(
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    """Run the full processing pipeline."""
+    from ners.main import run_pipeline as _run_pipeline
+
+    cfg = setup_config(config_path=config, env=env)
+    code = _run_pipeline(cfg)
+    raise typer.Exit(code)
+
+
+# -------------------------
+# NER commands
+# -------------------------
+ner_app = typer.Typer(help="NER dataset and model")
+app.add_typer(ner_app, name="ner")
+
+
+def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
+    return setup_config(config_path=config, env=env)
+
+
+@ner_app.command("feature")
+def ner_feature(
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    from ners.ner import feature as _feature
+
+    cfg = _load_config(config, env)
+    _feature(cfg)
+
+
+@ner_app.command("build")
+def ner_build(
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    from ners.ner import build as _build
+
+    cfg = _load_config(config, env)
+    _build(cfg)
+
+
+@ner_app.command("train")
+def ner_train(
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    from ners.ner import train as _train
+
+    cfg = _load_config(config, env)
+    _train(cfg)
+
+
+@ner_app.command("run")
+def ner_run(
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+    reset: bool = typer.Option(
+        False, help="Reset intermediate outputs and rerun all steps"
+    ),
+) -> None:
+    from ners.ner import run_pipeline as _ner_pipeline
+
+    cfg = _load_config(config, env)
+    code = _ner_pipeline(cfg, reset)
+    raise typer.Exit(code)
+
+
+# -------------------------
+# Research commands
+# -------------------------
+research_app = typer.Typer(help="Research experiments and training")
+app.add_typer(research_app, name="research")
+
+
+@research_app.command("train")
+def research_train(
+    name: str = typer.Option(..., "--name", help="Model name to train"),
+    type: str = typer.Option(..., "--type", help="Experiment type"),
+    templates: str = typer.Option(
+        "research_templates.yaml", help="Templates file path"
+    ),
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    from ners.research.experiment.experiment_builder import ExperimentBuilder
+    from ners.research.model_trainer import ModelTrainer
+
+    cfg = _load_config(config, env)
+    exp_builder = ExperimentBuilder(cfg)
+    tmpl = exp_builder.load_templates(templates)
+    exp_cfg = exp_builder.find_template(tmpl, name, type)
+
+    trainer = ModelTrainer(cfg)
+    trainer.train_single_model(
+        model_name=exp_cfg.get("name"),
+        model_type=exp_cfg.get("model_type"),
+        features=exp_cfg.get("features"),
+        model_params=exp_cfg.get("model_params", {}),
+        tags=exp_cfg.get("tags", []),
+    )
+
+
+# -------------------------
+# Monitor commands
+# -------------------------
+monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
+app.add_typer(monitor_app, name="monitor")
+
+
+@monitor_app.command("status")
+def monitor_status(
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+    detailed: bool = typer.Option(
+        False, help="Show detailed status (failed batch IDs)"
+    ),
+) -> None:
+    _ = _load_config(config, env)
+    from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
+
+    PipelineMonitor().print_status(detailed=detailed)
+
+
+@monitor_app.command("clean")
+def monitor_clean(
+    step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
+    keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
+    force: bool = typer.Option(False, help="Do not ask for confirmation"),
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    _ = _load_config(config, env)
+    from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
+
+    mon = PipelineMonitor()
+    if not force:
+        typer.confirm("Clean checkpoints?", abort=True)
+
+    if step:
+        mon.clean_step_checkpoints(step, keep_last)
+    else:
+        for s in mon.steps:
+            mon.clean_step_checkpoints(s, keep_last)
+
+
+@monitor_app.command("reset")
+def monitor_reset(
+    step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
+    force: bool = typer.Option(False, help="Do not ask for confirmation"),
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    _ = _load_config(config, env)
+    from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
+
+    mon = PipelineMonitor()
+    if not force:
+        msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
+        typer.confirm(msg, abort=True)
+
+    if step:
+        mon.reset_step(step)
+    else:
+        for s in mon.steps:
+            mon.reset_step(s)
+
+
+# -------------------------
+# Web commands
+# -------------------------
+web_app = typer.Typer(help="Web UI wrapper")
+app.add_typer(web_app, name="web")
+
+
+@web_app.command("run")
+def web_run(
+    config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
+    env: str = typer.Option("development", help="Environment name"),
+) -> None:
+    """Launch the Streamlit web app via subprocess."""
+    app_path = Path(__file__).parent / "web" / "app.py"
+    cmd = [
+        sys.executable,
+        "-m",
+        "streamlit",
+        "run",
+        str(app_path),
+    ]
+    # Pass configuration via environment variables to avoid argparse in Streamlit
+    env_vars = os.environ.copy()
+    if config is not None:
+        env_vars["NERS_CONFIG"] = str(config)
+    env_vars["NERS_ENV"] = env
+
+    raise typer.Exit(subprocess.call(cmd, env=env_vars))
+
+
+if __name__ == "__main__":  # pragma: no cover
+    app()
@@ -0,0 +1,95 @@
+import logging
+from pathlib import Path
+from typing import Optional, Union
+
+from ners.core.utils import ensure_directories
+from ners.core.config.config_manager import ConfigManager
+from ners.core.config.logging_config import LoggingConfig
+from ners.core.config.pipeline_config import PipelineConfig
+
+config_manager = ConfigManager()
+
+
+def get_config() -> PipelineConfig:
+    """Get the global configuration instance"""
+    return config_manager.get_config()
+
+
+def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
+    """Load configuration from specified path"""
+    if config_path:
+        return config_manager.load_config(Path(config_path))
+    return config_manager.get_config()
+
+
+def setup_config(
+    config_path: Optional[Path] = None, env: str = "development"
+) -> PipelineConfig:
+    """
+    Unified configuration loading and logging setup for all entrypoint scripts.
+
+    Args:
+        config_path: Direct path to config file (takes precedence over env)
+        env: Environment name (defaults to "development")
+
+    Returns:
+        Loaded configuration object
+    """
+    # Determine config path
+    if config_path is None:
+        config_path = Path("config") / f"pipeline.{env}.yaml"
+
+    # Load configuration
+    config = ConfigManager(config_path).load_config()
+
+    # Setup logging
+    setup_logging(config)
+
+    # Ensure required directories exist
+    ensure_directories(config)
+
+    logging.info(f"Loaded configuration: {config.name} v{config.version}")
+    logging.info(f"Environment: {config.environment}")
+    logging.info(f"Config file: {config_path}")
+
+    return config
+
+
+def setup_logging(config: PipelineConfig):
+    """Setup logging based on configuration"""
+
+    # Create logs directory
+    log_dir = config.paths.logs_dir
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    # Setup logging configuration
+    log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
+
+    # Create formatter
+    formatter = logging.Formatter(config.logging.format)
+
+    # Setup root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+
+    # Clear existing handlers
+    root_logger.handlers.clear()
+
+    # Console handler
+    if config.logging.console_logging:
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+
+    # File handler
+    if config.logging.file_logging:
+        from logging.handlers import RotatingFileHandler
+
+        log_file_path = log_dir / config.logging.log_file
+        file_handler = RotatingFileHandler(
+            log_file_path,
+            maxBytes=config.logging.max_log_size,
+            backupCount=config.logging.backup_count,
+        )
+        file_handler.setFormatter(formatter)
+        root_logger.addHandler(file_handler)
@@ -0,0 +1,30 @@
+from pydantic import BaseModel
+
+
+class NERConfig(BaseModel):
+    """NER annotation configuration"""
+
+    model_name: str = "drc_names_ner"
+    retry_attempts: int = 3
+
+
+class LLMConfig(BaseModel):
+    """LLM annotation configuration"""
+
+    model_name: str = "mistral:7b"
+    requests_per_minute: int = 60
+    requests_per_second: int = 2
+    retry_attempts: int = 3
+    timeout_seconds: int = 30
+    max_concurrent_requests: int = 2
+    enable_rate_limiting: bool = False
+
+
+class AnnotationConfig(BaseModel):
+    """Base class for annotation configurations"""
+
+    llm: LLMConfig = LLMConfig()
+    ner: NERConfig = NERConfig()
+
+    class Config:
+        arbitrary_types_allowed = True
@@ -0,0 +1,151 @@
+import json
+import logging
+from pathlib import Path
+from typing import Optional, Union, Dict, Any
+
+import yaml
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.core.config.project_paths import ProjectPaths
+
+
+class ConfigManager:
+    """Centralized configuration management"""
+
+    def __init__(self, config_path: Optional[Union[str, Path]] = None):
+        self.config_path = config_path or self._find_config_file()
+        self._config: Optional[PipelineConfig] = None
+        self._setup_default_paths()
+
+    @classmethod
+    def _find_config_file(cls) -> Path:
+        """Find configuration file in standard locations"""
+        possible_paths = [
+            Path.cwd() / "config" / "pipeline.yaml",
+            Path.cwd() / "config" / "pipeline.yml",
+            Path.cwd() / "pipeline.yaml",
+            Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
+        ]
+
+        for path in possible_paths:
+            if path.exists():
+                return path
+
+        # Return default path if none found
+        return Path.cwd() / "config" / "pipeline.yaml"
+
+    def _setup_default_paths(self):
+        """Setup default project paths"""
+        root_dir = Path(__file__).parent.parent.parent.parent.parent
+        self.default_paths = ProjectPaths(
+            root_dir=root_dir,
+            configs_dir=root_dir / "config",
+            data_dir=root_dir / "data" / "dataset",
+            models_dir=root_dir / "data" / "models",
+            outputs_dir=root_dir / "data" / "outputs",
+            logs_dir=root_dir / "data" / "logs",
+            checkpoints_dir=root_dir / "data" / "checkpoints",
+        )
+
+    def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
+        """Load configuration from file"""
+        if config_path:
+            self.config_path = config_path
+
+        if not self.config_path.exists():
+            logging.warning(
+                f"Config file not found: {self.config_path}. Using defaults."
+            )
+            return self._create_default_config()
+
+        try:
+            with open(self.config_path, "r") as f:
+                if self.config_path.suffix.lower() in [".yaml", ".yml"]:
+                    config_data = yaml.safe_load(f)
+                else:
+                    config_data = json.load(f)
+
+            # Ensure paths are properly set
+            if "paths" not in config_data:
+                config_data["paths"] = self.default_paths.model_dump()
+
+            self._config = PipelineConfig(**config_data)
+            return self._config
+
+        except Exception as e:
+            logging.error(f"Failed to load config from {self.config_path}: {e}")
+            return self._create_default_config()
+
+    def _create_default_config(self) -> PipelineConfig:
+        """Create default configuration"""
+        return PipelineConfig(paths=self.default_paths)
+
+    def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
+        """Save configuration to file"""
+        save_path = path or self.config_path
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+
+        config_dict = config.model_dump()
+
+        # Convert Path objects to strings for serialization
+        if "paths" in config_dict:
+            for key, value in config_dict["paths"].items():
+                if isinstance(value, Path):
+                    config_dict["paths"][key] = str(value)
+
+        try:
+            with open(save_path, "w") as f:
+                if save_path.suffix.lower() in [".yaml", ".yml"]:
+                    yaml.dump(config_dict, f, default_flow_style=False, indent=2)
+                else:
+                    json.dump(config_dict, f, indent=2)
+
+            logging.info(f"Configuration saved to {save_path}")
+
+        except Exception as e:
+            logging.error(f"Failed to save config to {save_path}: {e}")
+
+    def get_config(self) -> PipelineConfig:
+        """Get current configuration, loading if necessary"""
+        if self._config is None:
+            self._config = self.load_config()
+        return self._config
+
+    def update_config(self, updates: Dict[str, Any]):
+        """Update configuration with new values"""
+        config = self.get_config()
+
+        # Deep update configuration
+        config_dict = config.model_dump()
+        self._deep_update(config_dict, updates)
+
+        self._config = PipelineConfig(**config_dict)
+
+    def _deep_update(self, base_dict: Dict, update_dict: Dict):
+        """Recursively update nested dictionaries"""
+        for key, value in update_dict.items():
+            if (
+                key in base_dict
+                and isinstance(base_dict[key], dict)
+                and isinstance(value, dict)
+            ):
+                self._deep_update(base_dict[key], value)
+            else:
+                base_dict[key] = value
+
+    def get_environment_config(self, env: str) -> PipelineConfig:
+        """Load environment-specific configuration"""
+        env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
+
+        if env_config_path.exists():
+            base_config = self.load_config()
+            env_config = self.load_config(env_config_path)
+
+            # Merge configurations
+            base_dict = base_config.dict()
+            env_dict = env_config.dict()
+            self._deep_update(base_dict, env_dict)
+
+            return PipelineConfig(**base_dict)
+
+        return self.get_config()
@@ -0,0 +1,32 @@
+from dataclasses import field
+from typing import Dict, Optional
+
+from pydantic import BaseModel
+
+
+class DataConfig(BaseModel):
+    """Data handling configuration"""
+
+    input_file: str = "names.csv"
+    output_files: Dict[str, str] = field(
+        default_factory=lambda: {
+            "featured": "names_featured.csv",
+            "evaluation": "names_evaluation.csv",
+            "engineered": "names_engineered.csv",
+            "males": "names_males.csv",
+            "females": "names_females.csv",
+            "ner_data": "names_ner.json",
+            "ner_spacy": "names_ner.spacy",
+        }
+    )
+    selected_columns: list[str] = field(default=["name", "sex", "region"])
+    split_evaluation: bool = False
+    split_by_province: bool = True
+    split_by_gender: bool = True
+    split_ner_data: bool = True
+    evaluation_fraction: float = 0.2
+    random_seed: int = 42
+
+    # Dataset size limiting options
+    max_dataset_size: Optional[int] = None
+    balance_by_sex: bool = False
@@ -0,0 +1,13 @@
+from pydantic import BaseModel
+
+
+class LoggingConfig(BaseModel):
+    """Logging configuration"""
+
+    level: str = "INFO"
+    format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    file_logging: bool = True
+    console_logging: bool = True
+    log_file: str = "pipeline.log"
+    max_log_size: int = 10 * 1024 * 1024  # 10MB
+    backup_count: int = 5
@@ -0,0 +1,29 @@
+from pydantic import BaseModel
+
+from ners.core.config.annotation_config import AnnotationConfig
+from ners.core.config.data_config import DataConfig
+from ners.core.config.logging_config import LoggingConfig
+from ners.core.config.processing_config import ProcessingConfig
+from ners.core.config.project_paths import ProjectPaths
+
+
+class PipelineConfig(BaseModel):
+    """Main pipeline configuration"""
+
+    name: str = "drc_names_pipeline"
+    version: str = "1.0.0"
+    description: str = "DRC Names NLP Processing Pipeline"
+
+    paths: ProjectPaths
+    stages: list[str] = []
+    processing: ProcessingConfig = ProcessingConfig()
+    annotation: AnnotationConfig = AnnotationConfig()
+    data: DataConfig = DataConfig()
+    logging: LoggingConfig = LoggingConfig()
+
+    # Environment-specific settings
+    environment: str = "development"
+    debug: bool = True
+
+    class Config:
+        arbitrary_types_allowed = True
@@ -0,0 +1,17 @@
+from dataclasses import field
+
+from pydantic import BaseModel
+
+
+class ProcessingConfig(BaseModel):
+    """Data processing pipeline configuration"""
+
+    batch_size: int = 1000
+    max_workers: int = 4
+    checkpoint_interval: int = 5
+    use_multiprocessing: bool = False
+    encoding_options: list = field(
+        default_factory=lambda: ["utf-8", "utf-16", "latin1"]
+    )
+    chunk_size: int = 100_000
+    epochs: int = 2
@@ -0,0 +1,26 @@
+from pathlib import Path
+
+from pydantic import BaseModel, field_validator
+
+
+class ProjectPaths(BaseModel):
+    """Project directory structure configuration"""
+
+    root_dir: Path
+    data_dir: Path
+    models_dir: Path
+    outputs_dir: Path
+    logs_dir: Path
+    configs_dir: Path
+    checkpoints_dir: Path
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    @classmethod
+    @field_validator("*", mode="before")
+    def convert_to_path(cls, v):
+        return Path(v) if not isinstance(v, Path) else v
+
+    def get_data_path(self, filename: str) -> Path:
+        return self.data_dir / filename
@@ -0,0 +1,46 @@
+import logging
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from ners.core.config import PipelineConfig
+
+
+@contextmanager
+def temporary_config_override(**overrides):
+    """Context manager for temporarily overriding configuration"""
+    from ners.core.config import get_config
+
+    config = get_config()
+    original_values = {}
+
+    # Store original values and apply overrides
+    for key, value in overrides.items():
+        if hasattr(config, key):
+            original_values[key] = getattr(config, key)
+            setattr(config, key, value)
+
+    try:
+        yield config
+    finally:
+        # Restore original values
+        for key, value in original_values.items():
+            setattr(config, key, value)
+
+
+def ensure_directories(config: "PipelineConfig") -> None:
+    """Ensure all required directories exist"""
+    directories = [
+        config.paths.data_dir,
+        config.paths.models_dir,
+        config.paths.outputs_dir,
+        config.paths.logs_dir,
+        config.paths.configs_dir,
+        config.paths.checkpoints_dir,
+    ]
+
+    for directory in directories:
+        Path(directory).mkdir(parents=True, exist_ok=True)
+
+    logging.info("Ensured all required directories exist")
@@ -0,0 +1,174 @@
+import gc
+import logging
+from pathlib import Path
+from typing import Optional, Union, Iterator, Dict
+
+import pandas as pd
+
+from ners.core.config.pipeline_config import PipelineConfig
+
+OPTIMIZED_DTYPES = {
+    # Numeric columns with appropriate bit-width
+    "year": "Int16",  # Years fit in 16-bit integer
+    "words": "Int8",  # Word counts typically < 128
+    "length": "Int16",  # Name lengths fit in 16-bit
+    "annotated": "Int8",  # Binary flag (0/1)
+    "ner_tagged": "Int8",  # Binary flag (0/1)
+    # Categorical columns (memory efficient for repeated values)
+    "sex": "category",
+    "province": "category",
+    "region": "category",
+    "identified_category": "category",
+    "transformation_type": "category",
+    # String columns with proper string dtype
+    "name": "string",
+    "probable_native": "string",
+    "probable_surname": "string",
+    "identified_name": "string",
+    "identified_surname": "string",
+    "ner_entities": "string",
+}
+
+
+class DataLoader:
+    """Reusable data loading utilities"""
+
+    def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
+        self.config = config
+        self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
+
+    def load_csv_chunked(
+        self, filepath: Union[str, Path], chunk_size: Optional[int] = None
+    ) -> Iterator[pd.DataFrame]:
+        """Load CSV file in chunks for memory efficiency"""
+        chunk_size = chunk_size or self.config.processing.chunk_size
+        encodings = self.config.processing.encoding_options
+        filepath = Path(filepath)
+
+        for encoding in encodings:
+            try:
+                logging.info(f"Reading {filepath} with encoding: {encoding}")
+
+                # Read with optimal dtypes
+                chunk_iter = pd.read_csv(
+                    filepath,
+                    encoding=encoding,
+                    chunksize=chunk_size,
+                    on_bad_lines="skip",
+                    dtype=self.dtypes,
+                )
+
+                for i, chunk in enumerate(chunk_iter):
+                    logging.debug(f"Processing optimized chunk {i + 1}")
+                    yield chunk
+
+                logging.info(f"Successfully read {filepath} with encoding: {encoding}")
+                return
+
+            except Exception as e:
+                logging.warning(f"Failed with encoding {encoding}: {e}")
+                continue
+
+        raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
+
+    def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
+        """Load complete CSV with memory optimization"""
+        chunks = []
+        for chunk in self.load_csv_chunked(filepath):
+            chunks.append(chunk)
+
+        if not chunks:
+            return pd.DataFrame()
+
+        logging.info(f"Concatenating {len(chunks)} optimized chunks")
+        df = pd.concat(chunks, ignore_index=True, copy=False)
+
+        # Cleanup chunks from memory
+        del chunks
+        gc.collect()
+
+        # Apply dataset size limiting if configured
+        if self.config.data.max_dataset_size is not None:
+            df = self._limit_dataset_size(df)
+
+        return df
+
+    def _limit_dataset_size(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Limit dataset size with optional sex balancing"""
+        max_size = self.config.data.max_dataset_size
+
+        if max_size is None or len(df) <= max_size:
+            return df
+
+        if self.config.data.balance_by_sex and "sex" in df.columns:
+            return self._balanced_sample(df, max_size)
+        else:
+            # Simple random sampling
+            return df.sample(n=max_size, random_state=self.config.data.random_seed)
+
+    def _balanced_sample(self, df: pd.DataFrame, max_size: int) -> pd.DataFrame:
+        """Sample data with balanced sex distribution"""
+
+        # Get unique sex values
+        sex_values = df["sex"].dropna().unique()
+
+        if len(sex_values) == 0:
+            logging.warning(
+                "No valid values found in sex column 'sex', using random sampling"
+            )
+            return df.sample(n=max_size, random_state=self.config.data.random_seed)
+
+        # Calculate samples per sex category
+        samples_per_sex = max_size // len(sex_values)
+        remaining_samples = max_size % len(sex_values)
+
+        balanced_samples = []
+
+        for i, sex in enumerate(sex_values):
+            # Use boolean indexing instead of creating temporary DataFrames
+            sex_mask = df["sex"] == sex
+            sex_indices = df[sex_mask].index
+
+            # Distribute remaining samples to first categories
+            current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
+            current_samples = min(current_samples, len(sex_indices))
+
+            if current_samples > 0:
+                # Sample indices instead of DataFrame
+                sampled_indices = pd.Series(sex_indices).sample(
+                    n=current_samples, random_state=self.config.data.random_seed + i
+                )
+                balanced_samples.extend(sampled_indices.tolist())
+                logging.info(f"Sampled {current_samples} records for sex '{sex}'")
+
+        if not balanced_samples:
+            logging.warning(
+                "No balanced samples could be created, using random sampling"
+            )
+            return df.sample(n=max_size, random_state=self.config.data.random_seed)
+
+        # Create result using iloc with indices (no copying until final step)
+        result = df.iloc[balanced_samples].copy()
+
+        # Shuffle the final result
+        result = result.sample(
+            frac=1, random_state=self.config.data.random_seed
+        ).reset_index(drop=True)
+
+        logging.info(
+            f"Created balanced dataset with {len(result)} records from {len(df)} total"
+        )
+        return result
+
+    @classmethod
+    def save_csv(
+        cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
+    ) -> None:
+        """Save DataFrame to CSV with proper handling"""
+        filepath = Path(filepath)
+
+        if create_dirs:
+            filepath.parent.mkdir(parents=True, exist_ok=True)
+
+        df.to_csv(filepath, index=False, encoding="utf-8", sep=",", quoting=1)
+        logging.info(f"Saved {len(df)} rows to {filepath}")
@@ -0,0 +1,24 @@
+from ners.core.config.pipeline_config import PipelineConfig
+
+
+class PromptManager:
+    """Manage prompts for LLM operations"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.prompts_dir = self.config.paths.configs_dir / "prompts"
+
+    def load_prompt(self, prompt_name: str = "default") -> str:
+        """Load a prompt template"""
+        prompt_file = self.prompts_dir / f"{prompt_name}.txt"
+
+        if not prompt_file.exists():
+            # Fallback to root directory
+            fallback_file = self.config.paths.root_dir / "prompt.txt"
+            if fallback_file.exists():
+                prompt_file = fallback_file
+            else:
+                raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
+
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            return f.read().strip()
@@ -0,0 +1,56 @@
+import threading
+import time
+from dataclasses import dataclass
+from queue import Queue
+
+
+@dataclass
+class RateLimitConfig:
+    """Configuration for rate limiting LLM requests"""
+
+    requests_per_minute: int = 60
+    requests_per_second: int = 2
+    burst_limit: int = 5
+
+
+class RateLimiter:
+    """Thread-safe rate limiter for LLM requests"""
+
+    def __init__(self, config: RateLimitConfig):
+        self.config = config
+        self.request_times = Queue()
+        self.lock = threading.Lock()
+        self.last_request_time = 0
+
+    def wait_if_needed(self):
+        """Wait if necessary to respect rate limits"""
+        with self.lock:
+            current_time = time.time()
+
+            # Check requests per second limit
+            time_since_last = current_time - self.last_request_time
+            min_interval = 1.0 / self.config.requests_per_second
+
+            if time_since_last < min_interval:
+                sleep_time = min_interval - time_since_last
+                time.sleep(sleep_time)
+                current_time = time.time()
+
+            # Clean old request times (older than 1 minute)
+            while not self.request_times.empty():
+                if current_time - self.request_times.queue[0] > 60:
+                    self.request_times.get()
+                else:
+                    break
+
+            # Check requests per minute limit
+            if self.request_times.qsize() >= self.config.requests_per_minute:
+                oldest_request = self.request_times.queue[0]
+                wait_time = 60 - (current_time - oldest_request)
+                if wait_time > 0:
+                    time.sleep(wait_time)
+                    current_time = time.time()
+
+            # Record this request
+            self.request_times.put(current_time)
+            self.last_request_time = current_time
@@ -0,0 +1,174 @@
+import unicodedata
+from typing import Optional, Dict, Tuple
+
+import pandas as pd
+
+
+class RegionMapper:
+    """Reusable region mapping utilities"""
+
+    def __init__(self, mapping: Optional[Dict] = None):
+        self.mapping = mapping or REGION_MAPPING
+        self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
+
+    def map(self, series: pd.Series) -> pd.Series:
+        return series.str.lower().map(self.mapping).fillna("AUTRES")
+
+    @staticmethod
+    def clean_province(series: pd.Series) -> pd.Series:
+        return (
+            series.str.upper()
+            .str.strip()
+            .apply(
+                lambda x: (
+                    unicodedata.normalize("NFKD", x)
+                    .encode("ascii", errors="ignore")
+                    .decode("utf-8")
+                    if isinstance(x, str)
+                    else x
+                )
+            )
+        )
+
+    @staticmethod
+    def get_provinces():
+        return [
+            "kinshasa",
+            "bas-congo",
+            "bandundu",
+            "katanga",
+            "equateur",
+            "orientale",
+            "maniema",
+            "nord-kivu",
+            "sud-kivu",
+            "kasai-occidental",
+            "kasai-oriental",
+            "autres",
+        ]
+
+
+# DRC Region to Province Mapping
+REGION_MAPPING: Dict[str, Tuple[str, str]] = {
+    "bandundu": ("BANDUNDU", "BANDUNDU"),
+    "bandundu-1": ("BANDUNDU", "BANDUNDU"),
+    "bandundu-2": ("BANDUNDU", "BANDUNDU"),
+    "bandundu-3": ("BANDUNDU", "BANDUNDU"),
+    "bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "bas-fleuve": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "bas-uele": ("BAS-UELE", "ORIENTALE"),
+    "bas-uele-1": ("BAS-UELE", "ORIENTALE"),
+    "bas-uele-2": ("BAS-UELE", "ORIENTALE"),
+    "cataractes": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "equateur": ("EQUATEUR", "EQUATEUR"),
+    "equateur-1": ("EQUATEUR", "EQUATEUR"),
+    "equateur-2": ("EQUATEUR", "EQUATEUR"),
+    "equateur-3": ("EQUATEUR", "EQUATEUR"),
+    "equateur-4": ("EQUATEUR", "EQUATEUR"),
+    "equateur-5": ("EQUATEUR", "EQUATEUR"),
+    "haut-katanga": ("HAUT-KATANGA", "KATANGA"),
+    "haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
+    "haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
+    "haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
+    "haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
+    "haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
+    "haut-uele": ("HAUT-UELE", "ORIENTALE"),
+    "haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
+    "haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
+    "ituri": ("ITURI", "ORIENTALE"),
+    "ituri-1": ("ITURI", "ORIENTALE"),
+    "ituri-2": ("ITURI", "ORIENTALE"),
+    "ituri-3": ("ITURI", "ORIENTALE"),
+    "kasai": ("KASAI", "KASAI-OCCIDENTAL"),
+    "kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
+    "kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
+    "kasai-ce": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
+    "kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
+    "kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
+    "kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
+    "kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
+    "kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
+    "kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
+    "kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "kasai-orientale": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "katanga": ("KATANGA", "KATANGA"),
+    "katanga-1": ("KATANGA", "KATANGA"),
+    "katanga-2": ("KATANGA", "KATANGA"),
+    "katanga-3": ("KATANGA", "KATANGA"),
+    "katanga-4": ("KATANGA", "KATANGA"),
+    "kinshasa": ("KINSHASA", "KINSHASA"),
+    "kinshasa-centre": ("KINSHASA", "KINSHASA"),
+    "kinshasa-est": ("KINSHASA", "KINSHASA"),
+    "kinshasa-funa": ("KINSHASA", "KINSHASA"),
+    "kinshasa-global": ("KINSHASA", "KINSHASA"),
+    "kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
+    "kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
+    "kinshasa-ouest": ("KINSHASA", "KINSHASA"),
+    "kinshasa-plateau": ("KINSHASA", "KINSHASA"),
+    "kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
+    "kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kwango": ("KWANGO", "BANDUNDU"),
+    "kwango-1": ("KWANGO", "BANDUNDU"),
+    "kwango-2": ("KWANGO", "BANDUNDU"),
+    "kwilu": ("KWILU", "BANDUNDU"),
+    "kwilu-1": ("KWILU", "BANDUNDU"),
+    "kwilu-2": ("KWILU", "BANDUNDU"),
+    "kwilu-3": ("KWILU", "BANDUNDU"),
+    "lomami": ("LOMAMI", "KASAI-ORIENTAL"),
+    "lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
+    "lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
+    "lualaba": ("LUALABA", "KATANGA"),
+    "lualaba-1": ("LUALABA", "KATANGA"),
+    "lualaba-2": ("LUALABA", "KATANGA"),
+    "lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
+    "lukaya": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
+    "mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
+    "mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
+    "mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
+    "maniema": ("MANIEMA", "MANIEMA"),
+    "maniema-1": ("MANIEMA", "MANIEMA"),
+    "maniema-2": ("MANIEMA", "MANIEMA"),
+    "mongala": ("MONGALA", "EQUATEUR"),
+    "mongala-1": ("MONGALA", "EQUATEUR"),
+    "mongala-2": ("MONGALA", "EQUATEUR"),
+    "nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
+    "nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
+    "nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
+    "nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
+    "nord-ubangi": ("NORD-UBANGI", "EQUATEUR"),
+    "nord-ubangi-1": ("NORD-UBANGI", "EQUATEUR"),
+    "nord-ubangi-2": ("NORD-UBANGI", "EQUATEUR"),
+    "province-orientale": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-1": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-2": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-3": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-4": ("ORIENTALE", "ORIENTALE"),
+    "sankuru": ("SANKURU", "KASAI-ORIENTAL"),
+    "sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
+    "sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
+    "sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
+    "sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
+    "sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
+    "sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
+    "sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
+    "sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
+    "sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
+    "tanganyika": ("TANGANYIKA", "KATANGA"),
+    "tanganyika-1": ("TANGANYIKA", "KATANGA"),
+    "tanganyika-2": ("TANGANYIKA", "KATANGA"),
+    "tshopo": ("TSHOPO", "ORIENTALE"),
+    "tshopo-1": ("TSHOPO", "ORIENTALE"),
+    "tshopo-2": ("TSHOPO", "ORIENTALE"),
+    "tshuapa": ("TSHUAPA", "EQUATEUR"),
+    "tshuapa-1": ("TSHUAPA", "EQUATEUR"),
+    "tshuapa-2": ("TSHUAPA", "EQUATEUR"),
+}
@@ -0,0 +1,41 @@
+import json
+import logging
+from typing import Dict, Any
+
+from ners.core.config.pipeline_config import PipelineConfig
+
+
+class StateManager:
+    """Manage pipeline state and checkpoints"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.checkpoints_dir = self.config.paths.checkpoints_dir
+
+    def save_state(self, state: Dict[str, Any], state_name: str) -> None:
+        """Save pipeline state"""
+        self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
+        state_file = self.checkpoints_dir / f"{state_name}.json"
+
+        with open(state_file, "w") as f:
+            json.dump(state, f, indent=2, default=str)
+
+        logging.debug(f"Saved state to {state_file}")
+
+    def load_state(self, state_name: str) -> Dict[str, Any]:
+        """Load pipeline state"""
+        state_file = self.checkpoints_dir / f"{state_name}.json"
+
+        if not state_file.exists():
+            return {}
+
+        with open(state_file, "r") as f:
+            return json.load(f)
+
+    def clear_state(self, state_name: str) -> None:
+        """Clear pipeline state"""
+        state_file = self.checkpoints_dir / f"{state_name}.json"
+
+        if state_file.exists():
+            state_file.unlink()
+            logging.info(f"Cleared state: {state_name}")
@@ -0,0 +1,37 @@
+from typing import Optional, Dict
+
+import pandas as pd
+
+
+class TextCleaner:
+    """Reusable text cleaning utilities"""
+
+    def __init__(self, patterns: Optional[Dict[str, str]] = None):
+        self.patterns = patterns or {
+            "null_bytes": "\x00",
+            "non_breaking_spaces": "\u00a0",
+            "multiple_spaces": r" +",
+            "extra_whitespace": r"\s+",
+        }
+
+    def clean_text_series(self, series: pd.Series) -> pd.Series:
+        """Clean a pandas Series of text data"""
+        cleaned = series.astype(str)
+
+        # Apply cleaning patterns
+        for pattern_name, pattern in self.patterns.items():
+            if pattern_name == "multiple_spaces":
+                cleaned = cleaned.str.replace(pattern, " ", regex=True)
+            else:
+                cleaned = cleaned.str.replace(pattern, " ", regex=False)
+
+        return cleaned.str.strip().str.lower()
+
+    def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Clean all text columns in a DataFrame"""
+        df = df.copy()
+        columns = df.select_dtypes(include=["object", "string"]).columns
+        for col in columns:
+            df[col] = self.clean_text_series(df[col])
+
+        return df
@@ -0,0 +1,75 @@
+#!.venv/bin/python3
+import logging
+from ners.core.utils.data_loader import DataLoader
+from ners.processing.batch.batch_config import BatchConfig
+from ners.processing.pipeline import Pipeline
+from ners.processing.steps.data_cleaning_step import DataCleaningStep
+from ners.processing.steps.data_selection_step import DataSelectionStep
+from ners.processing.steps.data_splitting_step import DataSplittingStep
+from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
+from ners.processing.steps.ner_annotation_step import NERAnnotationStep
+from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
+
+
+def create_pipeline(config) -> Pipeline:
+    batch_config = BatchConfig(
+        batch_size=config.processing.batch_size,
+        max_workers=config.processing.max_workers,
+        checkpoint_interval=config.processing.checkpoint_interval,
+        use_multiprocessing=config.processing.use_multiprocessing,
+    )
+
+    pipeline = Pipeline(batch_config)
+    steps = [
+        DataCleaningStep(config),
+        FeatureExtractionStep(config),
+        DataSelectionStep(config),
+        NERAnnotationStep(config),
+        LLMAnnotationStep(config),
+    ]
+
+    for stage in config.stages:
+        for step in steps:
+            if step.name == stage:
+                pipeline.add_step(step)
+
+    return pipeline
+
+
+def run_pipeline(config) -> int:
+    try:
+        logging.info(f"Starting pipeline: {config.name} v{config.version}")
+
+        # Load input data
+        input_file_path = config.paths.get_data_path(config.data.input_file)
+        if not input_file_path.exists():
+            logging.error(f"Input file not found: {input_file_path}")
+            return 1
+
+        data_loader = DataLoader(config)
+        data_splitter = DataSplittingStep(config)
+        logging.info(f"Loading data from {input_file_path}")
+        df = data_loader.load_csv_complete(input_file_path)
+        logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
+
+        # Create and run pipeline
+        pipeline = create_pipeline(config)
+        data_splitter.split(pipeline.run(df))
+
+        # Show completion statistics
+        progress = pipeline.get_progress()
+        logging.info("=== Pipeline Completion Summary ===")
+        for step_name, stats in progress.items():
+            logging.info(
+                f"{step_name}: {stats['completion_percentage']:.1f}% "
+                f"({stats['processed_batches']}/{stats['total_batches']} batches)"
+            )
+            if stats["failed_batches"] > 0:
+                logging.warning(f"  {stats['failed_batches']} failed batches")
+
+        logging.info("Pipeline completed successfully")
+        return 0
+
+    except Exception as e:
+        logging.error(f"Pipeline failed: {e}", exc_info=True)
+        return 1
@@ -0,0 +1,14 @@
+#!.venv/bin/python3
+from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
+
+
+def status(*, detailed: bool = False) -> None:
+    PipelineMonitor().print_status(detailed=detailed)
+
+
+def clean_step(step: str, *, keep_last: int = 1) -> None:
+    PipelineMonitor().clean_step_checkpoints(step, keep_last)
+
+
+def reset_step(step: str) -> None:
+    PipelineMonitor().reset_step(step)
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+import logging
+import os
+import traceback
+from pathlib import Path
+
+from ners.core.config import PipelineConfig
+from ners.processing.ner.name_builder import NameBuilder
+from ners.processing.ner.name_engineering import NameEngineering
+from ners.processing.ner.name_model import NameModel
+
+
+def feature(config: PipelineConfig):
+    NameEngineering(config).compute()
+
+
+def build(config: PipelineConfig):
+    NameBuilder(config).build()
+
+
+def train(config: PipelineConfig):
+    name_model = NameModel(config)
+
+    data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
+    if not data_path.exists():
+        logging.info("NER data not found. Building dataset first...")
+        build(config)
+
+    name_model.create_blank_model("fr")
+    data = name_model.load_data(str(data_path))
+
+    split_idx = int(len(data) * 0.9)
+    train_data, eval_data = data[:split_idx], data[split_idx:]
+
+    logging.info(
+        f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
+    )
+    name_model.train(
+        data=train_data,
+        epochs=config.processing.epochs,
+        batch_size=config.processing.batch_size,
+        dropout_rate=0.3,
+    )
+    evaluation_results = name_model.evaluate(eval_data)
+
+    model_path = name_model.save()
+    logging.info(f"Model saved to: {model_path}")
+    print(f"Evaluation results: {evaluation_results}")
+
+
+def run_pipeline(config: PipelineConfig, reset: bool = False):
+    if not reset and os.path.exists(
+        config.paths.get_data_path(config.data.output_files["engineered"])
+    ):
+        logging.info("Step 1: Feature engineering already done.")
+    else:
+        logging.info("Step 1: Running feature engineering")
+        feature(config)
+
+    if not reset and os.path.exists(
+        config.paths.get_data_path(config.data.output_files["ner_data"])
+    ):
+        logging.info("Step 2: NER dataset already built.")
+    else:
+        logging.info("Step 2: Building NER dataset")
+        build(config)
+
+    logging.info("Step 3: Training NER Model")
+    train(config)
+
+    return 0
+
+
+def main():
+    try:
+        logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
+        return 1
+    except Exception:
+        traceback.print_exc()
+        return 1
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class BatchConfig:
+    """Configuration for batch processing"""
+
+    batch_size: int = 1000
+    max_workers: int = 4
+    checkpoint_interval: int = 5  # Save checkpoint every N batches
+    use_multiprocessing: bool = (
+        False  # Use ProcessPoolExecutor instead of ThreadPoolExecutor
+    )
@@ -0,0 +1,173 @@
+import logging
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from typing import Iterator
+
+import pandas as pd
+
+from ners.processing.batch.batch_config import BatchConfig
+from ners.processing.batch.memory_monitor import MemoryMonitor
+from ners.processing.steps import PipelineStep
+
+
+class BatchProcessor:
+    """Handles batch processing with concurrency and checkpointing"""
+
+    def __init__(self, config: BatchConfig):
+        self.config = config
+        self.memory_monitor = MemoryMonitor()
+
+    def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
+        """Create batches from DataFrame without unnecessary copies"""
+        total_rows = len(df)
+        batch_size = self.config.batch_size
+
+        for i in range(0, total_rows, batch_size):
+            batch = df.iloc[i : i + batch_size]
+            batch_id = i // batch_size
+            yield batch, batch_id
+
+    def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
+        """Memory-optimized sequential processing"""
+        results = []
+        memory_threshold_mb = 1000  # Clean memory when usage exceeds 1 GB
+
+        for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
+            if step.batch_exists(batch_id):
+                logging.info(
+                    f"Batch {batch_id} already processed, loading from checkpoint"
+                )
+                processed_batch = step.load_batch(batch_id)
+            else:
+                try:
+                    # Only copy if the processing step requires mutation
+                    if step.requires_batch_mutation:
+                        batch_copy = batch.copy()
+                        processed_batch = step.process_batch(batch_copy, batch_id)
+                    else:
+                        processed_batch = step.process_batch(batch, batch_id)
+
+                    step.save_batch(processed_batch, batch_id)
+                    step.state.processed_batches += 1
+                except Exception as e:
+                    logging.error(f"Failed to process batch {batch_id}: {e}")
+                    step.state.failed_batches.append(batch_id)
+                    continue
+
+            results.append(processed_batch)
+
+            # Memory management
+            if batch_num % self.config.checkpoint_interval == 0:
+                current_memory = self.memory_monitor.get_memory_usage_mb()
+                if current_memory > memory_threshold_mb:
+                    logging.info(f"Memory cleanup triggered at {current_memory:.1f} MB")
+                    self.memory_monitor.cleanup_memory()
+
+            # Save state periodically
+            if batch_id % self.config.checkpoint_interval == 0:
+                step.save_state()
+
+        # Final memory cleanup before concatenation
+        self.memory_monitor.cleanup_memory()
+        self.memory_monitor.log_memory_usage("before_concat")
+
+        result = self._safe_concat(results) if results else pd.DataFrame()
+
+        # Final cleanup
+        del results
+        self.memory_monitor.cleanup_memory()
+        self.memory_monitor.log_memory_usage("sequential_complete")
+
+        return result
+
+    def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
+        """Memory-optimized concurrent processing"""
+        executor_class = (
+            ProcessPoolExecutor
+            if self.config.use_multiprocessing
+            else ThreadPoolExecutor
+        )
+        results = {}
+
+        with executor_class(max_workers=self.config.max_workers) as executor:
+            # Submit all batches
+            future_to_batch = {}
+            for batch, batch_id in self.create_batches(df):
+                if step.batch_exists(batch_id):
+                    logging.info(
+                        f"Batch {batch_id} already processed, loading from checkpoint"
+                    )
+                    results[batch_id] = step.load_batch(batch_id)
+                else:
+                    # Only copy if necessary for concurrent processing
+                    batch_copy = batch.copy() if step.requires_batch_mutation else batch
+                    future = executor.submit(step.process_batch, batch_copy, batch_id)
+                    future_to_batch[future] = (batch_id, batch)
+
+            # Collect results as they complete
+            for future in as_completed(future_to_batch):
+                batch_id, batch = future_to_batch[future]
+                try:
+                    processed_batch = future.result()
+                    step.save_batch(processed_batch, batch_id)
+                    results[batch_id] = processed_batch
+                    step.state.processed_batches += 1
+                    logging.info(f"Completed batch {batch_id}")
+                except Exception as e:
+                    logging.error(f"Failed to process batch {batch_id}: {e}")
+                    step.state.failed_batches.append(batch_id)
+
+        # Memory-efficient reassembly
+        ordered_results = []
+        for batch_id in sorted(results.keys()):
+            ordered_results.append(results[batch_id])
+
+        step.save_state()
+
+        # Cleanup before concat
+        del results
+        self.memory_monitor.cleanup_memory()
+
+        result = (
+            self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
+        )
+
+        # Final cleanup
+        del ordered_results
+        self.memory_monitor.cleanup_memory()
+
+        return result
+
+    def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
+        """Process data using the configured strategy"""
+        step.state.total_batches = (
+            len(df) + self.config.batch_size - 1
+        ) // self.config.batch_size
+        step.load_state()
+
+        logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
+        self.memory_monitor.log_memory_usage("process_start")
+
+        if self.config.max_workers == 1:
+            result = self.process_sequential(step, df)
+        else:
+            result = self.process_concurrent(step, df)
+
+        self.memory_monitor.log_memory_usage("process_complete")
+        return result
+
+    def _safe_concat(self, dfs: list) -> pd.DataFrame:
+        """Memory-safe concatenation with monitoring"""
+        if not dfs:
+            return pd.DataFrame()
+
+        memory = self.memory_monitor.get_memory_usage_mb()
+        logging.info(f"Starting concat of {len(dfs)} DataFrames at {memory:.1f} MB")
+
+        # Use copy=False to avoid unnecessary copying during concat
+        result = pd.concat(dfs, ignore_index=True, copy=False)
+
+        # Monitor memory after concat
+        memory = self.memory_monitor.get_memory_usage_mb()
+        logging.info(f"Concat complete. Memory: {memory:.1f} MB")
+
+        return result
@@ -0,0 +1,25 @@
+import gc
+import logging
+
+import psutil
+
+
+class MemoryMonitor:
+    """Monitor and manage memory usage during batch processing"""
+
+    @staticmethod
+    def get_memory_usage_mb() -> float:
+        """Get current memory usage in MB"""
+        process = psutil.Process()
+        return process.memory_info().rss / 1024 / 1024
+
+    @staticmethod
+    def cleanup_memory():
+        """Force garbage collection"""
+        gc.collect()
+
+    @staticmethod
+    def log_memory_usage(step_name: str):
+        """Log current memory usage"""
+        memory_mb = MemoryMonitor.get_memory_usage_mb()
+        logging.info(f"Memory usage after {step_name}: {memory_mb:.1f} MB")
@@ -0,0 +1,196 @@
+import json
+import logging
+import shutil
+from datetime import datetime
+from typing import Optional, Dict
+
+from ners.core.config.config_manager import ConfigManager
+from ners.core.config.project_paths import ProjectPaths
+
+
+class PipelineMonitor:
+    """Monitor and manage pipeline execution"""
+
+    def __init__(self, paths: Optional[ProjectPaths] = None):
+        if paths is None:
+            # Use default configuration if none provided
+            config_manager = ConfigManager()
+            paths = config_manager.default_paths
+
+        self.paths = paths
+        self.checkpoint_dir = paths.checkpoints_dir
+        self.steps = [
+            "data_cleaning",
+            "data_selection",
+            "feature_extraction",
+            "ner_annotation",
+            "llm_annotation",
+            "data_splitting",
+        ]
+
+    def get_step_status(self, step_name: str) -> Dict:
+        """Get status of a specific pipeline step"""
+        step_dir = self.checkpoint_dir / step_name
+        state_file = step_dir / "pipeline_state.json"
+
+        if not state_file.exists():
+            return {
+                "step": step_name,
+                "status": "not_started",
+                "processed_batches": 0,
+                "total_batches": 0,
+                "failed_batches": 0,
+                "completion_percentage": 0.0,
+            }
+
+        try:
+            with open(state_file, "r") as f:
+                state = json.load(f)
+
+            processed = state.get("processed_batches", 0)
+            total = state.get("total_batches", 0)
+            failed = len(state.get("failed_batches", []))
+
+            if total == 0:
+                completion = 0.0
+                status = "not_started"
+            elif processed >= total:
+                completion = 100.0
+                status = "completed" if failed == 0 else "completed_with_errors"
+            else:
+                completion = (processed / total) * 100
+                status = "in_progress"
+
+            return {
+                "step": step_name,
+                "status": status,
+                "processed_batches": processed,
+                "total_batches": total,
+                "failed_batches": failed,
+                "completion_percentage": completion,
+                "last_checkpoint": state.get("last_checkpoint"),
+                "failed_batch_ids": state.get("failed_batches", []),
+            }
+
+        except Exception as e:
+            logging.error(f"Error reading state for {step_name}: {e}")
+            return {"step": step_name, "status": "error", "error": str(e)}
+
+    def get_pipeline_status(self) -> Dict:
+        """Get overall pipeline status"""
+        step_statuses = {}
+        overall_status = "not_started"
+        total_completion = 0.0
+
+        for step in self.steps:
+            status = self.get_step_status(step)
+            step_statuses[step] = status
+
+            if status["status"] == "error":
+                overall_status = "error"
+            elif status["status"] in ["in_progress"]:
+                overall_status = "in_progress"
+            elif status["status"] == "completed_with_errors":
+                overall_status = "completed_with_errors"
+
+            total_completion += status.get("completion_percentage", 0)
+
+        avg_completion = total_completion / len(self.steps)
+
+        if avg_completion >= 100 and overall_status not in [
+            "error",
+            "completed_with_errors",
+        ]:
+            overall_status = "completed"
+
+        return {
+            "overall_status": overall_status,
+            "overall_completion": avg_completion,
+            "steps": step_statuses,
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    def print_status(self, detailed: bool = False):
+        """Print pipeline status in a human-readable format"""
+        status = self.get_pipeline_status()
+
+        print("\n=== Pipeline Status ===")
+        print(f"Overall Status: {status['overall_status'].upper()}")
+        print(f"Overall Completion: {status['overall_completion']:.1f}%")
+        print(f"Last Updated: {status['timestamp']}")
+        print()
+
+        for step_name, step_status in status["steps"].items():
+            print(f"{step_name.replace('_', ' ').title()}:")
+            print(f"  Status: {step_status['status']}")
+            print(f"  Progress: {step_status['completion_percentage']:.1f}%")
+            print(
+                f"  Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
+            )
+
+            if step_status["failed_batches"] > 0:
+                print(f"  Failed Batches: {step_status['failed_batches']}")
+
+                if detailed and "failed_batch_ids" in step_status:
+                    print(f"  Failed Batch IDs: {step_status['failed_batch_ids']}")
+
+            print()
+
+    def count_checkpoint_files(self) -> Dict:
+        """Count checkpoint files for each step"""
+        counts = {}
+        total_size = 0
+
+        for step in self.steps:
+            step_dir = self.checkpoint_dir / step
+            if step_dir.exists():
+                csv_files = list(step_dir.glob("*.csv"))
+                step_size = sum(f.stat().st_size for f in csv_files)
+                counts[step] = {
+                    "files": len(csv_files),
+                    "size_mb": step_size / (1024 * 1024),
+                }
+                total_size += step_size
+            else:
+                counts[step] = {"files": 0, "size_mb": 0}
+
+        counts["total_size_mb"] = total_size / (1024 * 1024)
+        return counts
+
+    def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
+        """Clean checkpoint files for a specific step"""
+        step_dir = self.checkpoint_dir / step_name
+
+        if not step_dir.exists():
+            logging.info(f"No checkpoints found for {step_name}")
+            return
+
+        csv_files = sorted(step_dir.glob("batch_*.csv"))
+
+        if len(csv_files) <= keep_last:
+            logging.info(
+                f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
+            )
+            return
+
+        files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
+
+        for file_path in files_to_delete:
+            try:
+                file_path.unlink()
+                logging.info(f"Deleted {file_path}")
+            except Exception as e:
+                logging.error(f"Failed to delete {file_path}: {e}")
+
+    def reset_step(self, step_name: str):
+        """Reset a pipeline step by removing its checkpoints and state"""
+        step_dir = self.checkpoint_dir / step_name
+
+        if step_dir.exists():
+            try:
+                shutil.rmtree(step_dir)
+                logging.info(f"Reset step: {step_name}")
+            except Exception as e:
+                logging.error(f"Failed to reset {step_name}: {e}")
+        else:
+            logging.info(f"Step {step_name} has no checkpoints to reset")
@@ -0,0 +1,94 @@
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict
+
+import pandas as pd
+
+from ners.processing.steps.feature_extraction_step import NameCategory
+
+
+class BaseNameFormatter(ABC):
+    """
+    Base class for name formatting transformations.
+    Contains common logic for NER tagging and attribute computation.
+    """
+
+    def __init__(
+        self, connectors: List[str] = None, additional_surnames: List[str] = None
+    ):
+        self.connectors = connectors or ["wa", "ya", "ka", "ba"]
+        self.additional_surnames = additional_surnames or [
+            "jean",
+            "paul",
+            "marie",
+            "joseph",
+            "pierre",
+            "claude",
+            "andre",
+            "michel",
+            "robert",
+        ]
+
+    @classmethod
+    def parse_native_components(cls, native_str: str) -> List[str]:
+        """Parse native name string into individual components"""
+        if pd.isna(native_str) or not native_str:
+            return []
+        return native_str.strip().split()
+
+    def create_ner_tags(
+        self, text: str, native_parts: List[str], surname: str
+    ) -> List[Tuple[int, int, str]]:
+        """Create NER entity tags for transformed text"""
+        entities = []
+        current_pos = 0
+        words = text.split()
+
+        for word in words:
+            start_pos = current_pos
+            end_pos = current_pos + len(word)
+
+            # Determine tag based on word content
+            if word in native_parts or any(
+                connector in word for connector in self.connectors
+            ):
+                tag = "NATIVE"
+            elif word == surname or word in self.additional_surnames:
+                tag = "SURNAME"
+            else:
+                # Check if it's a compound native word or new surname
+                if any(part in word for part in native_parts):
+                    tag = "NATIVE"
+                else:
+                    tag = "SURNAME"
+
+            entities.append((start_pos, end_pos, tag))
+            current_pos = end_pos + 1  # +1 for space
+
+        return entities
+
+    @classmethod
+    def compute_numeric_features(cls, name: str) -> Dict:
+        """Compute all derived attributes for the transformed name"""
+        words_count = len(name.split()) if name else 0
+        length = len(name) if name else 0
+
+        return {
+            "words": words_count,
+            "length": length,
+            "identified_category": (
+                NameCategory.SIMPLE.value
+                if words_count == 3
+                else NameCategory.COMPOSE.value
+            ),
+        }
+
+    @abstractmethod
+    def transform(self, row: pd.Series) -> Dict:
+        """Transform a row according to the specific format rules"""
+        pass
+
+    @property
+    @abstractmethod
+    def transformation_type(self) -> str:
+        """Return the transformation type identifier"""
+        pass
@@ -0,0 +1,38 @@
+import random
+from typing import Dict
+
+import pandas as pd
+
+from ners.processing.ner.formats import BaseNameFormatter
+
+
+class ConnectorFormatter(BaseNameFormatter):
+    def transform(self, row: pd.Series) -> Dict:
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
+        connector = random.choice(self.connectors)
+
+        # Connect native parts with a random connector
+        if len(native_parts) > 1:
+            connected_native = f" {connector} ".join(native_parts)
+            full_name = f"{connected_native} {surname}".strip()
+        else:
+            connected_native = (
+                f"{row['probable_native']} {connector} {row['probable_native']}".strip()
+            )
+            full_name = f"{connected_native} {surname}".strip()
+
+        return {
+            "name": full_name,
+            "probable_native": connected_native,
+            "identified_name": connected_native,
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
+        }
+
+    @property
+    def transformation_type(self) -> str:
+        return "connector_added"
@@ -0,0 +1,36 @@
+import random
+from typing import Dict
+
+import pandas as pd
+
+from ners.processing.ner.formats import BaseNameFormatter
+
+
+class ExtendedSurnameFormatter(BaseNameFormatter):
+    def transform(self, row: pd.Series) -> Dict:
+        native_parts = self.parse_native_components(row["probable_native"])
+        original_surname = (
+            row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
+        )
+
+        # Add random additional surname
+        additional_surname = random.choice(self.additional_surnames)
+        combined_surname = f"{additional_surname} {original_surname}".strip()
+        full_name = f"{row['probable_native']} {combined_surname}".strip()
+
+        return {
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": combined_surname,
+            "identified_surname": combined_surname,
+            "ner_entities": str(
+                self.create_ner_tags(full_name, native_parts, combined_surname)
+            ),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
+        }
+
+    @property
+    def transformation_type(self) -> str:
+        return "extended_surname"
@@ -0,0 +1,28 @@
+from typing import Dict
+
+import pandas as pd
+
+from ners.processing.ner.formats import BaseNameFormatter
+
+
+class NativeOnlyFormatter(BaseNameFormatter):
+    def transform(self, row: pd.Series) -> Dict:
+        native_parts = self.parse_native_components(row["probable_native"])
+
+        # Only native components
+        full_name = row["probable_native"]
+
+        return {
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": "",
+            "identified_surname": "",
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
+        }
+
+    @property
+    def transformation_type(self) -> str:
+        return "native_only"
@@ -0,0 +1,29 @@
+from typing import Dict
+
+import pandas as pd
+
+from ners.processing.ner.formats import BaseNameFormatter
+
+
+class OriginalFormatter(BaseNameFormatter):
+    def transform(self, row: pd.Series) -> Dict:
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
+
+        # Keep original order: native components + surname
+        full_name = f"{row['probable_native']} {surname}".strip()
+
+        return {
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
+        }
+
+    @property
+    def transformation_type(self) -> str:
+        return "original"
@@ -0,0 +1,29 @@
+from typing import Dict
+
+import pandas as pd
+
+from ners.processing.ner.formats import BaseNameFormatter
+
+
+class PositionFlippedFormatter(BaseNameFormatter):
+    def transform(self, row: pd.Series) -> Dict:
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
+
+        # Flip order: surname + native components
+        full_name = f"{surname} {row['probable_native']}".strip()
+
+        return {
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
+        }
+
+    @property
+    def transformation_type(self) -> str:
+        return "position_flipped"
@@ -0,0 +1,34 @@
+from typing import Dict
+
+import pandas as pd
+
+from ners.processing.ner.formats import BaseNameFormatter
+
+
+class ReducedNativeFormatter(BaseNameFormatter):
+    def transform(self, row: pd.Series) -> Dict:
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
+
+        # Keep only first native component + surname
+        reduced_native = (
+            native_parts[0] if len(native_parts) > 1 else row["probable_native"]
+        )
+        full_name = f"{reduced_native} {surname}".strip()
+
+        return {
+            "name": full_name,
+            "probable_native": reduced_native,
+            "identified_name": reduced_native,
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(
+                self.create_ner_tags(full_name, [reduced_native], surname)
+            ),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
+        }
+
+    @property
+    def transformation_type(self) -> str:
+        return "reduced_native"
@@ -0,0 +1,87 @@
+import json
+import logging
+
+import spacy
+from spacy.tokens import DocBin
+
+from ners.core.config import PipelineConfig
+from ners.core.utils.data_loader import DataLoader
+from .name_tagger import NameTagger
+
+
+class NameBuilder:
+    def __init__(self, config: PipelineConfig):
+        config = config.model_copy(deep=True)
+        config.data.max_dataset_size = 1_000_000
+        config.data.balance_by_sex = True
+
+        self.config = config
+        self.data_loader = DataLoader(config)
+        self.tagger = NameTagger()
+
+    def build(self) -> int:
+        filepath = self.config.paths.get_data_path(
+            self.config.data.output_files["engineered"]
+        )
+        df = self.data_loader.load_csv_complete(filepath)
+        df = df[["name", "ner_tagged", "ner_entities"]]
+
+        # Filter early
+        ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
+        if ner_df.empty:
+            logging.error("No NER tagged data found")
+            return 1
+
+        total_rows = len(df)
+        del df  # No need to keep in memory
+
+        logging.info(f"Found {len(ner_df)} NER tagged entries")
+        nlp = spacy.blank("fr")
+
+        # Use NERNameTagger for parsing and validation
+        parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
+        validated_entities = self.tagger.validate_entities(
+            ner_df["name"], parsed_entities
+        )
+
+        # Drop rows with no valid entities
+        mask = validated_entities.map(bool)
+        ner_df = ner_df.loc[mask]
+        validated_entities = validated_entities.loc[mask]
+
+        if ner_df.empty:
+            logging.error("No valid training examples after validation")
+            return 1
+
+        # Prepare training data
+        training_data = list(
+            zip(
+                ner_df["name"].tolist(),
+                [{"entities": ents} for ents in validated_entities],
+            )
+        )
+
+        # Use NERNameTagger to create spaCy DocBin
+        docs = self.tagger.create_docs(
+            nlp, ner_df["name"].tolist(), validated_entities.tolist()
+        )
+        doc_bin = DocBin(docs=docs)
+
+        # Save
+        json_path = self.config.paths.get_data_path(
+            self.config.data.output_files["ner_data"]
+        )
+        spacy_path = self.config.paths.get_data_path(
+            self.config.data.output_files["ner_spacy"]
+        )
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
+        doc_bin.to_disk(spacy_path)
+
+        logging.info(
+            f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
+        )
+        logging.info(f"Saved NER JSON to {json_path}")
+        logging.info(f"Saved NER spacy to {spacy_path}")
+        return 0
@@ -0,0 +1,142 @@
+import gc
+import random
+import logging
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from ners.core.config import PipelineConfig
+from ners.core.utils.data_loader import DataLoader
+from ners.processing.ner.formats.connectors_format import ConnectorFormatter
+from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
+from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
+from ners.processing.ner.formats.original_format import OriginalFormatter
+from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
+from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
+
+
+class NameEngineering:
+    """
+    Feature engineering for NER dataset to prevent position-based learning
+    and encourage sequence characteristic learning.
+    """
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.data_loader = DataLoader(config)
+        self.connectors = ["wa", "ya", "ka", "ba", "la"]
+        self.additional_surnames = [
+            "jean",
+            "paul",
+            "marie",
+            "joseph",
+            "pierre",
+            "claude",
+            "andre",
+            "michel",
+            "robert",
+        ]
+
+        random.seed(self.config.data.random_seed)
+        np.random.seed(self.config.data.random_seed)
+
+        # Initialize format classes
+        self.formatters = {
+            "original": OriginalFormatter(self.connectors, self.additional_surnames),
+            "native_only": NativeOnlyFormatter(
+                self.connectors, self.additional_surnames
+            ),
+            "position_flipped": PositionFlippedFormatter(
+                self.connectors, self.additional_surnames
+            ),
+            "reduced_native": ReducedNativeFormatter(
+                self.connectors, self.additional_surnames
+            ),
+            "connector_added": ConnectorFormatter(
+                self.connectors, self.additional_surnames
+            ),
+            "extended_surname": ExtendedSurnameFormatter(
+                self.connectors, self.additional_surnames
+            ),
+        }
+
+    def load_data(self) -> pd.DataFrame:
+        """Load and filter NER-tagged data from CSV file"""
+
+        filepath = self.config.paths.get_data_path(
+            self.config.data.output_files["featured"]
+        )
+        df = self.data_loader.load_csv_complete(filepath)
+
+        # Filter only NER-tagged rows
+        ner_data = df[df["ner_tagged"] == 1].copy()
+        logging.info(
+            f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
+        )
+
+        return ner_data
+
+    def compute(self) -> None:
+        logging.info("Applying feature engineering transformations...")
+        input_filepath = self.config.paths.get_data_path(
+            self.config.data.output_files["featured"]
+        )
+        output_filepath = self.config.paths.get_data_path(
+            self.config.data.output_files["engineered"]
+        )
+
+        df = self.data_loader.load_csv_complete(input_filepath)
+        ner_df = df[df["ner_tagged"] == 1].copy()
+        logging.info(
+            f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
+        )
+
+        del df  # No need to keep in memory
+        gc.collect()
+
+        ner_df = ner_df.sample(
+            frac=1, random_state=self.config.data.random_seed
+        ).reset_index(drop=True)
+        total_rows = len(ner_df)
+
+        # Calculate split points
+        split_25_1 = int(total_rows * 0.25)
+        split_25_2 = int(total_rows * 0.50)
+        split_25_3 = int(total_rows * 0.75)
+        split_10_1 = int(total_rows * 0.85)
+        split_10_2 = int(total_rows * 0.95)
+
+        # Define transformation groups
+        groups = [
+            (0, split_25_1, "original"),  # First 25%: original format
+            (split_25_1, split_25_2, "native_only"),  # Second 25%: remove surname
+            (split_25_2, split_25_3, "position_flipped"),  # Third 25%: flip positions
+            (
+                split_25_3,
+                split_10_1,
+                "reduced_native",
+            ),  # Fourth 10%: reduce native components
+            (split_10_1, split_10_2, "connector_added"),  # Fifth 10%: add connectors
+            (split_10_2, total_rows, "extended_surname"),  # Last 5%: extend surnames
+        ]
+
+        for start, end, trans_type in groups:
+            logging.info(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
+
+        # Process each group
+        rows = []
+        for start, end, formatter_key in groups:
+            formatter = self.formatters[formatter_key]
+
+            for idx in tqdm(range(start, end), desc=f"Processing {formatter_key}"):
+                row = ner_df.iloc[idx]
+                transformed = formatter.transform(row)
+
+                # Keep original columns and add transformed ones
+                new_row = row.to_dict()
+                new_row.update(transformed)
+                rows.append(new_row)
+
+        self.data_loader.save_csv(pd.DataFrame(rows), output_filepath)
+        logging.info(f"Engineered dataset saved to {output_filepath}")
@@ -0,0 +1,430 @@
+import ast
+import json
+import logging
+import os
+import random
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+
+import spacy
+from spacy.training import Example
+from spacy.util import minibatch
+from tqdm import tqdm
+
+from ners.core.config.pipeline_config import PipelineConfig
+
+
+class NameModel:
+    """NER model trainer using spaCy for DRC names entity recognition"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.nlp = None
+        self.ner = None
+        self.model_path = None
+        self.training_stats = {}
+        self.evaluation_stats = {}
+
+    def create_blank_model(self, language: str = "fr") -> None:
+        """Create a blank spaCy model with NER pipeline"""
+        logging.info(f"Creating blank {language} model for NER training")
+
+        # Prefer GPU for spaCy if available (falls back to CPU automatically)
+        try:
+            if spacy.prefer_gpu():
+                logging.info("spaCy GPU enabled (cupy) for NER training")
+            else:
+                logging.info("spaCy running on CPU")
+        except Exception as e:
+            logging.debug(f"spaCy GPU selection skipped: {e}")
+
+        # Create blank model - French tokenizer works well for DRC names
+        self.nlp = spacy.blank(language)
+
+        # Add NER pipeline component
+        if "ner" not in self.nlp.pipe_names:
+            self.ner = self.nlp.add_pipe("ner")
+        else:
+            self.ner = self.nlp.get_pipe("ner")
+
+        # Add our custom labels
+        self.ner.add_label("NATIVE")
+        self.ner.add_label("SURNAME")
+
+        logging.info("Blank model created with NATIVE and SURNAME labels")
+
+    @classmethod
+    def load_data(cls, data_path: str) -> List[Tuple[str, Dict]]:
+        """Load training data from JSON file - compatible with NERNameTagger output format"""
+        if not os.path.exists(data_path):
+            raise FileNotFoundError(f"Training data not found at {data_path}")
+
+        logging.info(f"Loading training data from {data_path}")
+
+        with open(data_path, "r", encoding="utf-8") as f:
+            raw_data = json.load(f)
+
+        # Validate and clean training data
+        valid_data = []
+        skipped_count = 0
+
+        for i, item in enumerate(raw_data):
+            try:
+                if not isinstance(item, (list, tuple)) or len(item) != 2:
+                    logging.warning(
+                        f"Skipping invalid training example format at index {i}: {item}"
+                    )
+                    skipped_count += 1
+                    continue
+
+                text, annotations = item
+
+                # Validate text
+                if not isinstance(text, str) or not text.strip():
+                    logging.warning(f"Skipping invalid text at index {i}: {repr(text)}")
+                    skipped_count += 1
+                    continue
+
+                # Handle different annotation formats from NERNameTagger
+                if not isinstance(annotations, dict) or "entities" not in annotations:
+                    logging.warning(
+                        f"Skipping invalid annotations at index {i}: {annotations}"
+                    )
+                    skipped_count += 1
+                    continue
+
+                entities_raw = annotations["entities"]
+
+                # Parse entities - handle both string and list formats from tagger
+                if isinstance(entities_raw, str):
+                    # String format from tagger: "[(0, 6, 'NATIVE'), ...]"
+                    try:
+                        entities = ast.literal_eval(entities_raw)
+                        if not isinstance(entities, list):
+                            logging.warning(
+                                f"Parsed entities is not a list at index {i}: {entities}"
+                            )
+                            skipped_count += 1
+                            continue
+                    except (ValueError, SyntaxError) as e:
+                        logging.warning(
+                            f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
+                        )
+                        skipped_count += 1
+                        continue
+                elif isinstance(entities_raw, list):
+                    # Already in list format
+                    entities = entities_raw
+                else:
+                    logging.warning(
+                        f"Skipping invalid entities format at index {i}: {entities_raw}"
+                    )
+                    skipped_count += 1
+                    continue
+
+                # Validate each entity
+                valid_entities = []
+                for entity in entities:
+                    if not isinstance(entity, (list, tuple)) or len(entity) != 3:
+                        logging.warning(
+                            f"Skipping invalid entity format in '{text}': {entity}"
+                        )
+                        continue
+
+                    start, end, label = entity
+
+                    # Validate entity components
+                    if (
+                        not isinstance(start, int)
+                        or not isinstance(end, int)
+                        or not isinstance(label, str)
+                        or start >= end
+                        or start < 0
+                        or end > len(text)
+                    ):
+                        logging.warning(
+                            f"Skipping invalid entity bounds in '{text}': {entity}"
+                        )
+                        continue
+
+                    # Check for overlaps with already validated entities
+                    has_overlap = any(
+                        start < v_end and end > v_start
+                        for v_start, v_end, _ in valid_entities
+                    )
+
+                    if has_overlap:
+                        logging.warning(
+                            f"Skipping overlapping entity in '{text}': {entity}"
+                        )
+                        continue
+
+                    # Validate that the span doesn't contain spaces (matching tagger validation)
+                    span_text = text[start:end]
+                    if (
+                        not span_text
+                        or span_text != span_text.strip()
+                        or " " in span_text
+                    ):
+                        logging.warning(
+                            f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
+                        )
+                        continue
+
+                    valid_entities.append((start, end, label))
+
+                if not valid_entities:
+                    logging.warning(
+                        f"Skipping training example with no valid entities: '{text}'"
+                    )
+                    skipped_count += 1
+                    continue
+
+                # Sort entities by start position
+                valid_entities.sort(key=lambda x: x[0])
+                valid_data.append((text.strip(), {"entities": valid_entities}))
+
+            except Exception as e:
+                logging.error(f"Error processing training example at index {i}: {e}")
+                skipped_count += 1
+                continue
+
+        logging.info(
+            f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
+        )
+
+        if not valid_data:
+            raise ValueError("No valid training examples found in the data")
+
+        return valid_data
+
+    def train(
+        self,
+        data: List[Tuple[str, Dict]],
+        epochs: int = 1,
+        batch_size: int = 10_000,
+        dropout_rate: float = 0.3,
+    ) -> None:
+        """Train the NER model"""
+        logging.info(f"Starting NER training with {len(data)} examples")
+        logging.info(
+            f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
+        )
+
+        if self.nlp is None:
+            raise ValueError("Model not initialized. Call create_blank_model() first.")
+
+        # Initialize the model
+        self.nlp.initialize()
+        optimizer = self.nlp.resume_training()
+        losses_history = []
+
+        for epoch in range(epochs):
+            losses = {}
+            examples = []
+
+            for text, annotations in tqdm(data, desc="Create training examples"):
+                doc = self.nlp.make_doc(text)
+                examples.append(Example.from_dict(doc, annotations))
+
+            # Shuffle examples each epoch (important!)
+            random.shuffle(examples)
+
+            # Train in batches
+            batches = minibatch(examples, size=batch_size)
+            for batch in batches:
+                batch_losses = {}
+                self.nlp.update(
+                    batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
+                )
+                logging.info(
+                    f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
+                )
+
+                # Accumulate into total losses dict
+                for k, v in batch_losses.items():
+                    losses[k] = losses.get(k, 0.0) + v
+
+            del batches  # free memory
+            losses_history.append(losses.get("ner", 0))
+            logging.info(f"Epoch {epoch + 1}/{epochs}, Total Loss: {losses['ner']:.4f}")
+
+        # Store training statistics
+        self.training_stats = {
+            "epochs": epochs,
+            "final_loss": losses_history[-1] if losses_history else 0,
+            "training_examples": len(data),
+            "loss_history": losses_history,
+            "batch_size": batch_size,
+            "dropout_rate": dropout_rate,
+        }
+
+        logging.info(
+            f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
+        )
+
+    def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
+        """Evaluate the trained model on test data"""
+        if self.nlp is None:
+            raise ValueError("Model not trained. Call train_model() first.")
+
+        logging.info(f"Evaluating model on {len(test_data)} test examples")
+
+        total_examples = len(test_data)
+        correct_entities = 0
+        predicted_entities = 0
+        actual_entities = 0
+
+        entity_stats = {
+            "NATIVE": {"tp": 0, "fp": 0, "fn": 0},
+            "SURNAME": {"tp": 0, "fp": 0, "fn": 0},
+        }
+
+        for text, annotations in test_data:
+            # Get actual entities
+            actual_ents = set()
+            for start, end, label in annotations.get("entities", []):
+                actual_ents.add((start, end, label))
+                actual_entities += 1
+
+            # Get predicted entities
+            doc = self.nlp(text)
+            predicted_ents = set()
+            for ent in doc.ents:
+                predicted_ents.add((ent.start_char, ent.end_char, ent.label_))
+                predicted_entities += 1
+
+            # Calculate matches
+            matches = actual_ents.intersection(predicted_ents)
+            correct_entities += len(matches)
+
+            # Update per-label statistics
+            for start, end, label in actual_ents:
+                if (start, end, label) in predicted_ents:
+                    entity_stats[label]["tp"] += 1
+                else:
+                    entity_stats[label]["fn"] += 1
+
+            for start, end, label in predicted_ents:
+                if (start, end, label) not in actual_ents:
+                    entity_stats[label]["fp"] += 1
+
+        # Calculate overall metrics
+        precision = (
+            correct_entities / predicted_entities if predicted_entities > 0 else 0
+        )
+        recall = correct_entities / actual_entities if actual_entities > 0 else 0
+        f1_score = (
+            2 * (precision * recall) / (precision + recall)
+            if (precision + recall) > 0
+            else 0
+        )
+
+        # Calculate per-label metrics
+        label_metrics = {}
+        for label, stats in entity_stats.items():
+            tp, fp, fn = stats["tp"], stats["fp"], stats["fn"]
+            label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+            label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+            label_f1 = (
+                (
+                    2
+                    * (label_precision * label_recall)
+                    / (label_precision + label_recall)
+                )
+                if (label_precision + label_recall) > 0
+                else 0
+            )
+
+            label_metrics[label] = {
+                "precision": label_precision,
+                "recall": label_recall,
+                "f1_score": label_f1,
+                "support": tp + fn,
+            }
+
+        self.evaluation_stats = {
+            "overall": {
+                "precision": precision,
+                "recall": recall,
+                "f1_score": f1_score,
+                "total_examples": total_examples,
+                "correct_entities": correct_entities,
+                "predicted_entities": predicted_entities,
+                "actual_entities": actual_entities,
+            },
+            "by_label": label_metrics,
+        }
+
+        return self.evaluation_stats
+
+    def save(self, model_name: str = "drc_ner_model") -> str:
+        """Save the trained model"""
+        if self.nlp is None:
+            raise ValueError("No model to save. Train a model first.")
+
+        # Create model directory
+        model_dir = self.config.paths.models_dir / model_name
+        model_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save the model
+        self.nlp.to_disk(model_dir)
+        self.model_path = str(model_dir)
+
+        # Save training and evaluation statistics
+        training_stats_path = model_dir / "training_stats.json"
+        with open(training_stats_path, "w", encoding="utf-8") as f:
+            json.dump(self.training_stats, f, indent=2)
+
+        evaluation_stats_path = model_dir / "evaluation_stats.json"
+        with open(evaluation_stats_path, "w", encoding="utf-8") as f:
+            json.dump(self.evaluation_stats, f, indent=2)
+
+        logging.info(f"NER Model saved to {model_dir}")
+        return self.model_path
+
+    def load(self, model_path: str) -> None:
+        """Load a trained model"""
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Model not found at {model_path}")
+
+        logging.info(f"Loading model from {model_path}")
+        self.nlp = spacy.load(model_path)
+        self.ner = self.nlp.get_pipe("ner")
+        self.model_path = model_path
+
+        # Load training statistics if available
+        training_stats_path = Path(model_path) / "training_stats.json"
+        if training_stats_path.exists():
+            with open(training_stats_path, "r", encoding="utf-8") as f:
+                self.training_stats = json.load(f)
+
+        evaluation_stats_path = Path(model_path) / "evaluation_stats.json"
+        if evaluation_stats_path.exists():
+            with open(evaluation_stats_path, "r", encoding="utf-8") as f:
+                self.evaluation_stats = json.load(f)
+
+        logging.info("NER Model loaded successfully")
+
+    def predict(self, text: str) -> Dict[str, Any]:
+        """Make predictions on a single text"""
+        if self.nlp is None:
+            raise ValueError("No model loaded. Load or train a model first.")
+
+        doc = self.nlp(text)
+        entities = []
+
+        for ent in doc.ents:
+            entities.append(
+                {
+                    "text": ent.text,
+                    "label": ent.label_,
+                    "start": ent.start_char,
+                    "end": ent.end_char,
+                    "confidence": getattr(
+                        ent, "score", None
+                    ),  # If confidence scores are available
+                }
+            )
+
+        return {"text": text, "entities": entities}
@@ -0,0 +1,290 @@
+from typing import Union, Dict, Any, List
+import ast
+import json
+import logging
+import pandas as pd
+from spacy.util import filter_spans
+
+
+class NameTagger:
+    def tag_name(
+        self, name: str, probable_native: str, probable_surname: str
+    ) -> Union[Dict[str, Any], None]:
+        """Create a single NER training example using probable_native and probable_surname"""
+        if not name or not probable_native or not probable_surname:
+            return None
+
+        name = name.strip()
+        probable_native = probable_native.strip()
+        probable_surname = probable_surname.strip()
+
+        entities = []
+        used_spans = []  # Track used character spans to prevent overlaps
+
+        # Helper function to check if a span overlaps with any existing span
+        def has_overlap(start, end):
+            for used_start, used_end in used_spans:
+                if not (end <= used_start or start >= used_end):
+                    return True
+            return False
+
+        # Find positions of native names in the full name
+        native_words = probable_native.split()
+        name_lower = name.lower()  # Use lowercase for consistent searching
+        processed_native_words = set()
+
+        for native_word in native_words:
+            native_word = native_word.strip()
+            if len(native_word) < 2:  # Skip very short words
+                continue
+
+            native_word_lower = native_word.lower()
+
+            # Skip if we've already processed this exact word
+            if native_word_lower in processed_native_words:
+                continue
+            processed_native_words.add(native_word_lower)
+
+            # Find the first occurrence of this native word that doesn't overlap
+            start_pos = 0
+            while True:
+                pos = name_lower.find(
+                    native_word_lower, start_pos
+                )  # Case-insensitive search
+                if pos == -1:
+                    break
+
+                # Calculate end position - make sure we only include the word itself
+                end_pos = pos + len(native_word_lower)
+
+                # Double-check that the extracted span matches exactly what we expect
+                extracted_text = name[pos:end_pos]  # Get original case text
+                if extracted_text.lower() != native_word_lower:
+                    start_pos = pos + 1
+                    continue
+
+                # Check if this is a word boundary match and doesn't overlap
+                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
+                    pos, end_pos
+                ):
+                    entities.append((pos, end_pos, "NATIVE"))
+                    used_spans.append((pos, end_pos))
+                    break  # Only take the first non-overlapping occurrence
+
+                start_pos = pos + 1
+
+        # Find position of surname in the full name
+        if probable_surname and len(probable_surname.strip()) >= 2:
+            surname_lower = probable_surname.lower()
+
+            # Find the first occurrence that doesn't overlap
+            start_pos = 0
+            while True:
+                pos = name_lower.find(
+                    surname_lower, start_pos
+                )  # Case-insensitive search
+                if pos == -1:
+                    break
+
+                # Calculate end position correctly - exact match only
+                end_pos = pos + len(surname_lower)
+
+                # Double-check that the extracted span matches exactly what we expect
+                extracted_text = name[pos:end_pos]  # Get original case text
+                if extracted_text.lower() != surname_lower:
+                    start_pos = pos + 1
+                    continue
+
+                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
+                    pos, end_pos
+                ):
+                    entities.append((pos, end_pos, "SURNAME"))
+                    used_spans.append((pos, end_pos))
+                    break
+
+                start_pos = pos + 1
+
+        if not entities:
+            logging.warning(
+                f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
+            )
+            return None
+
+        # Sort entities by position and validate
+        entities.sort(key=lambda x: x[0])
+
+        # Final validation - ensure no overlaps and valid spans
+        validated_entities = []
+        for start, end, label in entities:
+            # Check bounds
+            if not (0 <= start < end <= len(name)):
+                logging.warning(
+                    f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
+                )
+                continue
+
+            # Check for overlaps with already validated entities
+            if any(
+                start < v_end and end > v_start
+                for v_start, v_end, _ in validated_entities
+            ):
+                logging.warning(
+                    f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
+                )
+                continue
+
+            # CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
+            span_text = name[start:end]
+            if not span_text or span_text != span_text.strip() or " " in span_text:
+                logging.warning(
+                    f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
+                )
+                continue
+
+            validated_entities.append((start, end, label))
+
+        if not validated_entities:
+            logging.warning(f"No valid entities after validation for: '{name}'")
+            return None
+
+        # Convert to string format that matches the dataset
+        entities_str = str(validated_entities)
+
+        return {
+            "entities": entities_str,
+            "spans": validated_entities,  # Keep the original tuples for internal use
+        }
+
+    @classmethod
+    def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
+        """Check if the match is at word boundaries"""
+        # Check character before start position
+        if start > 0:
+            prev_char = text[start - 1]
+            if prev_char.isalnum():
+                return False
+
+        # Check character after end position
+        if end < len(text):
+            next_char = text[end]
+            if next_char.isalnum():
+                return False
+
+        return True
+
+    @classmethod
+    def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
+        """Extract the actual text for each entity type"""
+        result = {"NATIVE": [], "SURNAME": []}
+
+        try:
+            entities = ast.literal_eval(entities_str)
+
+            for start, end, label in entities:
+                if 0 <= start < end <= len(name):
+                    span_text = name[start:end]
+                    if label in result:
+                        result[label].append(span_text)
+
+        except (ValueError, SyntaxError, TypeError):
+            pass
+
+        return result
+
+    @classmethod
+    def parse(cls, entities_str: str) -> List[tuple]:
+        """Parse entity strings from various formats.
+
+        Supports formats:
+        - [(start, end, label), ...]
+        - [[start, end, label], ...]
+        - [{"start": start, "end": end, "label": label}, ...]
+        """
+        if not entities_str or entities_str in ["[]", "", "nan"]:
+            return []
+        entities_str = str(entities_str).strip()
+        try:
+            if entities_str.startswith("[(") and entities_str.endswith(")]"):
+                return ast.literal_eval(entities_str)
+            elif entities_str.startswith("[[") and entities_str.endswith("]]"):
+                return [tuple(e) for e in ast.literal_eval(entities_str)]
+            elif entities_str.startswith("[{") and entities_str.endswith("}]"):
+                return [
+                    (e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
+                ]
+            else:
+                parsed = ast.literal_eval(entities_str)
+                return [
+                    tuple(e)
+                    for e in parsed
+                    if isinstance(e, (list, tuple)) and len(e) == 3
+                ]
+        except (ValueError, SyntaxError, json.JSONDecodeError):
+            return []
+
+    def parse_entities(self, series: pd.Series) -> pd.Series:
+        """Vectorized parse of entity strings."""
+        return series.map(self.parse)
+
+    @classmethod
+    def validate(cls, text: str, entities: List[tuple]) -> List[tuple]:
+        """Advanced entity validation with overlap removal.
+
+        This is more comprehensive than the basic validate_entities method.
+        """
+        if not entities or not text:
+            return []
+        text = str(text).strip()
+        valid = []
+
+        for ent in entities:
+            if not isinstance(ent, (list, tuple)) or len(ent) != 3:
+                continue
+            start, end, label = ent
+            try:
+                start, end = int(start), int(end)
+            except (ValueError, TypeError):
+                continue
+            if not isinstance(label, str):
+                continue
+            if not (0 <= start < end <= len(text)):
+                continue
+            if not text[start:end].strip():
+                continue
+            valid.append((start, end, label))
+
+        if not valid:
+            return []
+
+        valid.sort(key=lambda x: (x[0], x[1]))
+
+        # Remove overlaps
+        filtered, last_end = [], -1
+        for s, e, l in valid:
+            if s >= last_end:
+                filtered.append((s, e, l))
+                last_end = e
+        return filtered
+
+    def validate_entities(
+        self, texts: pd.Series, entities_series: pd.Series
+    ) -> pd.Series:
+        """Vectorized entity validation."""
+        return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
+
+    @classmethod
+    def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List:
+        """Batch create spaCy Docs from texts and entities."""
+        docs = []
+        for text, ents in zip(texts, entities):
+            doc = nlp(text)
+            spans = []
+            for start, end, label in ents:
+                span = doc.char_span(
+                    start, end, label=label, alignment_mode="contract"
+                ) or doc.char_span(start, end, label=label, alignment_mode="strict")
+                if span:
+                    spans.append(span)
+            doc.ents = filter_spans(spans)
+            docs.append(doc)
+        return docs
@@ -0,0 +1,57 @@
+import logging
+import time
+from typing import Dict, Any
+
+import pandas as pd
+
+from ners.processing.batch.batch_config import BatchConfig
+from ners.processing.batch.batch_processor import BatchProcessor
+from ners.processing.steps import PipelineStep
+
+
+class Pipeline:
+    """Main pipeline orchestrator"""
+
+    def __init__(self, config: BatchConfig):
+        self.config = config
+        self.processor = BatchProcessor(config)
+        self.steps = []
+
+    def add_step(self, step: PipelineStep):
+        """Add a processing step to the pipeline"""
+        self.steps.append(step)
+
+    def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
+        """Run the complete pipeline"""
+        current_data = input_data.copy()
+
+        for step in self.steps:
+            logging.info(f"Running pipeline step: {step.name}")
+            start_time = time.time()
+
+            current_data = self.processor.process(step, current_data)
+
+            elapsed_time = time.time() - start_time
+            logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
+
+            if step.state.failed_batches:
+                logging.warning(
+                    f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
+                )
+
+        return current_data
+
+    def get_progress(self) -> Dict[str, Any]:
+        """Get progress information for all steps"""
+        progress = {}
+        for step in self.steps:
+            progress[step.name] = {
+                "processed_batches": step.state.processed_batches,
+                "total_batches": step.state.total_batches,
+                "failed_batches": len(step.state.failed_batches),
+                "completion_percentage": (
+                    step.state.processed_batches / max(1, step.state.total_batches)
+                )
+                * 100,
+            }
+        return progress
@@ -0,0 +1,129 @@
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List, Optional
+
+import pandas as pd
+from pydantic import BaseModel
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.core.utils.data_loader import DataLoader
+from ners.processing.batch.batch_config import BatchConfig
+
+
+@dataclass
+class PipelineState:
+    """Tracks the state of pipeline execution"""
+
+    processed_batches: int = 0
+    total_batches: int = 0
+    failed_batches: List[int] = None
+    last_checkpoint: Optional[str] = None
+
+    def __post_init__(self):
+        if self.failed_batches is None:
+            self.failed_batches = []
+
+
+class NameAnnotation(BaseModel):
+    """Model for name annotation results"""
+
+    identified_name: Optional[str]
+    identified_surname: Optional[str]
+
+
+class PipelineStep(ABC):
+    """Abstract base class for pipeline steps"""
+
+    def __init__(
+        self,
+        name: str,
+        pipeline_config: PipelineConfig,
+        batch_config: Optional[BatchConfig] = None,
+    ):
+        self.name = name
+        self.pipeline_config = pipeline_config
+        self.data_loader = DataLoader(pipeline_config)
+
+        # Use provided batch_config or create default from pipeline config
+        if batch_config is None:
+            batch_config = BatchConfig(
+                batch_size=pipeline_config.processing.batch_size,
+                max_workers=pipeline_config.processing.max_workers,
+                checkpoint_interval=pipeline_config.processing.checkpoint_interval,
+                use_multiprocessing=pipeline_config.processing.use_multiprocessing,
+            )
+        self.batch_config = batch_config
+        self.state = PipelineState()
+
+    @property
+    def requires_batch_mutation(self) -> bool:
+        """Indicates if this step modifies the batch data"""
+        return False
+
+    @abstractmethod
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process a single batch of data"""
+        pass
+
+    def get_checkpoint_path(self, batch_id: int) -> str:
+        """Get the checkpoint file path for a batch"""
+        checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
+
+    def get_state_path(self) -> str:
+        """Get the state file path"""
+        state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
+        state_dir.mkdir(parents=True, exist_ok=True)
+        return str(state_dir / "pipeline_state.json")
+
+    def save_state(self):
+        """Save pipeline state to disk"""
+        state_file = self.get_state_path()
+        with open(state_file, "w") as f:
+            json.dump(
+                {
+                    "processed_batches": self.state.processed_batches,
+                    "total_batches": self.state.total_batches,
+                    "failed_batches": self.state.failed_batches,
+                    "last_checkpoint": self.state.last_checkpoint,
+                },
+                f,
+            )
+
+    def load_state(self) -> bool:
+        """Load pipeline state from disk. Returns True if state was loaded."""
+        state_file = self.get_state_path()
+        if os.path.exists(state_file):
+            try:
+                with open(state_file, "r") as f:
+                    state_data = json.load(f)
+                self.state.processed_batches = state_data.get("processed_batches", 0)
+                self.state.total_batches = state_data.get("total_batches", 0)
+                self.state.failed_batches = state_data.get("failed_batches", [])
+                self.state.last_checkpoint = state_data.get("last_checkpoint")
+                return True
+            except Exception as e:
+                logging.warning(f"Failed to load state: {e}")
+        return False
+
+    def batch_exists(self, batch_id: int) -> bool:
+        """Check if a batch has already been processed (idempotency)"""
+        checkpoint_path = self.get_checkpoint_path(batch_id)
+        return os.path.exists(checkpoint_path)
+
+    def save_batch(self, batch: pd.DataFrame, batch_id: int):
+        """Save processed batch to checkpoint"""
+        checkpoint_path = self.get_checkpoint_path(batch_id)
+        self.data_loader.save_csv(batch, checkpoint_path)
+        logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
+
+    def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
+        """Load processed batch from checkpoint"""
+        checkpoint_path = self.get_checkpoint_path(batch_id)
+        if os.path.exists(checkpoint_path):
+            return self.data_loader.load_csv_complete(checkpoint_path)
+        return None
@@ -0,0 +1,31 @@
+import logging
+
+import pandas as pd
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.core.utils.text_cleaner import TextCleaner
+from ners.processing.steps import PipelineStep
+
+
+class DataCleaningStep(PipelineStep):
+    """Configuration-driven data cleaning step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        super().__init__("data_cleaning", pipeline_config)
+        self.text_cleaner = TextCleaner()
+        self.required_columns = ["name", "sex", "region"]
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process a single batch for data cleaning"""
+        logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
+
+        # Drop rows with essential missing values
+        batch = batch.dropna(subset=self.required_columns)
+
+        # Apply text cleaning
+        batch = self.text_cleaner.clean_dataframe_text_columns(batch)
+
+        # Remove duplicates
+        batch = batch.drop_duplicates(subset=self.required_columns)
+
+        return batch
@@ -0,0 +1,60 @@
+import logging
+
+import pandas as pd
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.processing.steps import PipelineStep
+
+
+class DataSelectionStep(PipelineStep):
+    """Configuration-driven data selection step to keep only specified columns"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        super().__init__("data_selection", pipeline_config)
+        self.selected_columns = pipeline_config.data.selected_columns
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process a single batch for data selection"""
+        logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
+
+        # Remove rows where region == "global" only for specific years
+        if "region" in batch.columns and "year" in batch.columns:
+            target_years = {2015, 2021, 2022}
+            mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
+                target_years
+            )
+            removed = int(mask_remove.sum())
+            if removed:
+                batch = batch[~mask_remove]
+                logging.info(
+                    f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}"
+                )
+
+        # Check which columns exist in the batch
+        available_columns = [
+            col for col in self.selected_columns if col in batch.columns
+        ]
+        missing_columns = [
+            col for col in self.selected_columns if col not in batch.columns
+        ]
+
+        if missing_columns:
+            logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
+
+        if not available_columns:
+            logging.error(f"No required columns found in batch {batch_id}")
+            return pd.DataFrame()  # Return empty DataFrame if no required columns exist
+
+        # Select only the available required columns
+        selected_batch = batch[available_columns].copy()
+
+        logging.info(
+            f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
+        )
+
+        return selected_batch
+
+    @property
+    def requires_batch_mutation(self) -> bool:
+        """This step modifies the batch data by selecting columns"""
+        return True
@@ -0,0 +1,69 @@
+import numpy as np
+import pandas as pd
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.core.utils.region_mapper import RegionMapper
+from ners.processing.batch.batch_config import BatchConfig
+from ners.processing.steps import PipelineStep
+from ners.processing.steps.feature_extraction_step import Gender
+
+
+class DataSplittingStep(PipelineStep):
+    """Configuration-driven data splitting step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        batch_config = BatchConfig(
+            batch_size=pipeline_config.processing.batch_size,
+            max_workers=1,  # No need for parallelism in splitting
+            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
+            use_multiprocessing=False,
+        )
+        super().__init__("data_splitting", pipeline_config, batch_config)
+        self.eval_indices = None
+
+    def determine_eval_indices(self, total_size: int) -> set:
+        """Determine evaluation indices consistently across batches"""
+        if self.eval_indices is None:
+            np.random.seed(self.pipeline_config.data.random_seed)
+            eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
+            self.eval_indices = set(
+                np.random.choice(total_size, size=eval_size, replace=False)
+            )
+        return self.eval_indices
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process batch for data splitting - no modification needed"""
+        return batch
+
+    def split(self, df: pd.DataFrame) -> None:
+        """Save the split datasets based on configuration"""
+        output_files = self.pipeline_config.data.output_files
+        data_dir = self.pipeline_config.paths.data_dir
+
+        if self.pipeline_config.data.split_evaluation:
+            eval_indices = self.determine_eval_indices(len(df))
+            eval_mask = df.index.isin(eval_indices)
+
+            df_evaluation = df[eval_mask]
+            df_featured = df[~eval_mask]
+
+            self.data_loader.save_csv(
+                df_evaluation, data_dir / output_files["evaluation"]
+            )
+            self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
+        else:
+            self.data_loader.save_csv(df, data_dir / output_files["featured"])
+
+        if self.pipeline_config.data.split_by_province:
+            for province in RegionMapper.get_provinces():
+                df_region = df[df.province == province]
+                self.data_loader.save_csv(
+                    df_region, data_dir / "provinces" / f"{province}.csv"
+                )
+
+        if self.pipeline_config.data.split_by_gender:
+            df_males = df[df.sex == Gender.MALE.value]
+            df_females = df[df.sex == Gender.FEMALE.value]
+
+            self.data_loader.save_csv(df_males, data_dir / output_files["males"])
+            self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -0,0 +1,196 @@
+import gc
+import logging
+from enum import Enum
+from typing import Dict, Any
+
+import pandas as pd
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.core.utils.region_mapper import RegionMapper
+from ners.processing.ner.name_tagger import NameTagger
+from ners.processing.steps import PipelineStep
+
+
+class Gender(Enum):
+    MALE = "m"
+    FEMALE = "f"
+
+
+class NameCategory(Enum):
+    SIMPLE = "simple"
+    COMPOSE = "compose"
+
+
+class FeatureExtractionStep(PipelineStep):
+    """Configuration-driven feature extraction step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        super().__init__("feature_extraction", pipeline_config)
+        self.region_mapper = RegionMapper()
+        self.name_tagger = NameTagger()
+
+    @classmethod
+    def requires_batch_mutation(cls) -> bool:
+        """This step creates new columns, so mutation is required"""
+        return True
+
+    @classmethod
+    def validate_gender(cls, gender: str) -> Gender:
+        """Validate and normalize gender value"""
+        gender_lower = str(gender).lower().strip()
+        if gender_lower in ["m", "male", "homme", "masculin"]:
+            return Gender.MALE
+        elif gender_lower in ["f", "female", "femme", "féminin"]:
+            return Gender.FEMALE
+        else:
+            raise ValueError(f"Unknown gender: {gender}")
+
+    @classmethod
+    def get_name_category(cls, word_count: int) -> NameCategory:
+        """Determine name category based on word count"""
+        return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Extract features from names in batch"""
+        logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
+
+        result = batch.copy()
+        numeric_features = self._compute_numeric_features(result["name"])
+        result = result.assign(**numeric_features)
+
+        # Initialize features columns with optimal dtypes
+        features_columns = self._initialize_features_columns(len(result))
+        result = result.assign(**features_columns)
+
+        self._assign_probable_names(result)
+        self._process_simple_names(result)
+        result["identified_category"] = self._assign_identified_category(
+            result["words"]
+        )
+
+        if "year" in result.columns:
+            result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
+                "Int16"
+            )
+
+        if "region" in result.columns:
+            result["province"] = self.region_mapper.map(result["region"]).str.lower()
+            result["province"] = result["province"].astype("category")
+
+        if "sex" in result.columns:
+            result["sex"] = self._normalize_gender(result["sex"])
+
+        # Apply final dtype optimizations
+        result = self._optimize_dtypes(result)
+
+        # Cleanup
+        del numeric_features, features_columns
+        if batch_id % 10 == 0:  # Periodic cleanup
+            gc.collect()
+
+        return result
+
+    @classmethod
+    def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
+        """Calculate basic features in vectorized manner"""
+        return {
+            "words": (series.str.count(" ") + 1).astype("Int8"),
+            "length": series.str.len().astype("Int16"),
+        }
+
+    @classmethod
+    def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
+        """Initialize new columns with optimal dtypes"""
+        return {
+            "probable_native": pd.Series([None] * size, dtype="string"),
+            "probable_surname": pd.Series([None] * size, dtype="string"),
+            "identified_name": pd.Series([None] * size, dtype="string"),
+            "identified_surname": pd.Series([None] * size, dtype="string"),
+            "ner_entities": pd.Series([None] * size, dtype="string"),
+            "ner_tagged": pd.Series([0] * size, dtype="Int8"),
+            "annotated": pd.Series([0] * size, dtype="Int8"),
+        }
+
+    @classmethod
+    def _assign_probable_names(cls, df: pd.DataFrame) -> None:
+        """Assign probable native and surname names efficiently"""
+
+        name_splits = df["name"].str.split()
+        mask = name_splits.str.len() >= 2
+
+        df.loc[mask, "probable_native"] = name_splits[mask].apply(
+            lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
+        )
+        df.loc[mask, "probable_surname"] = name_splits[mask].apply(
+            lambda x: x[-1] if isinstance(x, list) else None
+        )
+
+    def _assign_identified_category(self, series: pd.Series) -> pd.Series:
+        """Assign identified category based on word count"""
+        return series.map(lambda x: self.get_name_category(x).value).astype("category")
+
+    def _process_simple_names(self, df: pd.DataFrame) -> None:
+        """Process 3-word names efficiently with vectorized operations"""
+        mask = pd.Series(df["words"] == 3)
+
+        if not mask.any():
+            return
+
+        df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
+        df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
+        df.loc[mask, "annotated"] = 1
+
+        # NER tagging for 3-word names
+        three_word_rows = df[mask]
+        for idx, row in three_word_rows.iterrows():
+            try:
+                entity = self.name_tagger.tag_name(
+                    row["name"], row["identified_name"], row["identified_surname"]
+                )
+
+                if entity:
+                    df.at[idx, "ner_entities"] = str(entity["entities"])
+                    df.at[idx, "ner_tagged"] = 1
+            except Exception as e:
+                logging.warning(f"NER tagging failed for row {idx}: {e}")
+
+    @classmethod
+    def _normalize_gender(cls, series: pd.Series) -> pd.Series:
+        gender_mapping = {
+            "m": "m",
+            "male": "m",
+            "homme": "m",
+            "masculin": "m",
+            "f": "f",
+            "female": "f",
+            "femme": "f",
+            "féminin": "f",
+        }
+
+        # Apply mapping with error handling
+        normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
+        return normalized.astype("category")
+
+    @classmethod
+    def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
+        categories = ["province", "identified_category", "sex"]
+
+        for col in categories:
+            if col in df.columns and df[col].dtype != "category":
+                df[col] = df[col].astype("category")
+
+        # Ensure string columns are proper string dtype
+        string_cols = [
+            "name",
+            "probable_native",
+            "probable_surname",
+            "identified_name",
+            "identified_surname",
+            "ner_entities",
+        ]
+
+        for col in string_cols:
+            if col in df.columns and df[col].dtype == "object":
+                df[col] = df[col].astype("string")
+
+        return df
@@ -0,0 +1,169 @@
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict
+
+import ollama
+import pandas as pd
+from pydantic import ValidationError
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.core.utils.prompt_manager import PromptManager
+from ners.core.utils.rate_limiter import RateLimitConfig
+from ners.core.utils.rate_limiter import RateLimiter
+from ners.processing.batch.batch_config import BatchConfig
+from ners.processing.steps import PipelineStep, NameAnnotation
+
+
+class LLMAnnotationStep(PipelineStep):
+    """Configuration-driven LLM annotation step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        # Create custom batch config for LLM processing
+        self.llm_config = pipeline_config.annotation.llm
+        batch_config = BatchConfig(
+            batch_size=pipeline_config.processing.batch_size,
+            max_workers=min(
+                self.llm_config.max_concurrent_requests,
+                pipeline_config.processing.max_workers,
+            ),
+            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
+            use_multiprocessing=pipeline_config.processing.use_multiprocessing,
+        )
+        super().__init__("llm_annotation", pipeline_config, batch_config)
+
+        self.prompt = PromptManager(pipeline_config).load_prompt()
+        self.rate_limiter = (
+            self._create_rate_limiter()
+            if self.llm_config.enable_rate_limiting
+            else None
+        )
+
+        # Statistics
+        self.successful_requests = 0
+        self.failed_requests = 0
+        self.total_retry_attempts = 0
+
+        # Setup logging
+        logging.getLogger("httpx").setLevel(logging.WARNING)
+
+    def _create_rate_limiter(self):
+        """Create rate limiter based on configuration"""
+        rate_config = RateLimitConfig(
+            requests_per_minute=self.llm_config.requests_per_minute,
+            requests_per_second=self.llm_config.requests_per_second,
+        )
+        return RateLimiter(rate_config)
+
+    def analyze_name(self, client: ollama.Client, name: str) -> Dict:
+        """Analyze a name with retry logic and rate limiting"""
+        for attempt in range(self.llm_config.retry_attempts):
+            try:
+                # Apply rate limiting if enabled
+                if self.rate_limiter:
+                    self.rate_limiter.wait_if_needed()
+
+                start_time = time.time()
+                response = client.chat(
+                    model=self.llm_config.model_name,
+                    messages=[
+                        {"role": "system", "content": self.prompt},
+                        {"role": "user", "content": name},
+                    ],
+                    format=NameAnnotation.model_json_schema(),
+                )
+                elapsed_time = time.time() - start_time
+
+                if elapsed_time > self.llm_config.timeout_seconds:
+                    raise TimeoutError(
+                        f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
+                    )
+
+                annotation = NameAnnotation.model_validate_json(
+                    response.message.content
+                )
+                result = {
+                    **annotation.model_dump(),
+                    "annotated": 1,
+                    "processing_time": elapsed_time,
+                    "attempts": attempt + 1,
+                }
+
+                self.successful_requests += 1
+                if attempt > 0:
+                    self.total_retry_attempts += attempt
+
+                return result
+
+            except (ValidationError, TimeoutError, Exception) as e:
+                logging.warning(
+                    f"Error analyzing '{name}' (attempt {attempt + 1}/{self.llm_config.retry_attempts}): {e}"
+                )
+
+                # Exponential backoff with jitter
+                if attempt < self.llm_config.retry_attempts - 1:
+                    wait_time = (2**attempt) + (time.time() % 1)
+                    time.sleep(min(wait_time, 10))
+
+        self.failed_requests += 1
+        return {
+            "identified_name": None,
+            "identified_surname": None,
+            "annotated": 0,
+            "processing_time": 0,
+            "attempts": self.llm_config.retry_attempts,
+            "failed": True,
+        }
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process batch with LLM annotation"""
+        unannotated_mask = batch.get("annotated", 0) == 0
+        unannotated_entries = batch[unannotated_mask]
+
+        if unannotated_entries.empty:
+            logging.info(f"Batch {batch_id}: No entries to annotate")
+            return batch
+
+        logging.info(
+            f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
+        )
+
+        batch = batch.copy()
+        client = ollama.Client()
+
+        # Process with controlled concurrency
+        max_workers = self.llm_config.max_concurrent_requests
+
+        if len(unannotated_entries) == 1 or max_workers == 1:
+            # Sequential processing
+            for idx, row in unannotated_entries.iterrows():
+                result = self.analyze_name(client, row["name"])
+                for field, value in result.items():
+                    if field not in ["failed"]:
+                        batch.loc[idx, field] = value
+        else:
+            # Concurrent processing
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                future_to_idx = {}
+
+                for idx, row in unannotated_entries.iterrows():
+                    future = executor.submit(self.analyze_name, client, row["name"])
+                    future_to_idx[future] = idx
+
+                for future in as_completed(future_to_idx):
+                    idx = future_to_idx[future]
+                    try:
+                        result = future.result()
+                        for field, value in result.items():
+                            if field not in ["failed"]:
+                                batch.loc[idx, field] = value
+                    except Exception as e:
+                        logging.error(f"Failed to process row {idx}: {e}")
+                        batch.loc[idx, "annotated"] = 0
+
+        # Ensure proper data types
+        batch["annotated"] = (
+            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        )
+
+        return batch
@@ -0,0 +1,172 @@
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict
+
+import pandas as pd
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.processing.ner.name_model import NameModel
+from ners.processing.steps import PipelineStep, NameAnnotation
+
+
+class NERAnnotationStep(PipelineStep):
+    """NER annotation step using trained spaCy model for entity recognition"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        # Create custom batch config for NER processing
+        super().__init__("ner_annotation", pipeline_config)
+
+        self.model_name = "drc_ner_model"
+        self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
+        self.name_model = NameModel(pipeline_config)
+        self.ner_config = pipeline_config.annotation.ner
+
+        # Statistics
+        self.successful_requests = 0
+        self.failed_requests = 0
+        self.total_retry_attempts = 0
+
+        # Load the model
+        self._load_ner_model()
+
+    def _load_ner_model(self) -> None:
+        """Load the trained NER model"""
+        try:
+            if self.model_path.exists():
+                logging.info(f"Loading NER model from {self.model_path}")
+                self.name_model.load(str(self.model_path))
+                logging.info("NER model loaded successfully")
+            else:
+                logging.warning(f"NER model not found at {self.model_path}")
+                logging.warning(
+                    "NER annotation will be skipped. Train the model first."
+                )
+                self.name_model.nlp = None
+        except Exception as e:
+            logging.error(f"Failed to load NER model: {e}")
+            self.name_model.nlp = None
+
+    def analyze_name(self, name: str) -> Dict:
+        """Analyze a name with retry logic"""
+        if self.name_model.nlp is None:
+            return {
+                "identified_name": None,
+                "identified_surname": None,
+                "annotated": 0,
+                "processing_time": 0,
+                "attempts": 0,
+                "failed": True,
+            }
+
+        for attempt in range(self.ner_config.retry_attempts):
+            try:
+                start_time = time.time()
+
+                # Get NER predictions
+                prediction = self.name_model.predict(name.lower())
+                entities = prediction.get("entities", [])
+
+                elapsed_time = time.time() - start_time
+
+                # Extract native names and surnames from entities
+                native_parts = []
+                surname_parts = []
+
+                for entity in entities:
+                    if entity["label"] == "NATIVE":
+                        native_parts.append(entity["text"])
+                    elif entity["label"] == "SURNAME":
+                        surname_parts.append(entity["text"])
+
+                # Create annotation result in same format as LLM step
+                annotation = NameAnnotation(
+                    identified_name=" ".join(native_parts) if native_parts else None,
+                    identified_surname=" ".join(surname_parts)
+                    if surname_parts
+                    else None,
+                )
+
+                result = {
+                    **annotation.model_dump(),
+                    "annotated": 1,
+                    "processing_time": elapsed_time,
+                    "attempts": attempt + 1,
+                }
+
+                self.successful_requests += 1
+                if attempt > 0:
+                    self.total_retry_attempts += attempt
+
+                return result
+
+            except Exception as e:
+                logging.warning(
+                    f"Error analyzing '{name}' with NER (attempt {attempt + 1}/{self.ner_config.retry_attempts}): {e}"
+                )
+
+                # Small delay between retries
+                if attempt < self.ner_config.retry_attempts - 1:
+                    time.sleep(0.1)
+
+        self.failed_requests += 1
+        return {
+            "identified_name": None,
+            "identified_surname": None,
+            "annotated": 0,
+            "processing_time": 0,
+            "attempts": self.ner_config.retry_attempts,
+            "failed": True,
+        }
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process batch with NER annotation using same logic as LLM step"""
+        unannotated_mask = batch.get("annotated", 0) == 0
+        unannotated_entries = batch[unannotated_mask]
+
+        if unannotated_entries.empty:
+            logging.info(f"Batch {batch_id}: No entries to annotate")
+            return batch
+
+        logging.info(
+            f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
+        )
+
+        batch = batch.copy()
+
+        # Process with controlled concurrency
+        max_workers = self.batch_config.max_workers
+
+        if len(unannotated_entries) == 1 or max_workers == 1:
+            # Sequential processing
+            for idx, row in unannotated_entries.iterrows():
+                result = self.analyze_name(row["name"])
+                for field, value in result.items():
+                    if field not in ["failed"]:
+                        batch.loc[idx, field] = value
+        else:
+            # Concurrent processing
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                future_to_idx = {}
+
+                for idx, row in unannotated_entries.iterrows():
+                    future = executor.submit(self.analyze_name, row["name"])
+                    future_to_idx[future] = idx
+
+                for future in as_completed(future_to_idx):
+                    idx = future_to_idx[future]
+                    try:
+                        result = future.result()
+                        for field, value in result.items():
+                            if field not in ["failed"]:
+                                batch.loc[idx, field] = value
+                    except Exception as e:
+                        logging.error(f"Failed to process row {idx}: {e}")
+                        batch.loc[idx, "annotated"] = 0
+
+        # Ensure proper data types
+        batch["annotated"] = (
+            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        )
+
+        return batch
@@ -0,0 +1,261 @@
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional, List
+
+import joblib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from ners.research.experiment import ExperimentConfig
+
+
+class BaseModel(ABC):
+    """Abstract base class for all models"""
+
+    def __init__(self, config: ExperimentConfig):
+        self.config = config
+        self.model = None
+        self.feature_extractor = None
+        self.label_encoder = None
+        self.tokenizer = None  # For neural models
+        self.is_fitted = False
+        self.training_history = {}  # Store training history for learning curves
+        self.learning_curve_data = {}  # Store learning curve experiment data
+
+    @property
+    @abstractmethod
+    def architecture(self) -> str:
+        """Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
+        pass
+
+    @abstractmethod
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        """Prepare features for training/prediction"""
+        pass
+
+    @abstractmethod
+    def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
+        """Fit the model - implemented differently for each architecture"""
+        pass
+
+    @abstractmethod
+    def cross_validate(
+        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
+    ) -> Dict[str, float] | dict[str, np.floating[Any]]:
+        """Perform cross-validation and return average scores"""
+        pass
+
+    @abstractmethod
+    def generate_learning_curve(
+        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+    ) -> Dict[str, Any]:
+        """Generate learning curve data for the model"""
+        pass
+
+    def predict(self, X: pd.DataFrame) -> np.ndarray:
+        """Make predictions"""
+        if not self.is_fitted:
+            raise ValueError("Model must be fitted before making predictions")
+
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+
+        predictions = self.model.predict(X_prepared)
+
+        # Handle different prediction formats
+        if hasattr(predictions, "shape") and len(predictions.shape) > 1:
+            # Neural network outputs (probabilities)
+            predictions = predictions.argmax(axis=1)
+
+        return self.label_encoder.inverse_transform(predictions)
+
+    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
+        """Get prediction probabilities if supported"""
+        if not self.is_fitted:
+            raise ValueError("Model must be fitted before making predictions")
+
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+
+        if hasattr(self.model, "predict_proba"):
+            return self.model.predict_proba(X_prepared)
+        elif hasattr(self.model, "predict"):
+            # For neural networks that return probabilities directly
+            probabilities = self.model.predict(X_prepared)
+            if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
+                return probabilities
+
+        raise NotImplementedError("Model does not support probability predictions")
+
+    def get_feature_importance(self) -> Optional[Dict[str, float]]:
+        """Get feature importance if supported by the model"""
+
+        if hasattr(self.model, "feature_importances_"):
+            # For tree-based models
+            importances = self.model.feature_importances_
+            feature_names = self._get_feature_names()
+            return dict(zip(feature_names, importances))
+
+        elif hasattr(self.model, "coef_"):
+            # For linear models
+            coefficients = np.abs(self.model.coef_[0])
+            feature_names = self._get_feature_names()
+            return dict(zip(feature_names, coefficients))
+
+        elif (
+            hasattr(self.model, "named_steps")
+            and "classifier" in self.model.named_steps
+        ):
+            # For sklearn pipelines (like LogisticRegression with vectorizer)
+            classifier = self.model.named_steps["classifier"]
+            if hasattr(classifier, "coef_"):
+                coefficients = np.abs(classifier.coef_[0])
+                if hasattr(
+                    self.model.named_steps["vectorizer"], "get_feature_names_out"
+                ):
+                    feature_names = self.model.named_steps[
+                        "vectorizer"
+                    ].get_feature_names_out()
+                    # Take top features to avoid too many n-grams
+                    top_indices = np.argsort(coefficients)[-20:]
+                    return dict(
+                        zip(feature_names[top_indices], coefficients[top_indices])
+                    )
+
+        return None
+
+    def _get_feature_names(self) -> List[str]:
+        """Get feature names (override in subclasses if needed)"""
+        if hasattr(self.model, "feature_names_in_"):
+            return list(self.model.feature_names_in_)
+        return [f"feature_{i}" for i in range(100)]  # Default fallback
+
+    def save(self, path: str):
+        """Save the complete model with training history"""
+
+        model_data = {
+            "model": self.model,
+            "feature_extractor": self.feature_extractor,
+            "label_encoder": self.label_encoder,
+            "tokenizer": self.tokenizer,
+            "config": self.config.to_dict(),
+            "is_fitted": self.is_fitted,
+            "training_history": self.training_history,
+            "learning_curve_data": self.learning_curve_data,
+        }
+        joblib.dump(model_data, path)
+
+    @classmethod
+    def load(cls, path: str) -> "BaseModel":
+        """Load a saved model with training history"""
+        model_data = joblib.load(path)
+
+        # Recreate the model instance
+        from ners.research.experiment import ExperimentConfig
+
+        config = ExperimentConfig.from_dict(model_data["config"])
+        instance = cls(config)
+
+        # Restore state
+        instance.model = model_data["model"]
+        instance.feature_extractor = model_data["feature_extractor"]
+        instance.label_encoder = model_data["label_encoder"]
+        instance.tokenizer = model_data.get("tokenizer")
+        instance.is_fitted = model_data["is_fitted"]
+        instance.training_history = model_data.get("training_history", {})
+        instance.learning_curve_data = model_data.get("learning_curve_data", {})
+
+        return instance
+
+    def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
+        """Plot and save learning curve"""
+
+        if not self.learning_curve_data:
+            logging.warning("No learning curve data available")
+            return ""
+
+        plt.figure(figsize=(10, 6))
+
+        data = self.learning_curve_data
+        train_sizes = data["train_sizes"]
+        train_scores = data["train_scores"]
+        val_scores = data["val_scores"]
+        train_std = data.get("train_scores_std", [0] * len(train_sizes))
+        val_std = data.get("val_scores_std", [0] * len(train_sizes))
+
+        # Plot learning curves
+        plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
+        plt.fill_between(
+            train_sizes,
+            np.array(train_scores) - np.array(train_std),
+            np.array(train_scores) + np.array(train_std),
+            alpha=0.1,
+            color="blue",
+        )
+
+        plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
+        plt.fill_between(
+            train_sizes,
+            np.array(val_scores) - np.array(val_std),
+            np.array(val_scores) + np.array(val_std),
+            alpha=0.1,
+            color="red",
+        )
+
+        plt.xlabel("Training Set Size")
+        plt.ylabel("Accuracy Score")
+        plt.title(f"Learning Curve - {self.__class__.__name__}")
+        plt.legend(loc="best")
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
+            plt.close()
+            return save_path
+        else:
+            plt.show()
+            return ""
+
+    def plot_training_history(self, save_path: Optional[str] = None) -> str:
+        """Plot training history for neural networks"""
+        if not self.training_history:
+            logging.warning("No training history available")
+            return ""
+
+        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
+
+        # Plot accuracy
+        if "accuracy" in self.training_history:
+            axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
+            if "val_accuracy" in self.training_history:
+                axes[0].plot(
+                    self.training_history["val_accuracy"], label="Validation Accuracy"
+                )
+            axes[0].set_title("Model Accuracy")
+            axes[0].set_xlabel("Epoch")
+            axes[0].set_ylabel("Accuracy")
+            axes[0].legend()
+            axes[0].grid(True, alpha=0.3)
+
+        # Plot loss
+        if "loss" in self.training_history:
+            axes[1].plot(self.training_history["loss"], label="Training Loss")
+            if "val_loss" in self.training_history:
+                axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
+            axes[1].set_title("Model Loss")
+            axes[1].set_xlabel("Epoch")
+            axes[1].set_ylabel("Loss")
+            axes[1].legend()
+            axes[1].grid(True, alpha=0.3)
+
+        plt.tight_layout()
+
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
+            plt.close()
+            return save_path
+        else:
+            plt.show()
+            return ""
@@ -0,0 +1,97 @@
+from dataclasses import dataclass, field, asdict
+from enum import Enum
+from typing import List, Dict, Any, Optional
+
+import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+
+from .feature_extractor import FeatureType
+
+
+@dataclass
+class ExperimentConfig:
+    """Configuration for a single experiment"""
+
+    # Experiment metadata
+    name: str
+    description: str = ""
+    tags: List[str] = field(default_factory=list)
+
+    # Model configuration
+    model_type: str = (
+        "logistic_regression"  # logistic_regression, lstm, transformer, etc.
+    )
+    model_params: Dict[str, Any] = field(default_factory=dict)
+
+    # Feature configuration
+    features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
+    feature_params: Dict[str, Any] = field(default_factory=dict)
+
+    # Data configuration
+    train_data_filter: Optional[Dict[str, Any]] = (
+        None  # Filter criteria for training data
+    )
+    test_data_filter: Optional[Dict[str, Any]] = None
+    target_column: str = "sex"
+
+    # Training configuration
+    test_size: float = 0.2
+    random_seed: int = 42
+    cross_validation_folds: int = 5
+
+    # Evaluation configuration
+    metrics: List[str] = field(
+        default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
+    )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization"""
+        result = asdict(self)
+        # Convert enums to strings
+        result["features"] = [f.value for f in self.features]
+        return result
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
+        """Create from dictionary"""
+        if "features" in data:
+            data["features"] = [FeatureType(f) for f in data["features"]]
+        return cls(**data)
+
+
+class ExperimentStatus(Enum):
+    """Experiment execution status"""
+
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+def calculate_metrics(
+    y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
+) -> Dict[str, float]:
+    """Calculate specified metrics"""
+
+    if metrics is None:
+        metrics = ["accuracy", "precision", "recall", "f1"]
+
+    results = {}
+
+    if "accuracy" in metrics:
+        results["accuracy"] = accuracy_score(y_true, y_pred)
+
+    if any(m in metrics for m in ["precision", "recall", "f1"]):
+        precision, recall, f1, _ = precision_recall_fscore_support(
+            y_true, y_pred, average="weighted"
+        )
+
+        if "precision" in metrics:
+            results["precision"] = precision
+        if "recall" in metrics:
+            results["recall"] = recall
+        if "f1" in metrics:
+            results["f1"] = f1
+
+    return results
@@ -0,0 +1,58 @@
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from typing import Optional, Dict, List, Any
+
+from ners.research.experiment import ExperimentConfig, ExperimentStatus
+
+
+@dataclass
+class ExperimentResult:
+    """Results from an experiment execution"""
+
+    experiment_id: str
+    config: ExperimentConfig
+
+    # Execution metadata
+    start_time: datetime
+    end_time: Optional[datetime] = None
+    status: ExperimentStatus = ExperimentStatus.PENDING
+    error_message: Optional[str] = None
+
+    # Model artifacts
+    model_path: Optional[str] = None
+    feature_extractor_path: Optional[str] = None
+
+    # Metrics
+    train_metrics: Dict[str, float] = field(default_factory=dict)
+    test_metrics: Dict[str, float] = field(default_factory=dict)
+    cv_metrics: Dict[str, float] = field(default_factory=dict)
+
+    # Additional results
+    confusion_matrix: Optional[List[List[int]]] = None
+    feature_importance: Optional[Dict[str, float]] = None
+    prediction_examples: Optional[List[Dict]] = None
+
+    # Data statistics
+    train_size: int = 0
+    test_size: int = 0
+    class_distribution: Dict[str, int] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization"""
+        result = asdict(self)
+        result["config"] = self.config.to_dict()
+        result["start_time"] = self.start_time.isoformat()
+        result["end_time"] = self.end_time.isoformat() if self.end_time else None
+        result["status"] = self.status.value
+        return result
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
+        """Create from dictionary"""
+        data["config"] = ExperimentConfig.from_dict(data["config"])
+        data["start_time"] = datetime.fromisoformat(data["start_time"])
+        data["end_time"] = (
+            datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
+        )
+        data["status"] = ExperimentStatus(data["status"])
+        return cls(**data)
@@ -0,0 +1,112 @@
+import logging
+from typing import List, Dict
+
+import yaml
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.research.experiment import ExperimentConfig
+from ners.research.experiment.feature_extractor import FeatureType
+
+
+class ExperimentBuilder:
+    """Helper class to build experiment configurations"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+
+    def load_templates(self, templates: str = "research_templates.yaml") -> dict:
+        """Load research templates from YAML file"""
+        try:
+            with open(self.config.paths.configs_dir / templates, "r") as file:
+                return yaml.safe_load(file)
+        except FileNotFoundError:
+            logging.error(f"Templates file not found: {templates}")
+            raise
+        except yaml.YAMLError as e:
+            logging.error(f"Error parsing templates file: {e}")
+            raise
+
+    @classmethod
+    def find_template(
+        cls, templates: dict, name: str, experiment_type: str = "baseline"
+    ) -> dict:
+        """Find experiment configuration by name and type"""
+
+        # Map type to section in templates
+        type_mapping = {
+            "baseline": "baseline_experiments",
+            "advanced": "advanced_experiments",
+            "feature_study": "feature_studies",
+            "tuning": "hyperparameter_tuning",
+        }
+
+        section_name = type_mapping.get(experiment_type)
+        if not section_name:
+            available_types = list(type_mapping.keys())
+            raise ValueError(
+                f"Unknown experiment type '{experiment_type}'. Available types: {available_types}"
+            )
+
+        if section_name not in templates:
+            raise ValueError(f"Section '{section_name}' not found in templates")
+
+        experiments = templates[section_name]
+
+        # Search for experiment by model name
+        for experiment in experiments:
+            # Check if this is the experiment we're looking for
+            # Look for experiments that match the model type or contain the name
+            if (
+                experiment.get("model_type") == name
+                or name.lower() in experiment.get("name", "").lower()
+                or experiment.get("name") == name
+                or f"baseline_{name}" == experiment.get("name")
+                or f"advanced_{name}" == experiment.get("name")
+            ):
+                return experiment
+
+        # If not found, list available experiments
+        available_experiments = [
+            exp.get("name", exp.get("model_type", "unknown")) for exp in experiments
+        ]
+        raise ValueError(
+            f"Experiment '{name}' not found in '{experiment_type}' section. "
+            f"Available experiments: {available_experiments}"
+        )
+
+    def get_templates(
+        self, templates_path: str = "research_templates.yaml"
+    ) -> Dict[str, List[Dict]]:
+        """Get all available experiments from templates organized by type"""
+        templates = self.load_templates(templates_path)
+
+        return {
+            "baseline": templates.get("baseline_experiments", []),
+            "advanced": templates.get("advanced_experiments", []),
+            "feature_study": templates.get("feature_studies", []),
+            "tuning": templates.get("hyperparameter_tuning", []),
+        }
+
+    @classmethod
+    def from_template(cls, template_config: dict) -> ExperimentConfig:
+        """Create an ExperimentConfig from a template configuration"""
+        # Convert feature strings to FeatureType objects
+        features = []
+        for feature_str in template_config.get("features", []):
+            try:
+                features.append(FeatureType(feature_str))
+            except ValueError:
+                logging.warning(f"Unknown feature type: {feature_str}")
+                continue
+
+        return ExperimentConfig(
+            name=template_config.get("name"),
+            description=template_config.get("description"),
+            model_type=template_config.get("model_type"),
+            features=features,
+            model_params=template_config.get("model_params", {}),
+            tags=template_config.get("tags", []),
+            test_size=template_config.get("test_size", 0.2),
+            cross_validation_folds=template_config.get("cross_validation_folds", 5),
+            train_data_filter=template_config.get("train_data_filter"),
+        )
@@ -0,0 +1,285 @@
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Optional
+
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import train_test_split
+
+from ners.core.config import PipelineConfig
+from ners.core.utils.data_loader import DataLoader
+from ners.research.base_model import BaseModel
+from ners.research.experiment import (
+    ExperimentConfig,
+    ExperimentStatus,
+    calculate_metrics,
+)
+from ners.research.experiment.experiment_tracker import ExperimentTracker
+from ners.research.model_registry import create_model
+
+
+class ExperimentRunner:
+    """Runs and manages experiments"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.tracker = ExperimentTracker(self.config)
+        self.data_loader = DataLoader(self.config)
+
+    def run_experiment(self, experiment_config: ExperimentConfig) -> str:
+        """Run a single experiment and return experiment ID"""
+        # Create experiment
+        experiment_id = self.tracker.create_experiment(experiment_config)
+
+        try:
+            logging.info(f"Starting experiment: {experiment_id}")
+            self.tracker.update_experiment(
+                experiment_id, status=ExperimentStatus.RUNNING
+            )
+
+            # Load data
+            filepath = self.config.paths.get_data_path(
+                self.config.data.output_files["featured"]
+            )
+            df = self.data_loader.load_csv_complete(filepath)
+
+            # Apply data filters if specified
+            df = self._apply_data_filters(df, experiment_config)
+
+            # Prepare target variable
+            y = df[experiment_config.target_column]
+            X = df
+
+            # Split data
+            X_train, X_test, y_train, y_test = train_test_split(
+                X,
+                y,
+                test_size=experiment_config.test_size,
+                random_state=experiment_config.random_seed,
+                stratify=y,
+            )
+
+            # Create and train model
+            model = create_model(experiment_config)
+            model.fit(X_train, y_train)
+
+            # Make predictions
+            train_pred = model.predict(X_train)
+            test_pred = model.predict(X_test)
+
+            # Calculate metrics
+            train_metrics = calculate_metrics(
+                y_train, train_pred, experiment_config.metrics
+            )
+            test_metrics = calculate_metrics(
+                y_test, test_pred, experiment_config.metrics
+            )
+
+            # Cross-validation if requested
+            cv_metrics = {}
+            if experiment_config.cross_validation_folds > 1:
+                cv_metrics = model.cross_validate(
+                    X_train, y_train, experiment_config.cross_validation_folds
+                )
+
+            # Additional analysis
+            conf_matrix = confusion_matrix(y_test, test_pred).tolist()
+            feature_importance = model.get_feature_importance()
+
+            # Create prediction examples
+            prediction_examples = self._create_prediction_examples(
+                X_test, y_test, test_pred, model, n_examples=10
+            )
+
+            # Calculate class distribution
+            class_distribution = y.value_counts().to_dict()
+
+            # Save model
+            model_path = self._save_model(model, experiment_id)
+
+            # Update experiment with results
+            self.tracker.update_experiment(
+                experiment_id,
+                status=ExperimentStatus.COMPLETED,
+                end_time=datetime.now(),
+                model_path=str(model_path),
+                train_metrics=train_metrics,
+                test_metrics=test_metrics,
+                cv_metrics=cv_metrics,
+                confusion_matrix=conf_matrix,
+                feature_importance=feature_importance,
+                prediction_examples=prediction_examples,
+                train_size=len(X_train),
+                test_size=len(X_test),
+                class_distribution=class_distribution,
+            )
+
+            logging.info(f"Experiment {experiment_id} completed successfully")
+            logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
+
+            return experiment_id
+
+        except Exception as e:
+            logging.error(f"Experiment {experiment_id} failed: {str(e)}")
+            self.tracker.update_experiment(
+                experiment_id,
+                status=ExperimentStatus.FAILED,
+                end_time=datetime.now(),
+                error_message=str(e),
+            )
+            raise
+
+    def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
+        """Run multiple experiments"""
+        experiment_ids = []
+
+        for i, config in enumerate(experiments):
+            logging.info(
+                f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
+            )
+            try:
+                exp_id = self.run_experiment(config)
+                experiment_ids.append(exp_id)
+            except Exception as e:
+                logging.error(f"Failed to run experiment {config.name}: {e}")
+                continue
+
+        return experiment_ids
+
+    @classmethod
+    def _apply_data_filters(
+        cls, df: pd.DataFrame, config: ExperimentConfig
+    ) -> pd.DataFrame:
+        """Apply data filters specified in experiment config"""
+        filtered_df = df.copy()
+
+        # Apply training data filters
+        if config.train_data_filter:
+            for column, criteria in config.train_data_filter.items():
+                if column in filtered_df.columns:
+                    if isinstance(criteria, list):
+                        filtered_df = filtered_df[filtered_df[column].isin(criteria)]
+                    elif isinstance(criteria, dict):
+                        if "min" in criteria:
+                            filtered_df = filtered_df[
+                                filtered_df[column] >= criteria["min"]
+                            ]
+                        if "max" in criteria:
+                            filtered_df = filtered_df[
+                                filtered_df[column] <= criteria["max"]
+                            ]
+                    else:
+                        filtered_df = filtered_df[filtered_df[column] == criteria]
+
+        return filtered_df
+
+    @classmethod
+    def _create_prediction_examples(
+        cls,
+        X_test: pd.DataFrame,
+        y_test: pd.Series,
+        predictions: np.ndarray,
+        model: BaseModel,
+        n_examples: int = 10,
+    ) -> List[Dict]:
+        """Create prediction examples for analysis"""
+        examples = []
+
+        # Get both correct and incorrect predictions
+        correct_mask = y_test == predictions
+        incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
+        correct_indices = X_test[correct_mask].index[: n_examples // 2]
+
+        sample_indices = list(incorrect_indices) + list(correct_indices)
+
+        for idx in sample_indices[:n_examples]:
+            example = {
+                "name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
+                "true_label": y_test.loc[idx],
+                "predicted_label": predictions[X_test.index.get_loc(idx)],
+                "correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
+            }
+
+            # Add probability if available
+            if model.architecture == "traditional":
+                proba = model.predict_proba(X_test.loc[[idx]])
+                example["prediction_confidence"] = float(proba.max())
+
+            examples.append(example)
+
+        return examples
+
+    def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
+        """Save trained model"""
+        model_dir = self.config.paths.models_dir / "experiments" / experiment_id
+        model_dir.mkdir(parents=True, exist_ok=True)
+
+        model_path = model_dir / "model.joblib"
+        model.save(str(model_path))
+
+        return model_path
+
+    def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
+        """Load a model from a completed experiment"""
+        experiment = self.tracker.get_experiment(experiment_id)
+
+        if experiment and experiment.model_path:
+            try:
+                # Load the saved model data Recreate the model instance using the saved config
+                model_data = joblib.load(experiment.model_path)
+                config = ExperimentConfig.from_dict(model_data["config"])
+                model = create_model(config)
+
+                # Restore the saved state
+                model.model = model_data["model"]
+                model.feature_extractor = model_data["feature_extractor"]
+                model.label_encoder = model_data["label_encoder"]
+                model.tokenizer = model_data.get("tokenizer")
+                model.is_fitted = model_data["is_fitted"]
+                model.training_history = model_data.get("training_history", {})
+                model.learning_curve_data = model_data.get("learning_curve_data", {})
+
+                # Restore vectorizers and encoders for models that use them (like XGBoost)
+                if "vectorizers" in model_data and hasattr(model, "vectorizers"):
+                    model.vectorizers = model_data["vectorizers"]
+                if "label_encoders" in model_data and hasattr(model, "label_encoders"):
+                    model.label_encoders = model_data["label_encoders"]
+
+                return model
+
+            except Exception as e:
+                logging.error(
+                    f"Failed to load model for experiment {experiment_id}: {e}"
+                )
+                return None
+
+        return None
+
+    def compare_experiments(
+        self, experiment_ids: List[str], metric: str = "accuracy"
+    ) -> pd.DataFrame:
+        """Compare experiments and return analysis"""
+        comparison_df = self.tracker.compare_experiments(experiment_ids)
+
+        if f"test_{metric}" in comparison_df.columns:
+            comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
+
+        return comparison_df
+
+    def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
+        """Get feature importance analysis for an experiment"""
+        experiment = self.tracker.get_experiment(experiment_id)
+
+        if experiment and experiment.feature_importance:
+            importance_df = pd.DataFrame(
+                [
+                    {"feature": feature, "importance": importance}
+                    for feature, importance in experiment.feature_importance.items()
+                ]
+            )
+            return importance_df.sort_values("importance", ascending=False)
+
+        return None
@@ -0,0 +1,200 @@
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Dict, List
+
+import pandas as pd
+
+from ners.core.config import PipelineConfig, get_config
+from ners.research.experiment import ExperimentConfig, ExperimentStatus
+from ners.research.experiment.experiement_result import ExperimentResult
+
+
+class ExperimentTracker:
+    """Tracks and manages experiments"""
+
+    def __init__(self, config: Optional[PipelineConfig] = None):
+        self.config = config or get_config()
+        self.experiments_dir = self.config.paths.outputs_dir / "experiments"
+        self.experiments_dir.mkdir(parents=True, exist_ok=True)
+
+        self.results_db_path = self.experiments_dir / "experiments.json"
+        self._results: Dict[str, ExperimentResult] = {}
+        self._load_results()
+
+    def _load_results(self):
+        """Load existing experiment results"""
+        if self.results_db_path.exists():
+            try:
+                with open(self.results_db_path, "r") as f:
+                    data = json.load(f)
+
+                for exp_id, exp_data in data.items():
+                    self._results[exp_id] = ExperimentResult.from_dict(exp_data)
+            except Exception as e:
+                print(f"Warning: Failed to load experiment results: {e}")
+
+    def _save_results(self):
+        """Save experiment results to disk"""
+        data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
+
+        with open(self.results_db_path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+
+    def create_experiment(self, config: ExperimentConfig) -> str:
+        """Create a new experiment and return its ID"""
+        # Generate experiment ID
+        config_hash = hashlib.md5(
+            json.dumps(config.to_dict(), sort_keys=True).encode()
+        ).hexdigest()[:8]
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        experiment_id = f"{config.name}_{timestamp}_{config_hash}"
+
+        # Create result object
+        result = ExperimentResult(
+            experiment_id=experiment_id, config=config, start_time=datetime.now()
+        )
+
+        self._results[experiment_id] = result
+        self._save_results()
+
+        return experiment_id
+
+    def update_experiment(self, experiment_id: str, **updates):
+        """Update an experiment's results"""
+        if experiment_id in self._results:
+            result = self._results[experiment_id]
+
+            for key, value in updates.items():
+                if hasattr(result, key):
+                    setattr(result, key, value)
+
+            self._save_results()
+
+    def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
+        """Get experiment by ID"""
+        return self._results.get(experiment_id)
+
+    def list_experiments(
+        self,
+        status: Optional[ExperimentStatus] = None,
+        tags: Optional[List[str]] = None,
+        model_type: Optional[str] = None,
+    ) -> List[ExperimentResult]:
+        """List experiments with optional filtering"""
+        results = list(self._results.values())
+
+        if status:
+            results = [r for r in results if r.status == status]
+
+        if tags:
+            results = [r for r in results if any(tag in r.config.tags for tag in tags)]
+
+        if model_type:
+            results = [r for r in results if r.config.model_type == model_type]
+
+        return sorted(results, key=lambda x: x.start_time, reverse=True)
+
+    def get_best_experiment(
+        self,
+        metric: str = "accuracy",
+        dataset: str = "test",
+        filters: Optional[Dict] = None,
+    ) -> Optional[ExperimentResult]:
+        """Get the best experiment based on a metric"""
+        experiments = self.list_experiments()
+
+        if filters:
+            # Apply additional filters
+            if "model_type" in filters:
+                experiments = [
+                    e
+                    for e in experiments
+                    if e.config.model_type == filters["model_type"]
+                ]
+            if "features" in filters:
+                experiments = [
+                    e
+                    for e in experiments
+                    if any(f in e.config.features for f in filters["features"])
+                ]
+
+        valid_experiments = []
+        for exp in experiments:
+            if exp.status == ExperimentStatus.COMPLETED:
+                metrics_dict = (
+                    exp.test_metrics if dataset == "test" else exp.train_metrics
+                )
+                if metric in metrics_dict:
+                    valid_experiments.append((exp, metrics_dict[metric]))
+
+        if not valid_experiments:
+            return None
+
+        return max(valid_experiments, key=lambda x: x[1])[0]
+
+    def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
+        """Compare multiple experiments in a DataFrame"""
+        rows = []
+
+        for exp_id in experiment_ids:
+            exp = self.get_experiment(exp_id)
+            if exp:
+                row = {
+                    "experiment_id": exp_id,
+                    "name": exp.config.name,
+                    "model_type": exp.config.model_type,
+                    "features": ",".join([f.value for f in exp.config.features]),
+                    "status": exp.status.value,
+                    "train_size": exp.train_size,
+                    "test_size": exp.test_size,
+                }
+
+                # Add metrics
+                for metric, value in exp.test_metrics.items():
+                    row[f"test_{metric}"] = value
+
+                for metric, value in exp.cv_metrics.items():
+                    row[f"cv_{metric}"] = value
+
+                rows.append(row)
+
+        return pd.DataFrame(rows)
+
+    def export_results(self, output_path: Optional[Path] = None) -> Path:
+        """Export all results to CSV"""
+        if output_path is None:
+            output_path = (
+                self.experiments_dir
+                / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+            )
+
+        rows = []
+        for exp in self._results.values():
+            row = {
+                "experiment_id": exp.experiment_id,
+                "name": exp.config.name,
+                "description": exp.config.description,
+                "model_type": exp.config.model_type,
+                "features": ",".join([f.value for f in exp.config.features]),
+                "status": exp.status.value,
+                "start_time": exp.start_time.isoformat(),
+                "end_time": exp.end_time.isoformat() if exp.end_time else None,
+                "train_size": exp.train_size,
+                "test_size": exp.test_size,
+            }
+
+            # Add all metrics
+            for metric, value in exp.test_metrics.items():
+                row[f"test_{metric}"] = value
+
+            for metric, value in exp.cv_metrics.items():
+                row[f"cv_{metric}"] = value
+
+            rows.append(row)
+
+        df = pd.DataFrame(rows)
+        df.to_csv(output_path, index=False)
+
+        return output_path
@@ -0,0 +1,92 @@
+from enum import Enum
+from typing import List, Dict, Any, Union
+
+import pandas as pd
+
+
+class FeatureType(Enum):
+    """Types of features that can be extracted from names"""
+
+    FULL_NAME = "full_name"
+    NATIVE_NAME = "native_name"
+    SURNAME = "surname"
+    FIRST_WORD = "first_word"
+    LAST_WORD = "last_word"
+    NAME_LENGTH = "name_length"
+    WORD_COUNT = "word_count"
+    PROVINCE = "province"
+    CHAR_NGRAMS = "char_ngrams"
+    WORD_NGRAMS = "word_ngrams"
+    NAME_ENDINGS = "name_endings"
+    NAME_BEGINNINGS = "name_beginnings"
+
+
+class FeatureExtractor:
+    """Extract different types of features from name data"""
+
+    def __init__(
+        self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None
+    ):
+        self.feature_types = feature_types
+        self.feature_params = feature_params or {}
+
+    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Extract all configured features"""
+        features_df = pd.DataFrame(index=df.index)
+
+        for feature_type in self.feature_types:
+            feature_data = self._extract_single_feature(df, feature_type)
+
+            if isinstance(feature_data, pd.DataFrame):
+                features_df = pd.concat([features_df, feature_data], axis=1)
+            else:
+                features_df[feature_type.value] = feature_data
+
+        return features_df
+
+    def _extract_single_feature(
+        self, df: pd.DataFrame, feature_type: FeatureType
+    ) -> Union[pd.Series, pd.DataFrame]:
+        """Extract a single type of feature"""
+        if feature_type == FeatureType.FULL_NAME:
+            return df["name"].fillna("")
+
+        elif feature_type == FeatureType.NATIVE_NAME:
+            return df["identified_name"].fillna(df["probable_native"]).fillna("")
+
+        elif feature_type == FeatureType.SURNAME:
+            return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
+
+        elif feature_type == FeatureType.FIRST_WORD:
+            return df["name"].str.split().str[0].fillna("")
+
+        elif feature_type == FeatureType.LAST_WORD:
+            return df["name"].str.split().str[-1].fillna("")
+
+        elif feature_type == FeatureType.NAME_LENGTH:
+            return df["name"].str.len().fillna(0)
+
+        elif feature_type == FeatureType.WORD_COUNT:
+            return df["words"].fillna(1)
+
+        elif feature_type == FeatureType.PROVINCE:
+            return df["province"].fillna("unknown")
+
+        elif feature_type == FeatureType.NAME_ENDINGS:
+            n = self.feature_params.get("ending_length", 3)
+            return df["name"].str[-n:].fillna("")
+
+        elif feature_type == FeatureType.NAME_BEGINNINGS:
+            n = self.feature_params.get("beginning_length", 3)
+            return df["name"].str[:n].fillna("")
+
+        elif feature_type == FeatureType.CHAR_NGRAMS:
+            # This will be handled by the model's vectorizer
+            return df["name"].fillna("")
+
+        elif feature_type == FeatureType.WORD_NGRAMS:
+            # This will be handled by the model's vectorizer
+            return df["name"].fillna("")
+
+        else:
+            raise ValueError(f"Unknown feature type: {feature_type}")
@@ -0,0 +1,44 @@
+from typing import List
+
+from ners.research.base_model import BaseModel
+from ners.research.experiment import ExperimentConfig
+from ners.research.models.bigru_model import BiGRUModel
+from ners.research.models.cnn_model import CNNModel
+from ners.research.models.ensemble_model import EnsembleModel
+from ners.research.models.lightgbm_model import LightGBMModel
+from ners.research.models.logistic_regression_model import LogisticRegressionModel
+from ners.research.models.lstm_model import LSTMModel
+from ners.research.models.naive_bayes_model import NaiveBayesModel
+from ners.research.models.random_forest_model import RandomForestModel
+from ners.research.models.svm_model import SVMModel
+from ners.research.models.transformer_model import TransformerModel
+from ners.research.models.xgboost_model import XGBoostModel
+
+MODEL_REGISTRY = {
+    "bigru": BiGRUModel,
+    "cnn": CNNModel,
+    "ensemble": EnsembleModel,
+    "lightgbm": LightGBMModel,
+    "logistic_regression": LogisticRegressionModel,
+    "lstm": LSTMModel,
+    "naive_bayes": NaiveBayesModel,
+    "random_forest": RandomForestModel,
+    "svm": SVMModel,
+    "transformer": TransformerModel,
+    "xgboost": XGBoostModel,
+}
+
+
+def create_model(config: ExperimentConfig) -> BaseModel:
+    """Factory function to create models"""
+    model_class = MODEL_REGISTRY.get(config.model_type)
+
+    if model_class is None:
+        raise ValueError(f"Unknown model type: {config.model_type}")
+
+    return model_class(config)
+
+
+def list_available_models() -> List[str]:
+    """List all available model types"""
+    return list(MODEL_REGISTRY.keys())
@@ -0,0 +1,301 @@
+import json
+import logging
+from datetime import datetime
+from typing import List, Dict, Any
+
+import pandas as pd
+
+from ners.core.config import get_config
+from ners.core.utils.data_loader import DataLoader
+from ners.research.experiment import FeatureType, ExperimentConfig
+from ners.research.experiment.experiment_runner import ExperimentRunner
+from ners.research.experiment.experiment_tracker import ExperimentTracker
+from ners.research.model_registry import MODEL_REGISTRY
+
+
+class ModelTrainer:
+    """Comprehensive model training and artifact management"""
+
+    def __init__(self, config=None):
+        self.config = config or get_config()
+        self.data_loader = DataLoader(self.config)
+        self.experiment_runner = ExperimentRunner(self.config)
+        self.experiment_tracker = ExperimentTracker(self.config)
+
+        # Setup model artifacts directory
+        self.models_dir = self.config.paths.models_dir
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+
+    def train_single_model(
+        self,
+        model_name: str,
+        model_type: str = "logistic_regression",
+        features: List[str] = None,
+        model_params: Dict[str, Any] = None,
+        tags: List[str] = None,
+        save_artifacts: bool = True,
+    ) -> str:
+        """
+        Train a single model and save its artifacts.
+        Returns the experiment ID.
+        """
+        logging.info(f"Training {model_type} model: {model_name}")
+
+        if features is None:
+            features = ["full_name"]
+        feature_types = [FeatureType(f) for f in features]
+
+        # Prepare tags - combine default tags with template tags
+        default_tags = ["training", model_type]
+        experiment_tags = default_tags + (tags or [])
+
+        # Create experiment configuration
+        config = ExperimentConfig(
+            name=model_name,
+            description=f"Training {model_type} model with features: {', '.join(features)}",
+            model_type=model_type,
+            features=feature_types,
+            model_params=model_params or {},
+            tags=experiment_tags,
+        )
+
+        # Run experiment
+        experiment_id = self.experiment_runner.run_experiment(config)
+        experiment = self.experiment_tracker.get_experiment(experiment_id)
+
+        if experiment and experiment.test_metrics:
+            logging.info("Training completed successfully!")
+            logging.info(f"Experiment ID: {experiment_id}")
+            logging.info(
+                f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}"
+            )
+            logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
+
+            if save_artifacts:
+                self.save_model_artifacts(experiment_id)
+
+        return experiment_id
+
+    def train_multiple_models(
+        self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
+    ) -> List[str]:
+        """
+        Train multiple models with different configurations.
+        """
+        logging.info(f"Training {len(model_configs)} models...")
+
+        experiment_ids = []
+
+        for i, config in enumerate(model_configs):
+            model_name = f"{base_name}_{config['model_type']}_{i + 1}"
+
+            try:
+                exp_id = self.train_single_model(
+                    model_name=model_name,
+                    model_type=config["model_type"],
+                    features=config.get("features", ["full_name"]),
+                    model_params=config.get("model_params", {}),
+                    save_artifacts=save_all,
+                )
+                experiment_ids.append(exp_id)
+
+            except Exception as e:
+                logging.error(f"Failed to train {model_name}: {e}")
+                continue
+
+        logging.info(f"Completed training {len(experiment_ids)} models successfully")
+        return experiment_ids
+
+    def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
+        """
+        Save model artifacts in a structured way for easy loading.
+        Returns paths to saved artifacts.
+        """
+        experiment = self.experiment_tracker.get_experiment(experiment_id)
+        if not experiment:
+            raise ValueError(f"Experiment {experiment_id} not found")
+
+        # Create model-specific directory
+        model_dir = self.models_dir / experiment_id
+        model_dir.mkdir(parents=True, exist_ok=True)
+
+        # Load the trained model
+        trained_model = self.experiment_runner.load_experiment_model(experiment_id)
+        if not trained_model:
+            raise ValueError(f"Could not load model for experiment {experiment_id}")
+
+        # Save complete model with joblib
+        model_path = model_dir / "complete_model.joblib"
+        trained_model.save(str(model_path))
+
+        # Save model configuration
+        config_path = model_dir / "model_config.json"
+        with open(config_path, "w") as f:
+            import json
+
+            json.dump(experiment.config.to_dict(), f, indent=2)
+
+        # Save experiment results
+        results_path = model_dir / "experiment_results.json"
+        with open(results_path, "w") as f:
+            json.dump(experiment.to_dict(), f, indent=2, default=str)
+
+        # Generate and save learning curves
+        learning_curve_path = None
+        training_history_path = None
+
+        try:
+            # Load data for learning curve generation
+            data_path = self.config.paths.get_data_path(
+                self.config.data.output_files["featured"]
+            )
+            if data_path.exists():
+                df = self.data_loader.load_csv_complete(data_path)
+
+                # Generate learning curve
+                logging.info("Generating learning curve...")
+                trained_model.generate_learning_curve(
+                    df, df[experiment.config.target_column]
+                )
+
+                # Plot and save learning curve
+                learning_curve_path = model_dir / "learning_curve.png"
+                trained_model.plot_learning_curve(str(learning_curve_path))
+
+                # Plot and save training history (for neural networks)
+                if trained_model.training_history:
+                    training_history_path = model_dir / "training_history.png"
+                    trained_model.plot_training_history(str(training_history_path))
+
+                # Save learning curve data as JSON
+                learning_data_path = model_dir / "learning_curve_data.json"
+                with open(learning_data_path, "w") as f:
+                    json.dump(trained_model.learning_curve_data, f, indent=2)
+
+                # Save training history data as JSON
+                if trained_model.training_history:
+                    history_data_path = model_dir / "training_history_data.json"
+                    with open(history_data_path, "w") as f:
+                        json.dump(trained_model.training_history, f, indent=2)
+
+        except Exception as e:
+            logging.warning(f"Could not generate learning curves: {e}")
+
+        # Save artifacts metadata
+        metadata = {
+            "experiment_id": experiment_id,
+            "model_name": experiment.config.name,
+            "model_type": experiment.config.model_type,
+            "features": [f.value for f in experiment.config.features],
+            "training_date": datetime.now().isoformat(),
+            "test_accuracy": experiment.test_metrics.get("accuracy", 0),
+            "test_f1": experiment.test_metrics.get("f1", 0),
+            "model_path": str(model_path),
+            "config_path": str(config_path),
+            "results_path": str(results_path),
+            "learning_curve_plot": str(learning_curve_path)
+            if learning_curve_path
+            else None,
+            "training_history_plot": str(training_history_path)
+            if training_history_path
+            else None,
+            "has_learning_curve": bool(trained_model.learning_curve_data),
+            "has_training_history": bool(trained_model.training_history),
+        }
+
+        metadata_path = model_dir / "metadata.json"
+        with open(metadata_path, "w") as f:
+            json.dump(metadata, f, indent=2)
+
+        logging.info(f"Model artifacts saved to: {model_dir}")
+        logging.info(f"   - Complete model: {model_path.name}")
+        logging.info(f"   - Configuration: {config_path.name}")
+        logging.info(f"   - Results: {results_path.name}")
+        logging.info(f"   - Metadata: {metadata_path.name}")
+
+        if learning_curve_path and learning_curve_path.exists():
+            logging.info(f"   - Learning curve: {learning_curve_path.name}")
+
+        if training_history_path and training_history_path.exists():
+            logging.info(f"   - Training history: {training_history_path.name}")
+
+        return {
+            "model_dir": str(model_dir),
+            "model_path": str(model_path),
+            "config_path": str(config_path),
+            "results_path": str(results_path),
+            "metadata_path": str(metadata_path),
+            "learning_curve_plot": str(learning_curve_path)
+            if learning_curve_path
+            else None,
+            "training_history_plot": str(training_history_path)
+            if training_history_path
+            else None,
+        }
+
+    def load_trained_model(self, experiment_id: str):
+        """
+        Load a previously trained model from artifacts.
+        """
+        model_dir = self.models_dir / experiment_id
+        model_path = model_dir / "complete_model.joblib"
+
+        if not model_path.exists():
+            raise FileNotFoundError(
+                f"Model artifacts not found for experiment {experiment_id}"
+            )
+
+        # Load the model class dynamically
+        metadata_path = model_dir / "metadata.json"
+        with open(metadata_path, "r") as f:
+            metadata = json.load(f)
+
+        model_type = metadata["model_type"]
+        model_class = MODEL_REGISTRY[model_type]
+
+        # Load the complete model
+        loaded_model = model_class.load(str(model_path))
+
+        logging.info(f"Loaded model: {metadata['model_name']}")
+        logging.info(f"   Type: {model_type}")
+        logging.info(f"   Accuracy: {metadata['test_accuracy']:.4f}")
+
+        return loaded_model
+
+    def list_saved_models(self) -> pd.DataFrame:
+        """
+        List all saved model artifacts.
+        """
+        models_data = []
+
+        for model_dir in self.models_dir.iterdir():
+            if model_dir.is_dir():
+                metadata_path = model_dir / "metadata.json"
+                if metadata_path.exists():
+                    try:
+                        with open(metadata_path, "r") as f:
+                            metadata = json.load(f)
+                        models_data.append(metadata)
+                    except Exception as e:
+                        logging.warning(
+                            f"Could not read metadata for {model_dir.name}: {e}"
+                        )
+
+        if not models_data:
+            logging.info("No saved models found.")
+            return pd.DataFrame()
+
+        df = pd.DataFrame(models_data)
+
+        # Format the display
+        display_columns = [
+            "model_name",
+            "model_type",
+            "features",
+            "test_accuracy",
+            "test_f1",
+            "training_date",
+        ]
+        available_columns = [col for col in display_columns if col in df.columns]
+
+        return df[available_columns].sort_values("training_date", ascending=False)
@@ -0,0 +1,72 @@
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+from ners.research.neural_network_model import NeuralNetworkModel
+
+
+class BiGRUModel(NeuralNetworkModel):
+    """Bidirectional GRU model for name classification"""
+
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
+        params = kwargs
+        model = Sequential(
+            [
+                # Mask padding tokens so recurrent layers ignore them; fix input length
+                # for better shape inference and to support masking through the stack.
+                Embedding(
+                    input_dim=vocab_size,
+                    output_dim=params.get("embedding_dim", 64),
+                    mask_zero=True,
+                ),
+                # First recurrent block returns full sequences to allow stacking.
+                # Moderate dropout + optional recurrent_dropout to reduce overfitting
+                # on short names while retaining temporal signal.
+                Bidirectional(
+                    GRU(
+                        params.get("gru_units", 32),
+                        return_sequences=True,
+                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
+                    )
+                ),
+                # Second GRU summarizes to the last hidden state (no return_sequences),
+                # capturing bidirectional context efficiently for classification.
+                Bidirectional(
+                    GRU(
+                        params.get("gru_units", 32),
+                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
+                    )
+                ),
+                # Small dense head; ReLU + dropout for capacity and regularization.
+                Dense(64, activation="relu"),
+                Dropout(params.get("dropout", 0.5)),
+                # Two-way softmax for binary gender classification.
+                Dense(2, activation="softmax", dtype="float32"),
+            ]
+        )
+
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="adam",
+            metrics=["accuracy"],
+        )
+        return model
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_data = self._collect_text_corpus(X)
+
+        if self.tokenizer is None:
+            self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
+            self.tokenizer.fit_on_texts(text_data)
+
+        sequences = self.tokenizer.texts_to_sequences(text_data)
+        max_len = self.config.model_params.get("max_len", 6)
+
+        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,86 @@
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from tensorflow.keras.layers import (
+    Embedding,
+    Conv1D,
+    MaxPooling1D,
+    GlobalMaxPooling1D,
+    Dense,
+    Dropout,
+    SpatialDropout1D,
+)
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+from ners.research.neural_network_model import NeuralNetworkModel
+
+
+class CNNModel(NeuralNetworkModel):
+    """1D Convolutional Neural Network for character patterns"""
+
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
+        """Build CNN model with known vocabulary size"""
+
+        params = kwargs
+        model = Sequential(
+            [
+                # Learn char/subword embeddings; spatial dropout regularizes across channels
+                # to make the model robust to noisy characters and transliteration.
+                Embedding(
+                    input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)
+                ),
+                SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
+                # Small kernels capture short n-gram like patterns; padding='same' keeps
+                # sequence length stable for simpler pooling behavior.
+                Conv1D(
+                    filters=params.get("filters", 64),
+                    kernel_size=params.get("kernel_size", 3),
+                    activation="relu",
+                    padding="same",
+                ),
+                # Downsample to gain some position invariance and reduce computation.
+                MaxPooling1D(pool_size=2),
+                # Second conv layer to compose higher-level motifs (e.g., suffix+vowel).
+                Conv1D(
+                    filters=params.get("filters", 64),
+                    kernel_size=params.get("kernel_size", 3),
+                    activation="relu",
+                    padding="same",
+                ),
+                # Global max pooling picks strongest motif evidence anywhere in the name.
+                GlobalMaxPooling1D(),
+                # Compact dense head with dropout to control overfitting.
+                Dense(64, activation="relu"),
+                Dropout(params.get("dropout", 0.5)),
+                # Two-way softmax for binary classification.
+                Dense(2, activation="softmax", dtype="float32"),
+            ]
+        )
+
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="adam",
+            metrics=["accuracy"],
+        )
+        return model
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        """Prepare sequences for CNN using extracted features"""
+        # X here contains the features already extracted by FeatureExtractor
+        # Get text data from extracted features - use character level for CNN
+        text_data = self._collect_text_corpus(X)
+
+        # Initialize character-level tokenizer
+        if self.tokenizer is None:
+            self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
+            self.tokenizer.fit_on_texts(text_data)
+
+        sequences = self.tokenizer.texts_to_sequences(text_data)
+        max_len = self.config.model_params.get(
+            "max_len", 20
+        )  # Longer for character level
+
+        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,110 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.ensemble import VotingClassifier, RandomForestClassifier
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+
+from ners.research.experiment import ExperimentConfig
+from ners.research.traditional_model import TraditionalModel
+
+
+class EnsembleModel(TraditionalModel):
+    """Ensemble model combining multiple base models"""
+
+    @property
+    def architecture(self) -> str:
+        """Return the architecture type"""
+        return "ensemble"
+
+    def __init__(self, config: ExperimentConfig):
+        super().__init__(config)
+        self.base_models = []
+        self.model_weights = None
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+        base_model_types = params.get(
+            "base_models", ["logistic_regression", "random_forest", "naive_bayes"]
+        )
+
+        # Create base models with simplified configs; diverse vectorizers/classifiers
+        # encourage complementary errors that voting can average out.
+        estimators = []
+        for model_type in base_model_types:
+            if model_type == "logistic_regression":
+                model = Pipeline(
+                    [
+                        (
+                            "vectorizer",
+                            CountVectorizer(
+                                analyzer="char", ngram_range=(2, 4), max_features=5000
+                            ),
+                        ),
+                        (
+                            "classifier",
+                            LogisticRegression(
+                                max_iter=1000, random_state=self.config.random_seed
+                            ),
+                        ),
+                    ]
+                )
+                estimators.append(("logistic_regression", model))
+
+            elif model_type == "random_forest":
+                model = Pipeline(
+                    [
+                        (
+                            "vectorizer",
+                            TfidfVectorizer(
+                                analyzer="char", ngram_range=(2, 3), max_features=3000
+                            ),
+                        ),
+                        (
+                            "classifier",
+                            RandomForestClassifier(
+                                n_estimators=50, random_state=self.config.random_seed
+                            ),
+                        ),
+                    ]
+                )
+                estimators.append(("rf", model))
+
+            elif model_type == "naive_bayes":
+                model = Pipeline(
+                    [
+                        (
+                            "vectorizer",
+                            CountVectorizer(
+                                analyzer="char", ngram_range=(1, 3), max_features=4000
+                            ),
+                        ),
+                        ("classifier", MultinomialNB()),
+                    ]
+                )
+                estimators.append(("nb", model))
+
+        # Soft voting averages probabilities (preferred when members are calibrated);
+        # hard voting uses majority class. Parallelize member predictions.
+        voting_type = params.get("voting", "soft")  # 'hard' or 'soft'
+        return VotingClassifier(
+            estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
+        )
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_features = []
+
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                text_features.append(X[feature_type.value].astype(str))
+
+        if len(text_features) == 1:
+            return text_features[0].values
+        else:
+            combined = text_features[0].astype(str)
+            for feature in text_features[1:]:
+                combined = combined + " " + feature.astype(str)
+            return combined.values
@@ -0,0 +1,115 @@
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import LabelEncoder
+
+from ners.research.traditional_model import TraditionalModel
+
+
+class LightGBMModel(TraditionalModel):
+    """LightGBM with engineered features"""
+
+    def __init__(self, config):
+        super().__init__(config)
+        # Store vectorizers and encoders to ensure consistent feature space
+        self.vectorizers = {}
+        self.label_encoders = {}
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+
+        # Optional GPU acceleration
+        use_gpu = bool(params.get("use_gpu", False))
+        device = params.get("device", "gpu" if use_gpu else "cpu")
+        gpu_platform_id = params.get("gpu_platform_id", None)
+        gpu_device_id = params.get("gpu_device_id", None)
+
+        # Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
+        # and parallelism improve training speed for this task.
+        return lgb.LGBMClassifier(
+            n_estimators=params.get("n_estimators", 100),
+            max_depth=params.get("max_depth", -1),
+            learning_rate=params.get("learning_rate", 0.1),
+            num_leaves=params.get("num_leaves", 31),
+            subsample=params.get("subsample", 0.8),
+            colsample_bytree=params.get("colsample_bytree", 0.8),
+            random_state=self.config.random_seed,
+            objective=params.get("objective", "binary"),
+            n_jobs=params.get("n_jobs", -1),
+            verbose=2,
+            device=device,
+            gpu_platform_id=gpu_platform_id,
+            gpu_device_id=gpu_device_id,
+        )
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        features = []
+
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                column = X[feature_type.value]
+
+                if feature_type.value in ["name_length", "word_count"]:
+                    # Numerical features
+                    features.append(column.fillna(0).values.reshape(-1, 1))
+                elif feature_type.value in ["full_name", "native_name", "surname"]:
+                    # Character-level features for names
+                    feature_key = f"vectorizer_{feature_type.value}"
+
+                    if feature_key not in self.vectorizers:
+                        # First time - create and fit vectorizer
+                        self.vectorizers[feature_key] = CountVectorizer(
+                            analyzer="char", ngram_range=(2, 3), max_features=50
+                        )
+                        char_features = (
+                            self.vectorizers[feature_key]
+                            .fit_transform(column.fillna("").astype(str))
+                            .toarray()
+                        )
+                    else:
+                        # Subsequent times - use existing vectorizer
+                        char_features = (
+                            self.vectorizers[feature_key]
+                            .transform(column.fillna("").astype(str))
+                            .toarray()
+                        )
+
+                    features.append(char_features)
+                else:
+                    # Categorical features
+                    feature_key = f"encoder_{feature_type.value}"
+
+                    if feature_key not in self.label_encoders:
+                        # First time - create and fit encoder
+                        self.label_encoders[feature_key] = LabelEncoder()
+                        encoded = self.label_encoders[feature_key].fit_transform(
+                            column.fillna("unknown").astype(str)
+                        )
+                    else:
+                        # Subsequent times - use existing encoder
+                        # Handle unseen labels by mapping them to a default value
+                        column_clean = column.fillna("unknown").astype(str)
+
+                        # Get the classes the encoder was trained on
+                        known_classes = set(self.label_encoders[feature_key].classes_)
+
+                        # Map unseen values to "unknown" if it exists, otherwise to the first class
+                        if "unknown" in known_classes:
+                            default_class = "unknown"
+                        else:
+                            default_class = self.label_encoders[feature_key].classes_[0]
+
+                        # Replace unseen values with default
+                        column_mapped = column_clean.apply(
+                            lambda x: x if x in known_classes else default_class
+                        )
+
+                        encoded = self.label_encoders[feature_key].transform(
+                            column_mapped
+                        )
+
+                    features.append(encoded.reshape(-1, 1))
+
+        return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,53 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+
+from ners.research.traditional_model import TraditionalModel
+
+
+class LogisticRegressionModel(TraditionalModel):
+    """Logistic Regression with character n-grams"""
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+        # Character n-grams are strong signals for names; (2,5) balances
+        # capturing prefixes/suffixes with tractable feature size.
+        vectorizer = CountVectorizer(
+            analyzer="char",
+            ngram_range=params.get("ngram_range", (2, 5)),
+            max_features=params.get("max_features", 10000),
+        )
+
+        # liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
+        # OvR across classes (no effect for binary). class_weight can mitigate imbalance.
+        classifier = LogisticRegression(
+            max_iter=params.get("max_iter", 1000),
+            random_state=self.config.random_seed,
+            verbose=2,
+            solver=params.get("solver", "liblinear"),
+            n_jobs=params.get("n_jobs", -1),
+            class_weight=params.get("class_weight", None),
+        )
+
+        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_features = []
+
+        # Collect text-based features from the extracted features DataFrame
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                text_features.append(X[feature_type.value].astype(str))
+
+        # Combine text features
+        if len(text_features) == 1:
+            return text_features[0].values
+        else:
+            # Concatenate multiple text features with separator
+            combined = text_features[0].astype(str)
+            for feature in text_features[1:]:
+                combined = combined + " " + feature.astype(str)
+            return combined.values
@@ -0,0 +1,71 @@
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+from ners.research.neural_network_model import NeuralNetworkModel
+
+
+class LSTMModel(NeuralNetworkModel):
+    """LSTM model for sequence learning"""
+
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
+        params = kwargs
+        model = Sequential(
+            [
+                # Mask padding tokens; required for LSTM to ignore padded timesteps.
+                Embedding(
+                    input_dim=vocab_size,
+                    output_dim=params.get("embedding_dim", 64),
+                    mask_zero=True,
+                ),
+                # Stacked bidirectional LSTMs: first returns sequences to feed the next.
+                # Dropout/recurrent_dropout mitigate overfitting on short sequences.
+                Bidirectional(
+                    LSTM(
+                        params.get("lstm_units", 32),
+                        return_sequences=True,
+                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
+                    )
+                ),
+                # Second LSTM condenses sequence to a fixed vector for classification.
+                Bidirectional(
+                    LSTM(
+                        params.get("lstm_units", 32),
+                        dropout=params.get("dropout", 0.2),
+                        recurrent_dropout=params.get("recurrent_dropout", 0.0),
+                    )
+                ),
+                # Compact dense head with dropout; sufficient capacity for name signals.
+                Dense(64, activation="relu"),
+                Dropout(params.get("dropout", 0.5)),
+                # Two-way softmax for binary classification.
+                Dense(2, activation="softmax", dtype="float32"),
+            ]
+        )
+
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="adam",
+            metrics=["accuracy"],
+        )
+        return model
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_data = self._collect_text_corpus(X)
+
+        # Initialize tokenizer if needed
+        if self.tokenizer is None:
+            self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
+            self.tokenizer.fit_on_texts(text_data)
+
+        # Convert to sequences
+        sequences = self.tokenizer.texts_to_sequences(text_data)
+        max_len = self.config.model_params.get("max_len", 6)
+
+        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,42 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+
+from ners.research.traditional_model import TraditionalModel
+
+
+class NaiveBayesModel(TraditionalModel):
+    """Multinomial Naive Bayes with character n-grams"""
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+        # Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
+        # includes unigrams for coverage and higher n for suffix/prefix cues.
+        vectorizer = CountVectorizer(
+            analyzer="char",
+            ngram_range=params.get("ngram_range", (2, 5)),
+            max_features=params.get("max_features", 8000),
+        )
+
+        # Laplace smoothing (alpha) counters zero counts for rare n-grams.
+        classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
+
+        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_features = []
+
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                text_features.append(X[feature_type.value].astype(str))
+
+        if len(text_features) == 1:
+            return text_features[0].values
+        else:
+            combined = text_features[0].astype(str)
+            for feature in text_features[1:]:
+                combined = combined + " " + feature.astype(str)
+            return combined.values
@@ -0,0 +1,71 @@
+from typing import Dict
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import LabelEncoder
+
+from ners.research.traditional_model import TraditionalModel
+
+
+class RandomForestModel(TraditionalModel):
+    """Random Forest with engineered features"""
+
+    def __init__(self, config):
+        super().__init__(config)
+        # Persist encoders so categorical mappings stay consistent.
+        self.label_encoders: Dict[str, LabelEncoder] = {}
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+
+        # Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
+        # across trees for speed. Keep depth moderate for generalisation.
+        return RandomForestClassifier(
+            n_estimators=params.get("n_estimators", 100),
+            max_depth=params.get("max_depth", None),
+            random_state=self.config.random_seed,
+            verbose=2,
+            n_jobs=params.get("n_jobs", -1),
+        )
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        features = []
+
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                column = X[feature_type.value]
+
+                # Handle different feature types
+                if feature_type.value in ["name_length", "word_count"]:
+                    # Numerical features
+                    features.append(column.fillna(0).values.reshape(-1, 1))
+                else:
+                    # Categorical features (encode them persistently)
+                    feature_key = f"encoder_{feature_type.value}"
+
+                    if feature_key not in self.label_encoders:
+                        self.label_encoders[feature_key] = LabelEncoder()
+                        encoded = self.label_encoders[feature_key].fit_transform(
+                            column.fillna("unknown").astype(str)
+                        )
+                    else:
+                        encoder = self.label_encoders[feature_key]
+                        column_clean = column.fillna("unknown").astype(str)
+                        known_classes = set(encoder.classes_)
+                        default_class = (
+                            "unknown"
+                            if "unknown" in known_classes
+                            else encoder.classes_[0]
+                        )
+                        column_mapped = column_clean.apply(
+                            lambda value: value
+                            if value in known_classes
+                            else default_class
+                        )
+                        encoded = encoder.transform(column_mapped)
+
+                    features.append(encoded.reshape(-1, 1))
+
+        return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+from sklearn.svm import SVC
+
+from ners.research.traditional_model import TraditionalModel
+
+
+class SVMModel(TraditionalModel):
+    """Support Vector Machine with character n-grams and RBF kernel"""
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+        # TF-IDF downweights very common patterns; char n-grams (2,4) are effective
+        # for distinguishing name morphology under RBF kernels.
+        vectorizer = TfidfVectorizer(
+            analyzer="char",
+            ngram_range=params.get("ngram_range", (2, 4)),
+            max_features=params.get("max_features", 5000),
+        )
+
+        # RBF kernel captures non-linear interactions between n-grams; probability=True
+        # adds calibration at some cost. Larger cache helps speed kernel computations.
+        classifier = SVC(
+            kernel=params.get("kernel", "rbf"),
+            C=params.get("C", 1.0),
+            gamma=params.get("gamma", "scale"),
+            probability=True,  # Enable probability prediction
+            class_weight=params.get("class_weight", None),
+            cache_size=params.get("cache_size", 1000),
+            random_state=self.config.random_seed,
+            verbose=2,
+        )
+
+        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_features = []
+
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                text_features.append(X[feature_type.value].astype(str))
+
+        if len(text_features) == 1:
+            return text_features[0].values
+        else:
+            combined = text_features[0].astype(str)
+            for feature in text_features[1:]:
+                combined = combined + " " + feature.astype(str)
+            return combined.values
@@ -0,0 +1,90 @@
+from typing import Any
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from tensorflow.keras.layers import (
+    Input,
+    Embedding,
+    Dense,
+    GlobalAveragePooling1D,
+    MultiHeadAttention,
+    Dropout,
+    LayerNormalization,
+)
+from tensorflow.keras.models import Model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+from ners.research.neural_network_model import NeuralNetworkModel
+
+
+class TransformerModel(NeuralNetworkModel):
+    """Transformer-based model"""
+
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
+        params = kwargs
+
+        # Build Transformer model
+        inputs = Input(shape=(params.get("max_len", 8),))
+        x = Embedding(
+            input_dim=vocab_size,
+            output_dim=params.get("embedding_dim", 64),
+            input_length=params.get("max_len", 8),
+            mask_zero=True,
+        )(inputs)
+
+        # Add positional encoding
+        positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
+        pos_embedding = Embedding(
+            input_dim=params.get("max_len", 8),
+            output_dim=params.get("embedding_dim", 64),
+        )(positions)
+        x = x + pos_embedding
+
+        x = self._transformer_encoder(x, params)
+        x = GlobalAveragePooling1D()(x)
+        x = Dense(32, activation="relu")(x)
+        x = Dropout(params.get("dropout", 0.1))(x)
+        outputs = Dense(2, activation="softmax", dtype="float32")(x)
+
+        model = Model(inputs, outputs)
+        model.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+        return model
+
+    @classmethod
+    def _transformer_encoder(cls, x, cfg_params):
+        """Transformer encoder block"""
+
+        attn = MultiHeadAttention(
+            num_heads=cfg_params.get("transformer_num_heads", 2),
+            key_dim=cfg_params.get("transformer_head_size", 64),
+            dropout=cfg_params.get("attn_dropout", 0.1),
+        )(x, x)
+        x = LayerNormalization(epsilon=1e-6)(
+            x + Dropout(cfg_params.get("dropout", 0.1))(attn)
+        )
+
+        ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
+        ff = Dense(x.shape[-1])(ff)
+        return LayerNormalization(epsilon=1e-6)(
+            x + Dropout(cfg_params.get("dropout", 0.1))(ff)
+        )
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_data = self._collect_text_corpus(X)
+
+        # Initialize tokenizer if needed
+        if self.tokenizer is None:
+            self.tokenizer = Tokenizer(oov_token="<OOV>")
+            self.tokenizer.fit_on_texts(text_data)
+
+        # Convert to sequences
+        sequences = self.tokenizer.texts_to_sequences(text_data)
+        max_len = self.config.model_params.get("max_len", 6)
+
+        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,115 @@
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.base import BaseEstimator
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import LabelEncoder
+
+from ners.research.traditional_model import TraditionalModel
+
+
+class XGBoostModel(TraditionalModel):
+    """XGBoost with engineered features and character embeddings"""
+
+    def __init__(self, config):
+        super().__init__(config)
+        # Store vectorizers and encoders to ensure consistent feature space
+        self.vectorizers = {}
+        self.label_encoders = {}
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+
+        # Optional GPU acceleration
+        use_gpu = bool(params.get("use_gpu", False))
+        default_tree_method = "gpu_hist" if use_gpu else "hist"
+        tree_method = params.get("tree_method", default_tree_method)
+        predictor = params.get(
+            "predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto"
+        )
+
+        # Histogram-based trees and parallelism provide fast training; default
+        # logloss metric suits binary classification of gender.
+        return xgb.XGBClassifier(
+            n_estimators=params.get("n_estimators", 100),
+            max_depth=params.get("max_depth", 6),
+            learning_rate=params.get("learning_rate", 0.1),
+            subsample=params.get("subsample", 0.8),
+            colsample_bytree=params.get("colsample_bytree", 0.8),
+            random_state=self.config.random_seed,
+            eval_metric="logloss",
+            n_jobs=params.get("n_jobs", -1),
+            tree_method=tree_method,
+            predictor=predictor,
+            verbosity=2,
+        )
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        features = []
+
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                column = X[feature_type.value]
+
+                if feature_type.value in ["name_length", "word_count"]:
+                    # Numerical features
+                    features.append(column.fillna(0).values.reshape(-1, 1))
+                elif feature_type.value in ["full_name", "native_name", "surname"]:
+                    # Character-level features for names
+                    feature_key = f"vectorizer_{feature_type.value}"
+
+                    if feature_key not in self.vectorizers:
+                        # First time - create and fit vectorizer
+                        self.vectorizers[feature_key] = CountVectorizer(
+                            analyzer="char", ngram_range=(2, 3), max_features=100
+                        )
+                        char_features = (
+                            self.vectorizers[feature_key]
+                            .fit_transform(column.fillna("").astype(str))
+                            .toarray()
+                        )
+                    else:
+                        # Subsequent times - use existing vectorizer
+                        char_features = (
+                            self.vectorizers[feature_key]
+                            .transform(column.fillna("").astype(str))
+                            .toarray()
+                        )
+
+                    features.append(char_features)
+                else:
+                    # Categorical features
+                    feature_key = f"encoder_{feature_type.value}"
+
+                    if feature_key not in self.label_encoders:
+                        # First time - create and fit encoder
+                        self.label_encoders[feature_key] = LabelEncoder()
+                        encoded = self.label_encoders[feature_key].fit_transform(
+                            column.fillna("unknown").astype(str)
+                        )
+                    else:
+                        # Subsequent times - use existing encoder
+                        # Handle unseen labels by mapping them to a default value
+                        column_clean = column.fillna("unknown").astype(str)
+
+                        # Get the classes the encoder was trained on
+                        known_classes = set(self.label_encoders[feature_key].classes_)
+
+                        # Map unseen values to "unknown" if it exists, otherwise to the first class
+                        if "unknown" in known_classes:
+                            default_class = "unknown"
+                        else:
+                            default_class = self.label_encoders[feature_key].classes_[0]
+
+                        # Replace unseen values with default
+                        column_mapped = column_clean.apply(
+                            lambda x: x if x in known_classes else default_class
+                        )
+
+                        encoded = self.label_encoders[feature_key].transform(
+                            column_mapped
+                        )
+
+                    features.append(encoded.reshape(-1, 1))
+
+        return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,377 @@
+import logging
+from abc import abstractmethod
+from typing import Any, Dict, List
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+import tensorflow as tf
+
+from ners.research.base_model import BaseModel
+from ners.research.experiment.feature_extractor import FeatureExtractor
+
+
+class NeuralNetworkModel(BaseModel):
+    """Base class for neural network models (TensorFlow/Keras)"""
+
+    @property
+    def architecture(self) -> str:
+        return "neural_network"
+
+    @abstractmethod
+    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
+        """Build neural network model with known vocabulary size"""
+        pass
+
+    def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
+        """Fit the neural network model with deferred building"""
+        logging.info(f"Training {self.__class__.__name__}")
+
+        # Best-effort GPU configuration for TensorFlow when available
+        # - Enables memory growth to avoid pre-allocating all VRAM
+        # - Optionally enables mixed precision if requested via model params
+        try:
+            requested_gpu = bool(self.config.model_params.get("use_gpu", False))
+            enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
+
+            gpus = tf.config.list_physical_devices("GPU")
+            if gpus:
+                for gpu in gpus:
+                    try:
+                        tf.config.experimental.set_memory_growth(gpu, True)
+                    except Exception:
+                        pass
+
+                if enable_mixed:
+                    try:
+                        tf.keras.mixed_precision.set_global_policy("mixed_float16")
+                        logging.info("Enabled TensorFlow mixed precision (float16)")
+                    except Exception as e:
+                        logging.warning(f"Could not enable mixed precision: {e}")
+            else:
+                if requested_gpu:
+                    logging.warning(
+                        "Requested GPU but no TensorFlow GPU device is available."
+                    )
+        except Exception as e:
+            # Keep silent in non-TF environments / non-NN workflows
+            logging.debug(f"TensorFlow GPU setup skipped: {e}")
+
+        # Setup feature extraction
+        if self.feature_extractor is None:
+            self.feature_extractor = FeatureExtractor(
+                self.config.features, self.config.feature_params
+            )
+
+        # Extract and prepare features (this will also initialize tokenizer)
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+        # Sanitize any out-of-range indices to avoid embedding scatter errors
+        X_prepared = self._sanitize_sequences(X_prepared)
+
+        # Encode labels
+        if self.label_encoder is None:
+            self.label_encoder = LabelEncoder()
+            y_encoded = self.label_encoder.fit_transform(y)
+        else:
+            y_encoded = self.label_encoder.transform(y)
+
+        # Now we can build the model with known vocab size
+        vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
+        logging.info(f"Vocabulary size: {vocab_size}")
+
+        # Get additional model parameters
+        self.model = self.build_model_with_vocab(
+            vocab_size=vocab_size, **self.config.model_params
+        )
+
+        # Train the neural network
+        logging.info(
+            f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
+        )
+        logging.info(X_prepared[0])
+        logging.info(f"Model parameters: {self.config.model_params}")
+
+        history = self.model.fit(
+            X_prepared,
+            y_encoded,
+            epochs=self.config.model_params.get("epochs", 10),
+            batch_size=self.config.model_params.get("batch_size", 64),
+            validation_split=self.config.model_params.get("validation_split", 0.1),
+            verbose=2,
+        )
+
+        # Store training history
+        self.training_history = {
+            "accuracy": history.history["accuracy"],
+            "loss": history.history["loss"],
+            "val_accuracy": history.history.get("val_accuracy", []),
+            "val_loss": history.history.get("val_loss", []),
+        }
+
+        self.is_fitted = True
+        return self
+
+    def _sanitize_sequences(self, sequences: np.ndarray) -> np.ndarray:
+        """Clamp invalid token indices to OOV and ensure int32 dtype.
+
+        This prevents rare cases where malformed inputs or dtype issues introduce
+        large or negative indices which can trigger TensorScatterUpdate errors
+        during embedding updates on GPU.
+        """
+        try:
+            if sequences is None:
+                return sequences
+            arr = np.asarray(sequences)
+            # Ensure integer dtype for embedding lookups
+            if not np.issubdtype(arr.dtype, np.integer):
+                arr = arr.astype(np.int64, copy=False)
+
+            if self.tokenizer is not None and hasattr(self.tokenizer, "word_index"):
+                # Use the actual max index present in the tokenizer mapping
+                if self.tokenizer.word_index:
+                    max_idx = max(self.tokenizer.word_index.values())
+                else:
+                    max_idx = 0
+                # OOV token index if available, else fall back to 1
+                oov_index = self.tokenizer.word_index.get(
+                    getattr(self.tokenizer, "oov_token", "<OOV>"), 1
+                )
+                # Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
+                invalid_mask = (arr < 0) | (arr > max_idx)
+                # Avoid turning zeros into OOV
+                invalid_mask &= arr != 0
+                if invalid_mask.any():
+                    arr[invalid_mask] = oov_index
+
+            # Use int32 for TF embedding ops compatibility
+            return arr.astype(np.int32, copy=False)
+        except Exception as e:
+            logging.debug(f"Sequence sanitization skipped due to: {e}")
+            return sequences
+
+    def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
+        """Combine configured textual features into one string per record."""
+
+        column_names = [
+            feature.value
+            for feature in self.config.features
+            if feature.value in X.columns
+        ]
+        if not column_names:
+            raise ValueError(
+                "No configured text features found in the provided DataFrame."
+            )
+
+        text_frame = X[column_names].fillna("").astype(str)
+
+        if len(column_names) == 1:
+            return text_frame.iloc[:, 0].tolist()
+
+        combined_rows = []
+        for row in text_frame.itertuples(index=False):
+            tokens = [value for value in row if value]
+            combined_rows.append(" ".join(tokens))
+
+        return combined_rows
+
+    def cross_validate(
+        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
+    ) -> dict[str, np.floating[Any]]:
+        # Ensure TF GPU/mixed-precision config also applies to CV runs
+        try:
+            import tensorflow as tf
+
+            requested_gpu = bool(self.config.model_params.get("use_gpu", False))
+            enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
+
+            gpus = tf.config.list_physical_devices("GPU")
+            if gpus:
+                for gpu in gpus:
+                    try:
+                        tf.config.experimental.set_memory_growth(gpu, True)
+                    except Exception:
+                        pass
+                if enable_mixed:
+                    try:
+                        tf.keras.mixed_precision.set_global_policy("mixed_float16")
+                    except Exception:
+                        pass
+            else:
+                if requested_gpu:
+                    logging.warning("Requested GPU for CV but none is available.")
+        except Exception:
+            pass
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+        X_prepared = self._sanitize_sequences(X_prepared)
+        y_encoded = self.label_encoder.transform(y)
+
+        cv = StratifiedKFold(
+            n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
+        )
+
+        accuracies = []
+        precisions = []
+        recalls = []
+        f1_scores = []
+
+        # Get vocabulary size and model parameters
+        vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
+        max_len = self.config.model_params.get("max_len", 6)
+
+        for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
+            # Create fresh model for each fold using build_model_with_vocab
+            fold_model = self.build_model_with_vocab(
+                vocab_size=vocab_size, max_len=max_len, **self.config.model_params
+            )
+
+            # Train on fold
+            if hasattr(fold_model, "fit"):
+                fold_model.fit(
+                    X_prepared[train_idx],
+                    y_encoded[train_idx],
+                    epochs=self.config.model_params.get("epochs", 10),
+                    batch_size=self.config.model_params.get("batch_size", 32),
+                    verbose=0,
+                )
+
+            # Predict on validation
+            y_pred = fold_model.predict(X_prepared[val_idx])
+            if len(y_pred.shape) > 1:
+                y_pred = y_pred.argmax(axis=1)
+
+            # Calculate metrics
+            acc = accuracy_score(y_encoded[val_idx], y_pred)
+            prec, rec, f1, _ = precision_recall_fscore_support(
+                y_encoded[val_idx], y_pred, average="weighted"
+            )
+
+            accuracies.append(acc)
+            precisions.append(prec)
+            recalls.append(rec)
+            f1_scores.append(f1)
+
+        return {
+            "accuracy": np.mean(accuracies),
+            "precision": np.mean(precisions),
+            "recall": np.mean(recalls),
+            "f1": np.mean(f1_scores),
+        }
+
+    def generate_learning_curve(
+        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+    ) -> Dict[str, Any]:
+        """Generate learning curve data for the model"""
+        logging.info(f"Generating learning curve for {self.__class__.__name__}")
+
+        # Ensure TF GPU/mixed-precision config also applies here
+        try:
+            import tensorflow as tf
+
+            requested_gpu = bool(self.config.model_params.get("use_gpu", False))
+            enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
+
+            gpus = tf.config.list_physical_devices("GPU")
+            if gpus:
+                for gpu in gpus:
+                    try:
+                        tf.config.experimental.set_memory_growth(gpu, True)
+                    except Exception:
+                        pass
+                if enable_mixed:
+                    try:
+                        tf.keras.mixed_precision.set_global_policy("mixed_float16")
+                    except Exception:
+                        pass
+            else:
+                if requested_gpu:
+                    logging.warning(
+                        "Requested GPU for learning curve but none is available."
+                    )
+        except Exception:
+            pass
+
+        if train_sizes is None:
+            train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
+
+        learning_curve_data = {
+            "train_sizes": [],
+            "train_scores": [],
+            "val_scores": [],
+            "train_scores_std": [],
+            "val_scores_std": [],
+        }
+
+        # Prepare features and get vocabulary size
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+        X_prepared = self._sanitize_sequences(X_prepared)
+        y_encoded = self.label_encoder.transform(y)
+
+        vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
+        max_len = self.config.model_params.get("max_len", 6)
+
+        # Split data once for validation
+        X_train_full, X_val, y_train_full, y_val = train_test_split(
+            X_prepared,
+            y_encoded,
+            test_size=0.2,
+            random_state=self.config.random_seed,
+            stratify=y_encoded,
+        )
+
+        for size in train_sizes:
+            train_size = int(len(X_train_full) * size)
+            if train_size < 10:  # Minimum training size
+                continue
+
+            # Sample training data
+            indices = np.random.choice(len(X_train_full), train_size, replace=False)
+            X_train_subset = X_train_full[indices]
+            y_train_subset = y_train_full[indices]
+
+            # Train multiple models for variance estimation
+            train_scores = []
+            val_scores = []
+
+            for seed in range(3):  # 3 runs for variance
+                # Build fresh model using build_model_with_vocab
+                model = self.build_model_with_vocab(
+                    vocab_size=vocab_size, max_len=max_len, **self.config.model_params
+                )
+
+                # Train model
+                if hasattr(model, "fit"):
+                    model.fit(
+                        X_train_subset,
+                        y_train_subset,
+                        epochs=self.config.model_params.get("epochs", 10),
+                        batch_size=self.config.model_params.get("batch_size", 32),
+                        validation_data=(X_val, y_val),
+                        verbose=0,
+                    )
+
+                # Evaluate
+                train_pred = model.predict(X_train_subset)
+                val_pred = model.predict(X_val)
+
+                train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
+                val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
+
+                train_scores.append(train_acc)
+                val_scores.append(val_acc)
+
+            learning_curve_data["train_sizes"].append(train_size)
+            learning_curve_data["train_scores"].append(np.mean(train_scores))
+            learning_curve_data["val_scores"].append(np.mean(val_scores))
+            learning_curve_data["train_scores_std"].append(np.std(train_scores))
+            learning_curve_data["val_scores_std"].append(np.std(val_scores))
+
+        self.learning_curve_data = learning_curve_data
+        return learning_curve_data
@@ -0,0 +1 @@
+LETTERS = "abcdefghijklmnopqrstuvwxyz"
@@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from ners.research.statistics.utils import LETTERS, build_letter_frequencies
+
+
+def plot_transition_matrix(ax, df_probs, title=""):
+    hm = sns.heatmap(
+        df_probs.loc[list(LETTERS), list(LETTERS)],
+        cmap="Reds",
+        annot=False,
+        cbar=False,
+        ax=ax,
+    )
+    ax.set_title(title, fontsize=12)
+    return hm
+
+
+def plot_letter_frequencies(males, females, sort_values=False, title=None):
+    # Compute frequencies
+    L_m = build_letter_frequencies(males["name"]).set_index("letter")["freq"]
+    L_f = build_letter_frequencies(females["name"]).set_index("letter")["freq"]
+
+    # Combine into one DataFrame
+    df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
+    df_plot.to_csv(f"../assets/{title}_letter_frequencies.csv", index=False)
+
+    # Optional sorting
+    if sort_values:
+        df_plot = df_plot.sort_values("Male", ascending=False)
+
+    # Plot side-by-side bars
+    x = np.arange(len(df_plot))
+    w = 0.4
+    fig, ax = plt.subplots(figsize=(16, 6))
+    ax.bar(
+        x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8
+    )
+    ax.bar(
+        x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8
+    )
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(df_plot["letter"])
+    ax.set_ylabel("Frequency")
+    ax.set_xlabel("Letter")
+    ax.set_title(f"{title} - Letter Frequencies")
+    ax.legend()
+    ax.grid(axis="y", alpha=0.3)
+
+    plt.tight_layout()
+    plt.show()
@@ -0,0 +1,276 @@
+import re
+import unicodedata
+
+import numpy as np
+import pandas as pd
+from scipy.spatial.distance import euclidean
+from scipy.stats import entropy
+from typing import Dict, Any
+
+LETTERS = "abcdefghijklmnopqrstuvwxyz"
+START_TOKEN = "^"
+END_TOKEN = "$"
+
+
+def normalize_letters(s):
+    """Normalize accents -> ascii, lowercase, keep only a-z."""
+    s = str(s)
+    s = unicodedata.normalize("NFKD", s)
+    s = s.encode("ascii", errors="ignore").decode("utf-8")
+    s = s.lower()
+    s = re.sub(r"[^a-z]", "", s)
+    return s
+
+
+def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
+    return (
+        df.groupby("province")["identified_category"]
+        .value_counts(normalize=True)  # get proportions
+        .unstack(fill_value=0)  # reshape into columns per word count
+    )
+
+
+def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
+    # Normalize + split once (vectorized)
+    s = df[source].fillna("").astype(str)
+    s = s.str.lower().str.replace(r"[^\w'\-]+", " ", regex=True).str.strip().str.split()
+
+    # Explode the token list into rows under `target`
+    out = df.assign(**{target: s}).explode(target, ignore_index=True)
+
+    # Drop NA/empty tokens and strip whitespace
+    out[target] = out[target].astype(str).str.strip()
+    out = out[out[target].ne("")].dropna(subset=[target]).reset_index(drop=True)
+
+    return out
+
+
+def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
+    # Normalize: lowercase, remove non-letters, concatenate all into one string
+    s = (
+        series.astype(str)
+        .str.lower()
+        .str.replace(r"[^a-z]", "", regex=True)
+        .str.cat(sep="")
+    )
+
+    # Convert string into Series of characters
+    chars = pd.Series(list(s))
+
+    # Count letters and ensure all letters are present
+    out = (
+        chars.value_counts(normalize=False)
+        .reindex(list(LETTERS), fill_value=0)
+        .rename_axis("letter")
+        .reset_index(name="count")
+    )
+
+    # Relative frequency
+    total = out["count"].sum()
+    out["freq"] = out["count"] / (total if total > 0 else 1)
+    return out
+
+
+def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict:
+    # 1) Normalize
+    names = names.astype(str).str.lower().str.replace(rf"[^{LETTERS}]", "", regex=True)
+    names = names[names.str.len() > 0]
+
+    # 2) Prepare sequences
+    sequences = (START_TOKEN + names + END_TOKEN).tolist()
+
+    # 3) Tokens and indices
+    tokens = [START_TOKEN] + list(LETTERS) + [END_TOKEN]
+    index = {t: i for i, t in enumerate(tokens)}
+    V = len(tokens)
+
+    # 4) ASCII lookup table (O(1) char -> idx); others -> -1
+    lut = np.full(128, -1, dtype=np.int32)
+    for ch, i in index.items():
+        lut[ord(ch)] = i
+
+    # 5) Concatenate with a separator that’s not in vocab to kill cross-boundary pairs
+    concat = (" ".join(sequences)).encode("ascii", errors="ignore")
+
+    # 6) Map bytes to indices
+    arr = np.frombuffer(concat, dtype=np.uint8)
+    idx = lut[arr]
+
+    # 7) Build bigram pairs; drop invalid ones (separator & OOV)
+    a = idx[:-1]
+    b = idx[1:]
+    mask = (a >= 0) & (b >= 0)
+    a, b = a[mask], b[mask]
+
+    # 8) Count with a single bincount
+    lin = a * V + b
+    counts = np.bincount(lin, minlength=V * V).reshape(V, V)
+
+    # 9) Optional Laplace smoothing
+    if alpha and alpha > 0:
+        counts = counts + alpha
+
+    # 10) Row-normalize to probabilities
+    row_sums = counts.sum(axis=1, keepdims=True)
+    # avoid division by zero
+    probs = np.divide(counts, np.where(row_sums == 0, 1.0, row_sums), where=True)
+
+    # 11) DataFrames
+    df_counts = pd.DataFrame(counts, index=tokens, columns=tokens)
+    df_probs = pd.DataFrame(probs, index=tokens, columns=tokens)
+
+    return {
+        "tokens": tokens,
+        "index": index,
+        "counts": counts,
+        "df_counts": df_counts,
+        "probs": probs,
+        "df_probs": df_probs,
+    }
+
+
+def build_transition_comparisons(
+    names_transitions: Dict[str, Any],
+    surnames_transitions: Dict[str, Any],
+    n_permutations: int = 1000,
+) -> pd.DataFrame:
+    """
+    Compares letter transition probability matrices for names and surnames using
+    various distance metrics and a permutation test for statistical significance.
+    """
+
+    # Helper function to flatten and smooth matrices
+    def prepare_data(data):
+        return {"m": data["m"]["probs"].flatten(), "f": data["f"]["probs"].flatten()}
+
+    prepared_names = prepare_data(names_transitions)
+    prepared_surnames = prepare_data(surnames_transitions)
+
+    # Distance Metrics
+    names_l2 = euclidean(prepared_names["m"], prepared_names["f"])
+    surnames_l2 = euclidean(prepared_surnames["m"], prepared_surnames["f"])
+
+    kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
+    kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
+
+    kl_surnames_mf = entropy(
+        prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12
+    )
+    kl_surnames_fm = entropy(
+        prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12
+    )
+
+    jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
+    jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
+
+    # Permutation Test
+    def run_permutation_test(transitions):
+        # Flattened probabilities for male and female
+        P_m = transitions["m"]["probs"].flatten()
+        P_f = transitions["f"]["probs"].flatten()
+
+        # Calculate the observed JSD (our test statistic)
+        observed_jsd = 0.5 * (
+            entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)
+        )
+
+        # Concatenate male and female counts
+        counts_m = transitions["m"]["counts"]
+        counts_f = transitions["f"]["counts"]
+        all_counts = np.concatenate((counts_m, counts_f), axis=1)
+        total_counts = counts_m.shape[1] + counts_f.shape[1]
+
+        permuted_jsds = []
+        for _ in range(n_permutations):
+            # Shuffle the columns (names) and split back into two groups
+            shuffled_indices = np.random.permutation(total_counts)
+
+            # Note: This is a simplified approach, assuming counts are
+            # structured per name. A more robust implementation would
+            # shuffle the actual names themselves.
+            permuted_counts_m = all_counts[:, shuffled_indices[: counts_m.shape[1]]]
+            permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1] :]]
+
+            # Re-calculate probabilities and JSD for the permuted groups
+            # Add a small epsilon to the denominator to prevent division by zero
+            epsilon = 1e-12
+            permuted_probs_m = permuted_counts_m / (
+                permuted_counts_m.sum(axis=0, keepdims=True) + epsilon
+            )
+            permuted_probs_f = permuted_counts_f / (
+                permuted_counts_f.sum(axis=0, keepdims=True) + epsilon
+            )
+
+            permuted_jsd = 0.5 * (
+                entropy(
+                    permuted_probs_m.mean(axis=1) + 1e-12,
+                    permuted_probs_f.mean(axis=1) + 1e-12,
+                )
+                + entropy(
+                    permuted_probs_f.mean(axis=1) + 1e-12,
+                    permuted_probs_m.mean(axis=1) + 1e-12,
+                )
+            )
+            permuted_jsds.append(permuted_jsd)
+
+        # Calculate the p-value
+        p_value = np.mean(np.array(permuted_jsds) >= observed_jsd)
+        return p_value
+
+    names_p_value = run_permutation_test(names_transitions)
+    surnames_p_value = run_permutation_test(surnames_transitions)
+
+    out = pd.DataFrame(
+        {
+            "l2": [names_l2, surnames_l2],
+            "kl_mf": [kl_names_mf, kl_surnames_mf],
+            "kl_fm": [kl_names_fm, kl_surnames_fm],
+            "jsd": [jsd_names, jsd_surnames],
+            "permutation_p_value": [names_p_value, surnames_p_value],
+        },
+        index=["names", "surnames"],
+    )
+
+    return out
+
+
+import pandas as pd
+from collections import Counter
+from typing import Literal
+
+
+def build_ngrams_count(
+    df: pd.DataFrame,
+    n: int,
+    where: Literal["any", "prefix", "suffix"] = "any",
+) -> pd.DataFrame:
+    # Normalize and clean to a–z
+    names = df["name"].astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True)
+
+    ngrams = []
+    if where == "any":
+        for s in names:
+            L = len(s)
+            if L >= n:
+                ngrams.extend(s[i : i + n] for i in range(L - n + 1))
+    elif where == "prefix":
+        for s in names:
+            if len(s) >= n:
+                ngrams.append(s[:n])
+    elif where == "suffix":
+        for s in names:
+            if len(s) >= n:
+                ngrams.append(s[-n:])
+    else:
+        raise ValueError("where must be one of: 'any', 'prefix', 'suffix'")
+
+    counter = Counter(ngrams)
+
+    out = (
+        pd.DataFrame(counter.items(), columns=[f"{n}-gram", "count"])
+        .sort_values("count", ascending=False, kind="mergesort")
+        .reset_index(drop=True)
+    )
+    total = out["count"].sum()
+    out["freq"] = out["count"] / (total if total > 0 else 1)
+    return out
@@ -0,0 +1,163 @@
+import logging
+from abc import abstractmethod
+from typing import Dict, Any, List
+
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.model_selection import StratifiedKFold, cross_val_score
+from sklearn.model_selection import learning_curve
+from sklearn.preprocessing import LabelEncoder
+
+from ners.research.base_model import BaseModel
+from ners.research.experiment.feature_extractor import FeatureExtractor
+
+
+class TraditionalModel(BaseModel):
+    """Base class for traditional ML models (scikit-learn compatible)"""
+
+    @property
+    def architecture(self) -> str:
+        return "traditional"
+
+    @abstractmethod
+    def build_model(self) -> BaseEstimator:
+        """Build and return the sklearn model instance"""
+        pass
+
+    def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
+        """Fit the traditional ML model"""
+        logging.info(f"Training {self.__class__.__name__}")
+
+        # Build model if not already built
+        if self.model is None:
+            self.model = self.build_model()
+
+        # Setup feature extraction
+        if self.feature_extractor is None:
+            self.feature_extractor = FeatureExtractor(
+                self.config.features, self.config.feature_params
+            )
+
+        # Extract and prepare features
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+
+        # Encode labels
+        if self.label_encoder is None:
+            self.label_encoder = LabelEncoder()
+            y_encoded = self.label_encoder.fit_transform(y)
+        else:
+            y_encoded = self.label_encoder.transform(y)
+
+        # Train model
+        if len(X_prepared.shape) == 1:
+            # For text-based features (like LogisticRegression with vectorization)
+            logging.info(
+                f"Fitting model with {X_prepared.shape[0]} samples (text features)"
+            )
+        else:
+            # For numerical features
+            logging.info(
+                f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
+            )
+
+        logging.info(X_prepared[0])
+        logging.info(f"Model parameters: {self.config.model_params}")
+
+        history = self.model.fit(X_prepared, y_encoded)
+        self.is_fitted = True
+
+        self.training_history = {
+            "accuracy": history.history["accuracy"],
+            "loss": history.history["loss"],
+            "val_accuracy": history.history.get("val_accuracy", []),
+            "val_loss": history.history.get("val_loss", []),
+        }
+
+        return self
+
+    def cross_validate(
+        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
+    ) -> Dict[str, float]:
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+        y_encoded = self.label_encoder.transform(y)
+
+        cv = StratifiedKFold(
+            n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
+        )
+
+        # Calculate different metrics
+        results = {}
+
+        # Accuracy
+        accuracy_scores = cross_val_score(
+            self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
+        )
+        results["accuracy"] = accuracy_scores.mean()
+        results["accuracy_std"] = accuracy_scores.std()
+
+        # Precision, Recall, F1
+        for metric in ["precision", "recall", "f1"]:
+            if metric in self.config.metrics:
+                scores = cross_val_score(
+                    self.model,
+                    X_prepared,
+                    y_encoded,
+                    cv=cv,
+                    scoring=f"{metric}_weighted",
+                )
+                results[metric] = scores.mean()
+                results[f"{metric}_std"] = scores.std()
+
+        return results
+
+    def generate_learning_curve(
+        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+    ) -> Dict[str, Any]:
+        """Generate learning curve data for the model"""
+        logging.info(f"Generating learning curve for {self.__class__.__name__}")
+
+        if train_sizes is None:
+            train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
+
+        # Prepare features
+        if self.feature_extractor is None:
+            self.feature_extractor = FeatureExtractor(
+                self.config.features, self.config.feature_params
+            )
+
+        features_df = self.feature_extractor.extract_features(X)
+        X_prepared = self.prepare_features(features_df)
+
+        # Encode labels
+        if self.label_encoder is None:
+            self.label_encoder = LabelEncoder()
+            y_encoded = self.label_encoder.fit_transform(y)
+        else:
+            y_encoded = self.label_encoder.transform(y)
+
+        try:
+            train_sizes_abs, train_scores, val_scores = learning_curve(
+                self.build_model(),
+                X_prepared,
+                y_encoded,
+                train_sizes=train_sizes,
+                cv=3,  # Use 3-fold CV for speed
+                scoring="accuracy",
+                random_state=self.config.random_seed,
+            )
+
+            learning_curve_data = {
+                "train_sizes": train_sizes_abs.tolist(),
+                "train_scores": train_scores.mean(axis=1).tolist(),
+                "val_scores": val_scores.mean(axis=1).tolist(),
+                "train_scores_std": train_scores.std(axis=1).tolist(),
+                "val_scores_std": val_scores.std(axis=1).tolist(),
+            }
+        except Exception as e:
+            logging.warning(f"Could not generate learning curve: {e}")
+            return {}
+
+        self.learning_curve_data = learning_curve_data
+        return learning_curve_data
@@ -0,0 +1,46 @@
+#!.venv/bin/python3
+import logging
+import traceback
+
+from ners.core.config import setup_config
+from ners.research.experiment.experiment_builder import ExperimentBuilder
+from ners.research.model_trainer import ModelTrainer
+
+
+def train_from_template(
+    name: str,
+    type: str,
+    *,
+    templates: str = "research_templates.yaml",
+    config: str | None = None,
+    env: str = "development",
+) -> int:
+    try:
+        cfg = setup_config(config_path=config, env=env)
+        experiment_builder = ExperimentBuilder(cfg)
+
+        logging.info(f"Loading research templates from: {templates}")
+        tmpl = experiment_builder.load_templates(templates)
+
+        logging.info(f"Looking for experiment: name='{name}', type='{type}'")
+        experiment_config = experiment_builder.find_template(tmpl, name, type)
+
+        logging.info(f"Found experiment: {experiment_config.get('name')}")
+        logging.info(f"Description: {experiment_config.get('description')}")
+        logging.info(f"Features: {experiment_config.get('features')}")
+
+        trainer = ModelTrainer(cfg)
+        trainer.train_single_model(
+            model_name=experiment_config.get("name"),
+            model_type=experiment_config.get("model_type"),
+            features=experiment_config.get("features"),
+            model_params=experiment_config.get("model_params", {}),
+            tags=experiment_config.get("tags", []),
+        )
+
+        logging.info("Training completed successfully!")
+        return 0
+    except Exception as e:
+        logging.error(f"Training failed: {e}")
+        traceback.print_exc()
+        return 1
@@ -0,0 +1 @@
+
@@ -0,0 +1,67 @@
+#!.venv/bin/python3
+import os
+
+import streamlit as st
+
+from ners.core.config import setup_config, PipelineConfig
+from ners.core.utils.data_loader import DataLoader
+from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
+from ners.research.experiment.experiment_runner import ExperimentRunner
+from ners.research.experiment.experiment_tracker import ExperimentTracker
+
+# Page configuration
+st.set_page_config(
+    page_title="DRC NERS Platform",
+    page_icon="🇨🇩",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+
+
+def initialize_session_state(config: PipelineConfig):
+    """Initialize session state variables"""
+    if "config" not in st.session_state:
+        st.session_state.config = config
+    if "data_loader" not in st.session_state:
+        st.session_state.data_loader = DataLoader(config)
+    if "experiment_tracker" not in st.session_state:
+        st.session_state.experiment_tracker = ExperimentTracker(config)
+    if "experiment_runner" not in st.session_state:
+        st.session_state.experiment_runner = ExperimentRunner(config)
+    if "pipeline_monitor" not in st.session_state:
+        st.session_state.pipeline_monitor = PipelineMonitor()
+    if "current_experiment" not in st.session_state:
+        st.session_state.current_experiment = None
+    if "experiment_results" not in st.session_state:
+        st.session_state.experiment_results = {}
+
+
+class StreamlitApp:
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        initialize_session_state(config)
+
+    @classmethod
+    def run(cls):
+        st.title("🇨🇩 DRC NERS Platform")
+        st.markdown(
+            "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
+        )
+        st.markdown(
+            """
+            ## Overview
+            Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
+            underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
+            data.
+            This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
+            million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
+            """
+        )
+
+
+# Initialize app using environment variables when launched via Typer
+_config_path = os.environ.get("NERS_CONFIG")
+_env = os.environ.get("NERS_ENV", "development")
+_cfg = setup_config(_config_path, env=_env)
+_app = StreamlitApp(_cfg)
+_app.run()
@@ -0,0 +1 @@
+from .ner_testing import NERTesting
@@ -0,0 +1,10 @@
+import streamlit as st
+
+
+class Configuration:
+    def __init__(self, config):
+        self.config = config
+
+    def index(self):
+        st.title("Configuration")
+        st.json(self.config.model_dump())
@@ -0,0 +1,90 @@
+import pandas as pd
+import streamlit as st
+
+from ners.core.utils.data_loader import OPTIMIZED_DTYPES
+
+
+@st.cache_data
+def load_dataset(file_path: str) -> pd.DataFrame:
+    try:
+        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
+    except Exception as e:
+        st.error(f"Error loading dataset: {e}")
+        return pd.DataFrame()
+
+
+class Dashboard:
+    def __init__(self, config, experiment_tracker, experiment_runner):
+        self.config = config
+        self.experiment_tracker = experiment_tracker
+        self.experiment_runner = experiment_runner
+
+    def index(self):
+        st.title("Dashboard")
+        col1, col2, col3, col4, col5 = st.columns(5)
+
+        # Load basic statistics
+        try:
+            data_path = self.config.paths.get_data_path(
+                self.config.data.output_files["featured"]
+            )
+            if data_path.exists():
+                df = load_dataset(str(data_path))
+
+                with col1:
+                    st.metric("Total Names", f"{len(df):,}")
+
+                with col2:
+                    annotated = (df.get("annotated", 0) == 1).sum()
+                    st.metric("Annotated Names", f"{annotated:,}")
+
+                with col3:
+                    provinces = (
+                        df["province"].nunique() if "province" in df.columns else 0
+                    )
+                    st.metric("Provinces", provinces)
+
+                with col4:
+                    if "sex" in df.columns:
+                        gender_dist = df["sex"].value_counts()
+                        ratio = gender_dist.get("f", 0) / max(
+                            gender_dist.get("m", 1), 1
+                        )
+                        st.metric("F/M Rate", f"{ratio:.2%}")
+                with col5:
+                    if "annotated" in df.columns:
+                        annotated = (df.get("annotated", 0) == 1).sum()
+                        ratio = annotated / len(df) if len(df) > 0 else 0
+                        st.metric("Annotation Rate", f"{ratio:.2%}")
+            else:
+                st.warning("No processed data found. Please run data processing first.")
+
+        except Exception as e:
+            st.error(f"Error loading dashboard data: {e}")
+
+        # Recent experiments
+        st.subheader("Recent Experiments")
+        experiments = self.experiment_tracker.list_experiments()[:5]
+
+        if experiments:
+            exp_data = []
+            for exp in experiments:
+                exp_data.append(
+                    {
+                        "Name": exp.config.name,
+                        "Model": exp.config.model_type,
+                        "Status": exp.status.value,
+                        "Accuracy": (
+                            f"{exp.test_metrics.get('accuracy', 0):.3f}"
+                            if exp.test_metrics
+                            else "N/A"
+                        ),
+                        "Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
+                    }
+                )
+
+            st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
+        else:
+            st.info(
+                "No experiments found. Create your first experiment in the Experiments tab!"
+            )
@@ -0,0 +1,52 @@
+from datetime import datetime
+
+import pandas as pd
+import streamlit as st
+
+from ners.core.utils.data_loader import OPTIMIZED_DTYPES
+
+
+@st.cache_data
+def load_dataset(file_path: str) -> pd.DataFrame:
+    try:
+        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
+    except Exception as e:
+        st.error(f"Error loading dataset: {e}")
+        return pd.DataFrame()
+
+
+class DataOverview:
+    def __init__(self, config):
+        self.config = config
+
+    def index(self):
+        st.title("Data Overview")
+        data_files = {
+            "Names": self.config.data.input_file,
+            "Featured Dataset": self.config.data.output_files["featured"],
+            "Evaluation Dataset": self.config.data.output_files["evaluation"],
+            "Male Names": self.config.data.output_files["males"],
+            "Female Names": self.config.data.output_files["females"],
+        }
+
+        st.write("Available Data Files:")
+        for name, rel_path in data_files.items():
+            file_path = self.config.paths.get_data_path(rel_path)
+            exists = file_path.exists()
+            size = file_path.stat().st_size if exists else 0
+            stats = (
+                f"Size: {size / (1024 * 1024):.1f} MB, Last Modified: {datetime.fromtimestamp(file_path.stat().st_mtime)}"
+                if exists
+                else "Not found"
+            )
+            st.write(f"- {name}: {file_path} ({stats})")
+
+        # Preview featured dataset if available
+        data_path = self.config.paths.get_data_path(
+            self.config.data.output_files["featured"]
+        )
+        if data_path.exists():
+            df = load_dataset(str(data_path))
+            st.subheader("Featured Dataset Preview")
+            st.dataframe(df.head(), use_container_width=True)
+            st.write(f"Rows: {len(df):,}")
@@ -0,0 +1,141 @@
+import pandas as pd
+import plotly.express as px
+import streamlit as st
+
+from ners.core.utils.data_loader import OPTIMIZED_DTYPES
+from ners.web.interfaces.log_reader import LogReader
+
+
+@st.cache_data
+def load_dataset(file_path: str) -> pd.DataFrame:
+    try:
+        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
+    except Exception as e:
+        st.error(f"Error loading dataset: {e}")
+        return pd.DataFrame()
+
+
+class DataProcessing:
+    def __init__(self, config, pipeline_monitor):
+        self.config = config
+        self.pipeline_monitor = pipeline_monitor
+
+    def index(self):
+        st.title("Data Processing")
+        status = self.pipeline_monitor.get_pipeline_status()
+
+        # Overall progress
+        overall_progress = status["overall_completion"] / 100
+        st.progress(overall_progress)
+        st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
+
+        # Step details
+        for step_name, step_status in status["steps"].items():
+            with st.expander(
+                f"{step_name.replace('_', ' ').title()} - {step_status['status']}"
+            ):
+                col1, col2, col3 = st.columns(3)
+
+                with col1:
+                    st.metric("Processed Batches", step_status["processed_batches"])
+
+                with col2:
+                    st.metric("Total Batches", step_status["total_batches"])
+
+                with col3:
+                    st.metric("Failed Batches", step_status["failed_batches"])
+
+                if step_status["completion_percentage"] > 0:
+                    st.progress(step_status["completion_percentage"] / 100)
+
+        # Read actual log entries from the log file
+        st.subheader("Recent Processing Logs")
+        try:
+            log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
+            log_reader = LogReader(log_file_path)
+
+            # Options for filtering logs
+            col1, col2 = st.columns(2)
+            with col1:
+                log_level_filter = st.selectbox(
+                    "Filter by Level",
+                    ["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
+                    key="log_level_filter",
+                )
+
+            with col2:
+                num_entries = st.number_input(
+                    "Number of entries",
+                    min_value=5,
+                    max_value=50,
+                    value=10,
+                    key="num_log_entries",
+                )
+
+            # Get log entries based on filter
+            if log_level_filter == "All":
+                log_entries = log_reader.read_last_entries(num_entries)
+            else:
+                log_entries = log_reader.read_entries_by_level(
+                    log_level_filter, num_entries
+                )
+
+            if log_entries:
+                for entry in log_entries:
+                    if entry.level == "ERROR":
+                        st.error(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )
+                    elif entry.level == "WARNING":
+                        st.warning(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )
+                    elif entry.level == "INFO":
+                        st.info(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )
+                    else:
+                        st.text(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )
+
+                # Show log statistics
+                st.subheader("Log Statistics")
+                log_stats = log_reader.get_log_stats()
+
+                if log_stats:
+                    col1, col2, col3, col4 = st.columns(4)
+
+                    with col1:
+                        st.metric("Total Lines", log_stats.get("total_lines", 0))
+                    with col2:
+                        st.metric("INFO", log_stats.get("INFO", 0))
+                    with col3:
+                        st.metric("WARNING", log_stats.get("WARNING", 0))
+                    with col4:
+                        st.metric("ERROR", log_stats.get("ERROR", 0))
+
+                    # Log level distribution chart
+                    levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
+                    counts = [log_stats.get(level, 0) for level in levels]
+
+                    if sum(counts) > 0:
+                        fig = px.bar(
+                            x=levels,
+                            y=counts,
+                            title="Log Entries by Level",
+                            color=levels,
+                            color_discrete_map={
+                                "INFO": "blue",
+                                "WARNING": "orange",
+                                "ERROR": "red",
+                                "DEBUG": "gray",
+                                "CRITICAL": "darkred",
+                            },
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info("No log entries found or log file is empty.")
+
+        except Exception as e:
+            st.error(f"Error reading log file: {e}")
@@ -0,0 +1,434 @@
+from typing import List, Dict
+
+import streamlit as st
+
+from ners.core.config.pipeline_config import PipelineConfig
+from ners.research.experiment import ExperimentConfig, ExperimentStatus
+from ners.research.experiment.experiment_builder import ExperimentBuilder
+from ners.research.experiment.experiment_runner import ExperimentRunner
+from ners.research.experiment.experiment_tracker import ExperimentTracker
+from ners.research.experiment.feature_extractor import FeatureType
+from ners.research.model_registry import list_available_models
+
+
+class Experiments:
+    def __init__(
+        self,
+        config: PipelineConfig,
+        experiment_tracker: ExperimentTracker,
+        experiment_runner: ExperimentRunner,
+    ):
+        self.config = config
+        self.experiment_tracker = experiment_tracker
+        self.experiment_runner = experiment_runner
+        self.experiment_builder = ExperimentBuilder(config)
+
+    def index(self):
+        st.title("Experiments")
+
+        tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
+
+        with tab1:
+            self.show_template_experiments()
+
+        with tab2:
+            self.show_experiment_list()
+
+        with tab3:
+            self.show_batch_experiments()
+
+    def show_template_experiments(self):
+        """Show interface for running predefined template experiments"""
+        st.subheader("Template Experiments")
+        st.write("Run predefined experiments based on research templates.")
+
+        try:
+            available_experiments = self.experiment_builder.get_templates()
+
+            # Create tabs for different experiment types
+            exp_tabs = st.tabs(
+                ["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]
+            )
+
+            with exp_tabs[0]:
+                self._show_experiments_by_type(
+                    available_experiments["baseline"], "baseline"
+                )
+
+            with exp_tabs[1]:
+                self._show_experiments_by_type(
+                    available_experiments["advanced"], "advanced"
+                )
+
+            with exp_tabs[2]:
+                self._show_experiments_by_type(
+                    available_experiments["feature_study"], "feature_study"
+                )
+
+            with exp_tabs[3]:
+                self._show_experiments_by_type(
+                    available_experiments["tuning"], "tuning"
+                )
+
+        except Exception as e:
+            st.error(f"Error loading experiment templates: {e}")
+            st.info(
+                "Make sure the research templates file exists at `config/research_templates.yaml`"
+            )
+
+    def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
+        """Show experiments for a specific type"""
+        if not experiments:
+            st.info(f"No {experiment_type} experiments available in templates.")
+            return
+
+        st.write(f"**{experiment_type.title()} Experiments**")
+
+        # Show available experiments
+        for i, exp_template in enumerate(experiments):
+            exp_name = exp_template.get("name", f"Experiment {i + 1}")
+            exp_description = exp_template.get(
+                "description", "No description available"
+            )
+
+            with st.expander(f"📊 {exp_name} - {exp_description}"):
+                col1, col2 = st.columns([2, 1])
+
+                with col1:
+                    st.json(exp_template)
+
+                with col2:
+                    if st.button("🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
+                        self._run_template_experiment(exp_template)
+
+    def _run_template_experiment(self, exp_template: Dict):
+        """Run a template experiment"""
+        try:
+            with st.spinner(f"Running {exp_template.get('name')}..."):
+                # Create experiment config from template
+                experiment_config = self.experiment_builder.from_template(exp_template)
+
+                # Run the experiment
+                experiment_id = self.experiment_runner.run_experiment(experiment_config)
+                st.success(
+                    f"Experiment '{experiment_config.name}' completed successfully!"
+                )
+                st.info(f"Experiment ID: `{experiment_id}`")
+
+                # Show results
+                experiment = self.experiment_tracker.get_experiment(experiment_id)
+                if experiment and experiment.test_metrics:
+                    st.write("**Results:**")
+                    col1, col2, col3 = st.columns(3)
+
+                    metrics = list(experiment.test_metrics.items())
+                    for i, (metric, value) in enumerate(metrics):
+                        with [col1, col2, col3][i % 3]:
+                            st.metric(metric.title(), f"{value:.4f}")
+
+        except Exception as e:
+            st.error(f"Error running experiment: {e}")
+
+    def show_experiment_list(self):
+        """Show list of all experiments with filtering"""
+        st.subheader("All Experiments")
+
+        # Filters
+        col1, col2, col3 = st.columns(3)
+
+        with col1:
+            status_filter = st.selectbox(
+                "Filter by Status", ["All", "completed", "running", "failed", "pending"]
+            )
+
+        with col2:
+            model_filter = st.selectbox(
+                "Filter by Model", ["All"] + list_available_models()
+            )
+
+        with col3:
+            tag_filter = st.text_input("Filter by Tags (comma-separated)")
+
+        # Get and filter experiments
+        experiments = self._get_filtered_experiments(
+            status_filter, model_filter, tag_filter
+        )
+
+        if not experiments:
+            st.info("No experiments found matching the filters.")
+            return
+
+        # Display experiments
+        for i, exp in enumerate(experiments):
+            with st.expander(
+                f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
+            ):
+                self._display_experiment_details(exp, i)
+
+    def _get_filtered_experiments(
+        self, status_filter: str, model_filter: str, tag_filter: str
+    ):
+        """Get experiments with applied filters"""
+        experiments = self.experiment_tracker.list_experiments()
+
+        # Apply filters
+        if status_filter != "All":
+            experiments = [
+                e for e in experiments if e.status == ExperimentStatus(status_filter)
+            ]
+
+        if model_filter != "All":
+            experiments = [
+                e for e in experiments if e.config.model_type == model_filter
+            ]
+
+        if tag_filter:
+            tags = [tag.strip() for tag in tag_filter.split(",")]
+            experiments = [
+                e for e in experiments if any(tag in e.config.tags for tag in tags)
+            ]
+
+        return experiments
+
+    @classmethod
+    def _display_experiment_details(cls, exp, index: int):
+        """Display details for a single experiment"""
+        col1, col2, col3 = st.columns(3)
+
+        with col1:
+            st.write(f"**Model:** {exp.config.model_type}")
+            st.write(
+                f"**Features:** {', '.join([f.value for f in exp.config.features])}"
+            )
+            st.write(f"**Tags:** {', '.join(exp.config.tags)}")
+
+        with col2:
+            if exp.test_metrics:
+                for metric, value in exp.test_metrics.items():
+                    st.metric(metric.title(), f"{value:.4f}")
+
+        with col3:
+            st.write(f"**Train Size:** {exp.train_size:,}")
+            st.write(f"**Test Size:** {exp.test_size:,}")
+
+            if st.button("View Details", key=f"details_{index}"):
+                st.session_state.selected_experiment = exp.experiment_id
+                st.rerun()
+
+        if exp.config.description:
+            st.write(f"**Description:** {exp.config.description}")
+
+    def show_batch_experiments(self):
+        """Show interface for running batch experiments"""
+        st.subheader("Batch Experiments")
+        st.write("Run multiple experiments with different parameter combinations.")
+
+        # Add option to run template batch experiments
+        batch_type = st.radio(
+            "Batch Type", ["Template Batch", "Custom Parameter Sweep"]
+        )
+
+        if batch_type == "Template Batch":
+            self._show_template_batch_experiments()
+        else:
+            self._show_custom_batch_experiments()
+
+    def _show_template_batch_experiments(self):
+        """Show interface for running batch experiments from templates"""
+        st.write("**Run Multiple Template Experiments**")
+
+        try:
+            available_experiments = self.experiment_builder.get_templates()
+
+            # Select experiment types to run
+            experiment_types = st.multiselect(
+                "Select Experiment Types",
+                ["baseline", "advanced", "feature_study", "tuning"],
+                default=["baseline"],
+            )
+
+            if experiment_types:
+                selected_experiments = []
+
+                for exp_type in experiment_types:
+                    experiments = available_experiments.get(exp_type, [])
+                    if experiments:
+                        st.write(f"**{exp_type.title()} Experiments:**")
+                        exp_names = [
+                            exp.get("name", f"Exp {i}")
+                            for i, exp in enumerate(experiments)
+                        ]
+                        selected_names = st.multiselect(
+                            f"Select {exp_type} experiments",
+                            exp_names,
+                            key=f"select_{exp_type}",
+                        )
+
+                        for name in selected_names:
+                            for exp in experiments:
+                                if exp.get("name") == name:
+                                    selected_experiments.append(exp)
+
+                if st.button("🚀 Run Selected Template Experiments"):
+                    self._run_template_batch_experiments(selected_experiments)
+
+        except Exception as e:
+            st.error(f"Error loading templates for batch experiments: {e}")
+
+    def _run_template_batch_experiments(self, selected_experiments: List[Dict]):
+        """Run batch experiments from templates"""
+        if not selected_experiments:
+            st.warning("No experiments selected")
+            return
+
+        with st.spinner(f"Running {len(selected_experiments)} template experiments..."):
+            try:
+                experiment_configs = []
+                for exp_template in selected_experiments:
+                    config = self.experiment_builder.from_template(exp_template)
+                    experiment_configs.append(config)
+
+                # Run batch experiments
+                experiment_ids = self.experiment_runner.run_experiment_batch(
+                    experiment_configs
+                )
+
+                st.success(f"Completed {len(experiment_ids)} template experiments!")
+
+                # Show summary
+                if experiment_ids:
+                    comparison = self.experiment_runner.compare_experiments(
+                        experiment_ids
+                    )
+                    st.write("**Template Batch Results:**")
+                    st.dataframe(
+                        comparison[["name", "model_type", "test_accuracy"]],
+                        use_container_width=True,
+                    )
+
+            except Exception as e:
+                st.error(f"Error running template batch experiments: {e}")
+
+    def _show_custom_batch_experiments(self):
+        """Show interface for custom parameter sweep experiments"""
+        # Parameter sweep configuration
+        with st.form("batch_experiments"):
+            st.write("**Parameter Sweep Configuration**")
+
+            col1, col2 = st.columns(2)
+
+            with col1:
+                base_name = st.text_input("Base Experiment Name", "parameter_sweep")
+                model_types = st.multiselect(
+                    "Model Types",
+                    list_available_models(),
+                    default=["logistic_regression"],
+                )
+
+                # N-gram ranges for logistic regression
+                st.write("**Logistic Regression Parameters**")
+                ngram_ranges = st.text_area(
+                    "N-gram Ranges (one per line, format: min,max)", "2,4\n2,5\n3,6"
+                )
+
+            with col2:
+                feature_combinations = st.multiselect(
+                    "Feature Combinations",
+                    [f.value for f in FeatureType],
+                    default=["full_name", "native_name", "surname"],
+                )
+
+                test_sizes = st.text_input(
+                    "Test Sizes (comma-separated)", "0.15,0.2,0.25"
+                )
+
+                tags = st.text_input("Common Tags", "parameter_sweep,batch")
+
+            if st.form_submit_button("🚀 Run Parameter Sweep"):
+                self.run_batch_experiments(
+                    base_name,
+                    model_types,
+                    ngram_ranges,
+                    feature_combinations,
+                    test_sizes,
+                    tags,
+                )
+
+    def run_batch_experiments(
+        self,
+        base_name: str,
+        model_types: List[str],
+        ngram_ranges: str,
+        feature_combinations: List[str],
+        test_sizes: str,
+        tags: str,
+    ):
+        """Run batch experiments with parameter combinations"""
+        with st.spinner("Running batch experiments..."):
+            try:
+                experiments = []
+
+                # Parse parameters
+                ngram_list = []
+                for line in ngram_ranges.strip().split("\n"):
+                    if "," in line:
+                        min_val, max_val = map(int, line.split(","))
+                        ngram_list.append([min_val, max_val])
+
+                test_size_list = [float(x.strip()) for x in test_sizes.split(",")]
+                tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()]
+
+                # Generate experiment combinations
+                exp_count = 0
+                for model_type in model_types:
+                    for feature_combo in feature_combinations:
+                        for test_size in test_size_list:
+                            if model_type == "logistic_regression":
+                                for ngram_range in ngram_list:
+                                    exp_name = f"{base_name}_{model_type}_{feature_combo}_{ngram_range[0]}_{ngram_range[1]}_{test_size}"
+
+                                    config = ExperimentConfig(
+                                        name=exp_name,
+                                        description=f"Batch experiment: {model_type} with {feature_combo}",
+                                        model_type=model_type,
+                                        features=[FeatureType(feature_combo)],
+                                        model_params={"ngram_range": ngram_range},
+                                        test_size=test_size,
+                                        tags=tag_list,
+                                    )
+                                    experiments.append(config)
+                                    exp_count += 1
+                            else:
+                                exp_name = f"{base_name}_{model_type}_{feature_combo}_{test_size}"
+
+                                config = ExperimentConfig(
+                                    name=exp_name,
+                                    description=f"Batch experiment: {model_type} with {feature_combo}",
+                                    model_type=model_type,
+                                    features=[FeatureType(feature_combo)],
+                                    test_size=test_size,
+                                    tags=tag_list,
+                                )
+                                experiments.append(config)
+                                exp_count += 1
+
+                # Run experiments
+                experiment_ids = self.experiment_runner.run_experiment_batch(
+                    experiments
+                )
+
+                st.success(f"Completed {len(experiment_ids)} batch experiments")
+
+                # Show summary
+                if experiment_ids:
+                    comparison = self.experiment_runner.compare_experiments(
+                        experiment_ids
+                    )
+                    st.write("**Batch Results Summary:**")
+                    st.dataframe(
+                        comparison[["name", "model_type", "test_accuracy"]],
+                        use_container_width=True,
+                    )
+
+            except Exception as e:
+                st.error(f"Error running batch experiments: {e}")
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+
+@dataclass
+class LogEntry:
+    timestamp: datetime
+    level: str
+    message: str
+
+
+class LogReader:
+    def __init__(self, log_file_path: Path):
+        self.log_file_path = Path(log_file_path)
+
+    def read_last_entries(self, num_entries: int = 20) -> List[LogEntry]:
+        entries = []
+        if not self.log_file_path.exists():
+            return entries
+
+        with open(self.log_file_path, "r") as f:
+            lines = f.readlines()[-num_entries:]
+
+        for line in lines:
+            entry = self._parse_log_line(line)
+            if entry:
+                entries.append(entry)
+
+        return entries
+
+    def read_entries_by_level(
+        self, level: str, num_entries: int = 20
+    ) -> List[LogEntry]:
+        entries = []
+        if not self.log_file_path.exists():
+            return entries
+
+        with open(self.log_file_path, "r") as f:
+            for line in reversed(f.readlines()):
+                entry = self._parse_log_line(line)
+                if entry and entry.level == level:
+                    entries.append(entry)
+                    if len(entries) >= num_entries:
+                        break
+
+        return list(reversed(entries))
+
+    def get_log_stats(self) -> dict:
+        if not self.log_file_path.exists():
+            return {}
+
+        stats = {"total_lines": 0}
+        with open(self.log_file_path, "r") as f:
+            for line in f:
+                stats["total_lines"] += 1
+                entry = self._parse_log_line(line)
+                if entry:
+                    stats[entry.level] = stats.get(entry.level, 0) + 1
+
+        return stats
+
+    @staticmethod
+    def _parse_log_line(line: str) -> LogEntry | None:
+        try:
+            # Expected format from logging config: [timestamp] - LEVEL - message
+            parts = line.strip().split(" - ")
+            if len(parts) >= 3:
+                timestamp_str = parts[0].strip("[]")
+                timestamp = datetime.fromisoformat(timestamp_str)
+                level = parts[1].strip()
+                message = " - ".join(parts[2:])
+                return LogEntry(timestamp, level, message)
+        except Exception:
+            return None
+
+        return None
@@ -0,0 +1,170 @@
+import streamlit as st
+from spacy import displacy
+
+from ners.core.config import PipelineConfig
+from ners.processing.ner.name_model import NameModel
+
+
+class NERTesting:
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.model_path = config.paths.models_dir / "drc_ner_model"
+        self.ner_model = None
+        self.training_stats = None
+        self.evaluation_stats = None
+
+    def load_ner_model(self) -> bool:
+        """Load the trained NER model"""
+        try:
+            if self.ner_model is None:
+                self.ner_model = NameModel(self.config)
+                self.ner_model.load(str(self.model_path))
+                self.training_stats = self.ner_model.training_stats
+                self.evaluation_stats = {}
+            return True
+        except Exception as e:
+            st.error(f"Error loading NER model: {e}")
+            return False
+
+    def index(self):
+        st.title("Named Entity Recognition")
+
+        # Load model
+        if not self.load_ner_model():
+            st.warning(
+                "NER model could not be loaded. Please ensure the model is trained and available."
+            )
+            return
+
+        # Display model information
+        self.show_model_training_info()
+        self.show_model_evaluation_info()
+
+        st.markdown("---")
+        st.subheader("Test the NER Model")
+        input_method = st.radio("Input Method", ["Single Name", "Multiple Names"])
+        if input_method == "Single Name":
+            self.test_single_name()
+        elif input_method == "Multiple Names":
+            self.test_multiple_names()
+
+    def show_model_training_info(self):
+        if self.training_stats:
+            col1, col2, col3, col4 = st.columns(4)
+
+            with col1:
+                st.metric(
+                    "Training Examples",
+                    f"{self.training_stats.get('training_examples', 0):,}",
+                )
+            with col2:
+                st.metric("Epochs", self.training_stats.get("epochs", 0))
+            with col3:
+                st.metric(
+                    "Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}"
+                )
+            with col4:
+                st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
+
+    def show_model_evaluation_info(self):
+        if self.evaluation_stats:
+            col1, col2, col3 = st.columns(4)
+            overall = self.evaluation_stats.get("overall", {})
+
+            with col1:
+                st.metric("Overall Precision", f"{overall['precision']:.2f}")
+            with col2:
+                st.metric("Overall Recall", f"{overall['recall']:.2f}")
+            with col3:
+                st.metric("Overall F1 Score", f"{overall['f1_score']:.2f}")
+
+            st.json(self.evaluation_stats.get("by_label", {}))
+
+    def test_single_name(self):
+        name_input = st.text_input(
+            "Name:",
+            placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
+            help="Enter a full name or multiple names separated by spaces",
+        )
+        if name_input.strip():
+            if st.button("Analyze Name", type="primary"):
+                self.analyze_and_display(name_input)
+
+    def test_multiple_names(self):
+        names_input = st.text_area(
+            "Names:",
+            placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
+            height=150,
+            help="Enter each name on a new line",
+        )
+
+        if names_input.strip():
+            if st.button("Analyze All Names", type="primary"):
+                names = [
+                    name.strip() for name in names_input.split("\n") if name.strip()
+                ]
+                for i, name in enumerate(names):
+                    st.markdown(f"**Name {i + 1}: {name}**")
+                    self.analyze_and_display(name)
+                    if i < len(names) - 1:
+                        st.markdown("---")
+
+    def analyze_and_display(self, text: str):
+        try:
+            result = self.ner_model.predict(text)
+            st.subheader("Analysis Results")
+            entities = result.get("entities", [])
+
+            if entities:
+                self.show_visual_entities(text, entities)
+                native_count = sum(1 for e in entities if e["label"] == "NATIVE")
+                surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
+
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.metric("Total Entities", len(entities))
+                with col2:
+                    st.metric("Native Names", native_count)
+                with col3:
+                    st.metric("Surnames", surname_count)
+
+            else:
+                st.warning("No entities detected in the input text.")
+                st.info(
+                    "Try using traditional Congolese names or ensure the spelling is correct."
+                )
+
+        except Exception as e:
+            st.error(f"Error analyzing text: {e}")
+
+    @classmethod
+    def show_visual_entities(cls, text: str, entities: list):
+        try:
+            # Convert our entities format to spaCy format for displacy
+            ents = []
+            for entity in entities:
+                ents.append(
+                    {
+                        "start": entity["start"],
+                        "end": entity["end"],
+                        "label": entity["label"],
+                    }
+                )
+
+            # Create doc-like structure for displacy
+            doc_data = {"text": text, "ents": ents, "title": None}
+
+            # Custom colors for our labels
+            colors = {
+                "NATIVE": "#74C0FC",
+                "SURNAME": "#69DB7C",
+            }  # Light blue  # Light green
+
+            options = {"colors": colors, "distance": 90}
+
+            # Generate HTML visualization
+            html = displacy.render(doc_data, style="ent", manual=True, options=options)
+            st.markdown(html, unsafe_allow_html=True)
+
+        except Exception as e:
+            st.warning(f"Could not generate visual representation: {e}")
@@ -0,0 +1,215 @@
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+import streamlit as st
+
+from ners.core.utils.data_loader import OPTIMIZED_DTYPES
+from ners.research.experiment.experiment_runner import ExperimentRunner
+from ners.research.experiment.experiment_tracker import ExperimentTracker
+
+
+class Predictions:
+    def __init__(
+        self,
+        config,
+        experiment_tracker: ExperimentTracker,
+        experiment_runner: ExperimentRunner,
+    ):
+        self.config = config
+        self.experiment_tracker = experiment_tracker
+        self.experiment_runner = experiment_runner
+
+    def index(self):
+        st.title("Predictions")
+
+        # Load available models
+        experiments = self.experiment_tracker.list_experiments()
+        completed_experiments = [
+            e for e in experiments if e.status.value == "completed" and e.model_path
+        ]
+
+        if not completed_experiments:
+            st.warning(
+                "No trained models available. Please run some experiments first."
+            )
+            return
+
+        # Model selection
+        model_options = {
+            f"{exp.config.name} (Acc: {exp.test_metrics.get('accuracy', 0):.3f})": exp
+            for exp in completed_experiments
+            if exp.test_metrics
+        }
+
+        selected_model_name = st.selectbox("Select Model", list(model_options.keys()))
+
+        if not selected_model_name:
+            return
+
+        selected_experiment = model_options[selected_model_name]
+
+        # Prediction modes
+        prediction_mode = st.radio(
+            "Prediction Mode", ["Single Name", "Batch Upload", "Dataset Prediction"]
+        )
+
+        if prediction_mode == "Single Name":
+            self.show_single_prediction(selected_experiment)
+        elif prediction_mode == "Batch Upload":
+            self.show_batch_prediction(selected_experiment)
+        elif prediction_mode == "Dataset Prediction":
+            self.show_dataset_prediction(selected_experiment)
+
+    def show_single_prediction(self, experiment):
+        """Show single name prediction interface"""
+        name_input = st.text_input(
+            "Enter a name:", placeholder="e.g., Jean Baptiste Mukendi"
+        )
+        if name_input and st.button("Predict Gender"):
+            try:
+                # Load the model
+                model = self.experiment_runner.load_experiment_model(
+                    experiment.experiment_id
+                )
+
+                if model is None:
+                    st.error("Failed to load model")
+                    return
+
+                # Create a DataFrame with the input
+                input_df = self._prepare_single_input(name_input)
+
+                # Make prediction
+                prediction = model.predict(input_df)[0]
+
+                # Get prediction probability if available
+                confidence = self._get_prediction_confidence(model, input_df)
+
+                # Display results
+                self._display_single_prediction_results(
+                    prediction, confidence, experiment, name_input
+                )
+
+            except Exception as e:
+                st.error(f"Error making prediction: {e}")
+
+    def _prepare_single_input(self, name_input: str) -> pd.DataFrame:
+        """Prepare single name input for prediction"""
+        return pd.DataFrame(
+            {
+                "name": [name_input],
+                "words": [len(name_input.split())],
+                "length": [len(name_input.replace(" ", ""))],
+                "province": ["unknown"],  # Default values
+                "identified_name": [None],
+                "identified_surname": [None],
+                "probable_native": [None],
+                "probable_surname": [None],
+            }
+        )
+
+    def _get_prediction_confidence(
+        self, model, input_df: pd.DataFrame
+    ) -> Optional[float]:
+        """Get prediction confidence if available"""
+        try:
+            probabilities = model.predict_proba(input_df)[0]
+            return max(probabilities)
+        except:
+            return None
+
+    def _display_single_prediction_results(
+        self, prediction: str, confidence: Optional[float], experiment, name_input: str
+    ):
+        """Display single prediction results"""
+        col1, col2 = st.columns(2)
+
+        with col1:
+            gender_label = "Female" if prediction == "f" else "Male"
+            st.success(f"**Predicted Gender:** {gender_label}")
+
+        with col2:
+            if confidence:
+                st.metric("Confidence", f"{confidence:.2%}")
+
+        # Additional info
+        st.info(f"Model used: {experiment.config.name}")
+        st.info(
+            f"Features used: {', '.join([f.value for f in experiment.config.features])}"
+        )
+
+    def show_batch_prediction(self, experiment):
+        uploaded_file = st.file_uploader("Upload CSV file with names", type="csv")
+        if uploaded_file is not None:
+            try:
+                df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)
+
+                st.write("**Uploaded Data Preview:**")
+                st.dataframe(df.head(), use_container_width=True)
+
+                # Column selection
+                df = self._prepare_batch_data(df)
+
+                if st.button("Run Batch Prediction"):
+                    self._run_batch_prediction(df, experiment)
+
+            except Exception as e:
+                st.error(f"Error processing file: {e}")
+
+    def _prepare_batch_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Prepare batch data for prediction"""
+        # Column selection
+        if "name" not in df.columns:
+            name_column = st.selectbox("Select the name column:", df.columns)
+            df = df.rename(columns={name_column: "name"})
+
+        # Add missing columns with defaults
+        required_columns = [
+            "words",
+            "length",
+            "province",
+            "identified_name",
+            "identified_surname",
+            "probable_native",
+            "probable_surname",
+        ]
+
+        for col in required_columns:
+            if col not in df.columns:
+                if col == "words":
+                    df[col] = df["name"].str.split().str.len()
+                elif col == "length":
+                    df[col] = df["name"].str.replace(" ", "").str.len()
+                else:
+                    df[col] = None
+
+        return df
+
+    def _run_batch_prediction(self, df: pd.DataFrame, experiment):
+        """Run batch prediction and display results"""
+        with st.spinner("Making predictions..."):
+            # Load model
+            model = self.experiment_runner.load_experiment_model(
+                experiment.experiment_id
+            )
+
+            if model is None:
+                st.error("Failed to load model")
+                return
+
+            # Make predictions
+            predictions = model.predict(df)
+            df["predicted_gender"] = predictions
+            df["gender_label"] = df["predicted_gender"].map(
+                {"f": "Female", "m": "Male"}
+            )
+
+            # Try to get probabilities
+            try:
+                probabilities = model.predict_proba(df)
+                df["confidence"] = np.max(probabilities, axis=1)
+            except:
+                df["confidence"] = None
+
+        st.success("Predictions completed!")
@@ -0,0 +1,283 @@
+from typing import List
+
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import streamlit as st
+
+from ners.research.experiment.experiment_runner import ExperimentRunner
+from ners.research.experiment.experiment_tracker import ExperimentTracker
+
+
+class ResultsAnalysis:
+    def __init__(
+        self,
+        config,
+        experiment_tracker: ExperimentTracker,
+        experiment_runner: ExperimentRunner,
+    ):
+        self.config = config
+        self.experiment_tracker = experiment_tracker
+        self.experiment_runner = experiment_runner
+
+    def index(self):
+        st.title("Results & Analysis")
+        tab1, tab2, tab3 = st.tabs(
+            ["Experiment Comparison", "Performance Analysis", "Model Analysis"]
+        )
+
+        with tab1:
+            self.show_experiment_comparison()
+
+        with tab2:
+            self.show_performance_analysis()
+
+        with tab3:
+            self.show_model_analysis()
+
+    def show_experiment_comparison(self):
+        """Show experiment comparison interface"""
+        st.subheader("Compare Experiments")
+
+        experiments = self.experiment_tracker.list_experiments()
+        completed_experiments = [
+            e for e in experiments if e.status.value == "completed"
+        ]
+
+        if not completed_experiments:
+            st.warning("No completed experiments found.")
+            return
+
+        # Experiment selection
+        exp_options = {
+            f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id
+            for exp in completed_experiments
+        }
+
+        selected_exp_names = st.multiselect(
+            "Select Experiments to Compare",
+            list(exp_options.keys()),
+            default=list(exp_options.keys())[: min(5, len(exp_options))],
+        )
+
+        if not selected_exp_names:
+            st.info("Please select experiments to compare.")
+            return
+
+        selected_exp_ids = [exp_options[name] for name in selected_exp_names]
+
+        # Generate comparison
+        comparison_df = self.experiment_runner.compare_experiments(selected_exp_ids)
+
+        if comparison_df.empty:
+            st.error("No data available for comparison.")
+            return
+
+        self._display_comparison_table(comparison_df)
+        self._display_comparison_charts(comparison_df)
+
+    def _display_comparison_table(self, comparison_df: pd.DataFrame):
+        """Display comparison table"""
+        st.write("**Experiment Comparison Table**")
+
+        # Select columns to display
+        metric_columns = [
+            col
+            for col in comparison_df.columns
+            if col.startswith("test_") or col.startswith("cv_")
+        ]
+        display_columns = ["name", "model_type", "features"] + metric_columns
+        available_columns = [
+            col for col in display_columns if col in comparison_df.columns
+        ]
+
+        st.dataframe(comparison_df[available_columns], use_container_width=True)
+
+    def _display_comparison_charts(self, comparison_df: pd.DataFrame):
+        """Display comparison charts"""
+        st.write("**Performance Comparison**")
+
+        if "test_accuracy" in comparison_df.columns:
+            fig = px.bar(
+                comparison_df,
+                x="name",
+                y="test_accuracy",
+                color="model_type",
+                title="Test Accuracy Comparison",
+            )
+            fig.update_layout(xaxis_tickangle=-45)
+            st.plotly_chart(fig, use_container_width=True)
+
+        # Metric comparison across multiple metrics
+        metric_columns = [
+            col
+            for col in comparison_df.columns
+            if col.startswith("test_") or col.startswith("cv_")
+        ]
+
+        if len(metric_columns) > 1:
+            metric_to_plot = st.selectbox(
+                "Select Metric for Detailed Comparison", metric_columns
+            )
+
+            if metric_to_plot in comparison_df.columns:
+                fig = px.bar(
+                    comparison_df,
+                    x="name",
+                    y=metric_to_plot,
+                    color="model_type",
+                    title=f"{metric_to_plot.replace('_', ' ').title()} Comparison",
+                )
+                fig.update_layout(xaxis_tickangle=-45)
+                st.plotly_chart(fig, use_container_width=True)
+
+    def show_performance_analysis(self):
+        """Show performance analysis across experiments"""
+        st.subheader("Performance Analysis")
+
+        experiments = self.experiment_tracker.list_experiments()
+        completed_experiments = [
+            e for e in experiments if e.status.value == "completed" and e.test_metrics
+        ]
+
+        if not completed_experiments:
+            st.warning("No completed experiments with metrics found.")
+            return
+
+        # Prepare data for analysis
+        analysis_data = self._prepare_analysis_data(completed_experiments)
+        analysis_df = pd.DataFrame(analysis_data)
+
+        self._display_performance_trends(analysis_df)
+        self._display_model_comparison(analysis_df)
+        self._display_top_experiments(analysis_df)
+
+    def _prepare_analysis_data(self, completed_experiments: List) -> List[dict]:
+        """Prepare data for performance analysis"""
+        analysis_data = []
+        for exp in completed_experiments:
+            row = {
+                "experiment_id": exp.experiment_id,
+                "name": exp.config.name,
+                "model_type": exp.config.model_type,
+                "feature_count": len(exp.config.features),
+                "features": ", ".join([f.value for f in exp.config.features]),
+                "train_size": exp.train_size,
+                "test_size": exp.test_size,
+                **exp.test_metrics,
+            }
+            analysis_data.append(row)
+        return analysis_data
+
+    def _display_performance_trends(self, analysis_df: pd.DataFrame):
+        """Display performance trend charts"""
+        col1, col2 = st.columns(2)
+
+        with col1:
+            # Accuracy vs Training Size
+            if (
+                "accuracy" in analysis_df.columns
+                and "train_size" in analysis_df.columns
+            ):
+                fig = px.scatter(
+                    analysis_df,
+                    x="train_size",
+                    y="accuracy",
+                    color="model_type",
+                    hover_data=["name"],
+                    title="Accuracy vs Training Size",
+                )
+                st.plotly_chart(fig, use_container_width=True)
+
+        with col2:
+            # Feature Count vs Performance
+            if (
+                "accuracy" in analysis_df.columns
+                and "feature_count" in analysis_df.columns
+            ):
+                fig = px.scatter(
+                    analysis_df,
+                    x="feature_count",
+                    y="accuracy",
+                    color="model_type",
+                    hover_data=["name"],
+                    title="Accuracy vs Number of Features",
+                )
+                st.plotly_chart(fig, use_container_width=True)
+
+    def _display_model_comparison(self, analysis_df: pd.DataFrame):
+        """Display model type comparison"""
+        if "accuracy" in analysis_df.columns:
+            model_performance = (
+                analysis_df.groupby("model_type")["accuracy"]
+                .agg(["mean", "std", "count"])
+                .reset_index()
+            )
+
+            fig = go.Figure()
+            fig.add_trace(
+                go.Bar(
+                    x=model_performance["model_type"],
+                    y=model_performance["mean"],
+                    error_y=dict(type="data", array=model_performance["std"].fillna(0)),
+                    name="Accuracy",
+                )
+            )
+
+            st.plotly_chart(fig, use_container_width=True)
+
+    def _display_top_experiments(self, analysis_df: pd.DataFrame):
+        """Display top-performing experiments"""
+        if "accuracy" in analysis_df.columns:
+            top_n = st.slider("Select Top N Experiments", 3, 20, 5)
+            top_experiments = analysis_df.nlargest(top_n, "accuracy")
+
+            st.write("**Top Performing Experiments:**")
+            st.dataframe(
+                top_experiments[
+                    [
+                        "name",
+                        "model_type",
+                        "features",
+                        "train_size",
+                        "test_size",
+                        "accuracy",
+                    ]
+                ],
+                use_container_width=True,
+            )
+
+    def show_model_analysis(self):
+        """Show detailed model analysis interface"""
+        st.subheader("Model Analysis")
+
+        experiments = self.experiment_tracker.list_experiments()
+        completed_experiments = [
+            e for e in experiments if e.status.value == "completed"
+        ]
+
+        if not completed_experiments:
+            st.warning("No completed experiments found for analysis.")
+            return
+
+        # Model selection
+        exp_options = {
+            f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id
+            for exp in completed_experiments
+        }
+        selected_exp_name = st.selectbox(
+            "Select Model for Analysis", list(exp_options.keys())
+        )
+        if not selected_exp_name:
+            return
+
+        exp_id = exp_options[selected_exp_name]
+        experiment = self.experiment_tracker.get_experiment(exp_id)
+
+        if not experiment or not experiment.test_metrics:
+            st.warning("Selected experiment has no evaluation metrics.")
+            return
+
+        # Display detailed metrics
+        st.write("**Detailed Metrics:**")
+        st.json(experiment.test_metrics)
@@ -0,0 +1,16 @@
+import streamlit as st
+
+from ners.web.interfaces.dashboard import Dashboard
+
+st.set_page_config(page_title="Dashboard", page_icon="📊", layout="wide")
+
+if "config" in st.session_state:
+    dashboard = Dashboard(
+        st.session_state.config,
+        st.session_state.experiment_tracker,
+        st.session_state.experiment_runner,
+    )
+    dashboard.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,12 @@
+import streamlit as st
+
+from ners.web.interfaces.data_overview import DataOverview
+
+st.set_page_config(page_title="Data Overview", page_icon="📋", layout="wide")
+
+if "config" in st.session_state:
+    data_overview = DataOverview(st.session_state.config)
+    data_overview.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,14 @@
+import streamlit as st
+
+from ners.web.interfaces.data_processing import DataProcessing
+
+st.set_page_config(page_title="Data Processing", page_icon="⚙️", layout="wide")
+
+if "config" in st.session_state:
+    data_processing = DataProcessing(
+        st.session_state.config, st.session_state.pipeline_monitor
+    )
+    data_processing.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,16 @@
+import streamlit as st
+
+from ners.web.interfaces.experiments import Experiments
+
+st.set_page_config(page_title="Experiments", page_icon="🧪", layout="wide")
+
+if "config" in st.session_state:
+    experiments = Experiments(
+        st.session_state.config,
+        st.session_state.experiment_tracker,
+        st.session_state.experiment_runner,
+    )
+    experiments.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,16 @@
+import streamlit as st
+
+from ners.web.interfaces.results_analysis import ResultsAnalysis
+
+st.set_page_config(page_title="Results & Analysis", page_icon="📈", layout="wide")
+
+if "config" in st.session_state:
+    results_analysis = ResultsAnalysis(
+        st.session_state.config,
+        st.session_state.experiment_tracker,
+        st.session_state.experiment_runner,
+    )
+    results_analysis.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,16 @@
+import streamlit as st
+
+from ners.web.interfaces.predictions import Predictions
+
+st.set_page_config(page_title="Predictions", page_icon="🔮", layout="wide")
+
+if "config" in st.session_state:
+    predictions = Predictions(
+        st.session_state.config,
+        st.session_state.experiment_tracker,
+        st.session_state.experiment_runner,
+    )
+    predictions.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,12 @@
+import streamlit as st
+
+from ners.web.interfaces.configuration import Configuration
+
+st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
+
+if "config" in st.session_state:
+    configuration = Configuration(st.session_state.config)
+    configuration.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,12 @@
+import streamlit as st
+
+from ners.web.interfaces.ner_testing import NERTesting
+
+st.set_page_config(page_title="NER Testing", page_icon="🏷️", layout="wide")
+
+if "config" in st.session_state:
+    ner_testing = NERTesting(st.session_state.config)
+    ner_testing.index()
+else:
+    st.error("Please run the main app first to initialize the configuration.")
+    st.markdown("Go back to the [main page](/) to start the application.")