refactoring: uv
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
"""DRC NERS NLP package."""
|
||||
|
||||
__all__: list[str] = []
|
||||
+226
@@ -0,0 +1,226 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
|
||||
from ners.core.config import setup_config, PipelineConfig
|
||||
|
||||
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Pipeline commands
|
||||
# -------------------------
|
||||
pipeline_app = typer.Typer(help="Data processing pipeline")
|
||||
app.add_typer(pipeline_app, name="pipeline")
|
||||
|
||||
|
||||
@pipeline_app.command("run")
|
||||
def pipeline_run(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
"""Run the full processing pipeline."""
|
||||
from ners.main import run_pipeline as _run_pipeline
|
||||
|
||||
cfg = setup_config(config_path=config, env=env)
|
||||
code = _run_pipeline(cfg)
|
||||
raise typer.Exit(code)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# NER commands
|
||||
# -------------------------
|
||||
ner_app = typer.Typer(help="NER dataset and model")
|
||||
app.add_typer(ner_app, name="ner")
|
||||
|
||||
|
||||
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
|
||||
return setup_config(config_path=config, env=env)
|
||||
|
||||
|
||||
@ner_app.command("feature")
|
||||
def ner_feature(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.ner import feature as _feature
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
_feature(cfg)
|
||||
|
||||
|
||||
@ner_app.command("build")
|
||||
def ner_build(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.ner import build as _build
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
_build(cfg)
|
||||
|
||||
|
||||
@ner_app.command("train")
|
||||
def ner_train(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.ner import train as _train
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
_train(cfg)
|
||||
|
||||
|
||||
@ner_app.command("run")
|
||||
def ner_run(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
reset: bool = typer.Option(
|
||||
False, help="Reset intermediate outputs and rerun all steps"
|
||||
),
|
||||
) -> None:
|
||||
from ners.ner import run_pipeline as _ner_pipeline
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
code = _ner_pipeline(cfg, reset)
|
||||
raise typer.Exit(code)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Research commands
|
||||
# -------------------------
|
||||
research_app = typer.Typer(help="Research experiments and training")
|
||||
app.add_typer(research_app, name="research")
|
||||
|
||||
|
||||
@research_app.command("train")
|
||||
def research_train(
|
||||
name: str = typer.Option(..., "--name", help="Model name to train"),
|
||||
type: str = typer.Option(..., "--type", help="Experiment type"),
|
||||
templates: str = typer.Option(
|
||||
"research_templates.yaml", help="Templates file path"
|
||||
),
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||
from ners.research.model_trainer import ModelTrainer
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
exp_builder = ExperimentBuilder(cfg)
|
||||
tmpl = exp_builder.load_templates(templates)
|
||||
exp_cfg = exp_builder.find_template(tmpl, name, type)
|
||||
|
||||
trainer = ModelTrainer(cfg)
|
||||
trainer.train_single_model(
|
||||
model_name=exp_cfg.get("name"),
|
||||
model_type=exp_cfg.get("model_type"),
|
||||
features=exp_cfg.get("features"),
|
||||
model_params=exp_cfg.get("model_params", {}),
|
||||
tags=exp_cfg.get("tags", []),
|
||||
)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Monitor commands
|
||||
# -------------------------
|
||||
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
|
||||
app.add_typer(monitor_app, name="monitor")
|
||||
|
||||
|
||||
@monitor_app.command("status")
|
||||
def monitor_status(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
detailed: bool = typer.Option(
|
||||
False, help="Show detailed status (failed batch IDs)"
|
||||
),
|
||||
) -> None:
|
||||
_ = _load_config(config, env)
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
PipelineMonitor().print_status(detailed=detailed)
|
||||
|
||||
|
||||
@monitor_app.command("clean")
|
||||
def monitor_clean(
|
||||
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
|
||||
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
|
||||
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
_ = _load_config(config, env)
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
mon = PipelineMonitor()
|
||||
if not force:
|
||||
typer.confirm("Clean checkpoints?", abort=True)
|
||||
|
||||
if step:
|
||||
mon.clean_step_checkpoints(step, keep_last)
|
||||
else:
|
||||
for s in mon.steps:
|
||||
mon.clean_step_checkpoints(s, keep_last)
|
||||
|
||||
|
||||
@monitor_app.command("reset")
|
||||
def monitor_reset(
|
||||
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
|
||||
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
_ = _load_config(config, env)
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
mon = PipelineMonitor()
|
||||
if not force:
|
||||
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
|
||||
typer.confirm(msg, abort=True)
|
||||
|
||||
if step:
|
||||
mon.reset_step(step)
|
||||
else:
|
||||
for s in mon.steps:
|
||||
mon.reset_step(s)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Web commands
|
||||
# -------------------------
|
||||
web_app = typer.Typer(help="Web UI wrapper")
|
||||
app.add_typer(web_app, name="web")
|
||||
|
||||
|
||||
@web_app.command("run")
|
||||
def web_run(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
"""Launch the Streamlit web app via subprocess."""
|
||||
app_path = Path(__file__).parent / "web" / "app.py"
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"streamlit",
|
||||
"run",
|
||||
str(app_path),
|
||||
]
|
||||
# Pass configuration via environment variables to avoid argparse in Streamlit
|
||||
env_vars = os.environ.copy()
|
||||
if config is not None:
|
||||
env_vars["NERS_CONFIG"] = str(config)
|
||||
env_vars["NERS_ENV"] = env
|
||||
|
||||
raise typer.Exit(subprocess.call(cmd, env=env_vars))
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
app()
|
||||
@@ -0,0 +1,95 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from ners.core.utils import ensure_directories
|
||||
from ners.core.config.config_manager import ConfigManager
|
||||
from ners.core.config.logging_config import LoggingConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
config_manager = ConfigManager()
|
||||
|
||||
|
||||
def get_config() -> PipelineConfig:
|
||||
"""Get the global configuration instance"""
|
||||
return config_manager.get_config()
|
||||
|
||||
|
||||
def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
|
||||
"""Load configuration from specified path"""
|
||||
if config_path:
|
||||
return config_manager.load_config(Path(config_path))
|
||||
return config_manager.get_config()
|
||||
|
||||
|
||||
def setup_config(
|
||||
config_path: Optional[Path] = None, env: str = "development"
|
||||
) -> PipelineConfig:
|
||||
"""
|
||||
Unified configuration loading and logging setup for all entrypoint scripts.
|
||||
|
||||
Args:
|
||||
config_path: Direct path to config file (takes precedence over env)
|
||||
env: Environment name (defaults to "development")
|
||||
|
||||
Returns:
|
||||
Loaded configuration object
|
||||
"""
|
||||
# Determine config path
|
||||
if config_path is None:
|
||||
config_path = Path("config") / f"pipeline.{env}.yaml"
|
||||
|
||||
# Load configuration
|
||||
config = ConfigManager(config_path).load_config()
|
||||
|
||||
# Setup logging
|
||||
setup_logging(config)
|
||||
|
||||
# Ensure required directories exist
|
||||
ensure_directories(config)
|
||||
|
||||
logging.info(f"Loaded configuration: {config.name} v{config.version}")
|
||||
logging.info(f"Environment: {config.environment}")
|
||||
logging.info(f"Config file: {config_path}")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def setup_logging(config: PipelineConfig):
|
||||
"""Setup logging based on configuration"""
|
||||
|
||||
# Create logs directory
|
||||
log_dir = config.paths.logs_dir
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup logging configuration
|
||||
log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(config.logging.format)
|
||||
|
||||
# Setup root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(log_level)
|
||||
|
||||
# Clear existing handlers
|
||||
root_logger.handlers.clear()
|
||||
|
||||
# Console handler
|
||||
if config.logging.console_logging:
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# File handler
|
||||
if config.logging.file_logging:
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
log_file_path = log_dir / config.logging.log_file
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file_path,
|
||||
maxBytes=config.logging.max_log_size,
|
||||
backupCount=config.logging.backup_count,
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(file_handler)
|
||||
@@ -0,0 +1,30 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class NERConfig(BaseModel):
|
||||
"""NER annotation configuration"""
|
||||
|
||||
model_name: str = "drc_names_ner"
|
||||
retry_attempts: int = 3
|
||||
|
||||
|
||||
class LLMConfig(BaseModel):
|
||||
"""LLM annotation configuration"""
|
||||
|
||||
model_name: str = "mistral:7b"
|
||||
requests_per_minute: int = 60
|
||||
requests_per_second: int = 2
|
||||
retry_attempts: int = 3
|
||||
timeout_seconds: int = 30
|
||||
max_concurrent_requests: int = 2
|
||||
enable_rate_limiting: bool = False
|
||||
|
||||
|
||||
class AnnotationConfig(BaseModel):
|
||||
"""Base class for annotation configurations"""
|
||||
|
||||
llm: LLMConfig = LLMConfig()
|
||||
ner: NERConfig = NERConfig()
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -0,0 +1,151 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Dict, Any
|
||||
|
||||
import yaml
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Centralized configuration management"""
|
||||
|
||||
def __init__(self, config_path: Optional[Union[str, Path]] = None):
|
||||
self.config_path = config_path or self._find_config_file()
|
||||
self._config: Optional[PipelineConfig] = None
|
||||
self._setup_default_paths()
|
||||
|
||||
@classmethod
|
||||
def _find_config_file(cls) -> Path:
|
||||
"""Find configuration file in standard locations"""
|
||||
possible_paths = [
|
||||
Path.cwd() / "config" / "pipeline.yaml",
|
||||
Path.cwd() / "config" / "pipeline.yml",
|
||||
Path.cwd() / "pipeline.yaml",
|
||||
Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
|
||||
]
|
||||
|
||||
for path in possible_paths:
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Return default path if none found
|
||||
return Path.cwd() / "config" / "pipeline.yaml"
|
||||
|
||||
def _setup_default_paths(self):
|
||||
"""Setup default project paths"""
|
||||
root_dir = Path(__file__).parent.parent.parent.parent.parent
|
||||
self.default_paths = ProjectPaths(
|
||||
root_dir=root_dir,
|
||||
configs_dir=root_dir / "config",
|
||||
data_dir=root_dir / "data" / "dataset",
|
||||
models_dir=root_dir / "data" / "models",
|
||||
outputs_dir=root_dir / "data" / "outputs",
|
||||
logs_dir=root_dir / "data" / "logs",
|
||||
checkpoints_dir=root_dir / "data" / "checkpoints",
|
||||
)
|
||||
|
||||
def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
|
||||
"""Load configuration from file"""
|
||||
if config_path:
|
||||
self.config_path = config_path
|
||||
|
||||
if not self.config_path.exists():
|
||||
logging.warning(
|
||||
f"Config file not found: {self.config_path}. Using defaults."
|
||||
)
|
||||
return self._create_default_config()
|
||||
|
||||
try:
|
||||
with open(self.config_path, "r") as f:
|
||||
if self.config_path.suffix.lower() in [".yaml", ".yml"]:
|
||||
config_data = yaml.safe_load(f)
|
||||
else:
|
||||
config_data = json.load(f)
|
||||
|
||||
# Ensure paths are properly set
|
||||
if "paths" not in config_data:
|
||||
config_data["paths"] = self.default_paths.model_dump()
|
||||
|
||||
self._config = PipelineConfig(**config_data)
|
||||
return self._config
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load config from {self.config_path}: {e}")
|
||||
return self._create_default_config()
|
||||
|
||||
def _create_default_config(self) -> PipelineConfig:
|
||||
"""Create default configuration"""
|
||||
return PipelineConfig(paths=self.default_paths)
|
||||
|
||||
def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
|
||||
"""Save configuration to file"""
|
||||
save_path = path or self.config_path
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
config_dict = config.model_dump()
|
||||
|
||||
# Convert Path objects to strings for serialization
|
||||
if "paths" in config_dict:
|
||||
for key, value in config_dict["paths"].items():
|
||||
if isinstance(value, Path):
|
||||
config_dict["paths"][key] = str(value)
|
||||
|
||||
try:
|
||||
with open(save_path, "w") as f:
|
||||
if save_path.suffix.lower() in [".yaml", ".yml"]:
|
||||
yaml.dump(config_dict, f, default_flow_style=False, indent=2)
|
||||
else:
|
||||
json.dump(config_dict, f, indent=2)
|
||||
|
||||
logging.info(f"Configuration saved to {save_path}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to save config to {save_path}: {e}")
|
||||
|
||||
def get_config(self) -> PipelineConfig:
|
||||
"""Get current configuration, loading if necessary"""
|
||||
if self._config is None:
|
||||
self._config = self.load_config()
|
||||
return self._config
|
||||
|
||||
def update_config(self, updates: Dict[str, Any]):
|
||||
"""Update configuration with new values"""
|
||||
config = self.get_config()
|
||||
|
||||
# Deep update configuration
|
||||
config_dict = config.model_dump()
|
||||
self._deep_update(config_dict, updates)
|
||||
|
||||
self._config = PipelineConfig(**config_dict)
|
||||
|
||||
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
||||
"""Recursively update nested dictionaries"""
|
||||
for key, value in update_dict.items():
|
||||
if (
|
||||
key in base_dict
|
||||
and isinstance(base_dict[key], dict)
|
||||
and isinstance(value, dict)
|
||||
):
|
||||
self._deep_update(base_dict[key], value)
|
||||
else:
|
||||
base_dict[key] = value
|
||||
|
||||
def get_environment_config(self, env: str) -> PipelineConfig:
|
||||
"""Load environment-specific configuration"""
|
||||
env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
|
||||
|
||||
if env_config_path.exists():
|
||||
base_config = self.load_config()
|
||||
env_config = self.load_config(env_config_path)
|
||||
|
||||
# Merge configurations
|
||||
base_dict = base_config.dict()
|
||||
env_dict = env_config.dict()
|
||||
self._deep_update(base_dict, env_dict)
|
||||
|
||||
return PipelineConfig(**base_dict)
|
||||
|
||||
return self.get_config()
|
||||
@@ -0,0 +1,32 @@
|
||||
from dataclasses import field
|
||||
from typing import Dict, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DataConfig(BaseModel):
|
||||
"""Data handling configuration"""
|
||||
|
||||
input_file: str = "names.csv"
|
||||
output_files: Dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
"featured": "names_featured.csv",
|
||||
"evaluation": "names_evaluation.csv",
|
||||
"engineered": "names_engineered.csv",
|
||||
"males": "names_males.csv",
|
||||
"females": "names_females.csv",
|
||||
"ner_data": "names_ner.json",
|
||||
"ner_spacy": "names_ner.spacy",
|
||||
}
|
||||
)
|
||||
selected_columns: list[str] = field(default=["name", "sex", "region"])
|
||||
split_evaluation: bool = False
|
||||
split_by_province: bool = True
|
||||
split_by_gender: bool = True
|
||||
split_ner_data: bool = True
|
||||
evaluation_fraction: float = 0.2
|
||||
random_seed: int = 42
|
||||
|
||||
# Dataset size limiting options
|
||||
max_dataset_size: Optional[int] = None
|
||||
balance_by_sex: bool = False
|
||||
@@ -0,0 +1,13 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class LoggingConfig(BaseModel):
|
||||
"""Logging configuration"""
|
||||
|
||||
level: str = "INFO"
|
||||
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: bool = True
|
||||
console_logging: bool = True
|
||||
log_file: str = "pipeline.log"
|
||||
max_log_size: int = 10 * 1024 * 1024 # 10MB
|
||||
backup_count: int = 5
|
||||
@@ -0,0 +1,29 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ners.core.config.annotation_config import AnnotationConfig
|
||||
from ners.core.config.data_config import DataConfig
|
||||
from ners.core.config.logging_config import LoggingConfig
|
||||
from ners.core.config.processing_config import ProcessingConfig
|
||||
from ners.core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class PipelineConfig(BaseModel):
|
||||
"""Main pipeline configuration"""
|
||||
|
||||
name: str = "drc_names_pipeline"
|
||||
version: str = "1.0.0"
|
||||
description: str = "DRC Names NLP Processing Pipeline"
|
||||
|
||||
paths: ProjectPaths
|
||||
stages: list[str] = []
|
||||
processing: ProcessingConfig = ProcessingConfig()
|
||||
annotation: AnnotationConfig = AnnotationConfig()
|
||||
data: DataConfig = DataConfig()
|
||||
logging: LoggingConfig = LoggingConfig()
|
||||
|
||||
# Environment-specific settings
|
||||
environment: str = "development"
|
||||
debug: bool = True
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -0,0 +1,17 @@
|
||||
from dataclasses import field
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ProcessingConfig(BaseModel):
|
||||
"""Data processing pipeline configuration"""
|
||||
|
||||
batch_size: int = 1000
|
||||
max_workers: int = 4
|
||||
checkpoint_interval: int = 5
|
||||
use_multiprocessing: bool = False
|
||||
encoding_options: list = field(
|
||||
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
|
||||
)
|
||||
chunk_size: int = 100_000
|
||||
epochs: int = 2
|
||||
@@ -0,0 +1,26 @@
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseModel, field_validator
|
||||
|
||||
|
||||
class ProjectPaths(BaseModel):
|
||||
"""Project directory structure configuration"""
|
||||
|
||||
root_dir: Path
|
||||
data_dir: Path
|
||||
models_dir: Path
|
||||
outputs_dir: Path
|
||||
logs_dir: Path
|
||||
configs_dir: Path
|
||||
checkpoints_dir: Path
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@classmethod
|
||||
@field_validator("*", mode="before")
|
||||
def convert_to_path(cls, v):
|
||||
return Path(v) if not isinstance(v, Path) else v
|
||||
|
||||
def get_data_path(self, filename: str) -> Path:
|
||||
return self.data_dir / filename
|
||||
@@ -0,0 +1,46 @@
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ners.core.config import PipelineConfig
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temporary_config_override(**overrides):
|
||||
"""Context manager for temporarily overriding configuration"""
|
||||
from ners.core.config import get_config
|
||||
|
||||
config = get_config()
|
||||
original_values = {}
|
||||
|
||||
# Store original values and apply overrides
|
||||
for key, value in overrides.items():
|
||||
if hasattr(config, key):
|
||||
original_values[key] = getattr(config, key)
|
||||
setattr(config, key, value)
|
||||
|
||||
try:
|
||||
yield config
|
||||
finally:
|
||||
# Restore original values
|
||||
for key, value in original_values.items():
|
||||
setattr(config, key, value)
|
||||
|
||||
|
||||
def ensure_directories(config: "PipelineConfig") -> None:
|
||||
"""Ensure all required directories exist"""
|
||||
directories = [
|
||||
config.paths.data_dir,
|
||||
config.paths.models_dir,
|
||||
config.paths.outputs_dir,
|
||||
config.paths.logs_dir,
|
||||
config.paths.configs_dir,
|
||||
config.paths.checkpoints_dir,
|
||||
]
|
||||
|
||||
for directory in directories:
|
||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info("Ensured all required directories exist")
|
||||
@@ -0,0 +1,174 @@
|
||||
import gc
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Iterator, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
OPTIMIZED_DTYPES = {
|
||||
# Numeric columns with appropriate bit-width
|
||||
"year": "Int16", # Years fit in 16-bit integer
|
||||
"words": "Int8", # Word counts typically < 128
|
||||
"length": "Int16", # Name lengths fit in 16-bit
|
||||
"annotated": "Int8", # Binary flag (0/1)
|
||||
"ner_tagged": "Int8", # Binary flag (0/1)
|
||||
# Categorical columns (memory efficient for repeated values)
|
||||
"sex": "category",
|
||||
"province": "category",
|
||||
"region": "category",
|
||||
"identified_category": "category",
|
||||
"transformation_type": "category",
|
||||
# String columns with proper string dtype
|
||||
"name": "string",
|
||||
"probable_native": "string",
|
||||
"probable_surname": "string",
|
||||
"identified_name": "string",
|
||||
"identified_surname": "string",
|
||||
"ner_entities": "string",
|
||||
}
|
||||
|
||||
|
||||
class DataLoader:
|
||||
"""Reusable data loading utilities"""
|
||||
|
||||
def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
|
||||
self.config = config
|
||||
self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
|
||||
|
||||
def load_csv_chunked(
|
||||
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
|
||||
) -> Iterator[pd.DataFrame]:
|
||||
"""Load CSV file in chunks for memory efficiency"""
|
||||
chunk_size = chunk_size or self.config.processing.chunk_size
|
||||
encodings = self.config.processing.encoding_options
|
||||
filepath = Path(filepath)
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
logging.info(f"Reading {filepath} with encoding: {encoding}")
|
||||
|
||||
# Read with optimal dtypes
|
||||
chunk_iter = pd.read_csv(
|
||||
filepath,
|
||||
encoding=encoding,
|
||||
chunksize=chunk_size,
|
||||
on_bad_lines="skip",
|
||||
dtype=self.dtypes,
|
||||
)
|
||||
|
||||
for i, chunk in enumerate(chunk_iter):
|
||||
logging.debug(f"Processing optimized chunk {i + 1}")
|
||||
yield chunk
|
||||
|
||||
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed with encoding {encoding}: {e}")
|
||||
continue
|
||||
|
||||
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
|
||||
|
||||
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
||||
"""Load complete CSV with memory optimization"""
|
||||
chunks = []
|
||||
for chunk in self.load_csv_chunked(filepath):
|
||||
chunks.append(chunk)
|
||||
|
||||
if not chunks:
|
||||
return pd.DataFrame()
|
||||
|
||||
logging.info(f"Concatenating {len(chunks)} optimized chunks")
|
||||
df = pd.concat(chunks, ignore_index=True, copy=False)
|
||||
|
||||
# Cleanup chunks from memory
|
||||
del chunks
|
||||
gc.collect()
|
||||
|
||||
# Apply dataset size limiting if configured
|
||||
if self.config.data.max_dataset_size is not None:
|
||||
df = self._limit_dataset_size(df)
|
||||
|
||||
return df
|
||||
|
||||
def _limit_dataset_size(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Limit dataset size with optional sex balancing"""
|
||||
max_size = self.config.data.max_dataset_size
|
||||
|
||||
if max_size is None or len(df) <= max_size:
|
||||
return df
|
||||
|
||||
if self.config.data.balance_by_sex and "sex" in df.columns:
|
||||
return self._balanced_sample(df, max_size)
|
||||
else:
|
||||
# Simple random sampling
|
||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||
|
||||
def _balanced_sample(self, df: pd.DataFrame, max_size: int) -> pd.DataFrame:
|
||||
"""Sample data with balanced sex distribution"""
|
||||
|
||||
# Get unique sex values
|
||||
sex_values = df["sex"].dropna().unique()
|
||||
|
||||
if len(sex_values) == 0:
|
||||
logging.warning(
|
||||
"No valid values found in sex column 'sex', using random sampling"
|
||||
)
|
||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||
|
||||
# Calculate samples per sex category
|
||||
samples_per_sex = max_size // len(sex_values)
|
||||
remaining_samples = max_size % len(sex_values)
|
||||
|
||||
balanced_samples = []
|
||||
|
||||
for i, sex in enumerate(sex_values):
|
||||
# Use boolean indexing instead of creating temporary DataFrames
|
||||
sex_mask = df["sex"] == sex
|
||||
sex_indices = df[sex_mask].index
|
||||
|
||||
# Distribute remaining samples to first categories
|
||||
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
|
||||
current_samples = min(current_samples, len(sex_indices))
|
||||
|
||||
if current_samples > 0:
|
||||
# Sample indices instead of DataFrame
|
||||
sampled_indices = pd.Series(sex_indices).sample(
|
||||
n=current_samples, random_state=self.config.data.random_seed + i
|
||||
)
|
||||
balanced_samples.extend(sampled_indices.tolist())
|
||||
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
||||
|
||||
if not balanced_samples:
|
||||
logging.warning(
|
||||
"No balanced samples could be created, using random sampling"
|
||||
)
|
||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||
|
||||
# Create result using iloc with indices (no copying until final step)
|
||||
result = df.iloc[balanced_samples].copy()
|
||||
|
||||
# Shuffle the final result
|
||||
result = result.sample(
|
||||
frac=1, random_state=self.config.data.random_seed
|
||||
).reset_index(drop=True)
|
||||
|
||||
logging.info(
|
||||
f"Created balanced dataset with {len(result)} records from {len(df)} total"
|
||||
)
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def save_csv(
|
||||
cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
|
||||
) -> None:
|
||||
"""Save DataFrame to CSV with proper handling"""
|
||||
filepath = Path(filepath)
|
||||
|
||||
if create_dirs:
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
df.to_csv(filepath, index=False, encoding="utf-8", sep=",", quoting=1)
|
||||
logging.info(f"Saved {len(df)} rows to {filepath}")
|
||||
@@ -0,0 +1,24 @@
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class PromptManager:
|
||||
"""Manage prompts for LLM operations"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.prompts_dir = self.config.paths.configs_dir / "prompts"
|
||||
|
||||
def load_prompt(self, prompt_name: str = "default") -> str:
|
||||
"""Load a prompt template"""
|
||||
prompt_file = self.prompts_dir / f"{prompt_name}.txt"
|
||||
|
||||
if not prompt_file.exists():
|
||||
# Fallback to root directory
|
||||
fallback_file = self.config.paths.root_dir / "prompt.txt"
|
||||
if fallback_file.exists():
|
||||
prompt_file = fallback_file
|
||||
else:
|
||||
raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
|
||||
|
||||
with open(prompt_file, "r", encoding="utf-8") as f:
|
||||
return f.read().strip()
|
||||
@@ -0,0 +1,56 @@
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from queue import Queue
|
||||
|
||||
|
||||
@dataclass
|
||||
class RateLimitConfig:
|
||||
"""Configuration for rate limiting LLM requests"""
|
||||
|
||||
requests_per_minute: int = 60
|
||||
requests_per_second: int = 2
|
||||
burst_limit: int = 5
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Thread-safe rate limiter for LLM requests"""
|
||||
|
||||
def __init__(self, config: RateLimitConfig):
|
||||
self.config = config
|
||||
self.request_times = Queue()
|
||||
self.lock = threading.Lock()
|
||||
self.last_request_time = 0
|
||||
|
||||
def wait_if_needed(self):
|
||||
"""Wait if necessary to respect rate limits"""
|
||||
with self.lock:
|
||||
current_time = time.time()
|
||||
|
||||
# Check requests per second limit
|
||||
time_since_last = current_time - self.last_request_time
|
||||
min_interval = 1.0 / self.config.requests_per_second
|
||||
|
||||
if time_since_last < min_interval:
|
||||
sleep_time = min_interval - time_since_last
|
||||
time.sleep(sleep_time)
|
||||
current_time = time.time()
|
||||
|
||||
# Clean old request times (older than 1 minute)
|
||||
while not self.request_times.empty():
|
||||
if current_time - self.request_times.queue[0] > 60:
|
||||
self.request_times.get()
|
||||
else:
|
||||
break
|
||||
|
||||
# Check requests per minute limit
|
||||
if self.request_times.qsize() >= self.config.requests_per_minute:
|
||||
oldest_request = self.request_times.queue[0]
|
||||
wait_time = 60 - (current_time - oldest_request)
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
current_time = time.time()
|
||||
|
||||
# Record this request
|
||||
self.request_times.put(current_time)
|
||||
self.last_request_time = current_time
|
||||
@@ -0,0 +1,174 @@
|
||||
import unicodedata
|
||||
from typing import Optional, Dict, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class RegionMapper:
|
||||
"""Reusable region mapping utilities"""
|
||||
|
||||
def __init__(self, mapping: Optional[Dict] = None):
|
||||
self.mapping = mapping or REGION_MAPPING
|
||||
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
|
||||
|
||||
def map(self, series: pd.Series) -> pd.Series:
|
||||
return series.str.lower().map(self.mapping).fillna("AUTRES")
|
||||
|
||||
@staticmethod
|
||||
def clean_province(series: pd.Series) -> pd.Series:
|
||||
return (
|
||||
series.str.upper()
|
||||
.str.strip()
|
||||
.apply(
|
||||
lambda x: (
|
||||
unicodedata.normalize("NFKD", x)
|
||||
.encode("ascii", errors="ignore")
|
||||
.decode("utf-8")
|
||||
if isinstance(x, str)
|
||||
else x
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_provinces():
|
||||
return [
|
||||
"kinshasa",
|
||||
"bas-congo",
|
||||
"bandundu",
|
||||
"katanga",
|
||||
"equateur",
|
||||
"orientale",
|
||||
"maniema",
|
||||
"nord-kivu",
|
||||
"sud-kivu",
|
||||
"kasai-occidental",
|
||||
"kasai-oriental",
|
||||
"autres",
|
||||
]
|
||||
|
||||
|
||||
# DRC Region to Province Mapping
|
||||
REGION_MAPPING: Dict[str, Tuple[str, str]] = {
|
||||
"bandundu": ("BANDUNDU", "BANDUNDU"),
|
||||
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
|
||||
"bandundu-2": ("BANDUNDU", "BANDUNDU"),
|
||||
"bandundu-3": ("BANDUNDU", "BANDUNDU"),
|
||||
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"bas-fleuve": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"bas-uele": ("BAS-UELE", "ORIENTALE"),
|
||||
"bas-uele-1": ("BAS-UELE", "ORIENTALE"),
|
||||
"bas-uele-2": ("BAS-UELE", "ORIENTALE"),
|
||||
"cataractes": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"equateur": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-1": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-2": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-3": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-4": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-5": ("EQUATEUR", "EQUATEUR"),
|
||||
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
|
||||
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
|
||||
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
|
||||
"haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
|
||||
"haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
|
||||
"haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
|
||||
"haut-uele": ("HAUT-UELE", "ORIENTALE"),
|
||||
"haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
|
||||
"haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
|
||||
"ituri": ("ITURI", "ORIENTALE"),
|
||||
"ituri-1": ("ITURI", "ORIENTALE"),
|
||||
"ituri-2": ("ITURI", "ORIENTALE"),
|
||||
"ituri-3": ("ITURI", "ORIENTALE"),
|
||||
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||
"kasai-ce": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"kasai-orientale": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"katanga": ("KATANGA", "KATANGA"),
|
||||
"katanga-1": ("KATANGA", "KATANGA"),
|
||||
"katanga-2": ("KATANGA", "KATANGA"),
|
||||
"katanga-3": ("KATANGA", "KATANGA"),
|
||||
"katanga-4": ("KATANGA", "KATANGA"),
|
||||
"kinshasa": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-est": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-funa": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-global": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
|
||||
"kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kwango": ("KWANGO", "BANDUNDU"),
|
||||
"kwango-1": ("KWANGO", "BANDUNDU"),
|
||||
"kwango-2": ("KWANGO", "BANDUNDU"),
|
||||
"kwilu": ("KWILU", "BANDUNDU"),
|
||||
"kwilu-1": ("KWILU", "BANDUNDU"),
|
||||
"kwilu-2": ("KWILU", "BANDUNDU"),
|
||||
"kwilu-3": ("KWILU", "BANDUNDU"),
|
||||
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||
"lualaba": ("LUALABA", "KATANGA"),
|
||||
"lualaba-1": ("LUALABA", "KATANGA"),
|
||||
"lualaba-2": ("LUALABA", "KATANGA"),
|
||||
"lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
|
||||
"lukaya": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
"maniema": ("MANIEMA", "MANIEMA"),
|
||||
"maniema-1": ("MANIEMA", "MANIEMA"),
|
||||
"maniema-2": ("MANIEMA", "MANIEMA"),
|
||||
"mongala": ("MONGALA", "EQUATEUR"),
|
||||
"mongala-1": ("MONGALA", "EQUATEUR"),
|
||||
"mongala-2": ("MONGALA", "EQUATEUR"),
|
||||
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
|
||||
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
|
||||
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
|
||||
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
|
||||
"nord-ubangi": ("NORD-UBANGI", "EQUATEUR"),
|
||||
"nord-ubangi-1": ("NORD-UBANGI", "EQUATEUR"),
|
||||
"nord-ubangi-2": ("NORD-UBANGI", "EQUATEUR"),
|
||||
"province-orientale": ("ORIENTALE", "ORIENTALE"),
|
||||
"province-orientale-1": ("ORIENTALE", "ORIENTALE"),
|
||||
"province-orientale-2": ("ORIENTALE", "ORIENTALE"),
|
||||
"province-orientale-3": ("ORIENTALE", "ORIENTALE"),
|
||||
"province-orientale-4": ("ORIENTALE", "ORIENTALE"),
|
||||
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
|
||||
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
|
||||
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
|
||||
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
|
||||
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
|
||||
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
|
||||
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
|
||||
"sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
|
||||
"sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
|
||||
"sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
|
||||
"tanganyika": ("TANGANYIKA", "KATANGA"),
|
||||
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
|
||||
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
|
||||
"tshopo": ("TSHOPO", "ORIENTALE"),
|
||||
"tshopo-1": ("TSHOPO", "ORIENTALE"),
|
||||
"tshopo-2": ("TSHOPO", "ORIENTALE"),
|
||||
"tshuapa": ("TSHUAPA", "EQUATEUR"),
|
||||
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
|
||||
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class StateManager:
|
||||
"""Manage pipeline state and checkpoints"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.checkpoints_dir = self.config.paths.checkpoints_dir
|
||||
|
||||
def save_state(self, state: Dict[str, Any], state_name: str) -> None:
|
||||
"""Save pipeline state"""
|
||||
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
|
||||
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||
|
||||
with open(state_file, "w") as f:
|
||||
json.dump(state, f, indent=2, default=str)
|
||||
|
||||
logging.debug(f"Saved state to {state_file}")
|
||||
|
||||
def load_state(self, state_name: str) -> Dict[str, Any]:
|
||||
"""Load pipeline state"""
|
||||
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||
|
||||
if not state_file.exists():
|
||||
return {}
|
||||
|
||||
with open(state_file, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
def clear_state(self, state_name: str) -> None:
|
||||
"""Clear pipeline state"""
|
||||
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||
|
||||
if state_file.exists():
|
||||
state_file.unlink()
|
||||
logging.info(f"Cleared state: {state_name}")
|
||||
@@ -0,0 +1,37 @@
|
||||
from typing import Optional, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
"""Reusable text cleaning utilities"""
|
||||
|
||||
def __init__(self, patterns: Optional[Dict[str, str]] = None):
|
||||
self.patterns = patterns or {
|
||||
"null_bytes": "\x00",
|
||||
"non_breaking_spaces": "\u00a0",
|
||||
"multiple_spaces": r" +",
|
||||
"extra_whitespace": r"\s+",
|
||||
}
|
||||
|
||||
def clean_text_series(self, series: pd.Series) -> pd.Series:
|
||||
"""Clean a pandas Series of text data"""
|
||||
cleaned = series.astype(str)
|
||||
|
||||
# Apply cleaning patterns
|
||||
for pattern_name, pattern in self.patterns.items():
|
||||
if pattern_name == "multiple_spaces":
|
||||
cleaned = cleaned.str.replace(pattern, " ", regex=True)
|
||||
else:
|
||||
cleaned = cleaned.str.replace(pattern, " ", regex=False)
|
||||
|
||||
return cleaned.str.strip().str.lower()
|
||||
|
||||
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Clean all text columns in a DataFrame"""
|
||||
df = df.copy()
|
||||
columns = df.select_dtypes(include=["object", "string"]).columns
|
||||
for col in columns:
|
||||
df[col] = self.clean_text_series(df[col])
|
||||
|
||||
return df
|
||||
Executable
+75
@@ -0,0 +1,75 @@
|
||||
#!.venv/bin/python3
|
||||
import logging
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.pipeline import Pipeline
|
||||
from ners.processing.steps.data_cleaning_step import DataCleaningStep
|
||||
from ners.processing.steps.data_selection_step import DataSelectionStep
|
||||
from ners.processing.steps.data_splitting_step import DataSplittingStep
|
||||
from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
|
||||
from ners.processing.steps.ner_annotation_step import NERAnnotationStep
|
||||
from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||
|
||||
|
||||
def create_pipeline(config) -> Pipeline:
|
||||
batch_config = BatchConfig(
|
||||
batch_size=config.processing.batch_size,
|
||||
max_workers=config.processing.max_workers,
|
||||
checkpoint_interval=config.processing.checkpoint_interval,
|
||||
use_multiprocessing=config.processing.use_multiprocessing,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(batch_config)
|
||||
steps = [
|
||||
DataCleaningStep(config),
|
||||
FeatureExtractionStep(config),
|
||||
DataSelectionStep(config),
|
||||
NERAnnotationStep(config),
|
||||
LLMAnnotationStep(config),
|
||||
]
|
||||
|
||||
for stage in config.stages:
|
||||
for step in steps:
|
||||
if step.name == stage:
|
||||
pipeline.add_step(step)
|
||||
|
||||
return pipeline
|
||||
|
||||
|
||||
def run_pipeline(config) -> int:
|
||||
try:
|
||||
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
||||
|
||||
# Load input data
|
||||
input_file_path = config.paths.get_data_path(config.data.input_file)
|
||||
if not input_file_path.exists():
|
||||
logging.error(f"Input file not found: {input_file_path}")
|
||||
return 1
|
||||
|
||||
data_loader = DataLoader(config)
|
||||
data_splitter = DataSplittingStep(config)
|
||||
logging.info(f"Loading data from {input_file_path}")
|
||||
df = data_loader.load_csv_complete(input_file_path)
|
||||
logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
||||
|
||||
# Create and run pipeline
|
||||
pipeline = create_pipeline(config)
|
||||
data_splitter.split(pipeline.run(df))
|
||||
|
||||
# Show completion statistics
|
||||
progress = pipeline.get_progress()
|
||||
logging.info("=== Pipeline Completion Summary ===")
|
||||
for step_name, stats in progress.items():
|
||||
logging.info(
|
||||
f"{step_name}: {stats['completion_percentage']:.1f}% "
|
||||
f"({stats['processed_batches']}/{stats['total_batches']} batches)"
|
||||
)
|
||||
if stats["failed_batches"] > 0:
|
||||
logging.warning(f" {stats['failed_batches']} failed batches")
|
||||
|
||||
logging.info("Pipeline completed successfully")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
||||
return 1
|
||||
Executable
+14
@@ -0,0 +1,14 @@
|
||||
#!.venv/bin/python3
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
|
||||
def status(*, detailed: bool = False) -> None:
|
||||
PipelineMonitor().print_status(detailed=detailed)
|
||||
|
||||
|
||||
def clean_step(step: str, *, keep_last: int = 1) -> None:
|
||||
PipelineMonitor().clean_step_checkpoints(step, keep_last)
|
||||
|
||||
|
||||
def reset_step(step: str) -> None:
|
||||
PipelineMonitor().reset_step(step)
|
||||
Executable
+80
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.processing.ner.name_builder import NameBuilder
|
||||
from ners.processing.ner.name_engineering import NameEngineering
|
||||
from ners.processing.ner.name_model import NameModel
|
||||
|
||||
|
||||
def feature(config: PipelineConfig):
|
||||
NameEngineering(config).compute()
|
||||
|
||||
|
||||
def build(config: PipelineConfig):
|
||||
NameBuilder(config).build()
|
||||
|
||||
|
||||
def train(config: PipelineConfig):
|
||||
name_model = NameModel(config)
|
||||
|
||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||
if not data_path.exists():
|
||||
logging.info("NER data not found. Building dataset first...")
|
||||
build(config)
|
||||
|
||||
name_model.create_blank_model("fr")
|
||||
data = name_model.load_data(str(data_path))
|
||||
|
||||
split_idx = int(len(data) * 0.9)
|
||||
train_data, eval_data = data[:split_idx], data[split_idx:]
|
||||
|
||||
logging.info(
|
||||
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
|
||||
)
|
||||
name_model.train(
|
||||
data=train_data,
|
||||
epochs=config.processing.epochs,
|
||||
batch_size=config.processing.batch_size,
|
||||
dropout_rate=0.3,
|
||||
)
|
||||
evaluation_results = name_model.evaluate(eval_data)
|
||||
|
||||
model_path = name_model.save()
|
||||
logging.info(f"Model saved to: {model_path}")
|
||||
print(f"Evaluation results: {evaluation_results}")
|
||||
|
||||
|
||||
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
||||
if not reset and os.path.exists(
|
||||
config.paths.get_data_path(config.data.output_files["engineered"])
|
||||
):
|
||||
logging.info("Step 1: Feature engineering already done.")
|
||||
else:
|
||||
logging.info("Step 1: Running feature engineering")
|
||||
feature(config)
|
||||
|
||||
if not reset and os.path.exists(
|
||||
config.paths.get_data_path(config.data.output_files["ner_data"])
|
||||
):
|
||||
logging.info("Step 2: NER dataset already built.")
|
||||
else:
|
||||
logging.info("Step 2: Building NER dataset")
|
||||
build(config)
|
||||
|
||||
logging.info("Step 3: Training NER Model")
|
||||
train(config)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
|
||||
return 1
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
@@ -0,0 +1,13 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class BatchConfig:
|
||||
"""Configuration for batch processing"""
|
||||
|
||||
batch_size: int = 1000
|
||||
max_workers: int = 4
|
||||
checkpoint_interval: int = 5 # Save checkpoint every N batches
|
||||
use_multiprocessing: bool = (
|
||||
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
||||
)
|
||||
@@ -0,0 +1,173 @@
|
||||
import logging
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||
from typing import Iterator
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.batch.memory_monitor import MemoryMonitor
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class BatchProcessor:
|
||||
"""Handles batch processing with concurrency and checkpointing"""
|
||||
|
||||
def __init__(self, config: BatchConfig):
|
||||
self.config = config
|
||||
self.memory_monitor = MemoryMonitor()
|
||||
|
||||
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
|
||||
"""Create batches from DataFrame without unnecessary copies"""
|
||||
total_rows = len(df)
|
||||
batch_size = self.config.batch_size
|
||||
|
||||
for i in range(0, total_rows, batch_size):
|
||||
batch = df.iloc[i : i + batch_size]
|
||||
batch_id = i // batch_size
|
||||
yield batch, batch_id
|
||||
|
||||
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Memory-optimized sequential processing"""
|
||||
results = []
|
||||
memory_threshold_mb = 1000 # Clean memory when usage exceeds 1 GB
|
||||
|
||||
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
||||
if step.batch_exists(batch_id):
|
||||
logging.info(
|
||||
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||
)
|
||||
processed_batch = step.load_batch(batch_id)
|
||||
else:
|
||||
try:
|
||||
# Only copy if the processing step requires mutation
|
||||
if step.requires_batch_mutation:
|
||||
batch_copy = batch.copy()
|
||||
processed_batch = step.process_batch(batch_copy, batch_id)
|
||||
else:
|
||||
processed_batch = step.process_batch(batch, batch_id)
|
||||
|
||||
step.save_batch(processed_batch, batch_id)
|
||||
step.state.processed_batches += 1
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process batch {batch_id}: {e}")
|
||||
step.state.failed_batches.append(batch_id)
|
||||
continue
|
||||
|
||||
results.append(processed_batch)
|
||||
|
||||
# Memory management
|
||||
if batch_num % self.config.checkpoint_interval == 0:
|
||||
current_memory = self.memory_monitor.get_memory_usage_mb()
|
||||
if current_memory > memory_threshold_mb:
|
||||
logging.info(f"Memory cleanup triggered at {current_memory:.1f} MB")
|
||||
self.memory_monitor.cleanup_memory()
|
||||
|
||||
# Save state periodically
|
||||
if batch_id % self.config.checkpoint_interval == 0:
|
||||
step.save_state()
|
||||
|
||||
# Final memory cleanup before concatenation
|
||||
self.memory_monitor.cleanup_memory()
|
||||
self.memory_monitor.log_memory_usage("before_concat")
|
||||
|
||||
result = self._safe_concat(results) if results else pd.DataFrame()
|
||||
|
||||
# Final cleanup
|
||||
del results
|
||||
self.memory_monitor.cleanup_memory()
|
||||
self.memory_monitor.log_memory_usage("sequential_complete")
|
||||
|
||||
return result
|
||||
|
||||
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Memory-optimized concurrent processing"""
|
||||
executor_class = (
|
||||
ProcessPoolExecutor
|
||||
if self.config.use_multiprocessing
|
||||
else ThreadPoolExecutor
|
||||
)
|
||||
results = {}
|
||||
|
||||
with executor_class(max_workers=self.config.max_workers) as executor:
|
||||
# Submit all batches
|
||||
future_to_batch = {}
|
||||
for batch, batch_id in self.create_batches(df):
|
||||
if step.batch_exists(batch_id):
|
||||
logging.info(
|
||||
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||
)
|
||||
results[batch_id] = step.load_batch(batch_id)
|
||||
else:
|
||||
# Only copy if necessary for concurrent processing
|
||||
batch_copy = batch.copy() if step.requires_batch_mutation else batch
|
||||
future = executor.submit(step.process_batch, batch_copy, batch_id)
|
||||
future_to_batch[future] = (batch_id, batch)
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_batch):
|
||||
batch_id, batch = future_to_batch[future]
|
||||
try:
|
||||
processed_batch = future.result()
|
||||
step.save_batch(processed_batch, batch_id)
|
||||
results[batch_id] = processed_batch
|
||||
step.state.processed_batches += 1
|
||||
logging.info(f"Completed batch {batch_id}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process batch {batch_id}: {e}")
|
||||
step.state.failed_batches.append(batch_id)
|
||||
|
||||
# Memory-efficient reassembly
|
||||
ordered_results = []
|
||||
for batch_id in sorted(results.keys()):
|
||||
ordered_results.append(results[batch_id])
|
||||
|
||||
step.save_state()
|
||||
|
||||
# Cleanup before concat
|
||||
del results
|
||||
self.memory_monitor.cleanup_memory()
|
||||
|
||||
result = (
|
||||
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
||||
)
|
||||
|
||||
# Final cleanup
|
||||
del ordered_results
|
||||
self.memory_monitor.cleanup_memory()
|
||||
|
||||
return result
|
||||
|
||||
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process data using the configured strategy"""
|
||||
step.state.total_batches = (
|
||||
len(df) + self.config.batch_size - 1
|
||||
) // self.config.batch_size
|
||||
step.load_state()
|
||||
|
||||
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
||||
self.memory_monitor.log_memory_usage("process_start")
|
||||
|
||||
if self.config.max_workers == 1:
|
||||
result = self.process_sequential(step, df)
|
||||
else:
|
||||
result = self.process_concurrent(step, df)
|
||||
|
||||
self.memory_monitor.log_memory_usage("process_complete")
|
||||
return result
|
||||
|
||||
def _safe_concat(self, dfs: list) -> pd.DataFrame:
|
||||
"""Memory-safe concatenation with monitoring"""
|
||||
if not dfs:
|
||||
return pd.DataFrame()
|
||||
|
||||
memory = self.memory_monitor.get_memory_usage_mb()
|
||||
logging.info(f"Starting concat of {len(dfs)} DataFrames at {memory:.1f} MB")
|
||||
|
||||
# Use copy=False to avoid unnecessary copying during concat
|
||||
result = pd.concat(dfs, ignore_index=True, copy=False)
|
||||
|
||||
# Monitor memory after concat
|
||||
memory = self.memory_monitor.get_memory_usage_mb()
|
||||
logging.info(f"Concat complete. Memory: {memory:.1f} MB")
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,25 @@
|
||||
import gc
|
||||
import logging
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
class MemoryMonitor:
|
||||
"""Monitor and manage memory usage during batch processing"""
|
||||
|
||||
@staticmethod
|
||||
def get_memory_usage_mb() -> float:
|
||||
"""Get current memory usage in MB"""
|
||||
process = psutil.Process()
|
||||
return process.memory_info().rss / 1024 / 1024
|
||||
|
||||
@staticmethod
|
||||
def cleanup_memory():
|
||||
"""Force garbage collection"""
|
||||
gc.collect()
|
||||
|
||||
@staticmethod
|
||||
def log_memory_usage(step_name: str):
|
||||
"""Log current memory usage"""
|
||||
memory_mb = MemoryMonitor.get_memory_usage_mb()
|
||||
logging.info(f"Memory usage after {step_name}: {memory_mb:.1f} MB")
|
||||
@@ -0,0 +1,196 @@
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict
|
||||
|
||||
from ners.core.config.config_manager import ConfigManager
|
||||
from ners.core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class PipelineMonitor:
|
||||
"""Monitor and manage pipeline execution"""
|
||||
|
||||
def __init__(self, paths: Optional[ProjectPaths] = None):
|
||||
if paths is None:
|
||||
# Use default configuration if none provided
|
||||
config_manager = ConfigManager()
|
||||
paths = config_manager.default_paths
|
||||
|
||||
self.paths = paths
|
||||
self.checkpoint_dir = paths.checkpoints_dir
|
||||
self.steps = [
|
||||
"data_cleaning",
|
||||
"data_selection",
|
||||
"feature_extraction",
|
||||
"ner_annotation",
|
||||
"llm_annotation",
|
||||
"data_splitting",
|
||||
]
|
||||
|
||||
def get_step_status(self, step_name: str) -> Dict:
|
||||
"""Get status of a specific pipeline step"""
|
||||
step_dir = self.checkpoint_dir / step_name
|
||||
state_file = step_dir / "pipeline_state.json"
|
||||
|
||||
if not state_file.exists():
|
||||
return {
|
||||
"step": step_name,
|
||||
"status": "not_started",
|
||||
"processed_batches": 0,
|
||||
"total_batches": 0,
|
||||
"failed_batches": 0,
|
||||
"completion_percentage": 0.0,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(state_file, "r") as f:
|
||||
state = json.load(f)
|
||||
|
||||
processed = state.get("processed_batches", 0)
|
||||
total = state.get("total_batches", 0)
|
||||
failed = len(state.get("failed_batches", []))
|
||||
|
||||
if total == 0:
|
||||
completion = 0.0
|
||||
status = "not_started"
|
||||
elif processed >= total:
|
||||
completion = 100.0
|
||||
status = "completed" if failed == 0 else "completed_with_errors"
|
||||
else:
|
||||
completion = (processed / total) * 100
|
||||
status = "in_progress"
|
||||
|
||||
return {
|
||||
"step": step_name,
|
||||
"status": status,
|
||||
"processed_batches": processed,
|
||||
"total_batches": total,
|
||||
"failed_batches": failed,
|
||||
"completion_percentage": completion,
|
||||
"last_checkpoint": state.get("last_checkpoint"),
|
||||
"failed_batch_ids": state.get("failed_batches", []),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error reading state for {step_name}: {e}")
|
||||
return {"step": step_name, "status": "error", "error": str(e)}
|
||||
|
||||
def get_pipeline_status(self) -> Dict:
|
||||
"""Get overall pipeline status"""
|
||||
step_statuses = {}
|
||||
overall_status = "not_started"
|
||||
total_completion = 0.0
|
||||
|
||||
for step in self.steps:
|
||||
status = self.get_step_status(step)
|
||||
step_statuses[step] = status
|
||||
|
||||
if status["status"] == "error":
|
||||
overall_status = "error"
|
||||
elif status["status"] in ["in_progress"]:
|
||||
overall_status = "in_progress"
|
||||
elif status["status"] == "completed_with_errors":
|
||||
overall_status = "completed_with_errors"
|
||||
|
||||
total_completion += status.get("completion_percentage", 0)
|
||||
|
||||
avg_completion = total_completion / len(self.steps)
|
||||
|
||||
if avg_completion >= 100 and overall_status not in [
|
||||
"error",
|
||||
"completed_with_errors",
|
||||
]:
|
||||
overall_status = "completed"
|
||||
|
||||
return {
|
||||
"overall_status": overall_status,
|
||||
"overall_completion": avg_completion,
|
||||
"steps": step_statuses,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def print_status(self, detailed: bool = False):
|
||||
"""Print pipeline status in a human-readable format"""
|
||||
status = self.get_pipeline_status()
|
||||
|
||||
print("\n=== Pipeline Status ===")
|
||||
print(f"Overall Status: {status['overall_status'].upper()}")
|
||||
print(f"Overall Completion: {status['overall_completion']:.1f}%")
|
||||
print(f"Last Updated: {status['timestamp']}")
|
||||
print()
|
||||
|
||||
for step_name, step_status in status["steps"].items():
|
||||
print(f"{step_name.replace('_', ' ').title()}:")
|
||||
print(f" Status: {step_status['status']}")
|
||||
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
||||
print(
|
||||
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
|
||||
)
|
||||
|
||||
if step_status["failed_batches"] > 0:
|
||||
print(f" Failed Batches: {step_status['failed_batches']}")
|
||||
|
||||
if detailed and "failed_batch_ids" in step_status:
|
||||
print(f" Failed Batch IDs: {step_status['failed_batch_ids']}")
|
||||
|
||||
print()
|
||||
|
||||
def count_checkpoint_files(self) -> Dict:
|
||||
"""Count checkpoint files for each step"""
|
||||
counts = {}
|
||||
total_size = 0
|
||||
|
||||
for step in self.steps:
|
||||
step_dir = self.checkpoint_dir / step
|
||||
if step_dir.exists():
|
||||
csv_files = list(step_dir.glob("*.csv"))
|
||||
step_size = sum(f.stat().st_size for f in csv_files)
|
||||
counts[step] = {
|
||||
"files": len(csv_files),
|
||||
"size_mb": step_size / (1024 * 1024),
|
||||
}
|
||||
total_size += step_size
|
||||
else:
|
||||
counts[step] = {"files": 0, "size_mb": 0}
|
||||
|
||||
counts["total_size_mb"] = total_size / (1024 * 1024)
|
||||
return counts
|
||||
|
||||
def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
|
||||
"""Clean checkpoint files for a specific step"""
|
||||
step_dir = self.checkpoint_dir / step_name
|
||||
|
||||
if not step_dir.exists():
|
||||
logging.info(f"No checkpoints found for {step_name}")
|
||||
return
|
||||
|
||||
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
||||
|
||||
if len(csv_files) <= keep_last:
|
||||
logging.info(
|
||||
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
|
||||
)
|
||||
return
|
||||
|
||||
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
||||
|
||||
for file_path in files_to_delete:
|
||||
try:
|
||||
file_path.unlink()
|
||||
logging.info(f"Deleted {file_path}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to delete {file_path}: {e}")
|
||||
|
||||
def reset_step(self, step_name: str):
|
||||
"""Reset a pipeline step by removing its checkpoints and state"""
|
||||
step_dir = self.checkpoint_dir / step_name
|
||||
|
||||
if step_dir.exists():
|
||||
try:
|
||||
shutil.rmtree(step_dir)
|
||||
logging.info(f"Reset step: {step_name}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to reset {step_name}: {e}")
|
||||
else:
|
||||
logging.info(f"Step {step_name} has no checkpoints to reset")
|
||||
@@ -0,0 +1,94 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Tuple, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.steps.feature_extraction_step import NameCategory
|
||||
|
||||
|
||||
class BaseNameFormatter(ABC):
|
||||
"""
|
||||
Base class for name formatting transformations.
|
||||
Contains common logic for NER tagging and attribute computation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, connectors: List[str] = None, additional_surnames: List[str] = None
|
||||
):
|
||||
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
||||
self.additional_surnames = additional_surnames or [
|
||||
"jean",
|
||||
"paul",
|
||||
"marie",
|
||||
"joseph",
|
||||
"pierre",
|
||||
"claude",
|
||||
"andre",
|
||||
"michel",
|
||||
"robert",
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def parse_native_components(cls, native_str: str) -> List[str]:
|
||||
"""Parse native name string into individual components"""
|
||||
if pd.isna(native_str) or not native_str:
|
||||
return []
|
||||
return native_str.strip().split()
|
||||
|
||||
def create_ner_tags(
|
||||
self, text: str, native_parts: List[str], surname: str
|
||||
) -> List[Tuple[int, int, str]]:
|
||||
"""Create NER entity tags for transformed text"""
|
||||
entities = []
|
||||
current_pos = 0
|
||||
words = text.split()
|
||||
|
||||
for word in words:
|
||||
start_pos = current_pos
|
||||
end_pos = current_pos + len(word)
|
||||
|
||||
# Determine tag based on word content
|
||||
if word in native_parts or any(
|
||||
connector in word for connector in self.connectors
|
||||
):
|
||||
tag = "NATIVE"
|
||||
elif word == surname or word in self.additional_surnames:
|
||||
tag = "SURNAME"
|
||||
else:
|
||||
# Check if it's a compound native word or new surname
|
||||
if any(part in word for part in native_parts):
|
||||
tag = "NATIVE"
|
||||
else:
|
||||
tag = "SURNAME"
|
||||
|
||||
entities.append((start_pos, end_pos, tag))
|
||||
current_pos = end_pos + 1 # +1 for space
|
||||
|
||||
return entities
|
||||
|
||||
@classmethod
|
||||
def compute_numeric_features(cls, name: str) -> Dict:
|
||||
"""Compute all derived attributes for the transformed name"""
|
||||
words_count = len(name.split()) if name else 0
|
||||
length = len(name) if name else 0
|
||||
|
||||
return {
|
||||
"words": words_count,
|
||||
"length": length,
|
||||
"identified_category": (
|
||||
NameCategory.SIMPLE.value
|
||||
if words_count == 3
|
||||
else NameCategory.COMPOSE.value
|
||||
),
|
||||
}
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
"""Transform a row according to the specific format rules"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def transformation_type(self) -> str:
|
||||
"""Return the transformation type identifier"""
|
||||
pass
|
||||
@@ -0,0 +1,38 @@
|
||||
import random
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class ConnectorFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
connector = random.choice(self.connectors)
|
||||
|
||||
# Connect native parts with a random connector
|
||||
if len(native_parts) > 1:
|
||||
connected_native = f" {connector} ".join(native_parts)
|
||||
full_name = f"{connected_native} {surname}".strip()
|
||||
else:
|
||||
connected_native = (
|
||||
f"{row['probable_native']} {connector} {row['probable_native']}".strip()
|
||||
)
|
||||
full_name = f"{connected_native} {surname}".strip()
|
||||
|
||||
return {
|
||||
"name": full_name,
|
||||
"probable_native": connected_native,
|
||||
"identified_name": connected_native,
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return "connector_added"
|
||||
@@ -0,0 +1,36 @@
|
||||
import random
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
original_surname = (
|
||||
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
)
|
||||
|
||||
# Add random additional surname
|
||||
additional_surname = random.choice(self.additional_surnames)
|
||||
combined_surname = f"{additional_surname} {original_surname}".strip()
|
||||
full_name = f"{row['probable_native']} {combined_surname}".strip()
|
||||
|
||||
return {
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": combined_surname,
|
||||
"identified_surname": combined_surname,
|
||||
"ner_entities": str(
|
||||
self.create_ner_tags(full_name, native_parts, combined_surname)
|
||||
),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return "extended_surname"
|
||||
@@ -0,0 +1,28 @@
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class NativeOnlyFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
|
||||
# Only native components
|
||||
full_name = row["probable_native"]
|
||||
|
||||
return {
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": "",
|
||||
"identified_surname": "",
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return "native_only"
|
||||
@@ -0,0 +1,29 @@
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class OriginalFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Keep original order: native components + surname
|
||||
full_name = f"{row['probable_native']} {surname}".strip()
|
||||
|
||||
return {
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return "original"
|
||||
@@ -0,0 +1,29 @@
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class PositionFlippedFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Flip order: surname + native components
|
||||
full_name = f"{surname} {row['probable_native']}".strip()
|
||||
|
||||
return {
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return "position_flipped"
|
||||
@@ -0,0 +1,34 @@
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class ReducedNativeFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Keep only first native component + surname
|
||||
reduced_native = (
|
||||
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
||||
)
|
||||
full_name = f"{reduced_native} {surname}".strip()
|
||||
|
||||
return {
|
||||
"name": full_name,
|
||||
"probable_native": reduced_native,
|
||||
"identified_name": reduced_native,
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(
|
||||
self.create_ner_tags(full_name, [reduced_native], surname)
|
||||
),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return "reduced_native"
|
||||
@@ -0,0 +1,87 @@
|
||||
import json
|
||||
import logging
|
||||
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from .name_tagger import NameTagger
|
||||
|
||||
|
||||
class NameBuilder:
|
||||
def __init__(self, config: PipelineConfig):
|
||||
config = config.model_copy(deep=True)
|
||||
config.data.max_dataset_size = 1_000_000
|
||||
config.data.balance_by_sex = True
|
||||
|
||||
self.config = config
|
||||
self.data_loader = DataLoader(config)
|
||||
self.tagger = NameTagger()
|
||||
|
||||
def build(self) -> int:
|
||||
filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["engineered"]
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||
|
||||
# Filter early
|
||||
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
|
||||
if ner_df.empty:
|
||||
logging.error("No NER tagged data found")
|
||||
return 1
|
||||
|
||||
total_rows = len(df)
|
||||
del df # No need to keep in memory
|
||||
|
||||
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
||||
nlp = spacy.blank("fr")
|
||||
|
||||
# Use NERNameTagger for parsing and validation
|
||||
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
|
||||
validated_entities = self.tagger.validate_entities(
|
||||
ner_df["name"], parsed_entities
|
||||
)
|
||||
|
||||
# Drop rows with no valid entities
|
||||
mask = validated_entities.map(bool)
|
||||
ner_df = ner_df.loc[mask]
|
||||
validated_entities = validated_entities.loc[mask]
|
||||
|
||||
if ner_df.empty:
|
||||
logging.error("No valid training examples after validation")
|
||||
return 1
|
||||
|
||||
# Prepare training data
|
||||
training_data = list(
|
||||
zip(
|
||||
ner_df["name"].tolist(),
|
||||
[{"entities": ents} for ents in validated_entities],
|
||||
)
|
||||
)
|
||||
|
||||
# Use NERNameTagger to create spaCy DocBin
|
||||
docs = self.tagger.create_docs(
|
||||
nlp, ner_df["name"].tolist(), validated_entities.tolist()
|
||||
)
|
||||
doc_bin = DocBin(docs=docs)
|
||||
|
||||
# Save
|
||||
json_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["ner_data"]
|
||||
)
|
||||
spacy_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["ner_spacy"]
|
||||
)
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
doc_bin.to_disk(spacy_path)
|
||||
|
||||
logging.info(
|
||||
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
|
||||
)
|
||||
logging.info(f"Saved NER JSON to {json_path}")
|
||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||
return 0
|
||||
@@ -0,0 +1,142 @@
|
||||
import gc
|
||||
import random
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.ner.formats.connectors_format import ConnectorFormatter
|
||||
from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||
from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||
from ners.processing.ner.formats.original_format import OriginalFormatter
|
||||
from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
|
||||
from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
||||
|
||||
|
||||
class NameEngineering:
|
||||
"""
|
||||
Feature engineering for NER dataset to prevent position-based learning
|
||||
and encourage sequence characteristic learning.
|
||||
"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.data_loader = DataLoader(config)
|
||||
self.connectors = ["wa", "ya", "ka", "ba", "la"]
|
||||
self.additional_surnames = [
|
||||
"jean",
|
||||
"paul",
|
||||
"marie",
|
||||
"joseph",
|
||||
"pierre",
|
||||
"claude",
|
||||
"andre",
|
||||
"michel",
|
||||
"robert",
|
||||
]
|
||||
|
||||
random.seed(self.config.data.random_seed)
|
||||
np.random.seed(self.config.data.random_seed)
|
||||
|
||||
# Initialize format classes
|
||||
self.formatters = {
|
||||
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
||||
"native_only": NativeOnlyFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"position_flipped": PositionFlippedFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"reduced_native": ReducedNativeFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"connector_added": ConnectorFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"extended_surname": ExtendedSurnameFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
}
|
||||
|
||||
def load_data(self) -> pd.DataFrame:
|
||||
"""Load and filter NER-tagged data from CSV file"""
|
||||
|
||||
filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Filter only NER-tagged rows
|
||||
ner_data = df[df["ner_tagged"] == 1].copy()
|
||||
logging.info(
|
||||
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
|
||||
)
|
||||
|
||||
return ner_data
|
||||
|
||||
def compute(self) -> None:
|
||||
logging.info("Applying feature engineering transformations...")
|
||||
input_filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
output_filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["engineered"]
|
||||
)
|
||||
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||
logging.info(
|
||||
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
|
||||
)
|
||||
|
||||
del df # No need to keep in memory
|
||||
gc.collect()
|
||||
|
||||
ner_df = ner_df.sample(
|
||||
frac=1, random_state=self.config.data.random_seed
|
||||
).reset_index(drop=True)
|
||||
total_rows = len(ner_df)
|
||||
|
||||
# Calculate split points
|
||||
split_25_1 = int(total_rows * 0.25)
|
||||
split_25_2 = int(total_rows * 0.50)
|
||||
split_25_3 = int(total_rows * 0.75)
|
||||
split_10_1 = int(total_rows * 0.85)
|
||||
split_10_2 = int(total_rows * 0.95)
|
||||
|
||||
# Define transformation groups
|
||||
groups = [
|
||||
(0, split_25_1, "original"), # First 25%: original format
|
||||
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
||||
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
||||
(
|
||||
split_25_3,
|
||||
split_10_1,
|
||||
"reduced_native",
|
||||
), # Fourth 10%: reduce native components
|
||||
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
||||
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
||||
]
|
||||
|
||||
for start, end, trans_type in groups:
|
||||
logging.info(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
|
||||
|
||||
# Process each group
|
||||
rows = []
|
||||
for start, end, formatter_key in groups:
|
||||
formatter = self.formatters[formatter_key]
|
||||
|
||||
for idx in tqdm(range(start, end), desc=f"Processing {formatter_key}"):
|
||||
row = ner_df.iloc[idx]
|
||||
transformed = formatter.transform(row)
|
||||
|
||||
# Keep original columns and add transformed ones
|
||||
new_row = row.to_dict()
|
||||
new_row.update(transformed)
|
||||
rows.append(new_row)
|
||||
|
||||
self.data_loader.save_csv(pd.DataFrame(rows), output_filepath)
|
||||
logging.info(f"Engineered dataset saved to {output_filepath}")
|
||||
@@ -0,0 +1,430 @@
|
||||
import ast
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Tuple
|
||||
|
||||
import spacy
|
||||
from spacy.training import Example
|
||||
from spacy.util import minibatch
|
||||
from tqdm import tqdm
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class NameModel:
|
||||
"""NER model trainer using spaCy for DRC names entity recognition"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.nlp = None
|
||||
self.ner = None
|
||||
self.model_path = None
|
||||
self.training_stats = {}
|
||||
self.evaluation_stats = {}
|
||||
|
||||
def create_blank_model(self, language: str = "fr") -> None:
|
||||
"""Create a blank spaCy model with NER pipeline"""
|
||||
logging.info(f"Creating blank {language} model for NER training")
|
||||
|
||||
# Prefer GPU for spaCy if available (falls back to CPU automatically)
|
||||
try:
|
||||
if spacy.prefer_gpu():
|
||||
logging.info("spaCy GPU enabled (cupy) for NER training")
|
||||
else:
|
||||
logging.info("spaCy running on CPU")
|
||||
except Exception as e:
|
||||
logging.debug(f"spaCy GPU selection skipped: {e}")
|
||||
|
||||
# Create blank model - French tokenizer works well for DRC names
|
||||
self.nlp = spacy.blank(language)
|
||||
|
||||
# Add NER pipeline component
|
||||
if "ner" not in self.nlp.pipe_names:
|
||||
self.ner = self.nlp.add_pipe("ner")
|
||||
else:
|
||||
self.ner = self.nlp.get_pipe("ner")
|
||||
|
||||
# Add our custom labels
|
||||
self.ner.add_label("NATIVE")
|
||||
self.ner.add_label("SURNAME")
|
||||
|
||||
logging.info("Blank model created with NATIVE and SURNAME labels")
|
||||
|
||||
@classmethod
|
||||
def load_data(cls, data_path: str) -> List[Tuple[str, Dict]]:
|
||||
"""Load training data from JSON file - compatible with NERNameTagger output format"""
|
||||
if not os.path.exists(data_path):
|
||||
raise FileNotFoundError(f"Training data not found at {data_path}")
|
||||
|
||||
logging.info(f"Loading training data from {data_path}")
|
||||
|
||||
with open(data_path, "r", encoding="utf-8") as f:
|
||||
raw_data = json.load(f)
|
||||
|
||||
# Validate and clean training data
|
||||
valid_data = []
|
||||
skipped_count = 0
|
||||
|
||||
for i, item in enumerate(raw_data):
|
||||
try:
|
||||
if not isinstance(item, (list, tuple)) or len(item) != 2:
|
||||
logging.warning(
|
||||
f"Skipping invalid training example format at index {i}: {item}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
text, annotations = item
|
||||
|
||||
# Validate text
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
logging.warning(f"Skipping invalid text at index {i}: {repr(text)}")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Handle different annotation formats from NERNameTagger
|
||||
if not isinstance(annotations, dict) or "entities" not in annotations:
|
||||
logging.warning(
|
||||
f"Skipping invalid annotations at index {i}: {annotations}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
entities_raw = annotations["entities"]
|
||||
|
||||
# Parse entities - handle both string and list formats from tagger
|
||||
if isinstance(entities_raw, str):
|
||||
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
|
||||
try:
|
||||
entities = ast.literal_eval(entities_raw)
|
||||
if not isinstance(entities, list):
|
||||
logging.warning(
|
||||
f"Parsed entities is not a list at index {i}: {entities}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
except (ValueError, SyntaxError) as e:
|
||||
logging.warning(
|
||||
f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
elif isinstance(entities_raw, list):
|
||||
# Already in list format
|
||||
entities = entities_raw
|
||||
else:
|
||||
logging.warning(
|
||||
f"Skipping invalid entities format at index {i}: {entities_raw}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Validate each entity
|
||||
valid_entities = []
|
||||
for entity in entities:
|
||||
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
||||
logging.warning(
|
||||
f"Skipping invalid entity format in '{text}': {entity}"
|
||||
)
|
||||
continue
|
||||
|
||||
start, end, label = entity
|
||||
|
||||
# Validate entity components
|
||||
if (
|
||||
not isinstance(start, int)
|
||||
or not isinstance(end, int)
|
||||
or not isinstance(label, str)
|
||||
or start >= end
|
||||
or start < 0
|
||||
or end > len(text)
|
||||
):
|
||||
logging.warning(
|
||||
f"Skipping invalid entity bounds in '{text}': {entity}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
has_overlap = any(
|
||||
start < v_end and end > v_start
|
||||
for v_start, v_end, _ in valid_entities
|
||||
)
|
||||
|
||||
if has_overlap:
|
||||
logging.warning(
|
||||
f"Skipping overlapping entity in '{text}': {entity}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Validate that the span doesn't contain spaces (matching tagger validation)
|
||||
span_text = text[start:end]
|
||||
if (
|
||||
not span_text
|
||||
or span_text != span_text.strip()
|
||||
or " " in span_text
|
||||
):
|
||||
logging.warning(
|
||||
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
||||
)
|
||||
continue
|
||||
|
||||
valid_entities.append((start, end, label))
|
||||
|
||||
if not valid_entities:
|
||||
logging.warning(
|
||||
f"Skipping training example with no valid entities: '{text}'"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Sort entities by start position
|
||||
valid_entities.sort(key=lambda x: x[0])
|
||||
valid_data.append((text.strip(), {"entities": valid_entities}))
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing training example at index {i}: {e}")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
logging.info(
|
||||
f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
|
||||
)
|
||||
|
||||
if not valid_data:
|
||||
raise ValueError("No valid training examples found in the data")
|
||||
|
||||
return valid_data
|
||||
|
||||
def train(
|
||||
self,
|
||||
data: List[Tuple[str, Dict]],
|
||||
epochs: int = 1,
|
||||
batch_size: int = 10_000,
|
||||
dropout_rate: float = 0.3,
|
||||
) -> None:
|
||||
"""Train the NER model"""
|
||||
logging.info(f"Starting NER training with {len(data)} examples")
|
||||
logging.info(
|
||||
f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
|
||||
)
|
||||
|
||||
if self.nlp is None:
|
||||
raise ValueError("Model not initialized. Call create_blank_model() first.")
|
||||
|
||||
# Initialize the model
|
||||
self.nlp.initialize()
|
||||
optimizer = self.nlp.resume_training()
|
||||
losses_history = []
|
||||
|
||||
for epoch in range(epochs):
|
||||
losses = {}
|
||||
examples = []
|
||||
|
||||
for text, annotations in tqdm(data, desc="Create training examples"):
|
||||
doc = self.nlp.make_doc(text)
|
||||
examples.append(Example.from_dict(doc, annotations))
|
||||
|
||||
# Shuffle examples each epoch (important!)
|
||||
random.shuffle(examples)
|
||||
|
||||
# Train in batches
|
||||
batches = minibatch(examples, size=batch_size)
|
||||
for batch in batches:
|
||||
batch_losses = {}
|
||||
self.nlp.update(
|
||||
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
|
||||
)
|
||||
logging.info(
|
||||
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
||||
)
|
||||
|
||||
# Accumulate into total losses dict
|
||||
for k, v in batch_losses.items():
|
||||
losses[k] = losses.get(k, 0.0) + v
|
||||
|
||||
del batches # free memory
|
||||
losses_history.append(losses.get("ner", 0))
|
||||
logging.info(f"Epoch {epoch + 1}/{epochs}, Total Loss: {losses['ner']:.4f}")
|
||||
|
||||
# Store training statistics
|
||||
self.training_stats = {
|
||||
"epochs": epochs,
|
||||
"final_loss": losses_history[-1] if losses_history else 0,
|
||||
"training_examples": len(data),
|
||||
"loss_history": losses_history,
|
||||
"batch_size": batch_size,
|
||||
"dropout_rate": dropout_rate,
|
||||
}
|
||||
|
||||
logging.info(
|
||||
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
|
||||
)
|
||||
|
||||
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
||||
"""Evaluate the trained model on test data"""
|
||||
if self.nlp is None:
|
||||
raise ValueError("Model not trained. Call train_model() first.")
|
||||
|
||||
logging.info(f"Evaluating model on {len(test_data)} test examples")
|
||||
|
||||
total_examples = len(test_data)
|
||||
correct_entities = 0
|
||||
predicted_entities = 0
|
||||
actual_entities = 0
|
||||
|
||||
entity_stats = {
|
||||
"NATIVE": {"tp": 0, "fp": 0, "fn": 0},
|
||||
"SURNAME": {"tp": 0, "fp": 0, "fn": 0},
|
||||
}
|
||||
|
||||
for text, annotations in test_data:
|
||||
# Get actual entities
|
||||
actual_ents = set()
|
||||
for start, end, label in annotations.get("entities", []):
|
||||
actual_ents.add((start, end, label))
|
||||
actual_entities += 1
|
||||
|
||||
# Get predicted entities
|
||||
doc = self.nlp(text)
|
||||
predicted_ents = set()
|
||||
for ent in doc.ents:
|
||||
predicted_ents.add((ent.start_char, ent.end_char, ent.label_))
|
||||
predicted_entities += 1
|
||||
|
||||
# Calculate matches
|
||||
matches = actual_ents.intersection(predicted_ents)
|
||||
correct_entities += len(matches)
|
||||
|
||||
# Update per-label statistics
|
||||
for start, end, label in actual_ents:
|
||||
if (start, end, label) in predicted_ents:
|
||||
entity_stats[label]["tp"] += 1
|
||||
else:
|
||||
entity_stats[label]["fn"] += 1
|
||||
|
||||
for start, end, label in predicted_ents:
|
||||
if (start, end, label) not in actual_ents:
|
||||
entity_stats[label]["fp"] += 1
|
||||
|
||||
# Calculate overall metrics
|
||||
precision = (
|
||||
correct_entities / predicted_entities if predicted_entities > 0 else 0
|
||||
)
|
||||
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
||||
f1_score = (
|
||||
2 * (precision * recall) / (precision + recall)
|
||||
if (precision + recall) > 0
|
||||
else 0
|
||||
)
|
||||
|
||||
# Calculate per-label metrics
|
||||
label_metrics = {}
|
||||
for label, stats in entity_stats.items():
|
||||
tp, fp, fn = stats["tp"], stats["fp"], stats["fn"]
|
||||
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
||||
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
||||
label_f1 = (
|
||||
(
|
||||
2
|
||||
* (label_precision * label_recall)
|
||||
/ (label_precision + label_recall)
|
||||
)
|
||||
if (label_precision + label_recall) > 0
|
||||
else 0
|
||||
)
|
||||
|
||||
label_metrics[label] = {
|
||||
"precision": label_precision,
|
||||
"recall": label_recall,
|
||||
"f1_score": label_f1,
|
||||
"support": tp + fn,
|
||||
}
|
||||
|
||||
self.evaluation_stats = {
|
||||
"overall": {
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"f1_score": f1_score,
|
||||
"total_examples": total_examples,
|
||||
"correct_entities": correct_entities,
|
||||
"predicted_entities": predicted_entities,
|
||||
"actual_entities": actual_entities,
|
||||
},
|
||||
"by_label": label_metrics,
|
||||
}
|
||||
|
||||
return self.evaluation_stats
|
||||
|
||||
def save(self, model_name: str = "drc_ner_model") -> str:
|
||||
"""Save the trained model"""
|
||||
if self.nlp is None:
|
||||
raise ValueError("No model to save. Train a model first.")
|
||||
|
||||
# Create model directory
|
||||
model_dir = self.config.paths.models_dir / model_name
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save the model
|
||||
self.nlp.to_disk(model_dir)
|
||||
self.model_path = str(model_dir)
|
||||
|
||||
# Save training and evaluation statistics
|
||||
training_stats_path = model_dir / "training_stats.json"
|
||||
with open(training_stats_path, "w", encoding="utf-8") as f:
|
||||
json.dump(self.training_stats, f, indent=2)
|
||||
|
||||
evaluation_stats_path = model_dir / "evaluation_stats.json"
|
||||
with open(evaluation_stats_path, "w", encoding="utf-8") as f:
|
||||
json.dump(self.evaluation_stats, f, indent=2)
|
||||
|
||||
logging.info(f"NER Model saved to {model_dir}")
|
||||
return self.model_path
|
||||
|
||||
def load(self, model_path: str) -> None:
|
||||
"""Load a trained model"""
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError(f"Model not found at {model_path}")
|
||||
|
||||
logging.info(f"Loading model from {model_path}")
|
||||
self.nlp = spacy.load(model_path)
|
||||
self.ner = self.nlp.get_pipe("ner")
|
||||
self.model_path = model_path
|
||||
|
||||
# Load training statistics if available
|
||||
training_stats_path = Path(model_path) / "training_stats.json"
|
||||
if training_stats_path.exists():
|
||||
with open(training_stats_path, "r", encoding="utf-8") as f:
|
||||
self.training_stats = json.load(f)
|
||||
|
||||
evaluation_stats_path = Path(model_path) / "evaluation_stats.json"
|
||||
if evaluation_stats_path.exists():
|
||||
with open(evaluation_stats_path, "r", encoding="utf-8") as f:
|
||||
self.evaluation_stats = json.load(f)
|
||||
|
||||
logging.info("NER Model loaded successfully")
|
||||
|
||||
def predict(self, text: str) -> Dict[str, Any]:
|
||||
"""Make predictions on a single text"""
|
||||
if self.nlp is None:
|
||||
raise ValueError("No model loaded. Load or train a model first.")
|
||||
|
||||
doc = self.nlp(text)
|
||||
entities = []
|
||||
|
||||
for ent in doc.ents:
|
||||
entities.append(
|
||||
{
|
||||
"text": ent.text,
|
||||
"label": ent.label_,
|
||||
"start": ent.start_char,
|
||||
"end": ent.end_char,
|
||||
"confidence": getattr(
|
||||
ent, "score", None
|
||||
), # If confidence scores are available
|
||||
}
|
||||
)
|
||||
|
||||
return {"text": text, "entities": entities}
|
||||
@@ -0,0 +1,290 @@
|
||||
from typing import Union, Dict, Any, List
|
||||
import ast
|
||||
import json
|
||||
import logging
|
||||
import pandas as pd
|
||||
from spacy.util import filter_spans
|
||||
|
||||
|
||||
class NameTagger:
|
||||
def tag_name(
|
||||
self, name: str, probable_native: str, probable_surname: str
|
||||
) -> Union[Dict[str, Any], None]:
|
||||
"""Create a single NER training example using probable_native and probable_surname"""
|
||||
if not name or not probable_native or not probable_surname:
|
||||
return None
|
||||
|
||||
name = name.strip()
|
||||
probable_native = probable_native.strip()
|
||||
probable_surname = probable_surname.strip()
|
||||
|
||||
entities = []
|
||||
used_spans = [] # Track used character spans to prevent overlaps
|
||||
|
||||
# Helper function to check if a span overlaps with any existing span
|
||||
def has_overlap(start, end):
|
||||
for used_start, used_end in used_spans:
|
||||
if not (end <= used_start or start >= used_end):
|
||||
return True
|
||||
return False
|
||||
|
||||
# Find positions of native names in the full name
|
||||
native_words = probable_native.split()
|
||||
name_lower = name.lower() # Use lowercase for consistent searching
|
||||
processed_native_words = set()
|
||||
|
||||
for native_word in native_words:
|
||||
native_word = native_word.strip()
|
||||
if len(native_word) < 2: # Skip very short words
|
||||
continue
|
||||
|
||||
native_word_lower = native_word.lower()
|
||||
|
||||
# Skip if we've already processed this exact word
|
||||
if native_word_lower in processed_native_words:
|
||||
continue
|
||||
processed_native_words.add(native_word_lower)
|
||||
|
||||
# Find the first occurrence of this native word that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(
|
||||
native_word_lower, start_pos
|
||||
) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
# Calculate end position - make sure we only include the word itself
|
||||
end_pos = pos + len(native_word_lower)
|
||||
|
||||
# Double-check that the extracted span matches exactly what we expect
|
||||
extracted_text = name[pos:end_pos] # Get original case text
|
||||
if extracted_text.lower() != native_word_lower:
|
||||
start_pos = pos + 1
|
||||
continue
|
||||
|
||||
# Check if this is a word boundary match and doesn't overlap
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "NATIVE"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break # Only take the first non-overlapping occurrence
|
||||
|
||||
start_pos = pos + 1
|
||||
|
||||
# Find position of surname in the full name
|
||||
if probable_surname and len(probable_surname.strip()) >= 2:
|
||||
surname_lower = probable_surname.lower()
|
||||
|
||||
# Find the first occurrence that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(
|
||||
surname_lower, start_pos
|
||||
) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
# Calculate end position correctly - exact match only
|
||||
end_pos = pos + len(surname_lower)
|
||||
|
||||
# Double-check that the extracted span matches exactly what we expect
|
||||
extracted_text = name[pos:end_pos] # Get original case text
|
||||
if extracted_text.lower() != surname_lower:
|
||||
start_pos = pos + 1
|
||||
continue
|
||||
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "SURNAME"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break
|
||||
|
||||
start_pos = pos + 1
|
||||
|
||||
if not entities:
|
||||
logging.warning(
|
||||
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Sort entities by position and validate
|
||||
entities.sort(key=lambda x: x[0])
|
||||
|
||||
# Final validation - ensure no overlaps and valid spans
|
||||
validated_entities = []
|
||||
for start, end, label in entities:
|
||||
# Check bounds
|
||||
if not (0 <= start < end <= len(name)):
|
||||
logging.warning(
|
||||
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
if any(
|
||||
start < v_end and end > v_start
|
||||
for v_start, v_end, _ in validated_entities
|
||||
):
|
||||
logging.warning(
|
||||
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
|
||||
)
|
||||
continue
|
||||
|
||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||
span_text = name[start:end]
|
||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||
logging.warning(
|
||||
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
|
||||
)
|
||||
continue
|
||||
|
||||
validated_entities.append((start, end, label))
|
||||
|
||||
if not validated_entities:
|
||||
logging.warning(f"No valid entities after validation for: '{name}'")
|
||||
return None
|
||||
|
||||
# Convert to string format that matches the dataset
|
||||
entities_str = str(validated_entities)
|
||||
|
||||
return {
|
||||
"entities": entities_str,
|
||||
"spans": validated_entities, # Keep the original tuples for internal use
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
|
||||
"""Check if the match is at word boundaries"""
|
||||
# Check character before start position
|
||||
if start > 0:
|
||||
prev_char = text[start - 1]
|
||||
if prev_char.isalnum():
|
||||
return False
|
||||
|
||||
# Check character after end position
|
||||
if end < len(text):
|
||||
next_char = text[end]
|
||||
if next_char.isalnum():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
||||
"""Extract the actual text for each entity type"""
|
||||
result = {"NATIVE": [], "SURNAME": []}
|
||||
|
||||
try:
|
||||
entities = ast.literal_eval(entities_str)
|
||||
|
||||
for start, end, label in entities:
|
||||
if 0 <= start < end <= len(name):
|
||||
span_text = name[start:end]
|
||||
if label in result:
|
||||
result[label].append(span_text)
|
||||
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def parse(cls, entities_str: str) -> List[tuple]:
|
||||
"""Parse entity strings from various formats.
|
||||
|
||||
Supports formats:
|
||||
- [(start, end, label), ...]
|
||||
- [[start, end, label], ...]
|
||||
- [{"start": start, "end": end, "label": label}, ...]
|
||||
"""
|
||||
if not entities_str or entities_str in ["[]", "", "nan"]:
|
||||
return []
|
||||
entities_str = str(entities_str).strip()
|
||||
try:
|
||||
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
||||
return ast.literal_eval(entities_str)
|
||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||
return [
|
||||
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
|
||||
]
|
||||
else:
|
||||
parsed = ast.literal_eval(entities_str)
|
||||
return [
|
||||
tuple(e)
|
||||
for e in parsed
|
||||
if isinstance(e, (list, tuple)) and len(e) == 3
|
||||
]
|
||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
def parse_entities(self, series: pd.Series) -> pd.Series:
|
||||
"""Vectorized parse of entity strings."""
|
||||
return series.map(self.parse)
|
||||
|
||||
@classmethod
|
||||
def validate(cls, text: str, entities: List[tuple]) -> List[tuple]:
|
||||
"""Advanced entity validation with overlap removal.
|
||||
|
||||
This is more comprehensive than the basic validate_entities method.
|
||||
"""
|
||||
if not entities or not text:
|
||||
return []
|
||||
text = str(text).strip()
|
||||
valid = []
|
||||
|
||||
for ent in entities:
|
||||
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
|
||||
continue
|
||||
start, end, label = ent
|
||||
try:
|
||||
start, end = int(start), int(end)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
if not isinstance(label, str):
|
||||
continue
|
||||
if not (0 <= start < end <= len(text)):
|
||||
continue
|
||||
if not text[start:end].strip():
|
||||
continue
|
||||
valid.append((start, end, label))
|
||||
|
||||
if not valid:
|
||||
return []
|
||||
|
||||
valid.sort(key=lambda x: (x[0], x[1]))
|
||||
|
||||
# Remove overlaps
|
||||
filtered, last_end = [], -1
|
||||
for s, e, l in valid:
|
||||
if s >= last_end:
|
||||
filtered.append((s, e, l))
|
||||
last_end = e
|
||||
return filtered
|
||||
|
||||
def validate_entities(
|
||||
self, texts: pd.Series, entities_series: pd.Series
|
||||
) -> pd.Series:
|
||||
"""Vectorized entity validation."""
|
||||
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
||||
|
||||
@classmethod
|
||||
def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List:
|
||||
"""Batch create spaCy Docs from texts and entities."""
|
||||
docs = []
|
||||
for text, ents in zip(texts, entities):
|
||||
doc = nlp(text)
|
||||
spans = []
|
||||
for start, end, label in ents:
|
||||
span = doc.char_span(
|
||||
start, end, label=label, alignment_mode="contract"
|
||||
) or doc.char_span(start, end, label=label, alignment_mode="strict")
|
||||
if span:
|
||||
spans.append(span)
|
||||
doc.ents = filter_spans(spans)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
@@ -0,0 +1,57 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.batch.batch_processor import BatchProcessor
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class Pipeline:
|
||||
"""Main pipeline orchestrator"""
|
||||
|
||||
def __init__(self, config: BatchConfig):
|
||||
self.config = config
|
||||
self.processor = BatchProcessor(config)
|
||||
self.steps = []
|
||||
|
||||
def add_step(self, step: PipelineStep):
|
||||
"""Add a processing step to the pipeline"""
|
||||
self.steps.append(step)
|
||||
|
||||
def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Run the complete pipeline"""
|
||||
current_data = input_data.copy()
|
||||
|
||||
for step in self.steps:
|
||||
logging.info(f"Running pipeline step: {step.name}")
|
||||
start_time = time.time()
|
||||
|
||||
current_data = self.processor.process(step, current_data)
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
|
||||
|
||||
if step.state.failed_batches:
|
||||
logging.warning(
|
||||
f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
|
||||
)
|
||||
|
||||
return current_data
|
||||
|
||||
def get_progress(self) -> Dict[str, Any]:
|
||||
"""Get progress information for all steps"""
|
||||
progress = {}
|
||||
for step in self.steps:
|
||||
progress[step.name] = {
|
||||
"processed_batches": step.state.processed_batches,
|
||||
"total_batches": step.state.total_batches,
|
||||
"failed_batches": len(step.state.failed_batches),
|
||||
"completion_percentage": (
|
||||
step.state.processed_batches / max(1, step.state.total_batches)
|
||||
)
|
||||
* 100,
|
||||
}
|
||||
return progress
|
||||
@@ -0,0 +1,129 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineState:
|
||||
"""Tracks the state of pipeline execution"""
|
||||
|
||||
processed_batches: int = 0
|
||||
total_batches: int = 0
|
||||
failed_batches: List[int] = None
|
||||
last_checkpoint: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.failed_batches is None:
|
||||
self.failed_batches = []
|
||||
|
||||
|
||||
class NameAnnotation(BaseModel):
|
||||
"""Model for name annotation results"""
|
||||
|
||||
identified_name: Optional[str]
|
||||
identified_surname: Optional[str]
|
||||
|
||||
|
||||
class PipelineStep(ABC):
|
||||
"""Abstract base class for pipeline steps"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
pipeline_config: PipelineConfig,
|
||||
batch_config: Optional[BatchConfig] = None,
|
||||
):
|
||||
self.name = name
|
||||
self.pipeline_config = pipeline_config
|
||||
self.data_loader = DataLoader(pipeline_config)
|
||||
|
||||
# Use provided batch_config or create default from pipeline config
|
||||
if batch_config is None:
|
||||
batch_config = BatchConfig(
|
||||
batch_size=pipeline_config.processing.batch_size,
|
||||
max_workers=pipeline_config.processing.max_workers,
|
||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||
)
|
||||
self.batch_config = batch_config
|
||||
self.state = PipelineState()
|
||||
|
||||
@property
|
||||
def requires_batch_mutation(self) -> bool:
|
||||
"""Indicates if this step modifies the batch data"""
|
||||
return False
|
||||
|
||||
@abstractmethod
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process a single batch of data"""
|
||||
pass
|
||||
|
||||
def get_checkpoint_path(self, batch_id: int) -> str:
|
||||
"""Get the checkpoint file path for a batch"""
|
||||
checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
|
||||
checkpoint_dir.mkdir(parents=True, exist_ok=True)
|
||||
return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
|
||||
|
||||
def get_state_path(self) -> str:
|
||||
"""Get the state file path"""
|
||||
state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
|
||||
state_dir.mkdir(parents=True, exist_ok=True)
|
||||
return str(state_dir / "pipeline_state.json")
|
||||
|
||||
def save_state(self):
|
||||
"""Save pipeline state to disk"""
|
||||
state_file = self.get_state_path()
|
||||
with open(state_file, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"processed_batches": self.state.processed_batches,
|
||||
"total_batches": self.state.total_batches,
|
||||
"failed_batches": self.state.failed_batches,
|
||||
"last_checkpoint": self.state.last_checkpoint,
|
||||
},
|
||||
f,
|
||||
)
|
||||
|
||||
def load_state(self) -> bool:
|
||||
"""Load pipeline state from disk. Returns True if state was loaded."""
|
||||
state_file = self.get_state_path()
|
||||
if os.path.exists(state_file):
|
||||
try:
|
||||
with open(state_file, "r") as f:
|
||||
state_data = json.load(f)
|
||||
self.state.processed_batches = state_data.get("processed_batches", 0)
|
||||
self.state.total_batches = state_data.get("total_batches", 0)
|
||||
self.state.failed_batches = state_data.get("failed_batches", [])
|
||||
self.state.last_checkpoint = state_data.get("last_checkpoint")
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to load state: {e}")
|
||||
return False
|
||||
|
||||
def batch_exists(self, batch_id: int) -> bool:
|
||||
"""Check if a batch has already been processed (idempotency)"""
|
||||
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||
return os.path.exists(checkpoint_path)
|
||||
|
||||
def save_batch(self, batch: pd.DataFrame, batch_id: int):
|
||||
"""Save processed batch to checkpoint"""
|
||||
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||
self.data_loader.save_csv(batch, checkpoint_path)
|
||||
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
|
||||
|
||||
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
|
||||
"""Load processed batch from checkpoint"""
|
||||
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||
if os.path.exists(checkpoint_path):
|
||||
return self.data_loader.load_csv_complete(checkpoint_path)
|
||||
return None
|
||||
@@ -0,0 +1,31 @@
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.text_cleaner import TextCleaner
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class DataCleaningStep(PipelineStep):
|
||||
"""Configuration-driven data cleaning step"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
super().__init__("data_cleaning", pipeline_config)
|
||||
self.text_cleaner = TextCleaner()
|
||||
self.required_columns = ["name", "sex", "region"]
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process a single batch for data cleaning"""
|
||||
logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
|
||||
|
||||
# Drop rows with essential missing values
|
||||
batch = batch.dropna(subset=self.required_columns)
|
||||
|
||||
# Apply text cleaning
|
||||
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
|
||||
|
||||
# Remove duplicates
|
||||
batch = batch.drop_duplicates(subset=self.required_columns)
|
||||
|
||||
return batch
|
||||
@@ -0,0 +1,60 @@
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class DataSelectionStep(PipelineStep):
|
||||
"""Configuration-driven data selection step to keep only specified columns"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
super().__init__("data_selection", pipeline_config)
|
||||
self.selected_columns = pipeline_config.data.selected_columns
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process a single batch for data selection"""
|
||||
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
|
||||
|
||||
# Remove rows where region == "global" only for specific years
|
||||
if "region" in batch.columns and "year" in batch.columns:
|
||||
target_years = {2015, 2021, 2022}
|
||||
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
|
||||
target_years
|
||||
)
|
||||
removed = int(mask_remove.sum())
|
||||
if removed:
|
||||
batch = batch[~mask_remove]
|
||||
logging.info(
|
||||
f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}"
|
||||
)
|
||||
|
||||
# Check which columns exist in the batch
|
||||
available_columns = [
|
||||
col for col in self.selected_columns if col in batch.columns
|
||||
]
|
||||
missing_columns = [
|
||||
col for col in self.selected_columns if col not in batch.columns
|
||||
]
|
||||
|
||||
if missing_columns:
|
||||
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
||||
|
||||
if not available_columns:
|
||||
logging.error(f"No required columns found in batch {batch_id}")
|
||||
return pd.DataFrame() # Return empty DataFrame if no required columns exist
|
||||
|
||||
# Select only the available required columns
|
||||
selected_batch = batch[available_columns].copy()
|
||||
|
||||
logging.info(
|
||||
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
|
||||
)
|
||||
|
||||
return selected_batch
|
||||
|
||||
@property
|
||||
def requires_batch_mutation(self) -> bool:
|
||||
"""This step modifies the batch data by selecting columns"""
|
||||
return True
|
||||
@@ -0,0 +1,69 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.region_mapper import RegionMapper
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.steps import PipelineStep
|
||||
from ners.processing.steps.feature_extraction_step import Gender
|
||||
|
||||
|
||||
class DataSplittingStep(PipelineStep):
|
||||
"""Configuration-driven data splitting step"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
batch_config = BatchConfig(
|
||||
batch_size=pipeline_config.processing.batch_size,
|
||||
max_workers=1, # No need for parallelism in splitting
|
||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||
use_multiprocessing=False,
|
||||
)
|
||||
super().__init__("data_splitting", pipeline_config, batch_config)
|
||||
self.eval_indices = None
|
||||
|
||||
def determine_eval_indices(self, total_size: int) -> set:
|
||||
"""Determine evaluation indices consistently across batches"""
|
||||
if self.eval_indices is None:
|
||||
np.random.seed(self.pipeline_config.data.random_seed)
|
||||
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
|
||||
self.eval_indices = set(
|
||||
np.random.choice(total_size, size=eval_size, replace=False)
|
||||
)
|
||||
return self.eval_indices
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process batch for data splitting - no modification needed"""
|
||||
return batch
|
||||
|
||||
def split(self, df: pd.DataFrame) -> None:
|
||||
"""Save the split datasets based on configuration"""
|
||||
output_files = self.pipeline_config.data.output_files
|
||||
data_dir = self.pipeline_config.paths.data_dir
|
||||
|
||||
if self.pipeline_config.data.split_evaluation:
|
||||
eval_indices = self.determine_eval_indices(len(df))
|
||||
eval_mask = df.index.isin(eval_indices)
|
||||
|
||||
df_evaluation = df[eval_mask]
|
||||
df_featured = df[~eval_mask]
|
||||
|
||||
self.data_loader.save_csv(
|
||||
df_evaluation, data_dir / output_files["evaluation"]
|
||||
)
|
||||
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
|
||||
else:
|
||||
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
||||
|
||||
if self.pipeline_config.data.split_by_province:
|
||||
for province in RegionMapper.get_provinces():
|
||||
df_region = df[df.province == province]
|
||||
self.data_loader.save_csv(
|
||||
df_region, data_dir / "provinces" / f"{province}.csv"
|
||||
)
|
||||
|
||||
if self.pipeline_config.data.split_by_gender:
|
||||
df_males = df[df.sex == Gender.MALE.value]
|
||||
df_females = df[df.sex == Gender.FEMALE.value]
|
||||
|
||||
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
|
||||
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
|
||||
@@ -0,0 +1,196 @@
|
||||
import gc
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.region_mapper import RegionMapper
|
||||
from ners.processing.ner.name_tagger import NameTagger
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class Gender(Enum):
|
||||
MALE = "m"
|
||||
FEMALE = "f"
|
||||
|
||||
|
||||
class NameCategory(Enum):
|
||||
SIMPLE = "simple"
|
||||
COMPOSE = "compose"
|
||||
|
||||
|
||||
class FeatureExtractionStep(PipelineStep):
|
||||
"""Configuration-driven feature extraction step"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
super().__init__("feature_extraction", pipeline_config)
|
||||
self.region_mapper = RegionMapper()
|
||||
self.name_tagger = NameTagger()
|
||||
|
||||
@classmethod
|
||||
def requires_batch_mutation(cls) -> bool:
|
||||
"""This step creates new columns, so mutation is required"""
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def validate_gender(cls, gender: str) -> Gender:
|
||||
"""Validate and normalize gender value"""
|
||||
gender_lower = str(gender).lower().strip()
|
||||
if gender_lower in ["m", "male", "homme", "masculin"]:
|
||||
return Gender.MALE
|
||||
elif gender_lower in ["f", "female", "femme", "féminin"]:
|
||||
return Gender.FEMALE
|
||||
else:
|
||||
raise ValueError(f"Unknown gender: {gender}")
|
||||
|
||||
@classmethod
|
||||
def get_name_category(cls, word_count: int) -> NameCategory:
|
||||
"""Determine name category based on word count"""
|
||||
return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Extract features from names in batch"""
|
||||
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
|
||||
|
||||
result = batch.copy()
|
||||
numeric_features = self._compute_numeric_features(result["name"])
|
||||
result = result.assign(**numeric_features)
|
||||
|
||||
# Initialize features columns with optimal dtypes
|
||||
features_columns = self._initialize_features_columns(len(result))
|
||||
result = result.assign(**features_columns)
|
||||
|
||||
self._assign_probable_names(result)
|
||||
self._process_simple_names(result)
|
||||
result["identified_category"] = self._assign_identified_category(
|
||||
result["words"]
|
||||
)
|
||||
|
||||
if "year" in result.columns:
|
||||
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
|
||||
"Int16"
|
||||
)
|
||||
|
||||
if "region" in result.columns:
|
||||
result["province"] = self.region_mapper.map(result["region"]).str.lower()
|
||||
result["province"] = result["province"].astype("category")
|
||||
|
||||
if "sex" in result.columns:
|
||||
result["sex"] = self._normalize_gender(result["sex"])
|
||||
|
||||
# Apply final dtype optimizations
|
||||
result = self._optimize_dtypes(result)
|
||||
|
||||
# Cleanup
|
||||
del numeric_features, features_columns
|
||||
if batch_id % 10 == 0: # Periodic cleanup
|
||||
gc.collect()
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
|
||||
"""Calculate basic features in vectorized manner"""
|
||||
return {
|
||||
"words": (series.str.count(" ") + 1).astype("Int8"),
|
||||
"length": series.str.len().astype("Int16"),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
|
||||
"""Initialize new columns with optimal dtypes"""
|
||||
return {
|
||||
"probable_native": pd.Series([None] * size, dtype="string"),
|
||||
"probable_surname": pd.Series([None] * size, dtype="string"),
|
||||
"identified_name": pd.Series([None] * size, dtype="string"),
|
||||
"identified_surname": pd.Series([None] * size, dtype="string"),
|
||||
"ner_entities": pd.Series([None] * size, dtype="string"),
|
||||
"ner_tagged": pd.Series([0] * size, dtype="Int8"),
|
||||
"annotated": pd.Series([0] * size, dtype="Int8"),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _assign_probable_names(cls, df: pd.DataFrame) -> None:
|
||||
"""Assign probable native and surname names efficiently"""
|
||||
|
||||
name_splits = df["name"].str.split()
|
||||
mask = name_splits.str.len() >= 2
|
||||
|
||||
df.loc[mask, "probable_native"] = name_splits[mask].apply(
|
||||
lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
|
||||
)
|
||||
df.loc[mask, "probable_surname"] = name_splits[mask].apply(
|
||||
lambda x: x[-1] if isinstance(x, list) else None
|
||||
)
|
||||
|
||||
def _assign_identified_category(self, series: pd.Series) -> pd.Series:
|
||||
"""Assign identified category based on word count"""
|
||||
return series.map(lambda x: self.get_name_category(x).value).astype("category")
|
||||
|
||||
def _process_simple_names(self, df: pd.DataFrame) -> None:
|
||||
"""Process 3-word names efficiently with vectorized operations"""
|
||||
mask = pd.Series(df["words"] == 3)
|
||||
|
||||
if not mask.any():
|
||||
return
|
||||
|
||||
df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
|
||||
df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
|
||||
df.loc[mask, "annotated"] = 1
|
||||
|
||||
# NER tagging for 3-word names
|
||||
three_word_rows = df[mask]
|
||||
for idx, row in three_word_rows.iterrows():
|
||||
try:
|
||||
entity = self.name_tagger.tag_name(
|
||||
row["name"], row["identified_name"], row["identified_surname"]
|
||||
)
|
||||
|
||||
if entity:
|
||||
df.at[idx, "ner_entities"] = str(entity["entities"])
|
||||
df.at[idx, "ner_tagged"] = 1
|
||||
except Exception as e:
|
||||
logging.warning(f"NER tagging failed for row {idx}: {e}")
|
||||
|
||||
@classmethod
|
||||
def _normalize_gender(cls, series: pd.Series) -> pd.Series:
|
||||
gender_mapping = {
|
||||
"m": "m",
|
||||
"male": "m",
|
||||
"homme": "m",
|
||||
"masculin": "m",
|
||||
"f": "f",
|
||||
"female": "f",
|
||||
"femme": "f",
|
||||
"féminin": "f",
|
||||
}
|
||||
|
||||
# Apply mapping with error handling
|
||||
normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
|
||||
return normalized.astype("category")
|
||||
|
||||
@classmethod
|
||||
def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
|
||||
categories = ["province", "identified_category", "sex"]
|
||||
|
||||
for col in categories:
|
||||
if col in df.columns and df[col].dtype != "category":
|
||||
df[col] = df[col].astype("category")
|
||||
|
||||
# Ensure string columns are proper string dtype
|
||||
string_cols = [
|
||||
"name",
|
||||
"probable_native",
|
||||
"probable_surname",
|
||||
"identified_name",
|
||||
"identified_surname",
|
||||
"ner_entities",
|
||||
]
|
||||
|
||||
for col in string_cols:
|
||||
if col in df.columns and df[col].dtype == "object":
|
||||
df[col] = df[col].astype("string")
|
||||
|
||||
return df
|
||||
@@ -0,0 +1,169 @@
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Dict
|
||||
|
||||
import ollama
|
||||
import pandas as pd
|
||||
from pydantic import ValidationError
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.prompt_manager import PromptManager
|
||||
from ners.core.utils.rate_limiter import RateLimitConfig
|
||||
from ners.core.utils.rate_limiter import RateLimiter
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||
|
||||
|
||||
class LLMAnnotationStep(PipelineStep):
|
||||
"""Configuration-driven LLM annotation step"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
# Create custom batch config for LLM processing
|
||||
self.llm_config = pipeline_config.annotation.llm
|
||||
batch_config = BatchConfig(
|
||||
batch_size=pipeline_config.processing.batch_size,
|
||||
max_workers=min(
|
||||
self.llm_config.max_concurrent_requests,
|
||||
pipeline_config.processing.max_workers,
|
||||
),
|
||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||
)
|
||||
super().__init__("llm_annotation", pipeline_config, batch_config)
|
||||
|
||||
self.prompt = PromptManager(pipeline_config).load_prompt()
|
||||
self.rate_limiter = (
|
||||
self._create_rate_limiter()
|
||||
if self.llm_config.enable_rate_limiting
|
||||
else None
|
||||
)
|
||||
|
||||
# Statistics
|
||||
self.successful_requests = 0
|
||||
self.failed_requests = 0
|
||||
self.total_retry_attempts = 0
|
||||
|
||||
# Setup logging
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
def _create_rate_limiter(self):
|
||||
"""Create rate limiter based on configuration"""
|
||||
rate_config = RateLimitConfig(
|
||||
requests_per_minute=self.llm_config.requests_per_minute,
|
||||
requests_per_second=self.llm_config.requests_per_second,
|
||||
)
|
||||
return RateLimiter(rate_config)
|
||||
|
||||
def analyze_name(self, client: ollama.Client, name: str) -> Dict:
|
||||
"""Analyze a name with retry logic and rate limiting"""
|
||||
for attempt in range(self.llm_config.retry_attempts):
|
||||
try:
|
||||
# Apply rate limiting if enabled
|
||||
if self.rate_limiter:
|
||||
self.rate_limiter.wait_if_needed()
|
||||
|
||||
start_time = time.time()
|
||||
response = client.chat(
|
||||
model=self.llm_config.model_name,
|
||||
messages=[
|
||||
{"role": "system", "content": self.prompt},
|
||||
{"role": "user", "content": name},
|
||||
],
|
||||
format=NameAnnotation.model_json_schema(),
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
if elapsed_time > self.llm_config.timeout_seconds:
|
||||
raise TimeoutError(
|
||||
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
|
||||
)
|
||||
|
||||
annotation = NameAnnotation.model_validate_json(
|
||||
response.message.content
|
||||
)
|
||||
result = {
|
||||
**annotation.model_dump(),
|
||||
"annotated": 1,
|
||||
"processing_time": elapsed_time,
|
||||
"attempts": attempt + 1,
|
||||
}
|
||||
|
||||
self.successful_requests += 1
|
||||
if attempt > 0:
|
||||
self.total_retry_attempts += attempt
|
||||
|
||||
return result
|
||||
|
||||
except (ValidationError, TimeoutError, Exception) as e:
|
||||
logging.warning(
|
||||
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.llm_config.retry_attempts}): {e}"
|
||||
)
|
||||
|
||||
# Exponential backoff with jitter
|
||||
if attempt < self.llm_config.retry_attempts - 1:
|
||||
wait_time = (2**attempt) + (time.time() % 1)
|
||||
time.sleep(min(wait_time, 10))
|
||||
|
||||
self.failed_requests += 1
|
||||
return {
|
||||
"identified_name": None,
|
||||
"identified_surname": None,
|
||||
"annotated": 0,
|
||||
"processing_time": 0,
|
||||
"attempts": self.llm_config.retry_attempts,
|
||||
"failed": True,
|
||||
}
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process batch with LLM annotation"""
|
||||
unannotated_mask = batch.get("annotated", 0) == 0
|
||||
unannotated_entries = batch[unannotated_mask]
|
||||
|
||||
if unannotated_entries.empty:
|
||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||
return batch
|
||||
|
||||
logging.info(
|
||||
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
|
||||
)
|
||||
|
||||
batch = batch.copy()
|
||||
client = ollama.Client()
|
||||
|
||||
# Process with controlled concurrency
|
||||
max_workers = self.llm_config.max_concurrent_requests
|
||||
|
||||
if len(unannotated_entries) == 1 or max_workers == 1:
|
||||
# Sequential processing
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
result = self.analyze_name(client, row["name"])
|
||||
for field, value in result.items():
|
||||
if field not in ["failed"]:
|
||||
batch.loc[idx, field] = value
|
||||
else:
|
||||
# Concurrent processing
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_idx = {}
|
||||
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
future = executor.submit(self.analyze_name, client, row["name"])
|
||||
future_to_idx[future] = idx
|
||||
|
||||
for future in as_completed(future_to_idx):
|
||||
idx = future_to_idx[future]
|
||||
try:
|
||||
result = future.result()
|
||||
for field, value in result.items():
|
||||
if field not in ["failed"]:
|
||||
batch.loc[idx, field] = value
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process row {idx}: {e}")
|
||||
batch.loc[idx, "annotated"] = 0
|
||||
|
||||
# Ensure proper data types
|
||||
batch["annotated"] = (
|
||||
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
)
|
||||
|
||||
return batch
|
||||
@@ -0,0 +1,172 @@
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.processing.ner.name_model import NameModel
|
||||
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||
|
||||
|
||||
class NERAnnotationStep(PipelineStep):
|
||||
"""NER annotation step using trained spaCy model for entity recognition"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
# Create custom batch config for NER processing
|
||||
super().__init__("ner_annotation", pipeline_config)
|
||||
|
||||
self.model_name = "drc_ner_model"
|
||||
self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
|
||||
self.name_model = NameModel(pipeline_config)
|
||||
self.ner_config = pipeline_config.annotation.ner
|
||||
|
||||
# Statistics
|
||||
self.successful_requests = 0
|
||||
self.failed_requests = 0
|
||||
self.total_retry_attempts = 0
|
||||
|
||||
# Load the model
|
||||
self._load_ner_model()
|
||||
|
||||
def _load_ner_model(self) -> None:
|
||||
"""Load the trained NER model"""
|
||||
try:
|
||||
if self.model_path.exists():
|
||||
logging.info(f"Loading NER model from {self.model_path}")
|
||||
self.name_model.load(str(self.model_path))
|
||||
logging.info("NER model loaded successfully")
|
||||
else:
|
||||
logging.warning(f"NER model not found at {self.model_path}")
|
||||
logging.warning(
|
||||
"NER annotation will be skipped. Train the model first."
|
||||
)
|
||||
self.name_model.nlp = None
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load NER model: {e}")
|
||||
self.name_model.nlp = None
|
||||
|
||||
def analyze_name(self, name: str) -> Dict:
|
||||
"""Analyze a name with retry logic"""
|
||||
if self.name_model.nlp is None:
|
||||
return {
|
||||
"identified_name": None,
|
||||
"identified_surname": None,
|
||||
"annotated": 0,
|
||||
"processing_time": 0,
|
||||
"attempts": 0,
|
||||
"failed": True,
|
||||
}
|
||||
|
||||
for attempt in range(self.ner_config.retry_attempts):
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Get NER predictions
|
||||
prediction = self.name_model.predict(name.lower())
|
||||
entities = prediction.get("entities", [])
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Extract native names and surnames from entities
|
||||
native_parts = []
|
||||
surname_parts = []
|
||||
|
||||
for entity in entities:
|
||||
if entity["label"] == "NATIVE":
|
||||
native_parts.append(entity["text"])
|
||||
elif entity["label"] == "SURNAME":
|
||||
surname_parts.append(entity["text"])
|
||||
|
||||
# Create annotation result in same format as LLM step
|
||||
annotation = NameAnnotation(
|
||||
identified_name=" ".join(native_parts) if native_parts else None,
|
||||
identified_surname=" ".join(surname_parts)
|
||||
if surname_parts
|
||||
else None,
|
||||
)
|
||||
|
||||
result = {
|
||||
**annotation.model_dump(),
|
||||
"annotated": 1,
|
||||
"processing_time": elapsed_time,
|
||||
"attempts": attempt + 1,
|
||||
}
|
||||
|
||||
self.successful_requests += 1
|
||||
if attempt > 0:
|
||||
self.total_retry_attempts += attempt
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
f"Error analyzing '{name}' with NER (attempt {attempt + 1}/{self.ner_config.retry_attempts}): {e}"
|
||||
)
|
||||
|
||||
# Small delay between retries
|
||||
if attempt < self.ner_config.retry_attempts - 1:
|
||||
time.sleep(0.1)
|
||||
|
||||
self.failed_requests += 1
|
||||
return {
|
||||
"identified_name": None,
|
||||
"identified_surname": None,
|
||||
"annotated": 0,
|
||||
"processing_time": 0,
|
||||
"attempts": self.ner_config.retry_attempts,
|
||||
"failed": True,
|
||||
}
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process batch with NER annotation using same logic as LLM step"""
|
||||
unannotated_mask = batch.get("annotated", 0) == 0
|
||||
unannotated_entries = batch[unannotated_mask]
|
||||
|
||||
if unannotated_entries.empty:
|
||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||
return batch
|
||||
|
||||
logging.info(
|
||||
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
|
||||
)
|
||||
|
||||
batch = batch.copy()
|
||||
|
||||
# Process with controlled concurrency
|
||||
max_workers = self.batch_config.max_workers
|
||||
|
||||
if len(unannotated_entries) == 1 or max_workers == 1:
|
||||
# Sequential processing
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
result = self.analyze_name(row["name"])
|
||||
for field, value in result.items():
|
||||
if field not in ["failed"]:
|
||||
batch.loc[idx, field] = value
|
||||
else:
|
||||
# Concurrent processing
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_idx = {}
|
||||
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
future = executor.submit(self.analyze_name, row["name"])
|
||||
future_to_idx[future] = idx
|
||||
|
||||
for future in as_completed(future_to_idx):
|
||||
idx = future_to_idx[future]
|
||||
try:
|
||||
result = future.result()
|
||||
for field, value in result.items():
|
||||
if field not in ["failed"]:
|
||||
batch.loc[idx, field] = value
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process row {idx}: {e}")
|
||||
batch.loc[idx, "annotated"] = 0
|
||||
|
||||
# Ensure proper data types
|
||||
batch["annotated"] = (
|
||||
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
)
|
||||
|
||||
return batch
|
||||
@@ -0,0 +1,261 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
import joblib
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
|
||||
|
||||
class BaseModel(ABC):
|
||||
"""Abstract base class for all models"""
|
||||
|
||||
def __init__(self, config: ExperimentConfig):
|
||||
self.config = config
|
||||
self.model = None
|
||||
self.feature_extractor = None
|
||||
self.label_encoder = None
|
||||
self.tokenizer = None # For neural models
|
||||
self.is_fitted = False
|
||||
self.training_history = {} # Store training history for learning curves
|
||||
self.learning_curve_data = {} # Store learning curve experiment data
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def architecture(self) -> str:
|
||||
"""Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Prepare features for training/prediction"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||
"""Fit the model - implemented differently for each architecture"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> Dict[str, float] | dict[str, np.floating[Any]]:
|
||||
"""Perform cross-validation and return average scores"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
pass
|
||||
|
||||
def predict(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Make predictions"""
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Model must be fitted before making predictions")
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
predictions = self.model.predict(X_prepared)
|
||||
|
||||
# Handle different prediction formats
|
||||
if hasattr(predictions, "shape") and len(predictions.shape) > 1:
|
||||
# Neural network outputs (probabilities)
|
||||
predictions = predictions.argmax(axis=1)
|
||||
|
||||
return self.label_encoder.inverse_transform(predictions)
|
||||
|
||||
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Get prediction probabilities if supported"""
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Model must be fitted before making predictions")
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
if hasattr(self.model, "predict_proba"):
|
||||
return self.model.predict_proba(X_prepared)
|
||||
elif hasattr(self.model, "predict"):
|
||||
# For neural networks that return probabilities directly
|
||||
probabilities = self.model.predict(X_prepared)
|
||||
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
|
||||
return probabilities
|
||||
|
||||
raise NotImplementedError("Model does not support probability predictions")
|
||||
|
||||
def get_feature_importance(self) -> Optional[Dict[str, float]]:
|
||||
"""Get feature importance if supported by the model"""
|
||||
|
||||
if hasattr(self.model, "feature_importances_"):
|
||||
# For tree-based models
|
||||
importances = self.model.feature_importances_
|
||||
feature_names = self._get_feature_names()
|
||||
return dict(zip(feature_names, importances))
|
||||
|
||||
elif hasattr(self.model, "coef_"):
|
||||
# For linear models
|
||||
coefficients = np.abs(self.model.coef_[0])
|
||||
feature_names = self._get_feature_names()
|
||||
return dict(zip(feature_names, coefficients))
|
||||
|
||||
elif (
|
||||
hasattr(self.model, "named_steps")
|
||||
and "classifier" in self.model.named_steps
|
||||
):
|
||||
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
||||
classifier = self.model.named_steps["classifier"]
|
||||
if hasattr(classifier, "coef_"):
|
||||
coefficients = np.abs(classifier.coef_[0])
|
||||
if hasattr(
|
||||
self.model.named_steps["vectorizer"], "get_feature_names_out"
|
||||
):
|
||||
feature_names = self.model.named_steps[
|
||||
"vectorizer"
|
||||
].get_feature_names_out()
|
||||
# Take top features to avoid too many n-grams
|
||||
top_indices = np.argsort(coefficients)[-20:]
|
||||
return dict(
|
||||
zip(feature_names[top_indices], coefficients[top_indices])
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def _get_feature_names(self) -> List[str]:
|
||||
"""Get feature names (override in subclasses if needed)"""
|
||||
if hasattr(self.model, "feature_names_in_"):
|
||||
return list(self.model.feature_names_in_)
|
||||
return [f"feature_{i}" for i in range(100)] # Default fallback
|
||||
|
||||
def save(self, path: str):
|
||||
"""Save the complete model with training history"""
|
||||
|
||||
model_data = {
|
||||
"model": self.model,
|
||||
"feature_extractor": self.feature_extractor,
|
||||
"label_encoder": self.label_encoder,
|
||||
"tokenizer": self.tokenizer,
|
||||
"config": self.config.to_dict(),
|
||||
"is_fitted": self.is_fitted,
|
||||
"training_history": self.training_history,
|
||||
"learning_curve_data": self.learning_curve_data,
|
||||
}
|
||||
joblib.dump(model_data, path)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str) -> "BaseModel":
|
||||
"""Load a saved model with training history"""
|
||||
model_data = joblib.load(path)
|
||||
|
||||
# Recreate the model instance
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
|
||||
config = ExperimentConfig.from_dict(model_data["config"])
|
||||
instance = cls(config)
|
||||
|
||||
# Restore state
|
||||
instance.model = model_data["model"]
|
||||
instance.feature_extractor = model_data["feature_extractor"]
|
||||
instance.label_encoder = model_data["label_encoder"]
|
||||
instance.tokenizer = model_data.get("tokenizer")
|
||||
instance.is_fitted = model_data["is_fitted"]
|
||||
instance.training_history = model_data.get("training_history", {})
|
||||
instance.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||
|
||||
return instance
|
||||
|
||||
def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
|
||||
"""Plot and save learning curve"""
|
||||
|
||||
if not self.learning_curve_data:
|
||||
logging.warning("No learning curve data available")
|
||||
return ""
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
data = self.learning_curve_data
|
||||
train_sizes = data["train_sizes"]
|
||||
train_scores = data["train_scores"]
|
||||
val_scores = data["val_scores"]
|
||||
train_std = data.get("train_scores_std", [0] * len(train_sizes))
|
||||
val_std = data.get("val_scores_std", [0] * len(train_sizes))
|
||||
|
||||
# Plot learning curves
|
||||
plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
|
||||
plt.fill_between(
|
||||
train_sizes,
|
||||
np.array(train_scores) - np.array(train_std),
|
||||
np.array(train_scores) + np.array(train_std),
|
||||
alpha=0.1,
|
||||
color="blue",
|
||||
)
|
||||
|
||||
plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
|
||||
plt.fill_between(
|
||||
train_sizes,
|
||||
np.array(val_scores) - np.array(val_std),
|
||||
np.array(val_scores) + np.array(val_std),
|
||||
alpha=0.1,
|
||||
color="red",
|
||||
)
|
||||
|
||||
plt.xlabel("Training Set Size")
|
||||
plt.ylabel("Accuracy Score")
|
||||
plt.title(f"Learning Curve - {self.__class__.__name__}")
|
||||
plt.legend(loc="best")
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
if save_path:
|
||||
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
return save_path
|
||||
else:
|
||||
plt.show()
|
||||
return ""
|
||||
|
||||
def plot_training_history(self, save_path: Optional[str] = None) -> str:
|
||||
"""Plot training history for neural networks"""
|
||||
if not self.training_history:
|
||||
logging.warning("No training history available")
|
||||
return ""
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
|
||||
|
||||
# Plot accuracy
|
||||
if "accuracy" in self.training_history:
|
||||
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
||||
if "val_accuracy" in self.training_history:
|
||||
axes[0].plot(
|
||||
self.training_history["val_accuracy"], label="Validation Accuracy"
|
||||
)
|
||||
axes[0].set_title("Model Accuracy")
|
||||
axes[0].set_xlabel("Epoch")
|
||||
axes[0].set_ylabel("Accuracy")
|
||||
axes[0].legend()
|
||||
axes[0].grid(True, alpha=0.3)
|
||||
|
||||
# Plot loss
|
||||
if "loss" in self.training_history:
|
||||
axes[1].plot(self.training_history["loss"], label="Training Loss")
|
||||
if "val_loss" in self.training_history:
|
||||
axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
|
||||
axes[1].set_title("Model Loss")
|
||||
axes[1].set_xlabel("Epoch")
|
||||
axes[1].set_ylabel("Loss")
|
||||
axes[1].legend()
|
||||
axes[1].grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
if save_path:
|
||||
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
return save_path
|
||||
else:
|
||||
plt.show()
|
||||
return ""
|
||||
@@ -0,0 +1,97 @@
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from enum import Enum
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
|
||||
from .feature_extractor import FeatureType
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExperimentConfig:
|
||||
"""Configuration for a single experiment"""
|
||||
|
||||
# Experiment metadata
|
||||
name: str
|
||||
description: str = ""
|
||||
tags: List[str] = field(default_factory=list)
|
||||
|
||||
# Model configuration
|
||||
model_type: str = (
|
||||
"logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||
)
|
||||
model_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Feature configuration
|
||||
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
|
||||
feature_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Data configuration
|
||||
train_data_filter: Optional[Dict[str, Any]] = (
|
||||
None # Filter criteria for training data
|
||||
)
|
||||
test_data_filter: Optional[Dict[str, Any]] = None
|
||||
target_column: str = "sex"
|
||||
|
||||
# Training configuration
|
||||
test_size: float = 0.2
|
||||
random_seed: int = 42
|
||||
cross_validation_folds: int = 5
|
||||
|
||||
# Evaluation configuration
|
||||
metrics: List[str] = field(
|
||||
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization"""
|
||||
result = asdict(self)
|
||||
# Convert enums to strings
|
||||
result["features"] = [f.value for f in self.features]
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
|
||||
"""Create from dictionary"""
|
||||
if "features" in data:
|
||||
data["features"] = [FeatureType(f) for f in data["features"]]
|
||||
return cls(**data)
|
||||
|
||||
|
||||
class ExperimentStatus(Enum):
|
||||
"""Experiment execution status"""
|
||||
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
def calculate_metrics(
|
||||
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
|
||||
) -> Dict[str, float]:
|
||||
"""Calculate specified metrics"""
|
||||
|
||||
if metrics is None:
|
||||
metrics = ["accuracy", "precision", "recall", "f1"]
|
||||
|
||||
results = {}
|
||||
|
||||
if "accuracy" in metrics:
|
||||
results["accuracy"] = accuracy_score(y_true, y_pred)
|
||||
|
||||
if any(m in metrics for m in ["precision", "recall", "f1"]):
|
||||
precision, recall, f1, _ = precision_recall_fscore_support(
|
||||
y_true, y_pred, average="weighted"
|
||||
)
|
||||
|
||||
if "precision" in metrics:
|
||||
results["precision"] = precision
|
||||
if "recall" in metrics:
|
||||
results["recall"] = recall
|
||||
if "f1" in metrics:
|
||||
results["f1"] = f1
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,58 @@
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExperimentResult:
|
||||
"""Results from an experiment execution"""
|
||||
|
||||
experiment_id: str
|
||||
config: ExperimentConfig
|
||||
|
||||
# Execution metadata
|
||||
start_time: datetime
|
||||
end_time: Optional[datetime] = None
|
||||
status: ExperimentStatus = ExperimentStatus.PENDING
|
||||
error_message: Optional[str] = None
|
||||
|
||||
# Model artifacts
|
||||
model_path: Optional[str] = None
|
||||
feature_extractor_path: Optional[str] = None
|
||||
|
||||
# Metrics
|
||||
train_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
test_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
cv_metrics: Dict[str, float] = field(default_factory=dict)
|
||||
|
||||
# Additional results
|
||||
confusion_matrix: Optional[List[List[int]]] = None
|
||||
feature_importance: Optional[Dict[str, float]] = None
|
||||
prediction_examples: Optional[List[Dict]] = None
|
||||
|
||||
# Data statistics
|
||||
train_size: int = 0
|
||||
test_size: int = 0
|
||||
class_distribution: Dict[str, int] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization"""
|
||||
result = asdict(self)
|
||||
result["config"] = self.config.to_dict()
|
||||
result["start_time"] = self.start_time.isoformat()
|
||||
result["end_time"] = self.end_time.isoformat() if self.end_time else None
|
||||
result["status"] = self.status.value
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
|
||||
"""Create from dictionary"""
|
||||
data["config"] = ExperimentConfig.from_dict(data["config"])
|
||||
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
||||
data["end_time"] = (
|
||||
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||
)
|
||||
data["status"] = ExperimentStatus(data["status"])
|
||||
return cls(**data)
|
||||
@@ -0,0 +1,112 @@
|
||||
import logging
|
||||
from typing import List, Dict
|
||||
|
||||
import yaml
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
from ners.research.experiment.feature_extractor import FeatureType
|
||||
|
||||
|
||||
class ExperimentBuilder:
|
||||
"""Helper class to build experiment configurations"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
|
||||
def load_templates(self, templates: str = "research_templates.yaml") -> dict:
|
||||
"""Load research templates from YAML file"""
|
||||
try:
|
||||
with open(self.config.paths.configs_dir / templates, "r") as file:
|
||||
return yaml.safe_load(file)
|
||||
except FileNotFoundError:
|
||||
logging.error(f"Templates file not found: {templates}")
|
||||
raise
|
||||
except yaml.YAMLError as e:
|
||||
logging.error(f"Error parsing templates file: {e}")
|
||||
raise
|
||||
|
||||
@classmethod
|
||||
def find_template(
|
||||
cls, templates: dict, name: str, experiment_type: str = "baseline"
|
||||
) -> dict:
|
||||
"""Find experiment configuration by name and type"""
|
||||
|
||||
# Map type to section in templates
|
||||
type_mapping = {
|
||||
"baseline": "baseline_experiments",
|
||||
"advanced": "advanced_experiments",
|
||||
"feature_study": "feature_studies",
|
||||
"tuning": "hyperparameter_tuning",
|
||||
}
|
||||
|
||||
section_name = type_mapping.get(experiment_type)
|
||||
if not section_name:
|
||||
available_types = list(type_mapping.keys())
|
||||
raise ValueError(
|
||||
f"Unknown experiment type '{experiment_type}'. Available types: {available_types}"
|
||||
)
|
||||
|
||||
if section_name not in templates:
|
||||
raise ValueError(f"Section '{section_name}' not found in templates")
|
||||
|
||||
experiments = templates[section_name]
|
||||
|
||||
# Search for experiment by model name
|
||||
for experiment in experiments:
|
||||
# Check if this is the experiment we're looking for
|
||||
# Look for experiments that match the model type or contain the name
|
||||
if (
|
||||
experiment.get("model_type") == name
|
||||
or name.lower() in experiment.get("name", "").lower()
|
||||
or experiment.get("name") == name
|
||||
or f"baseline_{name}" == experiment.get("name")
|
||||
or f"advanced_{name}" == experiment.get("name")
|
||||
):
|
||||
return experiment
|
||||
|
||||
# If not found, list available experiments
|
||||
available_experiments = [
|
||||
exp.get("name", exp.get("model_type", "unknown")) for exp in experiments
|
||||
]
|
||||
raise ValueError(
|
||||
f"Experiment '{name}' not found in '{experiment_type}' section. "
|
||||
f"Available experiments: {available_experiments}"
|
||||
)
|
||||
|
||||
def get_templates(
|
||||
self, templates_path: str = "research_templates.yaml"
|
||||
) -> Dict[str, List[Dict]]:
|
||||
"""Get all available experiments from templates organized by type"""
|
||||
templates = self.load_templates(templates_path)
|
||||
|
||||
return {
|
||||
"baseline": templates.get("baseline_experiments", []),
|
||||
"advanced": templates.get("advanced_experiments", []),
|
||||
"feature_study": templates.get("feature_studies", []),
|
||||
"tuning": templates.get("hyperparameter_tuning", []),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_template(cls, template_config: dict) -> ExperimentConfig:
|
||||
"""Create an ExperimentConfig from a template configuration"""
|
||||
# Convert feature strings to FeatureType objects
|
||||
features = []
|
||||
for feature_str in template_config.get("features", []):
|
||||
try:
|
||||
features.append(FeatureType(feature_str))
|
||||
except ValueError:
|
||||
logging.warning(f"Unknown feature type: {feature_str}")
|
||||
continue
|
||||
|
||||
return ExperimentConfig(
|
||||
name=template_config.get("name"),
|
||||
description=template_config.get("description"),
|
||||
model_type=template_config.get("model_type"),
|
||||
features=features,
|
||||
model_params=template_config.get("model_params", {}),
|
||||
tags=template_config.get("tags", []),
|
||||
test_size=template_config.get("test_size", 0.2),
|
||||
cross_validation_folds=template_config.get("cross_validation_folds", 5),
|
||||
train_data_filter=template_config.get("train_data_filter"),
|
||||
)
|
||||
@@ -0,0 +1,285 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment import (
|
||||
ExperimentConfig,
|
||||
ExperimentStatus,
|
||||
calculate_metrics,
|
||||
)
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
from ners.research.model_registry import create_model
|
||||
|
||||
|
||||
class ExperimentRunner:
|
||||
"""Runs and manages experiments"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.tracker = ExperimentTracker(self.config)
|
||||
self.data_loader = DataLoader(self.config)
|
||||
|
||||
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
|
||||
"""Run a single experiment and return experiment ID"""
|
||||
# Create experiment
|
||||
experiment_id = self.tracker.create_experiment(experiment_config)
|
||||
|
||||
try:
|
||||
logging.info(f"Starting experiment: {experiment_id}")
|
||||
self.tracker.update_experiment(
|
||||
experiment_id, status=ExperimentStatus.RUNNING
|
||||
)
|
||||
|
||||
# Load data
|
||||
filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Apply data filters if specified
|
||||
df = self._apply_data_filters(df, experiment_config)
|
||||
|
||||
# Prepare target variable
|
||||
y = df[experiment_config.target_column]
|
||||
X = df
|
||||
|
||||
# Split data
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
test_size=experiment_config.test_size,
|
||||
random_state=experiment_config.random_seed,
|
||||
stratify=y,
|
||||
)
|
||||
|
||||
# Create and train model
|
||||
model = create_model(experiment_config)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Make predictions
|
||||
train_pred = model.predict(X_train)
|
||||
test_pred = model.predict(X_test)
|
||||
|
||||
# Calculate metrics
|
||||
train_metrics = calculate_metrics(
|
||||
y_train, train_pred, experiment_config.metrics
|
||||
)
|
||||
test_metrics = calculate_metrics(
|
||||
y_test, test_pred, experiment_config.metrics
|
||||
)
|
||||
|
||||
# Cross-validation if requested
|
||||
cv_metrics = {}
|
||||
if experiment_config.cross_validation_folds > 1:
|
||||
cv_metrics = model.cross_validate(
|
||||
X_train, y_train, experiment_config.cross_validation_folds
|
||||
)
|
||||
|
||||
# Additional analysis
|
||||
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
|
||||
feature_importance = model.get_feature_importance()
|
||||
|
||||
# Create prediction examples
|
||||
prediction_examples = self._create_prediction_examples(
|
||||
X_test, y_test, test_pred, model, n_examples=10
|
||||
)
|
||||
|
||||
# Calculate class distribution
|
||||
class_distribution = y.value_counts().to_dict()
|
||||
|
||||
# Save model
|
||||
model_path = self._save_model(model, experiment_id)
|
||||
|
||||
# Update experiment with results
|
||||
self.tracker.update_experiment(
|
||||
experiment_id,
|
||||
status=ExperimentStatus.COMPLETED,
|
||||
end_time=datetime.now(),
|
||||
model_path=str(model_path),
|
||||
train_metrics=train_metrics,
|
||||
test_metrics=test_metrics,
|
||||
cv_metrics=cv_metrics,
|
||||
confusion_matrix=conf_matrix,
|
||||
feature_importance=feature_importance,
|
||||
prediction_examples=prediction_examples,
|
||||
train_size=len(X_train),
|
||||
test_size=len(X_test),
|
||||
class_distribution=class_distribution,
|
||||
)
|
||||
|
||||
logging.info(f"Experiment {experiment_id} completed successfully")
|
||||
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
|
||||
|
||||
return experiment_id
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
|
||||
self.tracker.update_experiment(
|
||||
experiment_id,
|
||||
status=ExperimentStatus.FAILED,
|
||||
end_time=datetime.now(),
|
||||
error_message=str(e),
|
||||
)
|
||||
raise
|
||||
|
||||
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
|
||||
"""Run multiple experiments"""
|
||||
experiment_ids = []
|
||||
|
||||
for i, config in enumerate(experiments):
|
||||
logging.info(
|
||||
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
|
||||
)
|
||||
try:
|
||||
exp_id = self.run_experiment(config)
|
||||
experiment_ids.append(exp_id)
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to run experiment {config.name}: {e}")
|
||||
continue
|
||||
|
||||
return experiment_ids
|
||||
|
||||
@classmethod
|
||||
def _apply_data_filters(
|
||||
cls, df: pd.DataFrame, config: ExperimentConfig
|
||||
) -> pd.DataFrame:
|
||||
"""Apply data filters specified in experiment config"""
|
||||
filtered_df = df.copy()
|
||||
|
||||
# Apply training data filters
|
||||
if config.train_data_filter:
|
||||
for column, criteria in config.train_data_filter.items():
|
||||
if column in filtered_df.columns:
|
||||
if isinstance(criteria, list):
|
||||
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
||||
elif isinstance(criteria, dict):
|
||||
if "min" in criteria:
|
||||
filtered_df = filtered_df[
|
||||
filtered_df[column] >= criteria["min"]
|
||||
]
|
||||
if "max" in criteria:
|
||||
filtered_df = filtered_df[
|
||||
filtered_df[column] <= criteria["max"]
|
||||
]
|
||||
else:
|
||||
filtered_df = filtered_df[filtered_df[column] == criteria]
|
||||
|
||||
return filtered_df
|
||||
|
||||
@classmethod
|
||||
def _create_prediction_examples(
|
||||
cls,
|
||||
X_test: pd.DataFrame,
|
||||
y_test: pd.Series,
|
||||
predictions: np.ndarray,
|
||||
model: BaseModel,
|
||||
n_examples: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""Create prediction examples for analysis"""
|
||||
examples = []
|
||||
|
||||
# Get both correct and incorrect predictions
|
||||
correct_mask = y_test == predictions
|
||||
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
|
||||
correct_indices = X_test[correct_mask].index[: n_examples // 2]
|
||||
|
||||
sample_indices = list(incorrect_indices) + list(correct_indices)
|
||||
|
||||
for idx in sample_indices[:n_examples]:
|
||||
example = {
|
||||
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
|
||||
"true_label": y_test.loc[idx],
|
||||
"predicted_label": predictions[X_test.index.get_loc(idx)],
|
||||
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
|
||||
}
|
||||
|
||||
# Add probability if available
|
||||
if model.architecture == "traditional":
|
||||
proba = model.predict_proba(X_test.loc[[idx]])
|
||||
example["prediction_confidence"] = float(proba.max())
|
||||
|
||||
examples.append(example)
|
||||
|
||||
return examples
|
||||
|
||||
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
|
||||
"""Save trained model"""
|
||||
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model_path = model_dir / "model.joblib"
|
||||
model.save(str(model_path))
|
||||
|
||||
return model_path
|
||||
|
||||
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
|
||||
"""Load a model from a completed experiment"""
|
||||
experiment = self.tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.model_path:
|
||||
try:
|
||||
# Load the saved model data Recreate the model instance using the saved config
|
||||
model_data = joblib.load(experiment.model_path)
|
||||
config = ExperimentConfig.from_dict(model_data["config"])
|
||||
model = create_model(config)
|
||||
|
||||
# Restore the saved state
|
||||
model.model = model_data["model"]
|
||||
model.feature_extractor = model_data["feature_extractor"]
|
||||
model.label_encoder = model_data["label_encoder"]
|
||||
model.tokenizer = model_data.get("tokenizer")
|
||||
model.is_fitted = model_data["is_fitted"]
|
||||
model.training_history = model_data.get("training_history", {})
|
||||
model.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||
|
||||
# Restore vectorizers and encoders for models that use them (like XGBoost)
|
||||
if "vectorizers" in model_data and hasattr(model, "vectorizers"):
|
||||
model.vectorizers = model_data["vectorizers"]
|
||||
if "label_encoders" in model_data and hasattr(model, "label_encoders"):
|
||||
model.label_encoders = model_data["label_encoders"]
|
||||
|
||||
return model
|
||||
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"Failed to load model for experiment {experiment_id}: {e}"
|
||||
)
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def compare_experiments(
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
) -> pd.DataFrame:
|
||||
"""Compare experiments and return analysis"""
|
||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||
|
||||
if f"test_{metric}" in comparison_df.columns:
|
||||
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
|
||||
|
||||
return comparison_df
|
||||
|
||||
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
|
||||
"""Get feature importance analysis for an experiment"""
|
||||
experiment = self.tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.feature_importance:
|
||||
importance_df = pd.DataFrame(
|
||||
[
|
||||
{"feature": feature, "importance": importance}
|
||||
for feature, importance in experiment.feature_importance.items()
|
||||
]
|
||||
)
|
||||
return importance_df.sort_values("importance", ascending=False)
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,200 @@
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config import PipelineConfig, get_config
|
||||
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from ners.research.experiment.experiement_result import ExperimentResult
|
||||
|
||||
|
||||
class ExperimentTracker:
|
||||
"""Tracks and manages experiments"""
|
||||
|
||||
def __init__(self, config: Optional[PipelineConfig] = None):
|
||||
self.config = config or get_config()
|
||||
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
|
||||
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.results_db_path = self.experiments_dir / "experiments.json"
|
||||
self._results: Dict[str, ExperimentResult] = {}
|
||||
self._load_results()
|
||||
|
||||
def _load_results(self):
|
||||
"""Load existing experiment results"""
|
||||
if self.results_db_path.exists():
|
||||
try:
|
||||
with open(self.results_db_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for exp_id, exp_data in data.items():
|
||||
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load experiment results: {e}")
|
||||
|
||||
def _save_results(self):
|
||||
"""Save experiment results to disk"""
|
||||
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
|
||||
|
||||
with open(self.results_db_path, "w") as f:
|
||||
json.dump(data, f, indent=2, default=str)
|
||||
|
||||
def create_experiment(self, config: ExperimentConfig) -> str:
|
||||
"""Create a new experiment and return its ID"""
|
||||
# Generate experiment ID
|
||||
config_hash = hashlib.md5(
|
||||
json.dumps(config.to_dict(), sort_keys=True).encode()
|
||||
).hexdigest()[:8]
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
|
||||
|
||||
# Create result object
|
||||
result = ExperimentResult(
|
||||
experiment_id=experiment_id, config=config, start_time=datetime.now()
|
||||
)
|
||||
|
||||
self._results[experiment_id] = result
|
||||
self._save_results()
|
||||
|
||||
return experiment_id
|
||||
|
||||
def update_experiment(self, experiment_id: str, **updates):
|
||||
"""Update an experiment's results"""
|
||||
if experiment_id in self._results:
|
||||
result = self._results[experiment_id]
|
||||
|
||||
for key, value in updates.items():
|
||||
if hasattr(result, key):
|
||||
setattr(result, key, value)
|
||||
|
||||
self._save_results()
|
||||
|
||||
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
|
||||
"""Get experiment by ID"""
|
||||
return self._results.get(experiment_id)
|
||||
|
||||
def list_experiments(
|
||||
self,
|
||||
status: Optional[ExperimentStatus] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
) -> List[ExperimentResult]:
|
||||
"""List experiments with optional filtering"""
|
||||
results = list(self._results.values())
|
||||
|
||||
if status:
|
||||
results = [r for r in results if r.status == status]
|
||||
|
||||
if tags:
|
||||
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
|
||||
|
||||
if model_type:
|
||||
results = [r for r in results if r.config.model_type == model_type]
|
||||
|
||||
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||
|
||||
def get_best_experiment(
|
||||
self,
|
||||
metric: str = "accuracy",
|
||||
dataset: str = "test",
|
||||
filters: Optional[Dict] = None,
|
||||
) -> Optional[ExperimentResult]:
|
||||
"""Get the best experiment based on a metric"""
|
||||
experiments = self.list_experiments()
|
||||
|
||||
if filters:
|
||||
# Apply additional filters
|
||||
if "model_type" in filters:
|
||||
experiments = [
|
||||
e
|
||||
for e in experiments
|
||||
if e.config.model_type == filters["model_type"]
|
||||
]
|
||||
if "features" in filters:
|
||||
experiments = [
|
||||
e
|
||||
for e in experiments
|
||||
if any(f in e.config.features for f in filters["features"])
|
||||
]
|
||||
|
||||
valid_experiments = []
|
||||
for exp in experiments:
|
||||
if exp.status == ExperimentStatus.COMPLETED:
|
||||
metrics_dict = (
|
||||
exp.test_metrics if dataset == "test" else exp.train_metrics
|
||||
)
|
||||
if metric in metrics_dict:
|
||||
valid_experiments.append((exp, metrics_dict[metric]))
|
||||
|
||||
if not valid_experiments:
|
||||
return None
|
||||
|
||||
return max(valid_experiments, key=lambda x: x[1])[0]
|
||||
|
||||
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
|
||||
"""Compare multiple experiments in a DataFrame"""
|
||||
rows = []
|
||||
|
||||
for exp_id in experiment_ids:
|
||||
exp = self.get_experiment(exp_id)
|
||||
if exp:
|
||||
row = {
|
||||
"experiment_id": exp_id,
|
||||
"name": exp.config.name,
|
||||
"model_type": exp.config.model_type,
|
||||
"features": ",".join([f.value for f in exp.config.features]),
|
||||
"status": exp.status.value,
|
||||
"train_size": exp.train_size,
|
||||
"test_size": exp.test_size,
|
||||
}
|
||||
|
||||
# Add metrics
|
||||
for metric, value in exp.test_metrics.items():
|
||||
row[f"test_{metric}"] = value
|
||||
|
||||
for metric, value in exp.cv_metrics.items():
|
||||
row[f"cv_{metric}"] = value
|
||||
|
||||
rows.append(row)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
def export_results(self, output_path: Optional[Path] = None) -> Path:
|
||||
"""Export all results to CSV"""
|
||||
if output_path is None:
|
||||
output_path = (
|
||||
self.experiments_dir
|
||||
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
)
|
||||
|
||||
rows = []
|
||||
for exp in self._results.values():
|
||||
row = {
|
||||
"experiment_id": exp.experiment_id,
|
||||
"name": exp.config.name,
|
||||
"description": exp.config.description,
|
||||
"model_type": exp.config.model_type,
|
||||
"features": ",".join([f.value for f in exp.config.features]),
|
||||
"status": exp.status.value,
|
||||
"start_time": exp.start_time.isoformat(),
|
||||
"end_time": exp.end_time.isoformat() if exp.end_time else None,
|
||||
"train_size": exp.train_size,
|
||||
"test_size": exp.test_size,
|
||||
}
|
||||
|
||||
# Add all metrics
|
||||
for metric, value in exp.test_metrics.items():
|
||||
row[f"test_{metric}"] = value
|
||||
|
||||
for metric, value in exp.cv_metrics.items():
|
||||
row[f"cv_{metric}"] = value
|
||||
|
||||
rows.append(row)
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
df.to_csv(output_path, index=False)
|
||||
|
||||
return output_path
|
||||
@@ -0,0 +1,92 @@
|
||||
from enum import Enum
|
||||
from typing import List, Dict, Any, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class FeatureType(Enum):
|
||||
"""Types of features that can be extracted from names"""
|
||||
|
||||
FULL_NAME = "full_name"
|
||||
NATIVE_NAME = "native_name"
|
||||
SURNAME = "surname"
|
||||
FIRST_WORD = "first_word"
|
||||
LAST_WORD = "last_word"
|
||||
NAME_LENGTH = "name_length"
|
||||
WORD_COUNT = "word_count"
|
||||
PROVINCE = "province"
|
||||
CHAR_NGRAMS = "char_ngrams"
|
||||
WORD_NGRAMS = "word_ngrams"
|
||||
NAME_ENDINGS = "name_endings"
|
||||
NAME_BEGINNINGS = "name_beginnings"
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
"""Extract different types of features from name data"""
|
||||
|
||||
def __init__(
|
||||
self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None
|
||||
):
|
||||
self.feature_types = feature_types
|
||||
self.feature_params = feature_params or {}
|
||||
|
||||
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Extract all configured features"""
|
||||
features_df = pd.DataFrame(index=df.index)
|
||||
|
||||
for feature_type in self.feature_types:
|
||||
feature_data = self._extract_single_feature(df, feature_type)
|
||||
|
||||
if isinstance(feature_data, pd.DataFrame):
|
||||
features_df = pd.concat([features_df, feature_data], axis=1)
|
||||
else:
|
||||
features_df[feature_type.value] = feature_data
|
||||
|
||||
return features_df
|
||||
|
||||
def _extract_single_feature(
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
) -> Union[pd.Series, pd.DataFrame]:
|
||||
"""Extract a single type of feature"""
|
||||
if feature_type == FeatureType.FULL_NAME:
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NATIVE_NAME:
|
||||
return df["identified_name"].fillna(df["probable_native"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.SURNAME:
|
||||
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
|
||||
|
||||
elif feature_type == FeatureType.FIRST_WORD:
|
||||
return df["name"].str.split().str[0].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.LAST_WORD:
|
||||
return df["name"].str.split().str[-1].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_LENGTH:
|
||||
return df["name"].str.len().fillna(0)
|
||||
|
||||
elif feature_type == FeatureType.WORD_COUNT:
|
||||
return df["words"].fillna(1)
|
||||
|
||||
elif feature_type == FeatureType.PROVINCE:
|
||||
return df["province"].fillna("unknown")
|
||||
|
||||
elif feature_type == FeatureType.NAME_ENDINGS:
|
||||
n = self.feature_params.get("ending_length", 3)
|
||||
return df["name"].str[-n:].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.NAME_BEGINNINGS:
|
||||
n = self.feature_params.get("beginning_length", 3)
|
||||
return df["name"].str[:n].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.CHAR_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
elif feature_type == FeatureType.WORD_NGRAMS:
|
||||
# This will be handled by the model's vectorizer
|
||||
return df["name"].fillna("")
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown feature type: {feature_type}")
|
||||
@@ -0,0 +1,44 @@
|
||||
from typing import List
|
||||
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
from ners.research.models.bigru_model import BiGRUModel
|
||||
from ners.research.models.cnn_model import CNNModel
|
||||
from ners.research.models.ensemble_model import EnsembleModel
|
||||
from ners.research.models.lightgbm_model import LightGBMModel
|
||||
from ners.research.models.logistic_regression_model import LogisticRegressionModel
|
||||
from ners.research.models.lstm_model import LSTMModel
|
||||
from ners.research.models.naive_bayes_model import NaiveBayesModel
|
||||
from ners.research.models.random_forest_model import RandomForestModel
|
||||
from ners.research.models.svm_model import SVMModel
|
||||
from ners.research.models.transformer_model import TransformerModel
|
||||
from ners.research.models.xgboost_model import XGBoostModel
|
||||
|
||||
MODEL_REGISTRY = {
|
||||
"bigru": BiGRUModel,
|
||||
"cnn": CNNModel,
|
||||
"ensemble": EnsembleModel,
|
||||
"lightgbm": LightGBMModel,
|
||||
"logistic_regression": LogisticRegressionModel,
|
||||
"lstm": LSTMModel,
|
||||
"naive_bayes": NaiveBayesModel,
|
||||
"random_forest": RandomForestModel,
|
||||
"svm": SVMModel,
|
||||
"transformer": TransformerModel,
|
||||
"xgboost": XGBoostModel,
|
||||
}
|
||||
|
||||
|
||||
def create_model(config: ExperimentConfig) -> BaseModel:
|
||||
"""Factory function to create models"""
|
||||
model_class = MODEL_REGISTRY.get(config.model_type)
|
||||
|
||||
if model_class is None:
|
||||
raise ValueError(f"Unknown model type: {config.model_type}")
|
||||
|
||||
return model_class(config)
|
||||
|
||||
|
||||
def list_available_models() -> List[str]:
|
||||
"""List all available model types"""
|
||||
return list(MODEL_REGISTRY.keys())
|
||||
@@ -0,0 +1,301 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ners.core.config import get_config
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.research.experiment import FeatureType, ExperimentConfig
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
from ners.research.model_registry import MODEL_REGISTRY
|
||||
|
||||
|
||||
class ModelTrainer:
|
||||
"""Comprehensive model training and artifact management"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
self.config = config or get_config()
|
||||
self.data_loader = DataLoader(self.config)
|
||||
self.experiment_runner = ExperimentRunner(self.config)
|
||||
self.experiment_tracker = ExperimentTracker(self.config)
|
||||
|
||||
# Setup model artifacts directory
|
||||
self.models_dir = self.config.paths.models_dir
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def train_single_model(
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
tags: List[str] = None,
|
||||
save_artifacts: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Train a single model and save its artifacts.
|
||||
Returns the experiment ID.
|
||||
"""
|
||||
logging.info(f"Training {model_type} model: {model_name}")
|
||||
|
||||
if features is None:
|
||||
features = ["full_name"]
|
||||
feature_types = [FeatureType(f) for f in features]
|
||||
|
||||
# Prepare tags - combine default tags with template tags
|
||||
default_tags = ["training", model_type]
|
||||
experiment_tags = default_tags + (tags or [])
|
||||
|
||||
# Create experiment configuration
|
||||
config = ExperimentConfig(
|
||||
name=model_name,
|
||||
description=f"Training {model_type} model with features: {', '.join(features)}",
|
||||
model_type=model_type,
|
||||
features=feature_types,
|
||||
model_params=model_params or {},
|
||||
tags=experiment_tags,
|
||||
)
|
||||
|
||||
# Run experiment
|
||||
experiment_id = self.experiment_runner.run_experiment(config)
|
||||
experiment = self.experiment_tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.test_metrics:
|
||||
logging.info("Training completed successfully!")
|
||||
logging.info(f"Experiment ID: {experiment_id}")
|
||||
logging.info(
|
||||
f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}"
|
||||
)
|
||||
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
|
||||
|
||||
if save_artifacts:
|
||||
self.save_model_artifacts(experiment_id)
|
||||
|
||||
return experiment_id
|
||||
|
||||
def train_multiple_models(
|
||||
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||
) -> List[str]:
|
||||
"""
|
||||
Train multiple models with different configurations.
|
||||
"""
|
||||
logging.info(f"Training {len(model_configs)} models...")
|
||||
|
||||
experiment_ids = []
|
||||
|
||||
for i, config in enumerate(model_configs):
|
||||
model_name = f"{base_name}_{config['model_type']}_{i + 1}"
|
||||
|
||||
try:
|
||||
exp_id = self.train_single_model(
|
||||
model_name=model_name,
|
||||
model_type=config["model_type"],
|
||||
features=config.get("features", ["full_name"]),
|
||||
model_params=config.get("model_params", {}),
|
||||
save_artifacts=save_all,
|
||||
)
|
||||
experiment_ids.append(exp_id)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to train {model_name}: {e}")
|
||||
continue
|
||||
|
||||
logging.info(f"Completed training {len(experiment_ids)} models successfully")
|
||||
return experiment_ids
|
||||
|
||||
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
|
||||
"""
|
||||
Save model artifacts in a structured way for easy loading.
|
||||
Returns paths to saved artifacts.
|
||||
"""
|
||||
experiment = self.experiment_tracker.get_experiment(experiment_id)
|
||||
if not experiment:
|
||||
raise ValueError(f"Experiment {experiment_id} not found")
|
||||
|
||||
# Create model-specific directory
|
||||
model_dir = self.models_dir / experiment_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load the trained model
|
||||
trained_model = self.experiment_runner.load_experiment_model(experiment_id)
|
||||
if not trained_model:
|
||||
raise ValueError(f"Could not load model for experiment {experiment_id}")
|
||||
|
||||
# Save complete model with joblib
|
||||
model_path = model_dir / "complete_model.joblib"
|
||||
trained_model.save(str(model_path))
|
||||
|
||||
# Save model configuration
|
||||
config_path = model_dir / "model_config.json"
|
||||
with open(config_path, "w") as f:
|
||||
import json
|
||||
|
||||
json.dump(experiment.config.to_dict(), f, indent=2)
|
||||
|
||||
# Save experiment results
|
||||
results_path = model_dir / "experiment_results.json"
|
||||
with open(results_path, "w") as f:
|
||||
json.dump(experiment.to_dict(), f, indent=2, default=str)
|
||||
|
||||
# Generate and save learning curves
|
||||
learning_curve_path = None
|
||||
training_history_path = None
|
||||
|
||||
try:
|
||||
# Load data for learning curve generation
|
||||
data_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
if data_path.exists():
|
||||
df = self.data_loader.load_csv_complete(data_path)
|
||||
|
||||
# Generate learning curve
|
||||
logging.info("Generating learning curve...")
|
||||
trained_model.generate_learning_curve(
|
||||
df, df[experiment.config.target_column]
|
||||
)
|
||||
|
||||
# Plot and save learning curve
|
||||
learning_curve_path = model_dir / "learning_curve.png"
|
||||
trained_model.plot_learning_curve(str(learning_curve_path))
|
||||
|
||||
# Plot and save training history (for neural networks)
|
||||
if trained_model.training_history:
|
||||
training_history_path = model_dir / "training_history.png"
|
||||
trained_model.plot_training_history(str(training_history_path))
|
||||
|
||||
# Save learning curve data as JSON
|
||||
learning_data_path = model_dir / "learning_curve_data.json"
|
||||
with open(learning_data_path, "w") as f:
|
||||
json.dump(trained_model.learning_curve_data, f, indent=2)
|
||||
|
||||
# Save training history data as JSON
|
||||
if trained_model.training_history:
|
||||
history_data_path = model_dir / "training_history_data.json"
|
||||
with open(history_data_path, "w") as f:
|
||||
json.dump(trained_model.training_history, f, indent=2)
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not generate learning curves: {e}")
|
||||
|
||||
# Save artifacts metadata
|
||||
metadata = {
|
||||
"experiment_id": experiment_id,
|
||||
"model_name": experiment.config.name,
|
||||
"model_type": experiment.config.model_type,
|
||||
"features": [f.value for f in experiment.config.features],
|
||||
"training_date": datetime.now().isoformat(),
|
||||
"test_accuracy": experiment.test_metrics.get("accuracy", 0),
|
||||
"test_f1": experiment.test_metrics.get("f1", 0),
|
||||
"model_path": str(model_path),
|
||||
"config_path": str(config_path),
|
||||
"results_path": str(results_path),
|
||||
"learning_curve_plot": str(learning_curve_path)
|
||||
if learning_curve_path
|
||||
else None,
|
||||
"training_history_plot": str(training_history_path)
|
||||
if training_history_path
|
||||
else None,
|
||||
"has_learning_curve": bool(trained_model.learning_curve_data),
|
||||
"has_training_history": bool(trained_model.training_history),
|
||||
}
|
||||
|
||||
metadata_path = model_dir / "metadata.json"
|
||||
with open(metadata_path, "w") as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
logging.info(f"Model artifacts saved to: {model_dir}")
|
||||
logging.info(f" - Complete model: {model_path.name}")
|
||||
logging.info(f" - Configuration: {config_path.name}")
|
||||
logging.info(f" - Results: {results_path.name}")
|
||||
logging.info(f" - Metadata: {metadata_path.name}")
|
||||
|
||||
if learning_curve_path and learning_curve_path.exists():
|
||||
logging.info(f" - Learning curve: {learning_curve_path.name}")
|
||||
|
||||
if training_history_path and training_history_path.exists():
|
||||
logging.info(f" - Training history: {training_history_path.name}")
|
||||
|
||||
return {
|
||||
"model_dir": str(model_dir),
|
||||
"model_path": str(model_path),
|
||||
"config_path": str(config_path),
|
||||
"results_path": str(results_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"learning_curve_plot": str(learning_curve_path)
|
||||
if learning_curve_path
|
||||
else None,
|
||||
"training_history_plot": str(training_history_path)
|
||||
if training_history_path
|
||||
else None,
|
||||
}
|
||||
|
||||
def load_trained_model(self, experiment_id: str):
|
||||
"""
|
||||
Load a previously trained model from artifacts.
|
||||
"""
|
||||
model_dir = self.models_dir / experiment_id
|
||||
model_path = model_dir / "complete_model.joblib"
|
||||
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Model artifacts not found for experiment {experiment_id}"
|
||||
)
|
||||
|
||||
# Load the model class dynamically
|
||||
metadata_path = model_dir / "metadata.json"
|
||||
with open(metadata_path, "r") as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
model_type = metadata["model_type"]
|
||||
model_class = MODEL_REGISTRY[model_type]
|
||||
|
||||
# Load the complete model
|
||||
loaded_model = model_class.load(str(model_path))
|
||||
|
||||
logging.info(f"Loaded model: {metadata['model_name']}")
|
||||
logging.info(f" Type: {model_type}")
|
||||
logging.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
|
||||
|
||||
return loaded_model
|
||||
|
||||
def list_saved_models(self) -> pd.DataFrame:
|
||||
"""
|
||||
List all saved model artifacts.
|
||||
"""
|
||||
models_data = []
|
||||
|
||||
for model_dir in self.models_dir.iterdir():
|
||||
if model_dir.is_dir():
|
||||
metadata_path = model_dir / "metadata.json"
|
||||
if metadata_path.exists():
|
||||
try:
|
||||
with open(metadata_path, "r") as f:
|
||||
metadata = json.load(f)
|
||||
models_data.append(metadata)
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
f"Could not read metadata for {model_dir.name}: {e}"
|
||||
)
|
||||
|
||||
if not models_data:
|
||||
logging.info("No saved models found.")
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(models_data)
|
||||
|
||||
# Format the display
|
||||
display_columns = [
|
||||
"model_name",
|
||||
"model_type",
|
||||
"features",
|
||||
"test_accuracy",
|
||||
"test_f1",
|
||||
"training_date",
|
||||
]
|
||||
available_columns = [col for col in display_columns if col in df.columns]
|
||||
|
||||
return df[available_columns].sort_values("training_date", ascending=False)
|
||||
@@ -0,0 +1,72 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class BiGRUModel(NeuralNetworkModel):
|
||||
"""Bidirectional GRU model for name classification"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
# Mask padding tokens so recurrent layers ignore them; fix input length
|
||||
# for better shape inference and to support masking through the stack.
|
||||
Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
mask_zero=True,
|
||||
),
|
||||
# First recurrent block returns full sequences to allow stacking.
|
||||
# Moderate dropout + optional recurrent_dropout to reduce overfitting
|
||||
# on short names while retaining temporal signal.
|
||||
Bidirectional(
|
||||
GRU(
|
||||
params.get("gru_units", 32),
|
||||
return_sequences=True,
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Second GRU summarizes to the last hidden state (no return_sequences),
|
||||
# capturing bidirectional context efficiently for classification.
|
||||
Bidirectional(
|
||||
GRU(
|
||||
params.get("gru_units", 32),
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Small dense head; ReLU + dropout for capacity and regularization.
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
# Two-way softmax for binary gender classification.
|
||||
Dense(2, activation="softmax", dtype="float32"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy",
|
||||
optimizer="adam",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,86 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import (
|
||||
Embedding,
|
||||
Conv1D,
|
||||
MaxPooling1D,
|
||||
GlobalMaxPooling1D,
|
||||
Dense,
|
||||
Dropout,
|
||||
SpatialDropout1D,
|
||||
)
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class CNNModel(NeuralNetworkModel):
|
||||
"""1D Convolutional Neural Network for character patterns"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||
"""Build CNN model with known vocabulary size"""
|
||||
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
# Learn char/subword embeddings; spatial dropout regularizes across channels
|
||||
# to make the model robust to noisy characters and transliteration.
|
||||
Embedding(
|
||||
input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)
|
||||
),
|
||||
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
|
||||
# Small kernels capture short n-gram like patterns; padding='same' keeps
|
||||
# sequence length stable for simpler pooling behavior.
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
padding="same",
|
||||
),
|
||||
# Downsample to gain some position invariance and reduce computation.
|
||||
MaxPooling1D(pool_size=2),
|
||||
# Second conv layer to compose higher-level motifs (e.g., suffix+vowel).
|
||||
Conv1D(
|
||||
filters=params.get("filters", 64),
|
||||
kernel_size=params.get("kernel_size", 3),
|
||||
activation="relu",
|
||||
padding="same",
|
||||
),
|
||||
# Global max pooling picks strongest motif evidence anywhere in the name.
|
||||
GlobalMaxPooling1D(),
|
||||
# Compact dense head with dropout to control overfitting.
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
# Two-way softmax for binary classification.
|
||||
Dense(2, activation="softmax", dtype="float32"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy",
|
||||
optimizer="adam",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
"""Prepare sequences for CNN using extracted features"""
|
||||
# X here contains the features already extracted by FeatureExtractor
|
||||
# Get text data from extracted features - use character level for CNN
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
# Initialize character-level tokenizer
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get(
|
||||
"max_len", 20
|
||||
) # Longer for character level
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,110 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class EnsembleModel(TraditionalModel):
|
||||
"""Ensemble model combining multiple base models"""
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
"""Return the architecture type"""
|
||||
return "ensemble"
|
||||
|
||||
def __init__(self, config: ExperimentConfig):
|
||||
super().__init__(config)
|
||||
self.base_models = []
|
||||
self.model_weights = None
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
base_model_types = params.get(
|
||||
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
|
||||
)
|
||||
|
||||
# Create base models with simplified configs; diverse vectorizers/classifiers
|
||||
# encourage complementary errors that voting can average out.
|
||||
estimators = []
|
||||
for model_type in base_model_types:
|
||||
if model_type == "logistic_regression":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 4), max_features=5000
|
||||
),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
LogisticRegression(
|
||||
max_iter=1000, random_state=self.config.random_seed
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append(("logistic_regression", model))
|
||||
|
||||
elif model_type == "random_forest":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
TfidfVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=3000
|
||||
),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
RandomForestClassifier(
|
||||
n_estimators=50, random_state=self.config.random_seed
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append(("rf", model))
|
||||
|
||||
elif model_type == "naive_bayes":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(
|
||||
analyzer="char", ngram_range=(1, 3), max_features=4000
|
||||
),
|
||||
),
|
||||
("classifier", MultinomialNB()),
|
||||
]
|
||||
)
|
||||
estimators.append(("nb", model))
|
||||
|
||||
# Soft voting averages probabilities (preferred when members are calibrated);
|
||||
# hard voting uses majority class. Parallelize member predictions.
|
||||
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||
return VotingClassifier(
|
||||
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,115 @@
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LightGBMModel(TraditionalModel):
|
||||
"""LightGBM with engineered features"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Store vectorizers and encoders to ensure consistent feature space
|
||||
self.vectorizers = {}
|
||||
self.label_encoders = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
# Optional GPU acceleration
|
||||
use_gpu = bool(params.get("use_gpu", False))
|
||||
device = params.get("device", "gpu" if use_gpu else "cpu")
|
||||
gpu_platform_id = params.get("gpu_platform_id", None)
|
||||
gpu_device_id = params.get("gpu_device_id", None)
|
||||
|
||||
# Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
|
||||
# and parallelism improve training speed for this task.
|
||||
return lgb.LGBMClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", -1),
|
||||
learning_rate=params.get("learning_rate", 0.1),
|
||||
num_leaves=params.get("num_leaves", 31),
|
||||
subsample=params.get("subsample", 0.8),
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
objective=params.get("objective", "binary"),
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
verbose=2,
|
||||
device=device,
|
||||
gpu_platform_id=gpu_platform_id,
|
||||
gpu_device_id=gpu_device_id,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character-level features for names
|
||||
feature_key = f"vectorizer_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.vectorizers:
|
||||
# First time - create and fit vectorizer
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.fit_transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
|
||||
features.append(char_features)
|
||||
else:
|
||||
# Categorical features
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.label_encoders:
|
||||
# First time - create and fit encoder
|
||||
self.label_encoders[feature_key] = LabelEncoder()
|
||||
encoded = self.label_encoders[feature_key].fit_transform(
|
||||
column.fillna("unknown").astype(str)
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing encoder
|
||||
# Handle unseen labels by mapping them to a default value
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
|
||||
# Get the classes the encoder was trained on
|
||||
known_classes = set(self.label_encoders[feature_key].classes_)
|
||||
|
||||
# Map unseen values to "unknown" if it exists, otherwise to the first class
|
||||
if "unknown" in known_classes:
|
||||
default_class = "unknown"
|
||||
else:
|
||||
default_class = self.label_encoders[feature_key].classes_[0]
|
||||
|
||||
# Replace unseen values with default
|
||||
column_mapped = column_clean.apply(
|
||||
lambda x: x if x in known_classes else default_class
|
||||
)
|
||||
|
||||
encoded = self.label_encoders[feature_key].transform(
|
||||
column_mapped
|
||||
)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,53 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LogisticRegressionModel(TraditionalModel):
|
||||
"""Logistic Regression with character n-grams"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# Character n-grams are strong signals for names; (2,5) balances
|
||||
# capturing prefixes/suffixes with tractable feature size.
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 5)),
|
||||
max_features=params.get("max_features", 10000),
|
||||
)
|
||||
|
||||
# liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
|
||||
# OvR across classes (no effect for binary). class_weight can mitigate imbalance.
|
||||
classifier = LogisticRegression(
|
||||
max_iter=params.get("max_iter", 1000),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
solver=params.get("solver", "liblinear"),
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
class_weight=params.get("class_weight", None),
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
# Collect text-based features from the extracted features DataFrame
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
# Combine text features
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
# Concatenate multiple text features with separator
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,71 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class LSTMModel(NeuralNetworkModel):
|
||||
"""LSTM model for sequence learning"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
model = Sequential(
|
||||
[
|
||||
# Mask padding tokens; required for LSTM to ignore padded timesteps.
|
||||
Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
mask_zero=True,
|
||||
),
|
||||
# Stacked bidirectional LSTMs: first returns sequences to feed the next.
|
||||
# Dropout/recurrent_dropout mitigate overfitting on short sequences.
|
||||
Bidirectional(
|
||||
LSTM(
|
||||
params.get("lstm_units", 32),
|
||||
return_sequences=True,
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Second LSTM condenses sequence to a fixed vector for classification.
|
||||
Bidirectional(
|
||||
LSTM(
|
||||
params.get("lstm_units", 32),
|
||||
dropout=params.get("dropout", 0.2),
|
||||
recurrent_dropout=params.get("recurrent_dropout", 0.0),
|
||||
)
|
||||
),
|
||||
# Compact dense head with dropout; sufficient capacity for name signals.
|
||||
Dense(64, activation="relu"),
|
||||
Dropout(params.get("dropout", 0.5)),
|
||||
# Two-way softmax for binary classification.
|
||||
Dense(2, activation="softmax", dtype="float32"),
|
||||
]
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy",
|
||||
optimizer="adam",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,42 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class NaiveBayesModel(TraditionalModel):
|
||||
"""Multinomial Naive Bayes with character n-grams"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
|
||||
# includes unigrams for coverage and higher n for suffix/prefix cues.
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 5)),
|
||||
max_features=params.get("max_features", 8000),
|
||||
)
|
||||
|
||||
# Laplace smoothing (alpha) counters zero counts for rare n-grams.
|
||||
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,71 @@
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class RandomForestModel(TraditionalModel):
|
||||
"""Random Forest with engineered features"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Persist encoders so categorical mappings stay consistent.
|
||||
self.label_encoders: Dict[str, LabelEncoder] = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
|
||||
# across trees for speed. Keep depth moderate for generalisation.
|
||||
return RandomForestClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", None),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
# Handle different feature types
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
else:
|
||||
# Categorical features (encode them persistently)
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.label_encoders:
|
||||
self.label_encoders[feature_key] = LabelEncoder()
|
||||
encoded = self.label_encoders[feature_key].fit_transform(
|
||||
column.fillna("unknown").astype(str)
|
||||
)
|
||||
else:
|
||||
encoder = self.label_encoders[feature_key]
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
known_classes = set(encoder.classes_)
|
||||
default_class = (
|
||||
"unknown"
|
||||
if "unknown" in known_classes
|
||||
else encoder.classes_[0]
|
||||
)
|
||||
column_mapped = column_clean.apply(
|
||||
lambda value: value
|
||||
if value in known_classes
|
||||
else default_class
|
||||
)
|
||||
encoded = encoder.transform(column_mapped)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,52 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class SVMModel(TraditionalModel):
|
||||
"""Support Vector Machine with character n-grams and RBF kernel"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
|
||||
# for distinguishing name morphology under RBF kernels.
|
||||
vectorizer = TfidfVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 4)),
|
||||
max_features=params.get("max_features", 5000),
|
||||
)
|
||||
|
||||
# RBF kernel captures non-linear interactions between n-grams; probability=True
|
||||
# adds calibration at some cost. Larger cache helps speed kernel computations.
|
||||
classifier = SVC(
|
||||
kernel=params.get("kernel", "rbf"),
|
||||
C=params.get("C", 1.0),
|
||||
gamma=params.get("gamma", "scale"),
|
||||
probability=True, # Enable probability prediction
|
||||
class_weight=params.get("class_weight", None),
|
||||
cache_size=params.get("cache_size", 1000),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -0,0 +1,90 @@
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
Embedding,
|
||||
Dense,
|
||||
GlobalAveragePooling1D,
|
||||
MultiHeadAttention,
|
||||
Dropout,
|
||||
LayerNormalization,
|
||||
)
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class TransformerModel(NeuralNetworkModel):
|
||||
"""Transformer-based model"""
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
|
||||
# Build Transformer model
|
||||
inputs = Input(shape=(params.get("max_len", 8),))
|
||||
x = Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
input_length=params.get("max_len", 8),
|
||||
mask_zero=True,
|
||||
)(inputs)
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
||||
pos_embedding = Embedding(
|
||||
input_dim=params.get("max_len", 8),
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
)(positions)
|
||||
x = x + pos_embedding
|
||||
|
||||
x = self._transformer_encoder(x, params)
|
||||
x = GlobalAveragePooling1D()(x)
|
||||
x = Dense(32, activation="relu")(x)
|
||||
x = Dropout(params.get("dropout", 0.1))(x)
|
||||
outputs = Dense(2, activation="softmax", dtype="float32")(x)
|
||||
|
||||
model = Model(inputs, outputs)
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss="sparse_categorical_crossentropy",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
@classmethod
|
||||
def _transformer_encoder(cls, x, cfg_params):
|
||||
"""Transformer encoder block"""
|
||||
|
||||
attn = MultiHeadAttention(
|
||||
num_heads=cfg_params.get("transformer_num_heads", 2),
|
||||
key_dim=cfg_params.get("transformer_head_size", 64),
|
||||
dropout=cfg_params.get("attn_dropout", 0.1),
|
||||
)(x, x)
|
||||
x = LayerNormalization(epsilon=1e-6)(
|
||||
x + Dropout(cfg_params.get("dropout", 0.1))(attn)
|
||||
)
|
||||
|
||||
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
|
||||
ff = Dense(x.shape[-1])(ff)
|
||||
return LayerNormalization(epsilon=1e-6)(
|
||||
x + Dropout(cfg_params.get("dropout", 0.1))(ff)
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = self._collect_text_corpus(X)
|
||||
|
||||
# Initialize tokenizer if needed
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -0,0 +1,115 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class XGBoostModel(TraditionalModel):
|
||||
"""XGBoost with engineered features and character embeddings"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Store vectorizers and encoders to ensure consistent feature space
|
||||
self.vectorizers = {}
|
||||
self.label_encoders = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
# Optional GPU acceleration
|
||||
use_gpu = bool(params.get("use_gpu", False))
|
||||
default_tree_method = "gpu_hist" if use_gpu else "hist"
|
||||
tree_method = params.get("tree_method", default_tree_method)
|
||||
predictor = params.get(
|
||||
"predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto"
|
||||
)
|
||||
|
||||
# Histogram-based trees and parallelism provide fast training; default
|
||||
# logloss metric suits binary classification of gender.
|
||||
return xgb.XGBClassifier(
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", 6),
|
||||
learning_rate=params.get("learning_rate", 0.1),
|
||||
subsample=params.get("subsample", 0.8),
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
eval_metric="logloss",
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
tree_method=tree_method,
|
||||
predictor=predictor,
|
||||
verbosity=2,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character-level features for names
|
||||
feature_key = f"vectorizer_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.vectorizers:
|
||||
# First time - create and fit vectorizer
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||
)
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.fit_transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
|
||||
features.append(char_features)
|
||||
else:
|
||||
# Categorical features
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.label_encoders:
|
||||
# First time - create and fit encoder
|
||||
self.label_encoders[feature_key] = LabelEncoder()
|
||||
encoded = self.label_encoders[feature_key].fit_transform(
|
||||
column.fillna("unknown").astype(str)
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing encoder
|
||||
# Handle unseen labels by mapping them to a default value
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
|
||||
# Get the classes the encoder was trained on
|
||||
known_classes = set(self.label_encoders[feature_key].classes_)
|
||||
|
||||
# Map unseen values to "unknown" if it exists, otherwise to the first class
|
||||
if "unknown" in known_classes:
|
||||
default_class = "unknown"
|
||||
else:
|
||||
default_class = self.label_encoders[feature_key].classes_[0]
|
||||
|
||||
# Replace unseen values with default
|
||||
column_mapped = column_clean.apply(
|
||||
lambda x: x if x in known_classes else default_class
|
||||
)
|
||||
|
||||
encoded = self.label_encoders[feature_key].transform(
|
||||
column_mapped
|
||||
)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
@@ -0,0 +1,377 @@
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.metrics import precision_recall_fscore_support
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment.feature_extractor import FeatureExtractor
|
||||
|
||||
|
||||
class NeuralNetworkModel(BaseModel):
|
||||
"""Base class for neural network models (TensorFlow/Keras)"""
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
return "neural_network"
|
||||
|
||||
@abstractmethod
|
||||
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||
"""Build neural network model with known vocabulary size"""
|
||||
pass
|
||||
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||
"""Fit the neural network model with deferred building"""
|
||||
logging.info(f"Training {self.__class__.__name__}")
|
||||
|
||||
# Best-effort GPU configuration for TensorFlow when available
|
||||
# - Enables memory growth to avoid pre-allocating all VRAM
|
||||
# - Optionally enables mixed precision if requested via model params
|
||||
try:
|
||||
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||
|
||||
gpus = tf.config.list_physical_devices("GPU")
|
||||
if gpus:
|
||||
for gpu in gpus:
|
||||
try:
|
||||
tf.config.experimental.set_memory_growth(gpu, True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if enable_mixed:
|
||||
try:
|
||||
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||
logging.info("Enabled TensorFlow mixed precision (float16)")
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not enable mixed precision: {e}")
|
||||
else:
|
||||
if requested_gpu:
|
||||
logging.warning(
|
||||
"Requested GPU but no TensorFlow GPU device is available."
|
||||
)
|
||||
except Exception as e:
|
||||
# Keep silent in non-TF environments / non-NN workflows
|
||||
logging.debug(f"TensorFlow GPU setup skipped: {e}")
|
||||
|
||||
# Setup feature extraction
|
||||
if self.feature_extractor is None:
|
||||
self.feature_extractor = FeatureExtractor(
|
||||
self.config.features, self.config.feature_params
|
||||
)
|
||||
|
||||
# Extract and prepare features (this will also initialize tokenizer)
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
# Sanitize any out-of-range indices to avoid embedding scatter errors
|
||||
X_prepared = self._sanitize_sequences(X_prepared)
|
||||
|
||||
# Encode labels
|
||||
if self.label_encoder is None:
|
||||
self.label_encoder = LabelEncoder()
|
||||
y_encoded = self.label_encoder.fit_transform(y)
|
||||
else:
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
# Now we can build the model with known vocab size
|
||||
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||
logging.info(f"Vocabulary size: {vocab_size}")
|
||||
|
||||
# Get additional model parameters
|
||||
self.model = self.build_model_with_vocab(
|
||||
vocab_size=vocab_size, **self.config.model_params
|
||||
)
|
||||
|
||||
# Train the neural network
|
||||
logging.info(
|
||||
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
|
||||
)
|
||||
logging.info(X_prepared[0])
|
||||
logging.info(f"Model parameters: {self.config.model_params}")
|
||||
|
||||
history = self.model.fit(
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
epochs=self.config.model_params.get("epochs", 10),
|
||||
batch_size=self.config.model_params.get("batch_size", 64),
|
||||
validation_split=self.config.model_params.get("validation_split", 0.1),
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
# Store training history
|
||||
self.training_history = {
|
||||
"accuracy": history.history["accuracy"],
|
||||
"loss": history.history["loss"],
|
||||
"val_accuracy": history.history.get("val_accuracy", []),
|
||||
"val_loss": history.history.get("val_loss", []),
|
||||
}
|
||||
|
||||
self.is_fitted = True
|
||||
return self
|
||||
|
||||
def _sanitize_sequences(self, sequences: np.ndarray) -> np.ndarray:
|
||||
"""Clamp invalid token indices to OOV and ensure int32 dtype.
|
||||
|
||||
This prevents rare cases where malformed inputs or dtype issues introduce
|
||||
large or negative indices which can trigger TensorScatterUpdate errors
|
||||
during embedding updates on GPU.
|
||||
"""
|
||||
try:
|
||||
if sequences is None:
|
||||
return sequences
|
||||
arr = np.asarray(sequences)
|
||||
# Ensure integer dtype for embedding lookups
|
||||
if not np.issubdtype(arr.dtype, np.integer):
|
||||
arr = arr.astype(np.int64, copy=False)
|
||||
|
||||
if self.tokenizer is not None and hasattr(self.tokenizer, "word_index"):
|
||||
# Use the actual max index present in the tokenizer mapping
|
||||
if self.tokenizer.word_index:
|
||||
max_idx = max(self.tokenizer.word_index.values())
|
||||
else:
|
||||
max_idx = 0
|
||||
# OOV token index if available, else fall back to 1
|
||||
oov_index = self.tokenizer.word_index.get(
|
||||
getattr(self.tokenizer, "oov_token", "<OOV>"), 1
|
||||
)
|
||||
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
|
||||
invalid_mask = (arr < 0) | (arr > max_idx)
|
||||
# Avoid turning zeros into OOV
|
||||
invalid_mask &= arr != 0
|
||||
if invalid_mask.any():
|
||||
arr[invalid_mask] = oov_index
|
||||
|
||||
# Use int32 for TF embedding ops compatibility
|
||||
return arr.astype(np.int32, copy=False)
|
||||
except Exception as e:
|
||||
logging.debug(f"Sequence sanitization skipped due to: {e}")
|
||||
return sequences
|
||||
|
||||
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
|
||||
"""Combine configured textual features into one string per record."""
|
||||
|
||||
column_names = [
|
||||
feature.value
|
||||
for feature in self.config.features
|
||||
if feature.value in X.columns
|
||||
]
|
||||
if not column_names:
|
||||
raise ValueError(
|
||||
"No configured text features found in the provided DataFrame."
|
||||
)
|
||||
|
||||
text_frame = X[column_names].fillna("").astype(str)
|
||||
|
||||
if len(column_names) == 1:
|
||||
return text_frame.iloc[:, 0].tolist()
|
||||
|
||||
combined_rows = []
|
||||
for row in text_frame.itertuples(index=False):
|
||||
tokens = [value for value in row if value]
|
||||
combined_rows.append(" ".join(tokens))
|
||||
|
||||
return combined_rows
|
||||
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> dict[str, np.floating[Any]]:
|
||||
# Ensure TF GPU/mixed-precision config also applies to CV runs
|
||||
try:
|
||||
import tensorflow as tf
|
||||
|
||||
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||
|
||||
gpus = tf.config.list_physical_devices("GPU")
|
||||
if gpus:
|
||||
for gpu in gpus:
|
||||
try:
|
||||
tf.config.experimental.set_memory_growth(gpu, True)
|
||||
except Exception:
|
||||
pass
|
||||
if enable_mixed:
|
||||
try:
|
||||
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
if requested_gpu:
|
||||
logging.warning("Requested GPU for CV but none is available.")
|
||||
except Exception:
|
||||
pass
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
X_prepared = self._sanitize_sequences(X_prepared)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
cv = StratifiedKFold(
|
||||
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
|
||||
)
|
||||
|
||||
accuracies = []
|
||||
precisions = []
|
||||
recalls = []
|
||||
f1_scores = []
|
||||
|
||||
# Get vocabulary size and model parameters
|
||||
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
|
||||
# Create fresh model for each fold using build_model_with_vocab
|
||||
fold_model = self.build_model_with_vocab(
|
||||
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||
)
|
||||
|
||||
# Train on fold
|
||||
if hasattr(fold_model, "fit"):
|
||||
fold_model.fit(
|
||||
X_prepared[train_idx],
|
||||
y_encoded[train_idx],
|
||||
epochs=self.config.model_params.get("epochs", 10),
|
||||
batch_size=self.config.model_params.get("batch_size", 32),
|
||||
verbose=0,
|
||||
)
|
||||
|
||||
# Predict on validation
|
||||
y_pred = fold_model.predict(X_prepared[val_idx])
|
||||
if len(y_pred.shape) > 1:
|
||||
y_pred = y_pred.argmax(axis=1)
|
||||
|
||||
# Calculate metrics
|
||||
acc = accuracy_score(y_encoded[val_idx], y_pred)
|
||||
prec, rec, f1, _ = precision_recall_fscore_support(
|
||||
y_encoded[val_idx], y_pred, average="weighted"
|
||||
)
|
||||
|
||||
accuracies.append(acc)
|
||||
precisions.append(prec)
|
||||
recalls.append(rec)
|
||||
f1_scores.append(f1)
|
||||
|
||||
return {
|
||||
"accuracy": np.mean(accuracies),
|
||||
"precision": np.mean(precisions),
|
||||
"recall": np.mean(recalls),
|
||||
"f1": np.mean(f1_scores),
|
||||
}
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
# Ensure TF GPU/mixed-precision config also applies here
|
||||
try:
|
||||
import tensorflow as tf
|
||||
|
||||
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||
|
||||
gpus = tf.config.list_physical_devices("GPU")
|
||||
if gpus:
|
||||
for gpu in gpus:
|
||||
try:
|
||||
tf.config.experimental.set_memory_growth(gpu, True)
|
||||
except Exception:
|
||||
pass
|
||||
if enable_mixed:
|
||||
try:
|
||||
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
if requested_gpu:
|
||||
logging.warning(
|
||||
"Requested GPU for learning curve but none is available."
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if train_sizes is None:
|
||||
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
|
||||
|
||||
learning_curve_data = {
|
||||
"train_sizes": [],
|
||||
"train_scores": [],
|
||||
"val_scores": [],
|
||||
"train_scores_std": [],
|
||||
"val_scores_std": [],
|
||||
}
|
||||
|
||||
# Prepare features and get vocabulary size
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
X_prepared = self._sanitize_sequences(X_prepared)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
# Split data once for validation
|
||||
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
test_size=0.2,
|
||||
random_state=self.config.random_seed,
|
||||
stratify=y_encoded,
|
||||
)
|
||||
|
||||
for size in train_sizes:
|
||||
train_size = int(len(X_train_full) * size)
|
||||
if train_size < 10: # Minimum training size
|
||||
continue
|
||||
|
||||
# Sample training data
|
||||
indices = np.random.choice(len(X_train_full), train_size, replace=False)
|
||||
X_train_subset = X_train_full[indices]
|
||||
y_train_subset = y_train_full[indices]
|
||||
|
||||
# Train multiple models for variance estimation
|
||||
train_scores = []
|
||||
val_scores = []
|
||||
|
||||
for seed in range(3): # 3 runs for variance
|
||||
# Build fresh model using build_model_with_vocab
|
||||
model = self.build_model_with_vocab(
|
||||
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||
)
|
||||
|
||||
# Train model
|
||||
if hasattr(model, "fit"):
|
||||
model.fit(
|
||||
X_train_subset,
|
||||
y_train_subset,
|
||||
epochs=self.config.model_params.get("epochs", 10),
|
||||
batch_size=self.config.model_params.get("batch_size", 32),
|
||||
validation_data=(X_val, y_val),
|
||||
verbose=0,
|
||||
)
|
||||
|
||||
# Evaluate
|
||||
train_pred = model.predict(X_train_subset)
|
||||
val_pred = model.predict(X_val)
|
||||
|
||||
train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
|
||||
val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
|
||||
|
||||
train_scores.append(train_acc)
|
||||
val_scores.append(val_acc)
|
||||
|
||||
learning_curve_data["train_sizes"].append(train_size)
|
||||
learning_curve_data["train_scores"].append(np.mean(train_scores))
|
||||
learning_curve_data["val_scores"].append(np.mean(val_scores))
|
||||
learning_curve_data["train_scores_std"].append(np.std(train_scores))
|
||||
learning_curve_data["val_scores_std"].append(np.std(val_scores))
|
||||
|
||||
self.learning_curve_data = learning_curve_data
|
||||
return learning_curve_data
|
||||
@@ -0,0 +1 @@
|
||||
LETTERS = "abcdefghijklmnopqrstuvwxyz"
|
||||
@@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
from ners.research.statistics.utils import LETTERS, build_letter_frequencies
|
||||
|
||||
|
||||
def plot_transition_matrix(ax, df_probs, title=""):
|
||||
hm = sns.heatmap(
|
||||
df_probs.loc[list(LETTERS), list(LETTERS)],
|
||||
cmap="Reds",
|
||||
annot=False,
|
||||
cbar=False,
|
||||
ax=ax,
|
||||
)
|
||||
ax.set_title(title, fontsize=12)
|
||||
return hm
|
||||
|
||||
|
||||
def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
||||
# Compute frequencies
|
||||
L_m = build_letter_frequencies(males["name"]).set_index("letter")["freq"]
|
||||
L_f = build_letter_frequencies(females["name"]).set_index("letter")["freq"]
|
||||
|
||||
# Combine into one DataFrame
|
||||
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
|
||||
df_plot.to_csv(f"../assets/{title}_letter_frequencies.csv", index=False)
|
||||
|
||||
# Optional sorting
|
||||
if sort_values:
|
||||
df_plot = df_plot.sort_values("Male", ascending=False)
|
||||
|
||||
# Plot side-by-side bars
|
||||
x = np.arange(len(df_plot))
|
||||
w = 0.4
|
||||
fig, ax = plt.subplots(figsize=(16, 6))
|
||||
ax.bar(
|
||||
x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8
|
||||
)
|
||||
ax.bar(
|
||||
x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8
|
||||
)
|
||||
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(df_plot["letter"])
|
||||
ax.set_ylabel("Frequency")
|
||||
ax.set_xlabel("Letter")
|
||||
ax.set_title(f"{title} - Letter Frequencies")
|
||||
ax.legend()
|
||||
ax.grid(axis="y", alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
@@ -0,0 +1,276 @@
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.spatial.distance import euclidean
|
||||
from scipy.stats import entropy
|
||||
from typing import Dict, Any
|
||||
|
||||
LETTERS = "abcdefghijklmnopqrstuvwxyz"
|
||||
START_TOKEN = "^"
|
||||
END_TOKEN = "$"
|
||||
|
||||
|
||||
def normalize_letters(s):
|
||||
"""Normalize accents -> ascii, lowercase, keep only a-z."""
|
||||
s = str(s)
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = s.encode("ascii", errors="ignore").decode("utf-8")
|
||||
s = s.lower()
|
||||
s = re.sub(r"[^a-z]", "", s)
|
||||
return s
|
||||
|
||||
|
||||
def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
|
||||
return (
|
||||
df.groupby("province")["identified_category"]
|
||||
.value_counts(normalize=True) # get proportions
|
||||
.unstack(fill_value=0) # reshape into columns per word count
|
||||
)
|
||||
|
||||
|
||||
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
|
||||
# Normalize + split once (vectorized)
|
||||
s = df[source].fillna("").astype(str)
|
||||
s = s.str.lower().str.replace(r"[^\w'\-]+", " ", regex=True).str.strip().str.split()
|
||||
|
||||
# Explode the token list into rows under `target`
|
||||
out = df.assign(**{target: s}).explode(target, ignore_index=True)
|
||||
|
||||
# Drop NA/empty tokens and strip whitespace
|
||||
out[target] = out[target].astype(str).str.strip()
|
||||
out = out[out[target].ne("")].dropna(subset=[target]).reset_index(drop=True)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
||||
# Normalize: lowercase, remove non-letters, concatenate all into one string
|
||||
s = (
|
||||
series.astype(str)
|
||||
.str.lower()
|
||||
.str.replace(r"[^a-z]", "", regex=True)
|
||||
.str.cat(sep="")
|
||||
)
|
||||
|
||||
# Convert string into Series of characters
|
||||
chars = pd.Series(list(s))
|
||||
|
||||
# Count letters and ensure all letters are present
|
||||
out = (
|
||||
chars.value_counts(normalize=False)
|
||||
.reindex(list(LETTERS), fill_value=0)
|
||||
.rename_axis("letter")
|
||||
.reset_index(name="count")
|
||||
)
|
||||
|
||||
# Relative frequency
|
||||
total = out["count"].sum()
|
||||
out["freq"] = out["count"] / (total if total > 0 else 1)
|
||||
return out
|
||||
|
||||
|
||||
def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict:
|
||||
# 1) Normalize
|
||||
names = names.astype(str).str.lower().str.replace(rf"[^{LETTERS}]", "", regex=True)
|
||||
names = names[names.str.len() > 0]
|
||||
|
||||
# 2) Prepare sequences
|
||||
sequences = (START_TOKEN + names + END_TOKEN).tolist()
|
||||
|
||||
# 3) Tokens and indices
|
||||
tokens = [START_TOKEN] + list(LETTERS) + [END_TOKEN]
|
||||
index = {t: i for i, t in enumerate(tokens)}
|
||||
V = len(tokens)
|
||||
|
||||
# 4) ASCII lookup table (O(1) char -> idx); others -> -1
|
||||
lut = np.full(128, -1, dtype=np.int32)
|
||||
for ch, i in index.items():
|
||||
lut[ord(ch)] = i
|
||||
|
||||
# 5) Concatenate with a separator that’s not in vocab to kill cross-boundary pairs
|
||||
concat = (" ".join(sequences)).encode("ascii", errors="ignore")
|
||||
|
||||
# 6) Map bytes to indices
|
||||
arr = np.frombuffer(concat, dtype=np.uint8)
|
||||
idx = lut[arr]
|
||||
|
||||
# 7) Build bigram pairs; drop invalid ones (separator & OOV)
|
||||
a = idx[:-1]
|
||||
b = idx[1:]
|
||||
mask = (a >= 0) & (b >= 0)
|
||||
a, b = a[mask], b[mask]
|
||||
|
||||
# 8) Count with a single bincount
|
||||
lin = a * V + b
|
||||
counts = np.bincount(lin, minlength=V * V).reshape(V, V)
|
||||
|
||||
# 9) Optional Laplace smoothing
|
||||
if alpha and alpha > 0:
|
||||
counts = counts + alpha
|
||||
|
||||
# 10) Row-normalize to probabilities
|
||||
row_sums = counts.sum(axis=1, keepdims=True)
|
||||
# avoid division by zero
|
||||
probs = np.divide(counts, np.where(row_sums == 0, 1.0, row_sums), where=True)
|
||||
|
||||
# 11) DataFrames
|
||||
df_counts = pd.DataFrame(counts, index=tokens, columns=tokens)
|
||||
df_probs = pd.DataFrame(probs, index=tokens, columns=tokens)
|
||||
|
||||
return {
|
||||
"tokens": tokens,
|
||||
"index": index,
|
||||
"counts": counts,
|
||||
"df_counts": df_counts,
|
||||
"probs": probs,
|
||||
"df_probs": df_probs,
|
||||
}
|
||||
|
||||
|
||||
def build_transition_comparisons(
|
||||
names_transitions: Dict[str, Any],
|
||||
surnames_transitions: Dict[str, Any],
|
||||
n_permutations: int = 1000,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Compares letter transition probability matrices for names and surnames using
|
||||
various distance metrics and a permutation test for statistical significance.
|
||||
"""
|
||||
|
||||
# Helper function to flatten and smooth matrices
|
||||
def prepare_data(data):
|
||||
return {"m": data["m"]["probs"].flatten(), "f": data["f"]["probs"].flatten()}
|
||||
|
||||
prepared_names = prepare_data(names_transitions)
|
||||
prepared_surnames = prepare_data(surnames_transitions)
|
||||
|
||||
# Distance Metrics
|
||||
names_l2 = euclidean(prepared_names["m"], prepared_names["f"])
|
||||
surnames_l2 = euclidean(prepared_surnames["m"], prepared_surnames["f"])
|
||||
|
||||
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
|
||||
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
|
||||
|
||||
kl_surnames_mf = entropy(
|
||||
prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12
|
||||
)
|
||||
kl_surnames_fm = entropy(
|
||||
prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12
|
||||
)
|
||||
|
||||
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
|
||||
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
|
||||
|
||||
# Permutation Test
|
||||
def run_permutation_test(transitions):
|
||||
# Flattened probabilities for male and female
|
||||
P_m = transitions["m"]["probs"].flatten()
|
||||
P_f = transitions["f"]["probs"].flatten()
|
||||
|
||||
# Calculate the observed JSD (our test statistic)
|
||||
observed_jsd = 0.5 * (
|
||||
entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)
|
||||
)
|
||||
|
||||
# Concatenate male and female counts
|
||||
counts_m = transitions["m"]["counts"]
|
||||
counts_f = transitions["f"]["counts"]
|
||||
all_counts = np.concatenate((counts_m, counts_f), axis=1)
|
||||
total_counts = counts_m.shape[1] + counts_f.shape[1]
|
||||
|
||||
permuted_jsds = []
|
||||
for _ in range(n_permutations):
|
||||
# Shuffle the columns (names) and split back into two groups
|
||||
shuffled_indices = np.random.permutation(total_counts)
|
||||
|
||||
# Note: This is a simplified approach, assuming counts are
|
||||
# structured per name. A more robust implementation would
|
||||
# shuffle the actual names themselves.
|
||||
permuted_counts_m = all_counts[:, shuffled_indices[: counts_m.shape[1]]]
|
||||
permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1] :]]
|
||||
|
||||
# Re-calculate probabilities and JSD for the permuted groups
|
||||
# Add a small epsilon to the denominator to prevent division by zero
|
||||
epsilon = 1e-12
|
||||
permuted_probs_m = permuted_counts_m / (
|
||||
permuted_counts_m.sum(axis=0, keepdims=True) + epsilon
|
||||
)
|
||||
permuted_probs_f = permuted_counts_f / (
|
||||
permuted_counts_f.sum(axis=0, keepdims=True) + epsilon
|
||||
)
|
||||
|
||||
permuted_jsd = 0.5 * (
|
||||
entropy(
|
||||
permuted_probs_m.mean(axis=1) + 1e-12,
|
||||
permuted_probs_f.mean(axis=1) + 1e-12,
|
||||
)
|
||||
+ entropy(
|
||||
permuted_probs_f.mean(axis=1) + 1e-12,
|
||||
permuted_probs_m.mean(axis=1) + 1e-12,
|
||||
)
|
||||
)
|
||||
permuted_jsds.append(permuted_jsd)
|
||||
|
||||
# Calculate the p-value
|
||||
p_value = np.mean(np.array(permuted_jsds) >= observed_jsd)
|
||||
return p_value
|
||||
|
||||
names_p_value = run_permutation_test(names_transitions)
|
||||
surnames_p_value = run_permutation_test(surnames_transitions)
|
||||
|
||||
out = pd.DataFrame(
|
||||
{
|
||||
"l2": [names_l2, surnames_l2],
|
||||
"kl_mf": [kl_names_mf, kl_surnames_mf],
|
||||
"kl_fm": [kl_names_fm, kl_surnames_fm],
|
||||
"jsd": [jsd_names, jsd_surnames],
|
||||
"permutation_p_value": [names_p_value, surnames_p_value],
|
||||
},
|
||||
index=["names", "surnames"],
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from typing import Literal
|
||||
|
||||
|
||||
def build_ngrams_count(
|
||||
df: pd.DataFrame,
|
||||
n: int,
|
||||
where: Literal["any", "prefix", "suffix"] = "any",
|
||||
) -> pd.DataFrame:
|
||||
# Normalize and clean to a–z
|
||||
names = df["name"].astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True)
|
||||
|
||||
ngrams = []
|
||||
if where == "any":
|
||||
for s in names:
|
||||
L = len(s)
|
||||
if L >= n:
|
||||
ngrams.extend(s[i : i + n] for i in range(L - n + 1))
|
||||
elif where == "prefix":
|
||||
for s in names:
|
||||
if len(s) >= n:
|
||||
ngrams.append(s[:n])
|
||||
elif where == "suffix":
|
||||
for s in names:
|
||||
if len(s) >= n:
|
||||
ngrams.append(s[-n:])
|
||||
else:
|
||||
raise ValueError("where must be one of: 'any', 'prefix', 'suffix'")
|
||||
|
||||
counter = Counter(ngrams)
|
||||
|
||||
out = (
|
||||
pd.DataFrame(counter.items(), columns=[f"{n}-gram", "count"])
|
||||
.sort_values("count", ascending=False, kind="mergesort")
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
total = out["count"].sum()
|
||||
out["freq"] = out["count"] / (total if total > 0 else 1)
|
||||
return out
|
||||
@@ -0,0 +1,163 @@
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from typing import Dict, Any, List
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.model_selection import StratifiedKFold, cross_val_score
|
||||
from sklearn.model_selection import learning_curve
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment.feature_extractor import FeatureExtractor
|
||||
|
||||
|
||||
class TraditionalModel(BaseModel):
|
||||
"""Base class for traditional ML models (scikit-learn compatible)"""
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
return "traditional"
|
||||
|
||||
@abstractmethod
|
||||
def build_model(self) -> BaseEstimator:
|
||||
"""Build and return the sklearn model instance"""
|
||||
pass
|
||||
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||
"""Fit the traditional ML model"""
|
||||
logging.info(f"Training {self.__class__.__name__}")
|
||||
|
||||
# Build model if not already built
|
||||
if self.model is None:
|
||||
self.model = self.build_model()
|
||||
|
||||
# Setup feature extraction
|
||||
if self.feature_extractor is None:
|
||||
self.feature_extractor = FeatureExtractor(
|
||||
self.config.features, self.config.feature_params
|
||||
)
|
||||
|
||||
# Extract and prepare features
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
# Encode labels
|
||||
if self.label_encoder is None:
|
||||
self.label_encoder = LabelEncoder()
|
||||
y_encoded = self.label_encoder.fit_transform(y)
|
||||
else:
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
# Train model
|
||||
if len(X_prepared.shape) == 1:
|
||||
# For text-based features (like LogisticRegression with vectorization)
|
||||
logging.info(
|
||||
f"Fitting model with {X_prepared.shape[0]} samples (text features)"
|
||||
)
|
||||
else:
|
||||
# For numerical features
|
||||
logging.info(
|
||||
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
|
||||
)
|
||||
|
||||
logging.info(X_prepared[0])
|
||||
logging.info(f"Model parameters: {self.config.model_params}")
|
||||
|
||||
history = self.model.fit(X_prepared, y_encoded)
|
||||
self.is_fitted = True
|
||||
|
||||
self.training_history = {
|
||||
"accuracy": history.history["accuracy"],
|
||||
"loss": history.history["loss"],
|
||||
"val_accuracy": history.history.get("val_accuracy", []),
|
||||
"val_loss": history.history.get("val_loss", []),
|
||||
}
|
||||
|
||||
return self
|
||||
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> Dict[str, float]:
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
cv = StratifiedKFold(
|
||||
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
|
||||
)
|
||||
|
||||
# Calculate different metrics
|
||||
results = {}
|
||||
|
||||
# Accuracy
|
||||
accuracy_scores = cross_val_score(
|
||||
self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
|
||||
)
|
||||
results["accuracy"] = accuracy_scores.mean()
|
||||
results["accuracy_std"] = accuracy_scores.std()
|
||||
|
||||
# Precision, Recall, F1
|
||||
for metric in ["precision", "recall", "f1"]:
|
||||
if metric in self.config.metrics:
|
||||
scores = cross_val_score(
|
||||
self.model,
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
cv=cv,
|
||||
scoring=f"{metric}_weighted",
|
||||
)
|
||||
results[metric] = scores.mean()
|
||||
results[f"{metric}_std"] = scores.std()
|
||||
|
||||
return results
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
if train_sizes is None:
|
||||
train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
|
||||
|
||||
# Prepare features
|
||||
if self.feature_extractor is None:
|
||||
self.feature_extractor = FeatureExtractor(
|
||||
self.config.features, self.config.feature_params
|
||||
)
|
||||
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
|
||||
# Encode labels
|
||||
if self.label_encoder is None:
|
||||
self.label_encoder = LabelEncoder()
|
||||
y_encoded = self.label_encoder.fit_transform(y)
|
||||
else:
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
try:
|
||||
train_sizes_abs, train_scores, val_scores = learning_curve(
|
||||
self.build_model(),
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
train_sizes=train_sizes,
|
||||
cv=3, # Use 3-fold CV for speed
|
||||
scoring="accuracy",
|
||||
random_state=self.config.random_seed,
|
||||
)
|
||||
|
||||
learning_curve_data = {
|
||||
"train_sizes": train_sizes_abs.tolist(),
|
||||
"train_scores": train_scores.mean(axis=1).tolist(),
|
||||
"val_scores": val_scores.mean(axis=1).tolist(),
|
||||
"train_scores_std": train_scores.std(axis=1).tolist(),
|
||||
"val_scores_std": val_scores.std(axis=1).tolist(),
|
||||
}
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not generate learning curve: {e}")
|
||||
return {}
|
||||
|
||||
self.learning_curve_data = learning_curve_data
|
||||
return learning_curve_data
|
||||
Executable
+46
@@ -0,0 +1,46 @@
|
||||
#!.venv/bin/python3
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from ners.core.config import setup_config
|
||||
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||
from ners.research.model_trainer import ModelTrainer
|
||||
|
||||
|
||||
def train_from_template(
|
||||
name: str,
|
||||
type: str,
|
||||
*,
|
||||
templates: str = "research_templates.yaml",
|
||||
config: str | None = None,
|
||||
env: str = "development",
|
||||
) -> int:
|
||||
try:
|
||||
cfg = setup_config(config_path=config, env=env)
|
||||
experiment_builder = ExperimentBuilder(cfg)
|
||||
|
||||
logging.info(f"Loading research templates from: {templates}")
|
||||
tmpl = experiment_builder.load_templates(templates)
|
||||
|
||||
logging.info(f"Looking for experiment: name='{name}', type='{type}'")
|
||||
experiment_config = experiment_builder.find_template(tmpl, name, type)
|
||||
|
||||
logging.info(f"Found experiment: {experiment_config.get('name')}")
|
||||
logging.info(f"Description: {experiment_config.get('description')}")
|
||||
logging.info(f"Features: {experiment_config.get('features')}")
|
||||
|
||||
trainer = ModelTrainer(cfg)
|
||||
trainer.train_single_model(
|
||||
model_name=experiment_config.get("name"),
|
||||
model_type=experiment_config.get("model_type"),
|
||||
features=experiment_config.get("features"),
|
||||
model_params=experiment_config.get("model_params", {}),
|
||||
tags=experiment_config.get("tags", []),
|
||||
)
|
||||
|
||||
logging.info("Training completed successfully!")
|
||||
return 0
|
||||
except Exception as e:
|
||||
logging.error(f"Training failed: {e}")
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
#!.venv/bin/python3
|
||||
import os
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from ners.core.config import setup_config, PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
# Page configuration
|
||||
st.set_page_config(
|
||||
page_title="DRC NERS Platform",
|
||||
page_icon="🇨🇩",
|
||||
layout="wide",
|
||||
initial_sidebar_state="expanded",
|
||||
)
|
||||
|
||||
|
||||
def initialize_session_state(config: PipelineConfig):
|
||||
"""Initialize session state variables"""
|
||||
if "config" not in st.session_state:
|
||||
st.session_state.config = config
|
||||
if "data_loader" not in st.session_state:
|
||||
st.session_state.data_loader = DataLoader(config)
|
||||
if "experiment_tracker" not in st.session_state:
|
||||
st.session_state.experiment_tracker = ExperimentTracker(config)
|
||||
if "experiment_runner" not in st.session_state:
|
||||
st.session_state.experiment_runner = ExperimentRunner(config)
|
||||
if "pipeline_monitor" not in st.session_state:
|
||||
st.session_state.pipeline_monitor = PipelineMonitor()
|
||||
if "current_experiment" not in st.session_state:
|
||||
st.session_state.current_experiment = None
|
||||
if "experiment_results" not in st.session_state:
|
||||
st.session_state.experiment_results = {}
|
||||
|
||||
|
||||
class StreamlitApp:
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
initialize_session_state(config)
|
||||
|
||||
@classmethod
|
||||
def run(cls):
|
||||
st.title("🇨🇩 DRC NERS Platform")
|
||||
st.markdown(
|
||||
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
|
||||
)
|
||||
st.markdown(
|
||||
"""
|
||||
## Overview
|
||||
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
|
||||
underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
|
||||
data.
|
||||
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
|
||||
million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
# Initialize app using environment variables when launched via Typer
|
||||
_config_path = os.environ.get("NERS_CONFIG")
|
||||
_env = os.environ.get("NERS_ENV", "development")
|
||||
_cfg = setup_config(_config_path, env=_env)
|
||||
_app = StreamlitApp(_cfg)
|
||||
_app.run()
|
||||
@@ -0,0 +1 @@
|
||||
from .ner_testing import NERTesting
|
||||
@@ -0,0 +1,10 @@
|
||||
import streamlit as st
|
||||
|
||||
|
||||
class Configuration:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
def index(self):
|
||||
st.title("Configuration")
|
||||
st.json(self.config.model_dump())
|
||||
@@ -0,0 +1,90 @@
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
class Dashboard:
|
||||
def __init__(self, config, experiment_tracker, experiment_runner):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
self.experiment_runner = experiment_runner
|
||||
|
||||
def index(self):
|
||||
st.title("Dashboard")
|
||||
col1, col2, col3, col4, col5 = st.columns(5)
|
||||
|
||||
# Load basic statistics
|
||||
try:
|
||||
data_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
if data_path.exists():
|
||||
df = load_dataset(str(data_path))
|
||||
|
||||
with col1:
|
||||
st.metric("Total Names", f"{len(df):,}")
|
||||
|
||||
with col2:
|
||||
annotated = (df.get("annotated", 0) == 1).sum()
|
||||
st.metric("Annotated Names", f"{annotated:,}")
|
||||
|
||||
with col3:
|
||||
provinces = (
|
||||
df["province"].nunique() if "province" in df.columns else 0
|
||||
)
|
||||
st.metric("Provinces", provinces)
|
||||
|
||||
with col4:
|
||||
if "sex" in df.columns:
|
||||
gender_dist = df["sex"].value_counts()
|
||||
ratio = gender_dist.get("f", 0) / max(
|
||||
gender_dist.get("m", 1), 1
|
||||
)
|
||||
st.metric("F/M Rate", f"{ratio:.2%}")
|
||||
with col5:
|
||||
if "annotated" in df.columns:
|
||||
annotated = (df.get("annotated", 0) == 1).sum()
|
||||
ratio = annotated / len(df) if len(df) > 0 else 0
|
||||
st.metric("Annotation Rate", f"{ratio:.2%}")
|
||||
else:
|
||||
st.warning("No processed data found. Please run data processing first.")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dashboard data: {e}")
|
||||
|
||||
# Recent experiments
|
||||
st.subheader("Recent Experiments")
|
||||
experiments = self.experiment_tracker.list_experiments()[:5]
|
||||
|
||||
if experiments:
|
||||
exp_data = []
|
||||
for exp in experiments:
|
||||
exp_data.append(
|
||||
{
|
||||
"Name": exp.config.name,
|
||||
"Model": exp.config.model_type,
|
||||
"Status": exp.status.value,
|
||||
"Accuracy": (
|
||||
f"{exp.test_metrics.get('accuracy', 0):.3f}"
|
||||
if exp.test_metrics
|
||||
else "N/A"
|
||||
),
|
||||
"Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
|
||||
}
|
||||
)
|
||||
|
||||
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
|
||||
else:
|
||||
st.info(
|
||||
"No experiments found. Create your first experiment in the Experiments tab!"
|
||||
)
|
||||
@@ -0,0 +1,52 @@
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
class DataOverview:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
def index(self):
|
||||
st.title("Data Overview")
|
||||
data_files = {
|
||||
"Names": self.config.data.input_file,
|
||||
"Featured Dataset": self.config.data.output_files["featured"],
|
||||
"Evaluation Dataset": self.config.data.output_files["evaluation"],
|
||||
"Male Names": self.config.data.output_files["males"],
|
||||
"Female Names": self.config.data.output_files["females"],
|
||||
}
|
||||
|
||||
st.write("Available Data Files:")
|
||||
for name, rel_path in data_files.items():
|
||||
file_path = self.config.paths.get_data_path(rel_path)
|
||||
exists = file_path.exists()
|
||||
size = file_path.stat().st_size if exists else 0
|
||||
stats = (
|
||||
f"Size: {size / (1024 * 1024):.1f} MB, Last Modified: {datetime.fromtimestamp(file_path.stat().st_mtime)}"
|
||||
if exists
|
||||
else "Not found"
|
||||
)
|
||||
st.write(f"- {name}: {file_path} ({stats})")
|
||||
|
||||
# Preview featured dataset if available
|
||||
data_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
if data_path.exists():
|
||||
df = load_dataset(str(data_path))
|
||||
st.subheader("Featured Dataset Preview")
|
||||
st.dataframe(df.head(), use_container_width=True)
|
||||
st.write(f"Rows: {len(df):,}")
|
||||
@@ -0,0 +1,141 @@
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from ners.web.interfaces.log_reader import LogReader
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
class DataProcessing:
|
||||
def __init__(self, config, pipeline_monitor):
|
||||
self.config = config
|
||||
self.pipeline_monitor = pipeline_monitor
|
||||
|
||||
def index(self):
|
||||
st.title("Data Processing")
|
||||
status = self.pipeline_monitor.get_pipeline_status()
|
||||
|
||||
# Overall progress
|
||||
overall_progress = status["overall_completion"] / 100
|
||||
st.progress(overall_progress)
|
||||
st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
|
||||
|
||||
# Step details
|
||||
for step_name, step_status in status["steps"].items():
|
||||
with st.expander(
|
||||
f"{step_name.replace('_', ' ').title()} - {step_status['status']}"
|
||||
):
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.metric("Processed Batches", step_status["processed_batches"])
|
||||
|
||||
with col2:
|
||||
st.metric("Total Batches", step_status["total_batches"])
|
||||
|
||||
with col3:
|
||||
st.metric("Failed Batches", step_status["failed_batches"])
|
||||
|
||||
if step_status["completion_percentage"] > 0:
|
||||
st.progress(step_status["completion_percentage"] / 100)
|
||||
|
||||
# Read actual log entries from the log file
|
||||
st.subheader("Recent Processing Logs")
|
||||
try:
|
||||
log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
|
||||
log_reader = LogReader(log_file_path)
|
||||
|
||||
# Options for filtering logs
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
log_level_filter = st.selectbox(
|
||||
"Filter by Level",
|
||||
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
|
||||
key="log_level_filter",
|
||||
)
|
||||
|
||||
with col2:
|
||||
num_entries = st.number_input(
|
||||
"Number of entries",
|
||||
min_value=5,
|
||||
max_value=50,
|
||||
value=10,
|
||||
key="num_log_entries",
|
||||
)
|
||||
|
||||
# Get log entries based on filter
|
||||
if log_level_filter == "All":
|
||||
log_entries = log_reader.read_last_entries(num_entries)
|
||||
else:
|
||||
log_entries = log_reader.read_entries_by_level(
|
||||
log_level_filter, num_entries
|
||||
)
|
||||
|
||||
if log_entries:
|
||||
for entry in log_entries:
|
||||
if entry.level == "ERROR":
|
||||
st.error(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
elif entry.level == "WARNING":
|
||||
st.warning(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
elif entry.level == "INFO":
|
||||
st.info(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
else:
|
||||
st.text(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
|
||||
# Show log statistics
|
||||
st.subheader("Log Statistics")
|
||||
log_stats = log_reader.get_log_stats()
|
||||
|
||||
if log_stats:
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.metric("Total Lines", log_stats.get("total_lines", 0))
|
||||
with col2:
|
||||
st.metric("INFO", log_stats.get("INFO", 0))
|
||||
with col3:
|
||||
st.metric("WARNING", log_stats.get("WARNING", 0))
|
||||
with col4:
|
||||
st.metric("ERROR", log_stats.get("ERROR", 0))
|
||||
|
||||
# Log level distribution chart
|
||||
levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
|
||||
counts = [log_stats.get(level, 0) for level in levels]
|
||||
|
||||
if sum(counts) > 0:
|
||||
fig = px.bar(
|
||||
x=levels,
|
||||
y=counts,
|
||||
title="Log Entries by Level",
|
||||
color=levels,
|
||||
color_discrete_map={
|
||||
"INFO": "blue",
|
||||
"WARNING": "orange",
|
||||
"ERROR": "red",
|
||||
"DEBUG": "gray",
|
||||
"CRITICAL": "darkred",
|
||||
},
|
||||
)
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
else:
|
||||
st.info("No log entries found or log file is empty.")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error reading log file: {e}")
|
||||
@@ -0,0 +1,434 @@
|
||||
from typing import List, Dict
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
from ners.research.experiment.feature_extractor import FeatureType
|
||||
from ners.research.model_registry import list_available_models
|
||||
|
||||
|
||||
class Experiments:
|
||||
def __init__(
|
||||
self,
|
||||
config: PipelineConfig,
|
||||
experiment_tracker: ExperimentTracker,
|
||||
experiment_runner: ExperimentRunner,
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
self.experiment_runner = experiment_runner
|
||||
self.experiment_builder = ExperimentBuilder(config)
|
||||
|
||||
def index(self):
|
||||
st.title("Experiments")
|
||||
|
||||
tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
|
||||
|
||||
with tab1:
|
||||
self.show_template_experiments()
|
||||
|
||||
with tab2:
|
||||
self.show_experiment_list()
|
||||
|
||||
with tab3:
|
||||
self.show_batch_experiments()
|
||||
|
||||
def show_template_experiments(self):
|
||||
"""Show interface for running predefined template experiments"""
|
||||
st.subheader("Template Experiments")
|
||||
st.write("Run predefined experiments based on research templates.")
|
||||
|
||||
try:
|
||||
available_experiments = self.experiment_builder.get_templates()
|
||||
|
||||
# Create tabs for different experiment types
|
||||
exp_tabs = st.tabs(
|
||||
["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]
|
||||
)
|
||||
|
||||
with exp_tabs[0]:
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["baseline"], "baseline"
|
||||
)
|
||||
|
||||
with exp_tabs[1]:
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["advanced"], "advanced"
|
||||
)
|
||||
|
||||
with exp_tabs[2]:
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["feature_study"], "feature_study"
|
||||
)
|
||||
|
||||
with exp_tabs[3]:
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["tuning"], "tuning"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error loading experiment templates: {e}")
|
||||
st.info(
|
||||
"Make sure the research templates file exists at `config/research_templates.yaml`"
|
||||
)
|
||||
|
||||
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
|
||||
"""Show experiments for a specific type"""
|
||||
if not experiments:
|
||||
st.info(f"No {experiment_type} experiments available in templates.")
|
||||
return
|
||||
|
||||
st.write(f"**{experiment_type.title()} Experiments**")
|
||||
|
||||
# Show available experiments
|
||||
for i, exp_template in enumerate(experiments):
|
||||
exp_name = exp_template.get("name", f"Experiment {i + 1}")
|
||||
exp_description = exp_template.get(
|
||||
"description", "No description available"
|
||||
)
|
||||
|
||||
with st.expander(f"📊 {exp_name} - {exp_description}"):
|
||||
col1, col2 = st.columns([2, 1])
|
||||
|
||||
with col1:
|
||||
st.json(exp_template)
|
||||
|
||||
with col2:
|
||||
if st.button("🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
|
||||
self._run_template_experiment(exp_template)
|
||||
|
||||
def _run_template_experiment(self, exp_template: Dict):
|
||||
"""Run a template experiment"""
|
||||
try:
|
||||
with st.spinner(f"Running {exp_template.get('name')}..."):
|
||||
# Create experiment config from template
|
||||
experiment_config = self.experiment_builder.from_template(exp_template)
|
||||
|
||||
# Run the experiment
|
||||
experiment_id = self.experiment_runner.run_experiment(experiment_config)
|
||||
st.success(
|
||||
f"Experiment '{experiment_config.name}' completed successfully!"
|
||||
)
|
||||
st.info(f"Experiment ID: `{experiment_id}`")
|
||||
|
||||
# Show results
|
||||
experiment = self.experiment_tracker.get_experiment(experiment_id)
|
||||
if experiment and experiment.test_metrics:
|
||||
st.write("**Results:**")
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
metrics = list(experiment.test_metrics.items())
|
||||
for i, (metric, value) in enumerate(metrics):
|
||||
with [col1, col2, col3][i % 3]:
|
||||
st.metric(metric.title(), f"{value:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running experiment: {e}")
|
||||
|
||||
def show_experiment_list(self):
|
||||
"""Show list of all experiments with filtering"""
|
||||
st.subheader("All Experiments")
|
||||
|
||||
# Filters
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
status_filter = st.selectbox(
|
||||
"Filter by Status", ["All", "completed", "running", "failed", "pending"]
|
||||
)
|
||||
|
||||
with col2:
|
||||
model_filter = st.selectbox(
|
||||
"Filter by Model", ["All"] + list_available_models()
|
||||
)
|
||||
|
||||
with col3:
|
||||
tag_filter = st.text_input("Filter by Tags (comma-separated)")
|
||||
|
||||
# Get and filter experiments
|
||||
experiments = self._get_filtered_experiments(
|
||||
status_filter, model_filter, tag_filter
|
||||
)
|
||||
|
||||
if not experiments:
|
||||
st.info("No experiments found matching the filters.")
|
||||
return
|
||||
|
||||
# Display experiments
|
||||
for i, exp in enumerate(experiments):
|
||||
with st.expander(
|
||||
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
|
||||
):
|
||||
self._display_experiment_details(exp, i)
|
||||
|
||||
def _get_filtered_experiments(
|
||||
self, status_filter: str, model_filter: str, tag_filter: str
|
||||
):
|
||||
"""Get experiments with applied filters"""
|
||||
experiments = self.experiment_tracker.list_experiments()
|
||||
|
||||
# Apply filters
|
||||
if status_filter != "All":
|
||||
experiments = [
|
||||
e for e in experiments if e.status == ExperimentStatus(status_filter)
|
||||
]
|
||||
|
||||
if model_filter != "All":
|
||||
experiments = [
|
||||
e for e in experiments if e.config.model_type == model_filter
|
||||
]
|
||||
|
||||
if tag_filter:
|
||||
tags = [tag.strip() for tag in tag_filter.split(",")]
|
||||
experiments = [
|
||||
e for e in experiments if any(tag in e.config.tags for tag in tags)
|
||||
]
|
||||
|
||||
return experiments
|
||||
|
||||
@classmethod
|
||||
def _display_experiment_details(cls, exp, index: int):
|
||||
"""Display details for a single experiment"""
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.write(f"**Model:** {exp.config.model_type}")
|
||||
st.write(
|
||||
f"**Features:** {', '.join([f.value for f in exp.config.features])}"
|
||||
)
|
||||
st.write(f"**Tags:** {', '.join(exp.config.tags)}")
|
||||
|
||||
with col2:
|
||||
if exp.test_metrics:
|
||||
for metric, value in exp.test_metrics.items():
|
||||
st.metric(metric.title(), f"{value:.4f}")
|
||||
|
||||
with col3:
|
||||
st.write(f"**Train Size:** {exp.train_size:,}")
|
||||
st.write(f"**Test Size:** {exp.test_size:,}")
|
||||
|
||||
if st.button("View Details", key=f"details_{index}"):
|
||||
st.session_state.selected_experiment = exp.experiment_id
|
||||
st.rerun()
|
||||
|
||||
if exp.config.description:
|
||||
st.write(f"**Description:** {exp.config.description}")
|
||||
|
||||
def show_batch_experiments(self):
|
||||
"""Show interface for running batch experiments"""
|
||||
st.subheader("Batch Experiments")
|
||||
st.write("Run multiple experiments with different parameter combinations.")
|
||||
|
||||
# Add option to run template batch experiments
|
||||
batch_type = st.radio(
|
||||
"Batch Type", ["Template Batch", "Custom Parameter Sweep"]
|
||||
)
|
||||
|
||||
if batch_type == "Template Batch":
|
||||
self._show_template_batch_experiments()
|
||||
else:
|
||||
self._show_custom_batch_experiments()
|
||||
|
||||
def _show_template_batch_experiments(self):
|
||||
"""Show interface for running batch experiments from templates"""
|
||||
st.write("**Run Multiple Template Experiments**")
|
||||
|
||||
try:
|
||||
available_experiments = self.experiment_builder.get_templates()
|
||||
|
||||
# Select experiment types to run
|
||||
experiment_types = st.multiselect(
|
||||
"Select Experiment Types",
|
||||
["baseline", "advanced", "feature_study", "tuning"],
|
||||
default=["baseline"],
|
||||
)
|
||||
|
||||
if experiment_types:
|
||||
selected_experiments = []
|
||||
|
||||
for exp_type in experiment_types:
|
||||
experiments = available_experiments.get(exp_type, [])
|
||||
if experiments:
|
||||
st.write(f"**{exp_type.title()} Experiments:**")
|
||||
exp_names = [
|
||||
exp.get("name", f"Exp {i}")
|
||||
for i, exp in enumerate(experiments)
|
||||
]
|
||||
selected_names = st.multiselect(
|
||||
f"Select {exp_type} experiments",
|
||||
exp_names,
|
||||
key=f"select_{exp_type}",
|
||||
)
|
||||
|
||||
for name in selected_names:
|
||||
for exp in experiments:
|
||||
if exp.get("name") == name:
|
||||
selected_experiments.append(exp)
|
||||
|
||||
if st.button("🚀 Run Selected Template Experiments"):
|
||||
self._run_template_batch_experiments(selected_experiments)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error loading templates for batch experiments: {e}")
|
||||
|
||||
def _run_template_batch_experiments(self, selected_experiments: List[Dict]):
|
||||
"""Run batch experiments from templates"""
|
||||
if not selected_experiments:
|
||||
st.warning("No experiments selected")
|
||||
return
|
||||
|
||||
with st.spinner(f"Running {len(selected_experiments)} template experiments..."):
|
||||
try:
|
||||
experiment_configs = []
|
||||
for exp_template in selected_experiments:
|
||||
config = self.experiment_builder.from_template(exp_template)
|
||||
experiment_configs.append(config)
|
||||
|
||||
# Run batch experiments
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(
|
||||
experiment_configs
|
||||
)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} template experiments!")
|
||||
|
||||
# Show summary
|
||||
if experiment_ids:
|
||||
comparison = self.experiment_runner.compare_experiments(
|
||||
experiment_ids
|
||||
)
|
||||
st.write("**Template Batch Results:**")
|
||||
st.dataframe(
|
||||
comparison[["name", "model_type", "test_accuracy"]],
|
||||
use_container_width=True,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running template batch experiments: {e}")
|
||||
|
||||
def _show_custom_batch_experiments(self):
|
||||
"""Show interface for custom parameter sweep experiments"""
|
||||
# Parameter sweep configuration
|
||||
with st.form("batch_experiments"):
|
||||
st.write("**Parameter Sweep Configuration**")
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
base_name = st.text_input("Base Experiment Name", "parameter_sweep")
|
||||
model_types = st.multiselect(
|
||||
"Model Types",
|
||||
list_available_models(),
|
||||
default=["logistic_regression"],
|
||||
)
|
||||
|
||||
# N-gram ranges for logistic regression
|
||||
st.write("**Logistic Regression Parameters**")
|
||||
ngram_ranges = st.text_area(
|
||||
"N-gram Ranges (one per line, format: min,max)", "2,4\n2,5\n3,6"
|
||||
)
|
||||
|
||||
with col2:
|
||||
feature_combinations = st.multiselect(
|
||||
"Feature Combinations",
|
||||
[f.value for f in FeatureType],
|
||||
default=["full_name", "native_name", "surname"],
|
||||
)
|
||||
|
||||
test_sizes = st.text_input(
|
||||
"Test Sizes (comma-separated)", "0.15,0.2,0.25"
|
||||
)
|
||||
|
||||
tags = st.text_input("Common Tags", "parameter_sweep,batch")
|
||||
|
||||
if st.form_submit_button("🚀 Run Parameter Sweep"):
|
||||
self.run_batch_experiments(
|
||||
base_name,
|
||||
model_types,
|
||||
ngram_ranges,
|
||||
feature_combinations,
|
||||
test_sizes,
|
||||
tags,
|
||||
)
|
||||
|
||||
def run_batch_experiments(
|
||||
self,
|
||||
base_name: str,
|
||||
model_types: List[str],
|
||||
ngram_ranges: str,
|
||||
feature_combinations: List[str],
|
||||
test_sizes: str,
|
||||
tags: str,
|
||||
):
|
||||
"""Run batch experiments with parameter combinations"""
|
||||
with st.spinner("Running batch experiments..."):
|
||||
try:
|
||||
experiments = []
|
||||
|
||||
# Parse parameters
|
||||
ngram_list = []
|
||||
for line in ngram_ranges.strip().split("\n"):
|
||||
if "," in line:
|
||||
min_val, max_val = map(int, line.split(","))
|
||||
ngram_list.append([min_val, max_val])
|
||||
|
||||
test_size_list = [float(x.strip()) for x in test_sizes.split(",")]
|
||||
tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()]
|
||||
|
||||
# Generate experiment combinations
|
||||
exp_count = 0
|
||||
for model_type in model_types:
|
||||
for feature_combo in feature_combinations:
|
||||
for test_size in test_size_list:
|
||||
if model_type == "logistic_regression":
|
||||
for ngram_range in ngram_list:
|
||||
exp_name = f"{base_name}_{model_type}_{feature_combo}_{ngram_range[0]}_{ngram_range[1]}_{test_size}"
|
||||
|
||||
config = ExperimentConfig(
|
||||
name=exp_name,
|
||||
description=f"Batch experiment: {model_type} with {feature_combo}",
|
||||
model_type=model_type,
|
||||
features=[FeatureType(feature_combo)],
|
||||
model_params={"ngram_range": ngram_range},
|
||||
test_size=test_size,
|
||||
tags=tag_list,
|
||||
)
|
||||
experiments.append(config)
|
||||
exp_count += 1
|
||||
else:
|
||||
exp_name = f"{base_name}_{model_type}_{feature_combo}_{test_size}"
|
||||
|
||||
config = ExperimentConfig(
|
||||
name=exp_name,
|
||||
description=f"Batch experiment: {model_type} with {feature_combo}",
|
||||
model_type=model_type,
|
||||
features=[FeatureType(feature_combo)],
|
||||
test_size=test_size,
|
||||
tags=tag_list,
|
||||
)
|
||||
experiments.append(config)
|
||||
exp_count += 1
|
||||
|
||||
# Run experiments
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(
|
||||
experiments
|
||||
)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} batch experiments")
|
||||
|
||||
# Show summary
|
||||
if experiment_ids:
|
||||
comparison = self.experiment_runner.compare_experiments(
|
||||
experiment_ids
|
||||
)
|
||||
st.write("**Batch Results Summary:**")
|
||||
st.dataframe(
|
||||
comparison[["name", "model_type", "test_accuracy"]],
|
||||
use_container_width=True,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running batch experiments: {e}")
|
||||
@@ -0,0 +1,80 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogEntry:
|
||||
timestamp: datetime
|
||||
level: str
|
||||
message: str
|
||||
|
||||
|
||||
class LogReader:
|
||||
def __init__(self, log_file_path: Path):
|
||||
self.log_file_path = Path(log_file_path)
|
||||
|
||||
def read_last_entries(self, num_entries: int = 20) -> List[LogEntry]:
|
||||
entries = []
|
||||
if not self.log_file_path.exists():
|
||||
return entries
|
||||
|
||||
with open(self.log_file_path, "r") as f:
|
||||
lines = f.readlines()[-num_entries:]
|
||||
|
||||
for line in lines:
|
||||
entry = self._parse_log_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
||||
def read_entries_by_level(
|
||||
self, level: str, num_entries: int = 20
|
||||
) -> List[LogEntry]:
|
||||
entries = []
|
||||
if not self.log_file_path.exists():
|
||||
return entries
|
||||
|
||||
with open(self.log_file_path, "r") as f:
|
||||
for line in reversed(f.readlines()):
|
||||
entry = self._parse_log_line(line)
|
||||
if entry and entry.level == level:
|
||||
entries.append(entry)
|
||||
if len(entries) >= num_entries:
|
||||
break
|
||||
|
||||
return list(reversed(entries))
|
||||
|
||||
def get_log_stats(self) -> dict:
|
||||
if not self.log_file_path.exists():
|
||||
return {}
|
||||
|
||||
stats = {"total_lines": 0}
|
||||
with open(self.log_file_path, "r") as f:
|
||||
for line in f:
|
||||
stats["total_lines"] += 1
|
||||
entry = self._parse_log_line(line)
|
||||
if entry:
|
||||
stats[entry.level] = stats.get(entry.level, 0) + 1
|
||||
|
||||
return stats
|
||||
|
||||
@staticmethod
|
||||
def _parse_log_line(line: str) -> LogEntry | None:
|
||||
try:
|
||||
# Expected format from logging config: [timestamp] - LEVEL - message
|
||||
parts = line.strip().split(" - ")
|
||||
if len(parts) >= 3:
|
||||
timestamp_str = parts[0].strip("[]")
|
||||
timestamp = datetime.fromisoformat(timestamp_str)
|
||||
level = parts[1].strip()
|
||||
message = " - ".join(parts[2:])
|
||||
return LogEntry(timestamp, level, message)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,170 @@
|
||||
import streamlit as st
|
||||
from spacy import displacy
|
||||
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.processing.ner.name_model import NameModel
|
||||
|
||||
|
||||
class NERTesting:
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.model_path = config.paths.models_dir / "drc_ner_model"
|
||||
self.ner_model = None
|
||||
self.training_stats = None
|
||||
self.evaluation_stats = None
|
||||
|
||||
def load_ner_model(self) -> bool:
|
||||
"""Load the trained NER model"""
|
||||
try:
|
||||
if self.ner_model is None:
|
||||
self.ner_model = NameModel(self.config)
|
||||
self.ner_model.load(str(self.model_path))
|
||||
self.training_stats = self.ner_model.training_stats
|
||||
self.evaluation_stats = {}
|
||||
return True
|
||||
except Exception as e:
|
||||
st.error(f"Error loading NER model: {e}")
|
||||
return False
|
||||
|
||||
def index(self):
|
||||
st.title("Named Entity Recognition")
|
||||
|
||||
# Load model
|
||||
if not self.load_ner_model():
|
||||
st.warning(
|
||||
"NER model could not be loaded. Please ensure the model is trained and available."
|
||||
)
|
||||
return
|
||||
|
||||
# Display model information
|
||||
self.show_model_training_info()
|
||||
self.show_model_evaluation_info()
|
||||
|
||||
st.markdown("---")
|
||||
st.subheader("Test the NER Model")
|
||||
input_method = st.radio("Input Method", ["Single Name", "Multiple Names"])
|
||||
if input_method == "Single Name":
|
||||
self.test_single_name()
|
||||
elif input_method == "Multiple Names":
|
||||
self.test_multiple_names()
|
||||
|
||||
def show_model_training_info(self):
|
||||
if self.training_stats:
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.metric(
|
||||
"Training Examples",
|
||||
f"{self.training_stats.get('training_examples', 0):,}",
|
||||
)
|
||||
with col2:
|
||||
st.metric("Epochs", self.training_stats.get("epochs", 0))
|
||||
with col3:
|
||||
st.metric(
|
||||
"Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}"
|
||||
)
|
||||
with col4:
|
||||
st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
|
||||
|
||||
def show_model_evaluation_info(self):
|
||||
if self.evaluation_stats:
|
||||
col1, col2, col3 = st.columns(4)
|
||||
overall = self.evaluation_stats.get("overall", {})
|
||||
|
||||
with col1:
|
||||
st.metric("Overall Precision", f"{overall['precision']:.2f}")
|
||||
with col2:
|
||||
st.metric("Overall Recall", f"{overall['recall']:.2f}")
|
||||
with col3:
|
||||
st.metric("Overall F1 Score", f"{overall['f1_score']:.2f}")
|
||||
|
||||
st.json(self.evaluation_stats.get("by_label", {}))
|
||||
|
||||
def test_single_name(self):
|
||||
name_input = st.text_input(
|
||||
"Name:",
|
||||
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
|
||||
help="Enter a full name or multiple names separated by spaces",
|
||||
)
|
||||
if name_input.strip():
|
||||
if st.button("Analyze Name", type="primary"):
|
||||
self.analyze_and_display(name_input)
|
||||
|
||||
def test_multiple_names(self):
|
||||
names_input = st.text_area(
|
||||
"Names:",
|
||||
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
|
||||
height=150,
|
||||
help="Enter each name on a new line",
|
||||
)
|
||||
|
||||
if names_input.strip():
|
||||
if st.button("Analyze All Names", type="primary"):
|
||||
names = [
|
||||
name.strip() for name in names_input.split("\n") if name.strip()
|
||||
]
|
||||
for i, name in enumerate(names):
|
||||
st.markdown(f"**Name {i + 1}: {name}**")
|
||||
self.analyze_and_display(name)
|
||||
if i < len(names) - 1:
|
||||
st.markdown("---")
|
||||
|
||||
def analyze_and_display(self, text: str):
|
||||
try:
|
||||
result = self.ner_model.predict(text)
|
||||
st.subheader("Analysis Results")
|
||||
entities = result.get("entities", [])
|
||||
|
||||
if entities:
|
||||
self.show_visual_entities(text, entities)
|
||||
native_count = sum(1 for e in entities if e["label"] == "NATIVE")
|
||||
surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
st.metric("Total Entities", len(entities))
|
||||
with col2:
|
||||
st.metric("Native Names", native_count)
|
||||
with col3:
|
||||
st.metric("Surnames", surname_count)
|
||||
|
||||
else:
|
||||
st.warning("No entities detected in the input text.")
|
||||
st.info(
|
||||
"Try using traditional Congolese names or ensure the spelling is correct."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error analyzing text: {e}")
|
||||
|
||||
@classmethod
|
||||
def show_visual_entities(cls, text: str, entities: list):
|
||||
try:
|
||||
# Convert our entities format to spaCy format for displacy
|
||||
ents = []
|
||||
for entity in entities:
|
||||
ents.append(
|
||||
{
|
||||
"start": entity["start"],
|
||||
"end": entity["end"],
|
||||
"label": entity["label"],
|
||||
}
|
||||
)
|
||||
|
||||
# Create doc-like structure for displacy
|
||||
doc_data = {"text": text, "ents": ents, "title": None}
|
||||
|
||||
# Custom colors for our labels
|
||||
colors = {
|
||||
"NATIVE": "#74C0FC",
|
||||
"SURNAME": "#69DB7C",
|
||||
} # Light blue # Light green
|
||||
|
||||
options = {"colors": colors, "distance": 90}
|
||||
|
||||
# Generate HTML visualization
|
||||
html = displacy.render(doc_data, style="ent", manual=True, options=options)
|
||||
st.markdown(html, unsafe_allow_html=True)
|
||||
|
||||
except Exception as e:
|
||||
st.warning(f"Could not generate visual representation: {e}")
|
||||
@@ -0,0 +1,215 @@
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
|
||||
class Predictions:
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
experiment_tracker: ExperimentTracker,
|
||||
experiment_runner: ExperimentRunner,
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
self.experiment_runner = experiment_runner
|
||||
|
||||
def index(self):
|
||||
st.title("Predictions")
|
||||
|
||||
# Load available models
|
||||
experiments = self.experiment_tracker.list_experiments()
|
||||
completed_experiments = [
|
||||
e for e in experiments if e.status.value == "completed" and e.model_path
|
||||
]
|
||||
|
||||
if not completed_experiments:
|
||||
st.warning(
|
||||
"No trained models available. Please run some experiments first."
|
||||
)
|
||||
return
|
||||
|
||||
# Model selection
|
||||
model_options = {
|
||||
f"{exp.config.name} (Acc: {exp.test_metrics.get('accuracy', 0):.3f})": exp
|
||||
for exp in completed_experiments
|
||||
if exp.test_metrics
|
||||
}
|
||||
|
||||
selected_model_name = st.selectbox("Select Model", list(model_options.keys()))
|
||||
|
||||
if not selected_model_name:
|
||||
return
|
||||
|
||||
selected_experiment = model_options[selected_model_name]
|
||||
|
||||
# Prediction modes
|
||||
prediction_mode = st.radio(
|
||||
"Prediction Mode", ["Single Name", "Batch Upload", "Dataset Prediction"]
|
||||
)
|
||||
|
||||
if prediction_mode == "Single Name":
|
||||
self.show_single_prediction(selected_experiment)
|
||||
elif prediction_mode == "Batch Upload":
|
||||
self.show_batch_prediction(selected_experiment)
|
||||
elif prediction_mode == "Dataset Prediction":
|
||||
self.show_dataset_prediction(selected_experiment)
|
||||
|
||||
def show_single_prediction(self, experiment):
|
||||
"""Show single name prediction interface"""
|
||||
name_input = st.text_input(
|
||||
"Enter a name:", placeholder="e.g., Jean Baptiste Mukendi"
|
||||
)
|
||||
if name_input and st.button("Predict Gender"):
|
||||
try:
|
||||
# Load the model
|
||||
model = self.experiment_runner.load_experiment_model(
|
||||
experiment.experiment_id
|
||||
)
|
||||
|
||||
if model is None:
|
||||
st.error("Failed to load model")
|
||||
return
|
||||
|
||||
# Create a DataFrame with the input
|
||||
input_df = self._prepare_single_input(name_input)
|
||||
|
||||
# Make prediction
|
||||
prediction = model.predict(input_df)[0]
|
||||
|
||||
# Get prediction probability if available
|
||||
confidence = self._get_prediction_confidence(model, input_df)
|
||||
|
||||
# Display results
|
||||
self._display_single_prediction_results(
|
||||
prediction, confidence, experiment, name_input
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error making prediction: {e}")
|
||||
|
||||
def _prepare_single_input(self, name_input: str) -> pd.DataFrame:
|
||||
"""Prepare single name input for prediction"""
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"name": [name_input],
|
||||
"words": [len(name_input.split())],
|
||||
"length": [len(name_input.replace(" ", ""))],
|
||||
"province": ["unknown"], # Default values
|
||||
"identified_name": [None],
|
||||
"identified_surname": [None],
|
||||
"probable_native": [None],
|
||||
"probable_surname": [None],
|
||||
}
|
||||
)
|
||||
|
||||
def _get_prediction_confidence(
|
||||
self, model, input_df: pd.DataFrame
|
||||
) -> Optional[float]:
|
||||
"""Get prediction confidence if available"""
|
||||
try:
|
||||
probabilities = model.predict_proba(input_df)[0]
|
||||
return max(probabilities)
|
||||
except:
|
||||
return None
|
||||
|
||||
def _display_single_prediction_results(
|
||||
self, prediction: str, confidence: Optional[float], experiment, name_input: str
|
||||
):
|
||||
"""Display single prediction results"""
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
gender_label = "Female" if prediction == "f" else "Male"
|
||||
st.success(f"**Predicted Gender:** {gender_label}")
|
||||
|
||||
with col2:
|
||||
if confidence:
|
||||
st.metric("Confidence", f"{confidence:.2%}")
|
||||
|
||||
# Additional info
|
||||
st.info(f"Model used: {experiment.config.name}")
|
||||
st.info(
|
||||
f"Features used: {', '.join([f.value for f in experiment.config.features])}"
|
||||
)
|
||||
|
||||
def show_batch_prediction(self, experiment):
|
||||
uploaded_file = st.file_uploader("Upload CSV file with names", type="csv")
|
||||
if uploaded_file is not None:
|
||||
try:
|
||||
df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)
|
||||
|
||||
st.write("**Uploaded Data Preview:**")
|
||||
st.dataframe(df.head(), use_container_width=True)
|
||||
|
||||
# Column selection
|
||||
df = self._prepare_batch_data(df)
|
||||
|
||||
if st.button("Run Batch Prediction"):
|
||||
self._run_batch_prediction(df, experiment)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error processing file: {e}")
|
||||
|
||||
def _prepare_batch_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Prepare batch data for prediction"""
|
||||
# Column selection
|
||||
if "name" not in df.columns:
|
||||
name_column = st.selectbox("Select the name column:", df.columns)
|
||||
df = df.rename(columns={name_column: "name"})
|
||||
|
||||
# Add missing columns with defaults
|
||||
required_columns = [
|
||||
"words",
|
||||
"length",
|
||||
"province",
|
||||
"identified_name",
|
||||
"identified_surname",
|
||||
"probable_native",
|
||||
"probable_surname",
|
||||
]
|
||||
|
||||
for col in required_columns:
|
||||
if col not in df.columns:
|
||||
if col == "words":
|
||||
df[col] = df["name"].str.split().str.len()
|
||||
elif col == "length":
|
||||
df[col] = df["name"].str.replace(" ", "").str.len()
|
||||
else:
|
||||
df[col] = None
|
||||
|
||||
return df
|
||||
|
||||
def _run_batch_prediction(self, df: pd.DataFrame, experiment):
|
||||
"""Run batch prediction and display results"""
|
||||
with st.spinner("Making predictions..."):
|
||||
# Load model
|
||||
model = self.experiment_runner.load_experiment_model(
|
||||
experiment.experiment_id
|
||||
)
|
||||
|
||||
if model is None:
|
||||
st.error("Failed to load model")
|
||||
return
|
||||
|
||||
# Make predictions
|
||||
predictions = model.predict(df)
|
||||
df["predicted_gender"] = predictions
|
||||
df["gender_label"] = df["predicted_gender"].map(
|
||||
{"f": "Female", "m": "Male"}
|
||||
)
|
||||
|
||||
# Try to get probabilities
|
||||
try:
|
||||
probabilities = model.predict_proba(df)
|
||||
df["confidence"] = np.max(probabilities, axis=1)
|
||||
except:
|
||||
df["confidence"] = None
|
||||
|
||||
st.success("Predictions completed!")
|
||||
@@ -0,0 +1,283 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
import streamlit as st
|
||||
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
|
||||
class ResultsAnalysis:
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
experiment_tracker: ExperimentTracker,
|
||||
experiment_runner: ExperimentRunner,
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
self.experiment_runner = experiment_runner
|
||||
|
||||
def index(self):
|
||||
st.title("Results & Analysis")
|
||||
tab1, tab2, tab3 = st.tabs(
|
||||
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
|
||||
)
|
||||
|
||||
with tab1:
|
||||
self.show_experiment_comparison()
|
||||
|
||||
with tab2:
|
||||
self.show_performance_analysis()
|
||||
|
||||
with tab3:
|
||||
self.show_model_analysis()
|
||||
|
||||
def show_experiment_comparison(self):
|
||||
"""Show experiment comparison interface"""
|
||||
st.subheader("Compare Experiments")
|
||||
|
||||
experiments = self.experiment_tracker.list_experiments()
|
||||
completed_experiments = [
|
||||
e for e in experiments if e.status.value == "completed"
|
||||
]
|
||||
|
||||
if not completed_experiments:
|
||||
st.warning("No completed experiments found.")
|
||||
return
|
||||
|
||||
# Experiment selection
|
||||
exp_options = {
|
||||
f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id
|
||||
for exp in completed_experiments
|
||||
}
|
||||
|
||||
selected_exp_names = st.multiselect(
|
||||
"Select Experiments to Compare",
|
||||
list(exp_options.keys()),
|
||||
default=list(exp_options.keys())[: min(5, len(exp_options))],
|
||||
)
|
||||
|
||||
if not selected_exp_names:
|
||||
st.info("Please select experiments to compare.")
|
||||
return
|
||||
|
||||
selected_exp_ids = [exp_options[name] for name in selected_exp_names]
|
||||
|
||||
# Generate comparison
|
||||
comparison_df = self.experiment_runner.compare_experiments(selected_exp_ids)
|
||||
|
||||
if comparison_df.empty:
|
||||
st.error("No data available for comparison.")
|
||||
return
|
||||
|
||||
self._display_comparison_table(comparison_df)
|
||||
self._display_comparison_charts(comparison_df)
|
||||
|
||||
def _display_comparison_table(self, comparison_df: pd.DataFrame):
|
||||
"""Display comparison table"""
|
||||
st.write("**Experiment Comparison Table**")
|
||||
|
||||
# Select columns to display
|
||||
metric_columns = [
|
||||
col
|
||||
for col in comparison_df.columns
|
||||
if col.startswith("test_") or col.startswith("cv_")
|
||||
]
|
||||
display_columns = ["name", "model_type", "features"] + metric_columns
|
||||
available_columns = [
|
||||
col for col in display_columns if col in comparison_df.columns
|
||||
]
|
||||
|
||||
st.dataframe(comparison_df[available_columns], use_container_width=True)
|
||||
|
||||
def _display_comparison_charts(self, comparison_df: pd.DataFrame):
|
||||
"""Display comparison charts"""
|
||||
st.write("**Performance Comparison**")
|
||||
|
||||
if "test_accuracy" in comparison_df.columns:
|
||||
fig = px.bar(
|
||||
comparison_df,
|
||||
x="name",
|
||||
y="test_accuracy",
|
||||
color="model_type",
|
||||
title="Test Accuracy Comparison",
|
||||
)
|
||||
fig.update_layout(xaxis_tickangle=-45)
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
|
||||
# Metric comparison across multiple metrics
|
||||
metric_columns = [
|
||||
col
|
||||
for col in comparison_df.columns
|
||||
if col.startswith("test_") or col.startswith("cv_")
|
||||
]
|
||||
|
||||
if len(metric_columns) > 1:
|
||||
metric_to_plot = st.selectbox(
|
||||
"Select Metric for Detailed Comparison", metric_columns
|
||||
)
|
||||
|
||||
if metric_to_plot in comparison_df.columns:
|
||||
fig = px.bar(
|
||||
comparison_df,
|
||||
x="name",
|
||||
y=metric_to_plot,
|
||||
color="model_type",
|
||||
title=f"{metric_to_plot.replace('_', ' ').title()} Comparison",
|
||||
)
|
||||
fig.update_layout(xaxis_tickangle=-45)
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
|
||||
def show_performance_analysis(self):
|
||||
"""Show performance analysis across experiments"""
|
||||
st.subheader("Performance Analysis")
|
||||
|
||||
experiments = self.experiment_tracker.list_experiments()
|
||||
completed_experiments = [
|
||||
e for e in experiments if e.status.value == "completed" and e.test_metrics
|
||||
]
|
||||
|
||||
if not completed_experiments:
|
||||
st.warning("No completed experiments with metrics found.")
|
||||
return
|
||||
|
||||
# Prepare data for analysis
|
||||
analysis_data = self._prepare_analysis_data(completed_experiments)
|
||||
analysis_df = pd.DataFrame(analysis_data)
|
||||
|
||||
self._display_performance_trends(analysis_df)
|
||||
self._display_model_comparison(analysis_df)
|
||||
self._display_top_experiments(analysis_df)
|
||||
|
||||
def _prepare_analysis_data(self, completed_experiments: List) -> List[dict]:
|
||||
"""Prepare data for performance analysis"""
|
||||
analysis_data = []
|
||||
for exp in completed_experiments:
|
||||
row = {
|
||||
"experiment_id": exp.experiment_id,
|
||||
"name": exp.config.name,
|
||||
"model_type": exp.config.model_type,
|
||||
"feature_count": len(exp.config.features),
|
||||
"features": ", ".join([f.value for f in exp.config.features]),
|
||||
"train_size": exp.train_size,
|
||||
"test_size": exp.test_size,
|
||||
**exp.test_metrics,
|
||||
}
|
||||
analysis_data.append(row)
|
||||
return analysis_data
|
||||
|
||||
def _display_performance_trends(self, analysis_df: pd.DataFrame):
|
||||
"""Display performance trend charts"""
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
# Accuracy vs Training Size
|
||||
if (
|
||||
"accuracy" in analysis_df.columns
|
||||
and "train_size" in analysis_df.columns
|
||||
):
|
||||
fig = px.scatter(
|
||||
analysis_df,
|
||||
x="train_size",
|
||||
y="accuracy",
|
||||
color="model_type",
|
||||
hover_data=["name"],
|
||||
title="Accuracy vs Training Size",
|
||||
)
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
|
||||
with col2:
|
||||
# Feature Count vs Performance
|
||||
if (
|
||||
"accuracy" in analysis_df.columns
|
||||
and "feature_count" in analysis_df.columns
|
||||
):
|
||||
fig = px.scatter(
|
||||
analysis_df,
|
||||
x="feature_count",
|
||||
y="accuracy",
|
||||
color="model_type",
|
||||
hover_data=["name"],
|
||||
title="Accuracy vs Number of Features",
|
||||
)
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
|
||||
def _display_model_comparison(self, analysis_df: pd.DataFrame):
|
||||
"""Display model type comparison"""
|
||||
if "accuracy" in analysis_df.columns:
|
||||
model_performance = (
|
||||
analysis_df.groupby("model_type")["accuracy"]
|
||||
.agg(["mean", "std", "count"])
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
fig = go.Figure()
|
||||
fig.add_trace(
|
||||
go.Bar(
|
||||
x=model_performance["model_type"],
|
||||
y=model_performance["mean"],
|
||||
error_y=dict(type="data", array=model_performance["std"].fillna(0)),
|
||||
name="Accuracy",
|
||||
)
|
||||
)
|
||||
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
|
||||
def _display_top_experiments(self, analysis_df: pd.DataFrame):
|
||||
"""Display top-performing experiments"""
|
||||
if "accuracy" in analysis_df.columns:
|
||||
top_n = st.slider("Select Top N Experiments", 3, 20, 5)
|
||||
top_experiments = analysis_df.nlargest(top_n, "accuracy")
|
||||
|
||||
st.write("**Top Performing Experiments:**")
|
||||
st.dataframe(
|
||||
top_experiments[
|
||||
[
|
||||
"name",
|
||||
"model_type",
|
||||
"features",
|
||||
"train_size",
|
||||
"test_size",
|
||||
"accuracy",
|
||||
]
|
||||
],
|
||||
use_container_width=True,
|
||||
)
|
||||
|
||||
def show_model_analysis(self):
|
||||
"""Show detailed model analysis interface"""
|
||||
st.subheader("Model Analysis")
|
||||
|
||||
experiments = self.experiment_tracker.list_experiments()
|
||||
completed_experiments = [
|
||||
e for e in experiments if e.status.value == "completed"
|
||||
]
|
||||
|
||||
if not completed_experiments:
|
||||
st.warning("No completed experiments found for analysis.")
|
||||
return
|
||||
|
||||
# Model selection
|
||||
exp_options = {
|
||||
f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id
|
||||
for exp in completed_experiments
|
||||
}
|
||||
selected_exp_name = st.selectbox(
|
||||
"Select Model for Analysis", list(exp_options.keys())
|
||||
)
|
||||
if not selected_exp_name:
|
||||
return
|
||||
|
||||
exp_id = exp_options[selected_exp_name]
|
||||
experiment = self.experiment_tracker.get_experiment(exp_id)
|
||||
|
||||
if not experiment or not experiment.test_metrics:
|
||||
st.warning("Selected experiment has no evaluation metrics.")
|
||||
return
|
||||
|
||||
# Display detailed metrics
|
||||
st.write("**Detailed Metrics:**")
|
||||
st.json(experiment.test_metrics)
|
||||
@@ -0,0 +1,16 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.dashboard import Dashboard
|
||||
|
||||
st.set_page_config(page_title="Dashboard", page_icon="📊", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
dashboard = Dashboard(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
dashboard.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,12 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.data_overview import DataOverview
|
||||
|
||||
st.set_page_config(page_title="Data Overview", page_icon="📋", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
data_overview = DataOverview(st.session_state.config)
|
||||
data_overview.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,14 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.data_processing import DataProcessing
|
||||
|
||||
st.set_page_config(page_title="Data Processing", page_icon="⚙️", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
data_processing = DataProcessing(
|
||||
st.session_state.config, st.session_state.pipeline_monitor
|
||||
)
|
||||
data_processing.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,16 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.experiments import Experiments
|
||||
|
||||
st.set_page_config(page_title="Experiments", page_icon="🧪", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
experiments = Experiments(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
experiments.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,16 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.results_analysis import ResultsAnalysis
|
||||
|
||||
st.set_page_config(page_title="Results & Analysis", page_icon="📈", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
results_analysis = ResultsAnalysis(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
results_analysis.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,16 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.predictions import Predictions
|
||||
|
||||
st.set_page_config(page_title="Predictions", page_icon="🔮", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
predictions = Predictions(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
predictions.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,12 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.configuration import Configuration
|
||||
|
||||
st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
configuration = Configuration(st.session_state.config)
|
||||
configuration.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,12 @@
|
||||
import streamlit as st
|
||||
|
||||
from ners.web.interfaces.ner_testing import NERTesting
|
||||
|
||||
st.set_page_config(page_title="NER Testing", page_icon="🏷️", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
ner_testing = NERTesting(st.session_state.config)
|
||||
ner_testing.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user