refactoring: uv

This commit is contained in:
2025-10-05 18:14:15 +02:00
parent f3b06fbd07
commit 9dd4f759b3
120 changed files with 5525 additions and 3366 deletions
+3
View File
@@ -0,0 +1,3 @@
"""DRC NERS NLP package."""
__all__: list[str] = []
+226
View File
@@ -0,0 +1,226 @@
from __future__ import annotations
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional
import typer
from ners.core.config import setup_config, PipelineConfig
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
# -------------------------
# Pipeline commands
# -------------------------
pipeline_app = typer.Typer(help="Data processing pipeline")
app.add_typer(pipeline_app, name="pipeline")
@pipeline_app.command("run")
def pipeline_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Run the full processing pipeline."""
from ners.main import run_pipeline as _run_pipeline
cfg = setup_config(config_path=config, env=env)
code = _run_pipeline(cfg)
raise typer.Exit(code)
# -------------------------
# NER commands
# -------------------------
ner_app = typer.Typer(help="NER dataset and model")
app.add_typer(ner_app, name="ner")
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
return setup_config(config_path=config, env=env)
@ner_app.command("feature")
def ner_feature(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import feature as _feature
cfg = _load_config(config, env)
_feature(cfg)
@ner_app.command("build")
def ner_build(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import build as _build
cfg = _load_config(config, env)
_build(cfg)
@ner_app.command("train")
def ner_train(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import train as _train
cfg = _load_config(config, env)
_train(cfg)
@ner_app.command("run")
def ner_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
reset: bool = typer.Option(
False, help="Reset intermediate outputs and rerun all steps"
),
) -> None:
from ners.ner import run_pipeline as _ner_pipeline
cfg = _load_config(config, env)
code = _ner_pipeline(cfg, reset)
raise typer.Exit(code)
# -------------------------
# Research commands
# -------------------------
research_app = typer.Typer(help="Research experiments and training")
app.add_typer(research_app, name="research")
@research_app.command("train")
def research_train(
name: str = typer.Option(..., "--name", help="Model name to train"),
type: str = typer.Option(..., "--type", help="Experiment type"),
templates: str = typer.Option(
"research_templates.yaml", help="Templates file path"
),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.model_trainer import ModelTrainer
cfg = _load_config(config, env)
exp_builder = ExperimentBuilder(cfg)
tmpl = exp_builder.load_templates(templates)
exp_cfg = exp_builder.find_template(tmpl, name, type)
trainer = ModelTrainer(cfg)
trainer.train_single_model(
model_name=exp_cfg.get("name"),
model_type=exp_cfg.get("model_type"),
features=exp_cfg.get("features"),
model_params=exp_cfg.get("model_params", {}),
tags=exp_cfg.get("tags", []),
)
# -------------------------
# Monitor commands
# -------------------------
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
app.add_typer(monitor_app, name="monitor")
@monitor_app.command("status")
def monitor_status(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
detailed: bool = typer.Option(
False, help="Show detailed status (failed batch IDs)"
),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
PipelineMonitor().print_status(detailed=detailed)
@monitor_app.command("clean")
def monitor_clean(
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
typer.confirm("Clean checkpoints?", abort=True)
if step:
mon.clean_step_checkpoints(step, keep_last)
else:
for s in mon.steps:
mon.clean_step_checkpoints(s, keep_last)
@monitor_app.command("reset")
def monitor_reset(
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
typer.confirm(msg, abort=True)
if step:
mon.reset_step(step)
else:
for s in mon.steps:
mon.reset_step(s)
# -------------------------
# Web commands
# -------------------------
web_app = typer.Typer(help="Web UI wrapper")
app.add_typer(web_app, name="web")
@web_app.command("run")
def web_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Launch the Streamlit web app via subprocess."""
app_path = Path(__file__).parent / "web" / "app.py"
cmd = [
sys.executable,
"-m",
"streamlit",
"run",
str(app_path),
]
# Pass configuration via environment variables to avoid argparse in Streamlit
env_vars = os.environ.copy()
if config is not None:
env_vars["NERS_CONFIG"] = str(config)
env_vars["NERS_ENV"] = env
raise typer.Exit(subprocess.call(cmd, env=env_vars))
if __name__ == "__main__": # pragma: no cover
app()
View File
+95
View File
@@ -0,0 +1,95 @@
import logging
from pathlib import Path
from typing import Optional, Union
from ners.core.utils import ensure_directories
from ners.core.config.config_manager import ConfigManager
from ners.core.config.logging_config import LoggingConfig
from ners.core.config.pipeline_config import PipelineConfig
config_manager = ConfigManager()
def get_config() -> PipelineConfig:
"""Get the global configuration instance"""
return config_manager.get_config()
def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
"""Load configuration from specified path"""
if config_path:
return config_manager.load_config(Path(config_path))
return config_manager.get_config()
def setup_config(
config_path: Optional[Path] = None, env: str = "development"
) -> PipelineConfig:
"""
Unified configuration loading and logging setup for all entrypoint scripts.
Args:
config_path: Direct path to config file (takes precedence over env)
env: Environment name (defaults to "development")
Returns:
Loaded configuration object
"""
# Determine config path
if config_path is None:
config_path = Path("config") / f"pipeline.{env}.yaml"
# Load configuration
config = ConfigManager(config_path).load_config()
# Setup logging
setup_logging(config)
# Ensure required directories exist
ensure_directories(config)
logging.info(f"Loaded configuration: {config.name} v{config.version}")
logging.info(f"Environment: {config.environment}")
logging.info(f"Config file: {config_path}")
return config
def setup_logging(config: PipelineConfig):
"""Setup logging based on configuration"""
# Create logs directory
log_dir = config.paths.logs_dir
log_dir.mkdir(parents=True, exist_ok=True)
# Setup logging configuration
log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
# Create formatter
formatter = logging.Formatter(config.logging.format)
# Setup root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Clear existing handlers
root_logger.handlers.clear()
# Console handler
if config.logging.console_logging:
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
root_logger.addHandler(console_handler)
# File handler
if config.logging.file_logging:
from logging.handlers import RotatingFileHandler
log_file_path = log_dir / config.logging.log_file
file_handler = RotatingFileHandler(
log_file_path,
maxBytes=config.logging.max_log_size,
backupCount=config.logging.backup_count,
)
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)
+30
View File
@@ -0,0 +1,30 @@
from pydantic import BaseModel
class NERConfig(BaseModel):
"""NER annotation configuration"""
model_name: str = "drc_names_ner"
retry_attempts: int = 3
class LLMConfig(BaseModel):
"""LLM annotation configuration"""
model_name: str = "mistral:7b"
requests_per_minute: int = 60
requests_per_second: int = 2
retry_attempts: int = 3
timeout_seconds: int = 30
max_concurrent_requests: int = 2
enable_rate_limiting: bool = False
class AnnotationConfig(BaseModel):
"""Base class for annotation configurations"""
llm: LLMConfig = LLMConfig()
ner: NERConfig = NERConfig()
class Config:
arbitrary_types_allowed = True
+151
View File
@@ -0,0 +1,151 @@
import json
import logging
from pathlib import Path
from typing import Optional, Union, Dict, Any
import yaml
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.config.project_paths import ProjectPaths
class ConfigManager:
"""Centralized configuration management"""
def __init__(self, config_path: Optional[Union[str, Path]] = None):
self.config_path = config_path or self._find_config_file()
self._config: Optional[PipelineConfig] = None
self._setup_default_paths()
@classmethod
def _find_config_file(cls) -> Path:
"""Find configuration file in standard locations"""
possible_paths = [
Path.cwd() / "config" / "pipeline.yaml",
Path.cwd() / "config" / "pipeline.yml",
Path.cwd() / "pipeline.yaml",
Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
]
for path in possible_paths:
if path.exists():
return path
# Return default path if none found
return Path.cwd() / "config" / "pipeline.yaml"
def _setup_default_paths(self):
"""Setup default project paths"""
root_dir = Path(__file__).parent.parent.parent.parent.parent
self.default_paths = ProjectPaths(
root_dir=root_dir,
configs_dir=root_dir / "config",
data_dir=root_dir / "data" / "dataset",
models_dir=root_dir / "data" / "models",
outputs_dir=root_dir / "data" / "outputs",
logs_dir=root_dir / "data" / "logs",
checkpoints_dir=root_dir / "data" / "checkpoints",
)
def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
"""Load configuration from file"""
if config_path:
self.config_path = config_path
if not self.config_path.exists():
logging.warning(
f"Config file not found: {self.config_path}. Using defaults."
)
return self._create_default_config()
try:
with open(self.config_path, "r") as f:
if self.config_path.suffix.lower() in [".yaml", ".yml"]:
config_data = yaml.safe_load(f)
else:
config_data = json.load(f)
# Ensure paths are properly set
if "paths" not in config_data:
config_data["paths"] = self.default_paths.model_dump()
self._config = PipelineConfig(**config_data)
return self._config
except Exception as e:
logging.error(f"Failed to load config from {self.config_path}: {e}")
return self._create_default_config()
def _create_default_config(self) -> PipelineConfig:
"""Create default configuration"""
return PipelineConfig(paths=self.default_paths)
def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
"""Save configuration to file"""
save_path = path or self.config_path
save_path.parent.mkdir(parents=True, exist_ok=True)
config_dict = config.model_dump()
# Convert Path objects to strings for serialization
if "paths" in config_dict:
for key, value in config_dict["paths"].items():
if isinstance(value, Path):
config_dict["paths"][key] = str(value)
try:
with open(save_path, "w") as f:
if save_path.suffix.lower() in [".yaml", ".yml"]:
yaml.dump(config_dict, f, default_flow_style=False, indent=2)
else:
json.dump(config_dict, f, indent=2)
logging.info(f"Configuration saved to {save_path}")
except Exception as e:
logging.error(f"Failed to save config to {save_path}: {e}")
def get_config(self) -> PipelineConfig:
"""Get current configuration, loading if necessary"""
if self._config is None:
self._config = self.load_config()
return self._config
def update_config(self, updates: Dict[str, Any]):
"""Update configuration with new values"""
config = self.get_config()
# Deep update configuration
config_dict = config.model_dump()
self._deep_update(config_dict, updates)
self._config = PipelineConfig(**config_dict)
def _deep_update(self, base_dict: Dict, update_dict: Dict):
"""Recursively update nested dictionaries"""
for key, value in update_dict.items():
if (
key in base_dict
and isinstance(base_dict[key], dict)
and isinstance(value, dict)
):
self._deep_update(base_dict[key], value)
else:
base_dict[key] = value
def get_environment_config(self, env: str) -> PipelineConfig:
"""Load environment-specific configuration"""
env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
if env_config_path.exists():
base_config = self.load_config()
env_config = self.load_config(env_config_path)
# Merge configurations
base_dict = base_config.dict()
env_dict = env_config.dict()
self._deep_update(base_dict, env_dict)
return PipelineConfig(**base_dict)
return self.get_config()
+32
View File
@@ -0,0 +1,32 @@
from dataclasses import field
from typing import Dict, Optional
from pydantic import BaseModel
class DataConfig(BaseModel):
"""Data handling configuration"""
input_file: str = "names.csv"
output_files: Dict[str, str] = field(
default_factory=lambda: {
"featured": "names_featured.csv",
"evaluation": "names_evaluation.csv",
"engineered": "names_engineered.csv",
"males": "names_males.csv",
"females": "names_females.csv",
"ner_data": "names_ner.json",
"ner_spacy": "names_ner.spacy",
}
)
selected_columns: list[str] = field(default=["name", "sex", "region"])
split_evaluation: bool = False
split_by_province: bool = True
split_by_gender: bool = True
split_ner_data: bool = True
evaluation_fraction: float = 0.2
random_seed: int = 42
# Dataset size limiting options
max_dataset_size: Optional[int] = None
balance_by_sex: bool = False
+13
View File
@@ -0,0 +1,13 @@
from pydantic import BaseModel
class LoggingConfig(BaseModel):
"""Logging configuration"""
level: str = "INFO"
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: bool = True
console_logging: bool = True
log_file: str = "pipeline.log"
max_log_size: int = 10 * 1024 * 1024 # 10MB
backup_count: int = 5
+29
View File
@@ -0,0 +1,29 @@
from pydantic import BaseModel
from ners.core.config.annotation_config import AnnotationConfig
from ners.core.config.data_config import DataConfig
from ners.core.config.logging_config import LoggingConfig
from ners.core.config.processing_config import ProcessingConfig
from ners.core.config.project_paths import ProjectPaths
class PipelineConfig(BaseModel):
"""Main pipeline configuration"""
name: str = "drc_names_pipeline"
version: str = "1.0.0"
description: str = "DRC Names NLP Processing Pipeline"
paths: ProjectPaths
stages: list[str] = []
processing: ProcessingConfig = ProcessingConfig()
annotation: AnnotationConfig = AnnotationConfig()
data: DataConfig = DataConfig()
logging: LoggingConfig = LoggingConfig()
# Environment-specific settings
environment: str = "development"
debug: bool = True
class Config:
arbitrary_types_allowed = True
+17
View File
@@ -0,0 +1,17 @@
from dataclasses import field
from pydantic import BaseModel
class ProcessingConfig(BaseModel):
"""Data processing pipeline configuration"""
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5
use_multiprocessing: bool = False
encoding_options: list = field(
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
)
chunk_size: int = 100_000
epochs: int = 2
+26
View File
@@ -0,0 +1,26 @@
from pathlib import Path
from pydantic import BaseModel, field_validator
class ProjectPaths(BaseModel):
"""Project directory structure configuration"""
root_dir: Path
data_dir: Path
models_dir: Path
outputs_dir: Path
logs_dir: Path
configs_dir: Path
checkpoints_dir: Path
class Config:
arbitrary_types_allowed = True
@classmethod
@field_validator("*", mode="before")
def convert_to_path(cls, v):
return Path(v) if not isinstance(v, Path) else v
def get_data_path(self, filename: str) -> Path:
return self.data_dir / filename
+46
View File
@@ -0,0 +1,46 @@
import logging
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ners.core.config import PipelineConfig
@contextmanager
def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration"""
from ners.core.config import get_config
config = get_config()
original_values = {}
# Store original values and apply overrides
for key, value in overrides.items():
if hasattr(config, key):
original_values[key] = getattr(config, key)
setattr(config, key, value)
try:
yield config
finally:
# Restore original values
for key, value in original_values.items():
setattr(config, key, value)
def ensure_directories(config: "PipelineConfig") -> None:
"""Ensure all required directories exist"""
directories = [
config.paths.data_dir,
config.paths.models_dir,
config.paths.outputs_dir,
config.paths.logs_dir,
config.paths.configs_dir,
config.paths.checkpoints_dir,
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
logging.info("Ensured all required directories exist")
+174
View File
@@ -0,0 +1,174 @@
import gc
import logging
from pathlib import Path
from typing import Optional, Union, Iterator, Dict
import pandas as pd
from ners.core.config.pipeline_config import PipelineConfig
OPTIMIZED_DTYPES = {
# Numeric columns with appropriate bit-width
"year": "Int16", # Years fit in 16-bit integer
"words": "Int8", # Word counts typically < 128
"length": "Int16", # Name lengths fit in 16-bit
"annotated": "Int8", # Binary flag (0/1)
"ner_tagged": "Int8", # Binary flag (0/1)
# Categorical columns (memory efficient for repeated values)
"sex": "category",
"province": "category",
"region": "category",
"identified_category": "category",
"transformation_type": "category",
# String columns with proper string dtype
"name": "string",
"probable_native": "string",
"probable_surname": "string",
"identified_name": "string",
"identified_surname": "string",
"ner_entities": "string",
}
class DataLoader:
"""Reusable data loading utilities"""
def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
self.config = config
self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
def load_csv_chunked(
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
) -> Iterator[pd.DataFrame]:
"""Load CSV file in chunks for memory efficiency"""
chunk_size = chunk_size or self.config.processing.chunk_size
encodings = self.config.processing.encoding_options
filepath = Path(filepath)
for encoding in encodings:
try:
logging.info(f"Reading {filepath} with encoding: {encoding}")
# Read with optimal dtypes
chunk_iter = pd.read_csv(
filepath,
encoding=encoding,
chunksize=chunk_size,
on_bad_lines="skip",
dtype=self.dtypes,
)
for i, chunk in enumerate(chunk_iter):
logging.debug(f"Processing optimized chunk {i + 1}")
yield chunk
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
return
except Exception as e:
logging.warning(f"Failed with encoding {encoding}: {e}")
continue
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
"""Load complete CSV with memory optimization"""
chunks = []
for chunk in self.load_csv_chunked(filepath):
chunks.append(chunk)
if not chunks:
return pd.DataFrame()
logging.info(f"Concatenating {len(chunks)} optimized chunks")
df = pd.concat(chunks, ignore_index=True, copy=False)
# Cleanup chunks from memory
del chunks
gc.collect()
# Apply dataset size limiting if configured
if self.config.data.max_dataset_size is not None:
df = self._limit_dataset_size(df)
return df
def _limit_dataset_size(self, df: pd.DataFrame) -> pd.DataFrame:
"""Limit dataset size with optional sex balancing"""
max_size = self.config.data.max_dataset_size
if max_size is None or len(df) <= max_size:
return df
if self.config.data.balance_by_sex and "sex" in df.columns:
return self._balanced_sample(df, max_size)
else:
# Simple random sampling
return df.sample(n=max_size, random_state=self.config.data.random_seed)
def _balanced_sample(self, df: pd.DataFrame, max_size: int) -> pd.DataFrame:
"""Sample data with balanced sex distribution"""
# Get unique sex values
sex_values = df["sex"].dropna().unique()
if len(sex_values) == 0:
logging.warning(
"No valid values found in sex column 'sex', using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Calculate samples per sex category
samples_per_sex = max_size // len(sex_values)
remaining_samples = max_size % len(sex_values)
balanced_samples = []
for i, sex in enumerate(sex_values):
# Use boolean indexing instead of creating temporary DataFrames
sex_mask = df["sex"] == sex
sex_indices = df[sex_mask].index
# Distribute remaining samples to first categories
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
current_samples = min(current_samples, len(sex_indices))
if current_samples > 0:
# Sample indices instead of DataFrame
sampled_indices = pd.Series(sex_indices).sample(
n=current_samples, random_state=self.config.data.random_seed + i
)
balanced_samples.extend(sampled_indices.tolist())
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
if not balanced_samples:
logging.warning(
"No balanced samples could be created, using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Create result using iloc with indices (no copying until final step)
result = df.iloc[balanced_samples].copy()
# Shuffle the final result
result = result.sample(
frac=1, random_state=self.config.data.random_seed
).reset_index(drop=True)
logging.info(
f"Created balanced dataset with {len(result)} records from {len(df)} total"
)
return result
@classmethod
def save_csv(
cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
) -> None:
"""Save DataFrame to CSV with proper handling"""
filepath = Path(filepath)
if create_dirs:
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False, encoding="utf-8", sep=",", quoting=1)
logging.info(f"Saved {len(df)} rows to {filepath}")
+24
View File
@@ -0,0 +1,24 @@
from ners.core.config.pipeline_config import PipelineConfig
class PromptManager:
"""Manage prompts for LLM operations"""
def __init__(self, config: PipelineConfig):
self.config = config
self.prompts_dir = self.config.paths.configs_dir / "prompts"
def load_prompt(self, prompt_name: str = "default") -> str:
"""Load a prompt template"""
prompt_file = self.prompts_dir / f"{prompt_name}.txt"
if not prompt_file.exists():
# Fallback to root directory
fallback_file = self.config.paths.root_dir / "prompt.txt"
if fallback_file.exists():
prompt_file = fallback_file
else:
raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
with open(prompt_file, "r", encoding="utf-8") as f:
return f.read().strip()
+56
View File
@@ -0,0 +1,56 @@
import threading
import time
from dataclasses import dataclass
from queue import Queue
@dataclass
class RateLimitConfig:
"""Configuration for rate limiting LLM requests"""
requests_per_minute: int = 60
requests_per_second: int = 2
burst_limit: int = 5
class RateLimiter:
"""Thread-safe rate limiter for LLM requests"""
def __init__(self, config: RateLimitConfig):
self.config = config
self.request_times = Queue()
self.lock = threading.Lock()
self.last_request_time = 0
def wait_if_needed(self):
"""Wait if necessary to respect rate limits"""
with self.lock:
current_time = time.time()
# Check requests per second limit
time_since_last = current_time - self.last_request_time
min_interval = 1.0 / self.config.requests_per_second
if time_since_last < min_interval:
sleep_time = min_interval - time_since_last
time.sleep(sleep_time)
current_time = time.time()
# Clean old request times (older than 1 minute)
while not self.request_times.empty():
if current_time - self.request_times.queue[0] > 60:
self.request_times.get()
else:
break
# Check requests per minute limit
if self.request_times.qsize() >= self.config.requests_per_minute:
oldest_request = self.request_times.queue[0]
wait_time = 60 - (current_time - oldest_request)
if wait_time > 0:
time.sleep(wait_time)
current_time = time.time()
# Record this request
self.request_times.put(current_time)
self.last_request_time = current_time
+174
View File
@@ -0,0 +1,174 @@
import unicodedata
from typing import Optional, Dict, Tuple
import pandas as pd
class RegionMapper:
"""Reusable region mapping utilities"""
def __init__(self, mapping: Optional[Dict] = None):
self.mapping = mapping or REGION_MAPPING
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
def map(self, series: pd.Series) -> pd.Series:
return series.str.lower().map(self.mapping).fillna("AUTRES")
@staticmethod
def clean_province(series: pd.Series) -> pd.Series:
return (
series.str.upper()
.str.strip()
.apply(
lambda x: (
unicodedata.normalize("NFKD", x)
.encode("ascii", errors="ignore")
.decode("utf-8")
if isinstance(x, str)
else x
)
)
)
@staticmethod
def get_provinces():
return [
"kinshasa",
"bas-congo",
"bandundu",
"katanga",
"equateur",
"orientale",
"maniema",
"nord-kivu",
"sud-kivu",
"kasai-occidental",
"kasai-oriental",
"autres",
]
# DRC Region to Province Mapping
REGION_MAPPING: Dict[str, Tuple[str, str]] = {
"bandundu": ("BANDUNDU", "BANDUNDU"),
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
"bandundu-2": ("BANDUNDU", "BANDUNDU"),
"bandundu-3": ("BANDUNDU", "BANDUNDU"),
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-fleuve": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-uele": ("BAS-UELE", "ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "ORIENTALE"),
"cataractes": ("KONGO-CENTRAL", "BAS-CONGO"),
"equateur": ("EQUATEUR", "EQUATEUR"),
"equateur-1": ("EQUATEUR", "EQUATEUR"),
"equateur-2": ("EQUATEUR", "EQUATEUR"),
"equateur-3": ("EQUATEUR", "EQUATEUR"),
"equateur-4": ("EQUATEUR", "EQUATEUR"),
"equateur-5": ("EQUATEUR", "EQUATEUR"),
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
"haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
"haut-uele": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
"ituri": ("ITURI", "ORIENTALE"),
"ituri-1": ("ITURI", "ORIENTALE"),
"ituri-2": ("ITURI", "ORIENTALE"),
"ituri-3": ("ITURI", "ORIENTALE"),
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-ce": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-orientale": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"katanga": ("KATANGA", "KATANGA"),
"katanga-1": ("KATANGA", "KATANGA"),
"katanga-2": ("KATANGA", "KATANGA"),
"katanga-3": ("KATANGA", "KATANGA"),
"katanga-4": ("KATANGA", "KATANGA"),
"kinshasa": ("KINSHASA", "KINSHASA"),
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
"kinshasa-est": ("KINSHASA", "KINSHASA"),
"kinshasa-funa": ("KINSHASA", "KINSHASA"),
"kinshasa-global": ("KINSHASA", "KINSHASA"),
"kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
"kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
"kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
"kwango": ("KWANGO", "BANDUNDU"),
"kwango-1": ("KWANGO", "BANDUNDU"),
"kwango-2": ("KWANGO", "BANDUNDU"),
"kwilu": ("KWILU", "BANDUNDU"),
"kwilu-1": ("KWILU", "BANDUNDU"),
"kwilu-2": ("KWILU", "BANDUNDU"),
"kwilu-3": ("KWILU", "BANDUNDU"),
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
"lualaba": ("LUALABA", "KATANGA"),
"lualaba-1": ("LUALABA", "KATANGA"),
"lualaba-2": ("LUALABA", "KATANGA"),
"lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
"lukaya": ("KONGO-CENTRAL", "BAS-CONGO"),
"mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
"maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"),
"maniema-2": ("MANIEMA", "MANIEMA"),
"mongala": ("MONGALA", "EQUATEUR"),
"mongala-1": ("MONGALA", "EQUATEUR"),
"mongala-2": ("MONGALA", "EQUATEUR"),
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
"nord-ubangi": ("NORD-UBANGI", "EQUATEUR"),
"nord-ubangi-1": ("NORD-UBANGI", "EQUATEUR"),
"nord-ubangi-2": ("NORD-UBANGI", "EQUATEUR"),
"province-orientale": ("ORIENTALE", "ORIENTALE"),
"province-orientale-1": ("ORIENTALE", "ORIENTALE"),
"province-orientale-2": ("ORIENTALE", "ORIENTALE"),
"province-orientale-3": ("ORIENTALE", "ORIENTALE"),
"province-orientale-4": ("ORIENTALE", "ORIENTALE"),
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
"sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
"tanganyika": ("TANGANYIKA", "KATANGA"),
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
"tshopo": ("TSHOPO", "ORIENTALE"),
"tshopo-1": ("TSHOPO", "ORIENTALE"),
"tshopo-2": ("TSHOPO", "ORIENTALE"),
"tshuapa": ("TSHUAPA", "EQUATEUR"),
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
}
+41
View File
@@ -0,0 +1,41 @@
import json
import logging
from typing import Dict, Any
from ners.core.config.pipeline_config import PipelineConfig
class StateManager:
"""Manage pipeline state and checkpoints"""
def __init__(self, config: PipelineConfig):
self.config = config
self.checkpoints_dir = self.config.paths.checkpoints_dir
def save_state(self, state: Dict[str, Any], state_name: str) -> None:
"""Save pipeline state"""
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
state_file = self.checkpoints_dir / f"{state_name}.json"
with open(state_file, "w") as f:
json.dump(state, f, indent=2, default=str)
logging.debug(f"Saved state to {state_file}")
def load_state(self, state_name: str) -> Dict[str, Any]:
"""Load pipeline state"""
state_file = self.checkpoints_dir / f"{state_name}.json"
if not state_file.exists():
return {}
with open(state_file, "r") as f:
return json.load(f)
def clear_state(self, state_name: str) -> None:
"""Clear pipeline state"""
state_file = self.checkpoints_dir / f"{state_name}.json"
if state_file.exists():
state_file.unlink()
logging.info(f"Cleared state: {state_name}")
+37
View File
@@ -0,0 +1,37 @@
from typing import Optional, Dict
import pandas as pd
class TextCleaner:
"""Reusable text cleaning utilities"""
def __init__(self, patterns: Optional[Dict[str, str]] = None):
self.patterns = patterns or {
"null_bytes": "\x00",
"non_breaking_spaces": "\u00a0",
"multiple_spaces": r" +",
"extra_whitespace": r"\s+",
}
def clean_text_series(self, series: pd.Series) -> pd.Series:
"""Clean a pandas Series of text data"""
cleaned = series.astype(str)
# Apply cleaning patterns
for pattern_name, pattern in self.patterns.items():
if pattern_name == "multiple_spaces":
cleaned = cleaned.str.replace(pattern, " ", regex=True)
else:
cleaned = cleaned.str.replace(pattern, " ", regex=False)
return cleaned.str.strip().str.lower()
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean all text columns in a DataFrame"""
df = df.copy()
columns = df.select_dtypes(include=["object", "string"]).columns
for col in columns:
df[col] = self.clean_text_series(df[col])
return df
+75
View File
@@ -0,0 +1,75 @@
#!.venv/bin/python3
import logging
from ners.core.utils.data_loader import DataLoader
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.pipeline import Pipeline
from ners.processing.steps.data_cleaning_step import DataCleaningStep
from ners.processing.steps.data_selection_step import DataSelectionStep
from ners.processing.steps.data_splitting_step import DataSplittingStep
from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
from ners.processing.steps.ner_annotation_step import NERAnnotationStep
from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
def create_pipeline(config) -> Pipeline:
batch_config = BatchConfig(
batch_size=config.processing.batch_size,
max_workers=config.processing.max_workers,
checkpoint_interval=config.processing.checkpoint_interval,
use_multiprocessing=config.processing.use_multiprocessing,
)
pipeline = Pipeline(batch_config)
steps = [
DataCleaningStep(config),
FeatureExtractionStep(config),
DataSelectionStep(config),
NERAnnotationStep(config),
LLMAnnotationStep(config),
]
for stage in config.stages:
for step in steps:
if step.name == stage:
pipeline.add_step(step)
return pipeline
def run_pipeline(config) -> int:
try:
logging.info(f"Starting pipeline: {config.name} v{config.version}")
# Load input data
input_file_path = config.paths.get_data_path(config.data.input_file)
if not input_file_path.exists():
logging.error(f"Input file not found: {input_file_path}")
return 1
data_loader = DataLoader(config)
data_splitter = DataSplittingStep(config)
logging.info(f"Loading data from {input_file_path}")
df = data_loader.load_csv_complete(input_file_path)
logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
# Create and run pipeline
pipeline = create_pipeline(config)
data_splitter.split(pipeline.run(df))
# Show completion statistics
progress = pipeline.get_progress()
logging.info("=== Pipeline Completion Summary ===")
for step_name, stats in progress.items():
logging.info(
f"{step_name}: {stats['completion_percentage']:.1f}% "
f"({stats['processed_batches']}/{stats['total_batches']} batches)"
)
if stats["failed_batches"] > 0:
logging.warning(f" {stats['failed_batches']} failed batches")
logging.info("Pipeline completed successfully")
return 0
except Exception as e:
logging.error(f"Pipeline failed: {e}", exc_info=True)
return 1
+14
View File
@@ -0,0 +1,14 @@
#!.venv/bin/python3
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
def status(*, detailed: bool = False) -> None:
PipelineMonitor().print_status(detailed=detailed)
def clean_step(step: str, *, keep_last: int = 1) -> None:
PipelineMonitor().clean_step_checkpoints(step, keep_last)
def reset_step(step: str) -> None:
PipelineMonitor().reset_step(step)
+80
View File
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
import logging
import os
import traceback
from pathlib import Path
from ners.core.config import PipelineConfig
from ners.processing.ner.name_builder import NameBuilder
from ners.processing.ner.name_engineering import NameEngineering
from ners.processing.ner.name_model import NameModel
def feature(config: PipelineConfig):
NameEngineering(config).compute()
def build(config: PipelineConfig):
NameBuilder(config).build()
def train(config: PipelineConfig):
name_model = NameModel(config)
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
if not data_path.exists():
logging.info("NER data not found. Building dataset first...")
build(config)
name_model.create_blank_model("fr")
data = name_model.load_data(str(data_path))
split_idx = int(len(data) * 0.9)
train_data, eval_data = data[:split_idx], data[split_idx:]
logging.info(
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
)
name_model.train(
data=train_data,
epochs=config.processing.epochs,
batch_size=config.processing.batch_size,
dropout_rate=0.3,
)
evaluation_results = name_model.evaluate(eval_data)
model_path = name_model.save()
logging.info(f"Model saved to: {model_path}")
print(f"Evaluation results: {evaluation_results}")
def run_pipeline(config: PipelineConfig, reset: bool = False):
if not reset and os.path.exists(
config.paths.get_data_path(config.data.output_files["engineered"])
):
logging.info("Step 1: Feature engineering already done.")
else:
logging.info("Step 1: Running feature engineering")
feature(config)
if not reset and os.path.exists(
config.paths.get_data_path(config.data.output_files["ner_data"])
):
logging.info("Step 2: NER dataset already built.")
else:
logging.info("Step 2: Building NER dataset")
build(config)
logging.info("Step 3: Training NER Model")
train(config)
return 0
def main():
try:
logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
return 1
except Exception:
traceback.print_exc()
return 1
View File
+13
View File
@@ -0,0 +1,13 @@
from dataclasses import dataclass
@dataclass
class BatchConfig:
"""Configuration for batch processing"""
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5 # Save checkpoint every N batches
use_multiprocessing: bool = (
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
)
@@ -0,0 +1,173 @@
import logging
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from typing import Iterator
import pandas as pd
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.batch.memory_monitor import MemoryMonitor
from ners.processing.steps import PipelineStep
class BatchProcessor:
"""Handles batch processing with concurrency and checkpointing"""
def __init__(self, config: BatchConfig):
self.config = config
self.memory_monitor = MemoryMonitor()
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
"""Create batches from DataFrame without unnecessary copies"""
total_rows = len(df)
batch_size = self.config.batch_size
for i in range(0, total_rows, batch_size):
batch = df.iloc[i : i + batch_size]
batch_id = i // batch_size
yield batch, batch_id
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Memory-optimized sequential processing"""
results = []
memory_threshold_mb = 1000 # Clean memory when usage exceeds 1 GB
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
if step.batch_exists(batch_id):
logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
processed_batch = step.load_batch(batch_id)
else:
try:
# Only copy if the processing step requires mutation
if step.requires_batch_mutation:
batch_copy = batch.copy()
processed_batch = step.process_batch(batch_copy, batch_id)
else:
processed_batch = step.process_batch(batch, batch_id)
step.save_batch(processed_batch, batch_id)
step.state.processed_batches += 1
except Exception as e:
logging.error(f"Failed to process batch {batch_id}: {e}")
step.state.failed_batches.append(batch_id)
continue
results.append(processed_batch)
# Memory management
if batch_num % self.config.checkpoint_interval == 0:
current_memory = self.memory_monitor.get_memory_usage_mb()
if current_memory > memory_threshold_mb:
logging.info(f"Memory cleanup triggered at {current_memory:.1f} MB")
self.memory_monitor.cleanup_memory()
# Save state periodically
if batch_id % self.config.checkpoint_interval == 0:
step.save_state()
# Final memory cleanup before concatenation
self.memory_monitor.cleanup_memory()
self.memory_monitor.log_memory_usage("before_concat")
result = self._safe_concat(results) if results else pd.DataFrame()
# Final cleanup
del results
self.memory_monitor.cleanup_memory()
self.memory_monitor.log_memory_usage("sequential_complete")
return result
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Memory-optimized concurrent processing"""
executor_class = (
ProcessPoolExecutor
if self.config.use_multiprocessing
else ThreadPoolExecutor
)
results = {}
with executor_class(max_workers=self.config.max_workers) as executor:
# Submit all batches
future_to_batch = {}
for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id):
logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
results[batch_id] = step.load_batch(batch_id)
else:
# Only copy if necessary for concurrent processing
batch_copy = batch.copy() if step.requires_batch_mutation else batch
future = executor.submit(step.process_batch, batch_copy, batch_id)
future_to_batch[future] = (batch_id, batch)
# Collect results as they complete
for future in as_completed(future_to_batch):
batch_id, batch = future_to_batch[future]
try:
processed_batch = future.result()
step.save_batch(processed_batch, batch_id)
results[batch_id] = processed_batch
step.state.processed_batches += 1
logging.info(f"Completed batch {batch_id}")
except Exception as e:
logging.error(f"Failed to process batch {batch_id}: {e}")
step.state.failed_batches.append(batch_id)
# Memory-efficient reassembly
ordered_results = []
for batch_id in sorted(results.keys()):
ordered_results.append(results[batch_id])
step.save_state()
# Cleanup before concat
del results
self.memory_monitor.cleanup_memory()
result = (
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
)
# Final cleanup
del ordered_results
self.memory_monitor.cleanup_memory()
return result
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process data using the configured strategy"""
step.state.total_batches = (
len(df) + self.config.batch_size - 1
) // self.config.batch_size
step.load_state()
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
self.memory_monitor.log_memory_usage("process_start")
if self.config.max_workers == 1:
result = self.process_sequential(step, df)
else:
result = self.process_concurrent(step, df)
self.memory_monitor.log_memory_usage("process_complete")
return result
def _safe_concat(self, dfs: list) -> pd.DataFrame:
"""Memory-safe concatenation with monitoring"""
if not dfs:
return pd.DataFrame()
memory = self.memory_monitor.get_memory_usage_mb()
logging.info(f"Starting concat of {len(dfs)} DataFrames at {memory:.1f} MB")
# Use copy=False to avoid unnecessary copying during concat
result = pd.concat(dfs, ignore_index=True, copy=False)
# Monitor memory after concat
memory = self.memory_monitor.get_memory_usage_mb()
logging.info(f"Concat complete. Memory: {memory:.1f} MB")
return result
@@ -0,0 +1,25 @@
import gc
import logging
import psutil
class MemoryMonitor:
"""Monitor and manage memory usage during batch processing"""
@staticmethod
def get_memory_usage_mb() -> float:
"""Get current memory usage in MB"""
process = psutil.Process()
return process.memory_info().rss / 1024 / 1024
@staticmethod
def cleanup_memory():
"""Force garbage collection"""
gc.collect()
@staticmethod
def log_memory_usage(step_name: str):
"""Log current memory usage"""
memory_mb = MemoryMonitor.get_memory_usage_mb()
logging.info(f"Memory usage after {step_name}: {memory_mb:.1f} MB")
@@ -0,0 +1,196 @@
import json
import logging
import shutil
from datetime import datetime
from typing import Optional, Dict
from ners.core.config.config_manager import ConfigManager
from ners.core.config.project_paths import ProjectPaths
class PipelineMonitor:
"""Monitor and manage pipeline execution"""
def __init__(self, paths: Optional[ProjectPaths] = None):
if paths is None:
# Use default configuration if none provided
config_manager = ConfigManager()
paths = config_manager.default_paths
self.paths = paths
self.checkpoint_dir = paths.checkpoints_dir
self.steps = [
"data_cleaning",
"data_selection",
"feature_extraction",
"ner_annotation",
"llm_annotation",
"data_splitting",
]
def get_step_status(self, step_name: str) -> Dict:
"""Get status of a specific pipeline step"""
step_dir = self.checkpoint_dir / step_name
state_file = step_dir / "pipeline_state.json"
if not state_file.exists():
return {
"step": step_name,
"status": "not_started",
"processed_batches": 0,
"total_batches": 0,
"failed_batches": 0,
"completion_percentage": 0.0,
}
try:
with open(state_file, "r") as f:
state = json.load(f)
processed = state.get("processed_batches", 0)
total = state.get("total_batches", 0)
failed = len(state.get("failed_batches", []))
if total == 0:
completion = 0.0
status = "not_started"
elif processed >= total:
completion = 100.0
status = "completed" if failed == 0 else "completed_with_errors"
else:
completion = (processed / total) * 100
status = "in_progress"
return {
"step": step_name,
"status": status,
"processed_batches": processed,
"total_batches": total,
"failed_batches": failed,
"completion_percentage": completion,
"last_checkpoint": state.get("last_checkpoint"),
"failed_batch_ids": state.get("failed_batches", []),
}
except Exception as e:
logging.error(f"Error reading state for {step_name}: {e}")
return {"step": step_name, "status": "error", "error": str(e)}
def get_pipeline_status(self) -> Dict:
"""Get overall pipeline status"""
step_statuses = {}
overall_status = "not_started"
total_completion = 0.0
for step in self.steps:
status = self.get_step_status(step)
step_statuses[step] = status
if status["status"] == "error":
overall_status = "error"
elif status["status"] in ["in_progress"]:
overall_status = "in_progress"
elif status["status"] == "completed_with_errors":
overall_status = "completed_with_errors"
total_completion += status.get("completion_percentage", 0)
avg_completion = total_completion / len(self.steps)
if avg_completion >= 100 and overall_status not in [
"error",
"completed_with_errors",
]:
overall_status = "completed"
return {
"overall_status": overall_status,
"overall_completion": avg_completion,
"steps": step_statuses,
"timestamp": datetime.now().isoformat(),
}
def print_status(self, detailed: bool = False):
"""Print pipeline status in a human-readable format"""
status = self.get_pipeline_status()
print("\n=== Pipeline Status ===")
print(f"Overall Status: {status['overall_status'].upper()}")
print(f"Overall Completion: {status['overall_completion']:.1f}%")
print(f"Last Updated: {status['timestamp']}")
print()
for step_name, step_status in status["steps"].items():
print(f"{step_name.replace('_', ' ').title()}:")
print(f" Status: {step_status['status']}")
print(f" Progress: {step_status['completion_percentage']:.1f}%")
print(
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
)
if step_status["failed_batches"] > 0:
print(f" Failed Batches: {step_status['failed_batches']}")
if detailed and "failed_batch_ids" in step_status:
print(f" Failed Batch IDs: {step_status['failed_batch_ids']}")
print()
def count_checkpoint_files(self) -> Dict:
"""Count checkpoint files for each step"""
counts = {}
total_size = 0
for step in self.steps:
step_dir = self.checkpoint_dir / step
if step_dir.exists():
csv_files = list(step_dir.glob("*.csv"))
step_size = sum(f.stat().st_size for f in csv_files)
counts[step] = {
"files": len(csv_files),
"size_mb": step_size / (1024 * 1024),
}
total_size += step_size
else:
counts[step] = {"files": 0, "size_mb": 0}
counts["total_size_mb"] = total_size / (1024 * 1024)
return counts
def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
"""Clean checkpoint files for a specific step"""
step_dir = self.checkpoint_dir / step_name
if not step_dir.exists():
logging.info(f"No checkpoints found for {step_name}")
return
csv_files = sorted(step_dir.glob("batch_*.csv"))
if len(csv_files) <= keep_last:
logging.info(
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
)
return
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
for file_path in files_to_delete:
try:
file_path.unlink()
logging.info(f"Deleted {file_path}")
except Exception as e:
logging.error(f"Failed to delete {file_path}: {e}")
def reset_step(self, step_name: str):
"""Reset a pipeline step by removing its checkpoints and state"""
step_dir = self.checkpoint_dir / step_name
if step_dir.exists():
try:
shutil.rmtree(step_dir)
logging.info(f"Reset step: {step_name}")
except Exception as e:
logging.error(f"Failed to reset {step_name}: {e}")
else:
logging.info(f"Step {step_name} has no checkpoints to reset")
View File
@@ -0,0 +1,94 @@
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict
import pandas as pd
from ners.processing.steps.feature_extraction_step import NameCategory
class BaseNameFormatter(ABC):
"""
Base class for name formatting transformations.
Contains common logic for NER tagging and attribute computation.
"""
def __init__(
self, connectors: List[str] = None, additional_surnames: List[str] = None
):
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
self.additional_surnames = additional_surnames or [
"jean",
"paul",
"marie",
"joseph",
"pierre",
"claude",
"andre",
"michel",
"robert",
]
@classmethod
def parse_native_components(cls, native_str: str) -> List[str]:
"""Parse native name string into individual components"""
if pd.isna(native_str) or not native_str:
return []
return native_str.strip().split()
def create_ner_tags(
self, text: str, native_parts: List[str], surname: str
) -> List[Tuple[int, int, str]]:
"""Create NER entity tags for transformed text"""
entities = []
current_pos = 0
words = text.split()
for word in words:
start_pos = current_pos
end_pos = current_pos + len(word)
# Determine tag based on word content
if word in native_parts or any(
connector in word for connector in self.connectors
):
tag = "NATIVE"
elif word == surname or word in self.additional_surnames:
tag = "SURNAME"
else:
# Check if it's a compound native word or new surname
if any(part in word for part in native_parts):
tag = "NATIVE"
else:
tag = "SURNAME"
entities.append((start_pos, end_pos, tag))
current_pos = end_pos + 1 # +1 for space
return entities
@classmethod
def compute_numeric_features(cls, name: str) -> Dict:
"""Compute all derived attributes for the transformed name"""
words_count = len(name.split()) if name else 0
length = len(name) if name else 0
return {
"words": words_count,
"length": length,
"identified_category": (
NameCategory.SIMPLE.value
if words_count == 3
else NameCategory.COMPOSE.value
),
}
@abstractmethod
def transform(self, row: pd.Series) -> Dict:
"""Transform a row according to the specific format rules"""
pass
@property
@abstractmethod
def transformation_type(self) -> str:
"""Return the transformation type identifier"""
pass
@@ -0,0 +1,38 @@
import random
from typing import Dict
import pandas as pd
from ners.processing.ner.formats import BaseNameFormatter
class ConnectorFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
connector = random.choice(self.connectors)
# Connect native parts with a random connector
if len(native_parts) > 1:
connected_native = f" {connector} ".join(native_parts)
full_name = f"{connected_native} {surname}".strip()
else:
connected_native = (
f"{row['probable_native']} {connector} {row['probable_native']}".strip()
)
full_name = f"{connected_native} {surname}".strip()
return {
"name": full_name,
"probable_native": connected_native,
"identified_name": connected_native,
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return "connector_added"
@@ -0,0 +1,36 @@
import random
from typing import Dict
import pandas as pd
from ners.processing.ner.formats import BaseNameFormatter
class ExtendedSurnameFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
original_surname = (
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
)
# Add random additional surname
additional_surname = random.choice(self.additional_surnames)
combined_surname = f"{additional_surname} {original_surname}".strip()
full_name = f"{row['probable_native']} {combined_surname}".strip()
return {
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": combined_surname,
"identified_surname": combined_surname,
"ner_entities": str(
self.create_ner_tags(full_name, native_parts, combined_surname)
),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return "extended_surname"
@@ -0,0 +1,28 @@
from typing import Dict
import pandas as pd
from ners.processing.ner.formats import BaseNameFormatter
class NativeOnlyFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
# Only native components
full_name = row["probable_native"]
return {
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": "",
"identified_surname": "",
"ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return "native_only"
@@ -0,0 +1,29 @@
from typing import Dict
import pandas as pd
from ners.processing.ner.formats import BaseNameFormatter
class OriginalFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Keep original order: native components + surname
full_name = f"{row['probable_native']} {surname}".strip()
return {
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return "original"
@@ -0,0 +1,29 @@
from typing import Dict
import pandas as pd
from ners.processing.ner.formats import BaseNameFormatter
class PositionFlippedFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Flip order: surname + native components
full_name = f"{surname} {row['probable_native']}".strip()
return {
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return "position_flipped"
@@ -0,0 +1,34 @@
from typing import Dict
import pandas as pd
from ners.processing.ner.formats import BaseNameFormatter
class ReducedNativeFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Keep only first native component + surname
reduced_native = (
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
)
full_name = f"{reduced_native} {surname}".strip()
return {
"name": full_name,
"probable_native": reduced_native,
"identified_name": reduced_native,
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(
self.create_ner_tags(full_name, [reduced_native], surname)
),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return "reduced_native"
+87
View File
@@ -0,0 +1,87 @@
import json
import logging
import spacy
from spacy.tokens import DocBin
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from .name_tagger import NameTagger
class NameBuilder:
def __init__(self, config: PipelineConfig):
config = config.model_copy(deep=True)
config.data.max_dataset_size = 1_000_000
config.data.balance_by_sex = True
self.config = config
self.data_loader = DataLoader(config)
self.tagger = NameTagger()
def build(self) -> int:
filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
# Filter early
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
if ner_df.empty:
logging.error("No NER tagged data found")
return 1
total_rows = len(df)
del df # No need to keep in memory
logging.info(f"Found {len(ner_df)} NER tagged entries")
nlp = spacy.blank("fr")
# Use NERNameTagger for parsing and validation
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
validated_entities = self.tagger.validate_entities(
ner_df["name"], parsed_entities
)
# Drop rows with no valid entities
mask = validated_entities.map(bool)
ner_df = ner_df.loc[mask]
validated_entities = validated_entities.loc[mask]
if ner_df.empty:
logging.error("No valid training examples after validation")
return 1
# Prepare training data
training_data = list(
zip(
ner_df["name"].tolist(),
[{"entities": ents} for ents in validated_entities],
)
)
# Use NERNameTagger to create spaCy DocBin
docs = self.tagger.create_docs(
nlp, ner_df["name"].tolist(), validated_entities.tolist()
)
doc_bin = DocBin(docs=docs)
# Save
json_path = self.config.paths.get_data_path(
self.config.data.output_files["ner_data"]
)
spacy_path = self.config.paths.get_data_path(
self.config.data.output_files["ner_spacy"]
)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
doc_bin.to_disk(spacy_path)
logging.info(
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
)
logging.info(f"Saved NER JSON to {json_path}")
logging.info(f"Saved NER spacy to {spacy_path}")
return 0
+142
View File
@@ -0,0 +1,142 @@
import gc
import random
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.ner.formats.connectors_format import ConnectorFormatter
from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
from ners.processing.ner.formats.original_format import OriginalFormatter
from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
class NameEngineering:
"""
Feature engineering for NER dataset to prevent position-based learning
and encourage sequence characteristic learning.
"""
def __init__(self, config: PipelineConfig):
self.config = config
self.data_loader = DataLoader(config)
self.connectors = ["wa", "ya", "ka", "ba", "la"]
self.additional_surnames = [
"jean",
"paul",
"marie",
"joseph",
"pierre",
"claude",
"andre",
"michel",
"robert",
]
random.seed(self.config.data.random_seed)
np.random.seed(self.config.data.random_seed)
# Initialize format classes
self.formatters = {
"original": OriginalFormatter(self.connectors, self.additional_surnames),
"native_only": NativeOnlyFormatter(
self.connectors, self.additional_surnames
),
"position_flipped": PositionFlippedFormatter(
self.connectors, self.additional_surnames
),
"reduced_native": ReducedNativeFormatter(
self.connectors, self.additional_surnames
),
"connector_added": ConnectorFormatter(
self.connectors, self.additional_surnames
),
"extended_surname": ExtendedSurnameFormatter(
self.connectors, self.additional_surnames
),
}
def load_data(self) -> pd.DataFrame:
"""Load and filter NER-tagged data from CSV file"""
filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath)
# Filter only NER-tagged rows
ner_data = df[df["ner_tagged"] == 1].copy()
logging.info(
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
)
return ner_data
def compute(self) -> None:
logging.info("Applying feature engineering transformations...")
input_filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
output_filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy()
logging.info(
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
)
del df # No need to keep in memory
gc.collect()
ner_df = ner_df.sample(
frac=1, random_state=self.config.data.random_seed
).reset_index(drop=True)
total_rows = len(ner_df)
# Calculate split points
split_25_1 = int(total_rows * 0.25)
split_25_2 = int(total_rows * 0.50)
split_25_3 = int(total_rows * 0.75)
split_10_1 = int(total_rows * 0.85)
split_10_2 = int(total_rows * 0.95)
# Define transformation groups
groups = [
(0, split_25_1, "original"), # First 25%: original format
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
(
split_25_3,
split_10_1,
"reduced_native",
), # Fourth 10%: reduce native components
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
]
for start, end, trans_type in groups:
logging.info(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
# Process each group
rows = []
for start, end, formatter_key in groups:
formatter = self.formatters[formatter_key]
for idx in tqdm(range(start, end), desc=f"Processing {formatter_key}"):
row = ner_df.iloc[idx]
transformed = formatter.transform(row)
# Keep original columns and add transformed ones
new_row = row.to_dict()
new_row.update(transformed)
rows.append(new_row)
self.data_loader.save_csv(pd.DataFrame(rows), output_filepath)
logging.info(f"Engineered dataset saved to {output_filepath}")
+430
View File
@@ -0,0 +1,430 @@
import ast
import json
import logging
import os
import random
from pathlib import Path
from typing import Dict, Any, List, Tuple
import spacy
from spacy.training import Example
from spacy.util import minibatch
from tqdm import tqdm
from ners.core.config.pipeline_config import PipelineConfig
class NameModel:
"""NER model trainer using spaCy for DRC names entity recognition"""
def __init__(self, config: PipelineConfig):
self.config = config
self.nlp = None
self.ner = None
self.model_path = None
self.training_stats = {}
self.evaluation_stats = {}
def create_blank_model(self, language: str = "fr") -> None:
"""Create a blank spaCy model with NER pipeline"""
logging.info(f"Creating blank {language} model for NER training")
# Prefer GPU for spaCy if available (falls back to CPU automatically)
try:
if spacy.prefer_gpu():
logging.info("spaCy GPU enabled (cupy) for NER training")
else:
logging.info("spaCy running on CPU")
except Exception as e:
logging.debug(f"spaCy GPU selection skipped: {e}")
# Create blank model - French tokenizer works well for DRC names
self.nlp = spacy.blank(language)
# Add NER pipeline component
if "ner" not in self.nlp.pipe_names:
self.ner = self.nlp.add_pipe("ner")
else:
self.ner = self.nlp.get_pipe("ner")
# Add our custom labels
self.ner.add_label("NATIVE")
self.ner.add_label("SURNAME")
logging.info("Blank model created with NATIVE and SURNAME labels")
@classmethod
def load_data(cls, data_path: str) -> List[Tuple[str, Dict]]:
"""Load training data from JSON file - compatible with NERNameTagger output format"""
if not os.path.exists(data_path):
raise FileNotFoundError(f"Training data not found at {data_path}")
logging.info(f"Loading training data from {data_path}")
with open(data_path, "r", encoding="utf-8") as f:
raw_data = json.load(f)
# Validate and clean training data
valid_data = []
skipped_count = 0
for i, item in enumerate(raw_data):
try:
if not isinstance(item, (list, tuple)) or len(item) != 2:
logging.warning(
f"Skipping invalid training example format at index {i}: {item}"
)
skipped_count += 1
continue
text, annotations = item
# Validate text
if not isinstance(text, str) or not text.strip():
logging.warning(f"Skipping invalid text at index {i}: {repr(text)}")
skipped_count += 1
continue
# Handle different annotation formats from NERNameTagger
if not isinstance(annotations, dict) or "entities" not in annotations:
logging.warning(
f"Skipping invalid annotations at index {i}: {annotations}"
)
skipped_count += 1
continue
entities_raw = annotations["entities"]
# Parse entities - handle both string and list formats from tagger
if isinstance(entities_raw, str):
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
try:
entities = ast.literal_eval(entities_raw)
if not isinstance(entities, list):
logging.warning(
f"Parsed entities is not a list at index {i}: {entities}"
)
skipped_count += 1
continue
except (ValueError, SyntaxError) as e:
logging.warning(
f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
)
skipped_count += 1
continue
elif isinstance(entities_raw, list):
# Already in list format
entities = entities_raw
else:
logging.warning(
f"Skipping invalid entities format at index {i}: {entities_raw}"
)
skipped_count += 1
continue
# Validate each entity
valid_entities = []
for entity in entities:
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
logging.warning(
f"Skipping invalid entity format in '{text}': {entity}"
)
continue
start, end, label = entity
# Validate entity components
if (
not isinstance(start, int)
or not isinstance(end, int)
or not isinstance(label, str)
or start >= end
or start < 0
or end > len(text)
):
logging.warning(
f"Skipping invalid entity bounds in '{text}': {entity}"
)
continue
# Check for overlaps with already validated entities
has_overlap = any(
start < v_end and end > v_start
for v_start, v_end, _ in valid_entities
)
if has_overlap:
logging.warning(
f"Skipping overlapping entity in '{text}': {entity}"
)
continue
# Validate that the span doesn't contain spaces (matching tagger validation)
span_text = text[start:end]
if (
not span_text
or span_text != span_text.strip()
or " " in span_text
):
logging.warning(
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
)
continue
valid_entities.append((start, end, label))
if not valid_entities:
logging.warning(
f"Skipping training example with no valid entities: '{text}'"
)
skipped_count += 1
continue
# Sort entities by start position
valid_entities.sort(key=lambda x: x[0])
valid_data.append((text.strip(), {"entities": valid_entities}))
except Exception as e:
logging.error(f"Error processing training example at index {i}: {e}")
skipped_count += 1
continue
logging.info(
f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
)
if not valid_data:
raise ValueError("No valid training examples found in the data")
return valid_data
def train(
self,
data: List[Tuple[str, Dict]],
epochs: int = 1,
batch_size: int = 10_000,
dropout_rate: float = 0.3,
) -> None:
"""Train the NER model"""
logging.info(f"Starting NER training with {len(data)} examples")
logging.info(
f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
)
if self.nlp is None:
raise ValueError("Model not initialized. Call create_blank_model() first.")
# Initialize the model
self.nlp.initialize()
optimizer = self.nlp.resume_training()
losses_history = []
for epoch in range(epochs):
losses = {}
examples = []
for text, annotations in tqdm(data, desc="Create training examples"):
doc = self.nlp.make_doc(text)
examples.append(Example.from_dict(doc, annotations))
# Shuffle examples each epoch (important!)
random.shuffle(examples)
# Train in batches
batches = minibatch(examples, size=batch_size)
for batch in batches:
batch_losses = {}
self.nlp.update(
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
)
logging.info(
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
)
# Accumulate into total losses dict
for k, v in batch_losses.items():
losses[k] = losses.get(k, 0.0) + v
del batches # free memory
losses_history.append(losses.get("ner", 0))
logging.info(f"Epoch {epoch + 1}/{epochs}, Total Loss: {losses['ner']:.4f}")
# Store training statistics
self.training_stats = {
"epochs": epochs,
"final_loss": losses_history[-1] if losses_history else 0,
"training_examples": len(data),
"loss_history": losses_history,
"batch_size": batch_size,
"dropout_rate": dropout_rate,
}
logging.info(
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
)
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
"""Evaluate the trained model on test data"""
if self.nlp is None:
raise ValueError("Model not trained. Call train_model() first.")
logging.info(f"Evaluating model on {len(test_data)} test examples")
total_examples = len(test_data)
correct_entities = 0
predicted_entities = 0
actual_entities = 0
entity_stats = {
"NATIVE": {"tp": 0, "fp": 0, "fn": 0},
"SURNAME": {"tp": 0, "fp": 0, "fn": 0},
}
for text, annotations in test_data:
# Get actual entities
actual_ents = set()
for start, end, label in annotations.get("entities", []):
actual_ents.add((start, end, label))
actual_entities += 1
# Get predicted entities
doc = self.nlp(text)
predicted_ents = set()
for ent in doc.ents:
predicted_ents.add((ent.start_char, ent.end_char, ent.label_))
predicted_entities += 1
# Calculate matches
matches = actual_ents.intersection(predicted_ents)
correct_entities += len(matches)
# Update per-label statistics
for start, end, label in actual_ents:
if (start, end, label) in predicted_ents:
entity_stats[label]["tp"] += 1
else:
entity_stats[label]["fn"] += 1
for start, end, label in predicted_ents:
if (start, end, label) not in actual_ents:
entity_stats[label]["fp"] += 1
# Calculate overall metrics
precision = (
correct_entities / predicted_entities if predicted_entities > 0 else 0
)
recall = correct_entities / actual_entities if actual_entities > 0 else 0
f1_score = (
2 * (precision * recall) / (precision + recall)
if (precision + recall) > 0
else 0
)
# Calculate per-label metrics
label_metrics = {}
for label, stats in entity_stats.items():
tp, fp, fn = stats["tp"], stats["fp"], stats["fn"]
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
label_f1 = (
(
2
* (label_precision * label_recall)
/ (label_precision + label_recall)
)
if (label_precision + label_recall) > 0
else 0
)
label_metrics[label] = {
"precision": label_precision,
"recall": label_recall,
"f1_score": label_f1,
"support": tp + fn,
}
self.evaluation_stats = {
"overall": {
"precision": precision,
"recall": recall,
"f1_score": f1_score,
"total_examples": total_examples,
"correct_entities": correct_entities,
"predicted_entities": predicted_entities,
"actual_entities": actual_entities,
},
"by_label": label_metrics,
}
return self.evaluation_stats
def save(self, model_name: str = "drc_ner_model") -> str:
"""Save the trained model"""
if self.nlp is None:
raise ValueError("No model to save. Train a model first.")
# Create model directory
model_dir = self.config.paths.models_dir / model_name
model_dir.mkdir(parents=True, exist_ok=True)
# Save the model
self.nlp.to_disk(model_dir)
self.model_path = str(model_dir)
# Save training and evaluation statistics
training_stats_path = model_dir / "training_stats.json"
with open(training_stats_path, "w", encoding="utf-8") as f:
json.dump(self.training_stats, f, indent=2)
evaluation_stats_path = model_dir / "evaluation_stats.json"
with open(evaluation_stats_path, "w", encoding="utf-8") as f:
json.dump(self.evaluation_stats, f, indent=2)
logging.info(f"NER Model saved to {model_dir}")
return self.model_path
def load(self, model_path: str) -> None:
"""Load a trained model"""
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model not found at {model_path}")
logging.info(f"Loading model from {model_path}")
self.nlp = spacy.load(model_path)
self.ner = self.nlp.get_pipe("ner")
self.model_path = model_path
# Load training statistics if available
training_stats_path = Path(model_path) / "training_stats.json"
if training_stats_path.exists():
with open(training_stats_path, "r", encoding="utf-8") as f:
self.training_stats = json.load(f)
evaluation_stats_path = Path(model_path) / "evaluation_stats.json"
if evaluation_stats_path.exists():
with open(evaluation_stats_path, "r", encoding="utf-8") as f:
self.evaluation_stats = json.load(f)
logging.info("NER Model loaded successfully")
def predict(self, text: str) -> Dict[str, Any]:
"""Make predictions on a single text"""
if self.nlp is None:
raise ValueError("No model loaded. Load or train a model first.")
doc = self.nlp(text)
entities = []
for ent in doc.ents:
entities.append(
{
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
"confidence": getattr(
ent, "score", None
), # If confidence scores are available
}
)
return {"text": text, "entities": entities}
+290
View File
@@ -0,0 +1,290 @@
from typing import Union, Dict, Any, List
import ast
import json
import logging
import pandas as pd
from spacy.util import filter_spans
class NameTagger:
def tag_name(
self, name: str, probable_native: str, probable_surname: str
) -> Union[Dict[str, Any], None]:
"""Create a single NER training example using probable_native and probable_surname"""
if not name or not probable_native or not probable_surname:
return None
name = name.strip()
probable_native = probable_native.strip()
probable_surname = probable_surname.strip()
entities = []
used_spans = [] # Track used character spans to prevent overlaps
# Helper function to check if a span overlaps with any existing span
def has_overlap(start, end):
for used_start, used_end in used_spans:
if not (end <= used_start or start >= used_end):
return True
return False
# Find positions of native names in the full name
native_words = probable_native.split()
name_lower = name.lower() # Use lowercase for consistent searching
processed_native_words = set()
for native_word in native_words:
native_word = native_word.strip()
if len(native_word) < 2: # Skip very short words
continue
native_word_lower = native_word.lower()
# Skip if we've already processed this exact word
if native_word_lower in processed_native_words:
continue
processed_native_words.add(native_word_lower)
# Find the first occurrence of this native word that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(
native_word_lower, start_pos
) # Case-insensitive search
if pos == -1:
break
# Calculate end position - make sure we only include the word itself
end_pos = pos + len(native_word_lower)
# Double-check that the extracted span matches exactly what we expect
extracted_text = name[pos:end_pos] # Get original case text
if extracted_text.lower() != native_word_lower:
start_pos = pos + 1
continue
# Check if this is a word boundary match and doesn't overlap
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "NATIVE"))
used_spans.append((pos, end_pos))
break # Only take the first non-overlapping occurrence
start_pos = pos + 1
# Find position of surname in the full name
if probable_surname and len(probable_surname.strip()) >= 2:
surname_lower = probable_surname.lower()
# Find the first occurrence that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(
surname_lower, start_pos
) # Case-insensitive search
if pos == -1:
break
# Calculate end position correctly - exact match only
end_pos = pos + len(surname_lower)
# Double-check that the extracted span matches exactly what we expect
extracted_text = name[pos:end_pos] # Get original case text
if extracted_text.lower() != surname_lower:
start_pos = pos + 1
continue
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "SURNAME"))
used_spans.append((pos, end_pos))
break
start_pos = pos + 1
if not entities:
logging.warning(
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
)
return None
# Sort entities by position and validate
entities.sort(key=lambda x: x[0])
# Final validation - ensure no overlaps and valid spans
validated_entities = []
for start, end, label in entities:
# Check bounds
if not (0 <= start < end <= len(name)):
logging.warning(
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
)
continue
# Check for overlaps with already validated entities
if any(
start < v_end and end > v_start
for v_start, v_end, _ in validated_entities
):
logging.warning(
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
)
continue
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
span_text = name[start:end]
if not span_text or span_text != span_text.strip() or " " in span_text:
logging.warning(
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
)
continue
validated_entities.append((start, end, label))
if not validated_entities:
logging.warning(f"No valid entities after validation for: '{name}'")
return None
# Convert to string format that matches the dataset
entities_str = str(validated_entities)
return {
"entities": entities_str,
"spans": validated_entities, # Keep the original tuples for internal use
}
@classmethod
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
"""Check if the match is at word boundaries"""
# Check character before start position
if start > 0:
prev_char = text[start - 1]
if prev_char.isalnum():
return False
# Check character after end position
if end < len(text):
next_char = text[end]
if next_char.isalnum():
return False
return True
@classmethod
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
"""Extract the actual text for each entity type"""
result = {"NATIVE": [], "SURNAME": []}
try:
entities = ast.literal_eval(entities_str)
for start, end, label in entities:
if 0 <= start < end <= len(name):
span_text = name[start:end]
if label in result:
result[label].append(span_text)
except (ValueError, SyntaxError, TypeError):
pass
return result
@classmethod
def parse(cls, entities_str: str) -> List[tuple]:
"""Parse entity strings from various formats.
Supports formats:
- [(start, end, label), ...]
- [[start, end, label], ...]
- [{"start": start, "end": end, "label": label}, ...]
"""
if not entities_str or entities_str in ["[]", "", "nan"]:
return []
entities_str = str(entities_str).strip()
try:
if entities_str.startswith("[(") and entities_str.endswith(")]"):
return ast.literal_eval(entities_str)
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
return [tuple(e) for e in ast.literal_eval(entities_str)]
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
return [
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
]
else:
parsed = ast.literal_eval(entities_str)
return [
tuple(e)
for e in parsed
if isinstance(e, (list, tuple)) and len(e) == 3
]
except (ValueError, SyntaxError, json.JSONDecodeError):
return []
def parse_entities(self, series: pd.Series) -> pd.Series:
"""Vectorized parse of entity strings."""
return series.map(self.parse)
@classmethod
def validate(cls, text: str, entities: List[tuple]) -> List[tuple]:
"""Advanced entity validation with overlap removal.
This is more comprehensive than the basic validate_entities method.
"""
if not entities or not text:
return []
text = str(text).strip()
valid = []
for ent in entities:
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
continue
start, end, label = ent
try:
start, end = int(start), int(end)
except (ValueError, TypeError):
continue
if not isinstance(label, str):
continue
if not (0 <= start < end <= len(text)):
continue
if not text[start:end].strip():
continue
valid.append((start, end, label))
if not valid:
return []
valid.sort(key=lambda x: (x[0], x[1]))
# Remove overlaps
filtered, last_end = [], -1
for s, e, l in valid:
if s >= last_end:
filtered.append((s, e, l))
last_end = e
return filtered
def validate_entities(
self, texts: pd.Series, entities_series: pd.Series
) -> pd.Series:
"""Vectorized entity validation."""
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
@classmethod
def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List:
"""Batch create spaCy Docs from texts and entities."""
docs = []
for text, ents in zip(texts, entities):
doc = nlp(text)
spans = []
for start, end, label in ents:
span = doc.char_span(
start, end, label=label, alignment_mode="contract"
) or doc.char_span(start, end, label=label, alignment_mode="strict")
if span:
spans.append(span)
doc.ents = filter_spans(spans)
docs.append(doc)
return docs
+57
View File
@@ -0,0 +1,57 @@
import logging
import time
from typing import Dict, Any
import pandas as pd
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.batch.batch_processor import BatchProcessor
from ners.processing.steps import PipelineStep
class Pipeline:
"""Main pipeline orchestrator"""
def __init__(self, config: BatchConfig):
self.config = config
self.processor = BatchProcessor(config)
self.steps = []
def add_step(self, step: PipelineStep):
"""Add a processing step to the pipeline"""
self.steps.append(step)
def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
"""Run the complete pipeline"""
current_data = input_data.copy()
for step in self.steps:
logging.info(f"Running pipeline step: {step.name}")
start_time = time.time()
current_data = self.processor.process(step, current_data)
elapsed_time = time.time() - start_time
logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
if step.state.failed_batches:
logging.warning(
f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
)
return current_data
def get_progress(self) -> Dict[str, Any]:
"""Get progress information for all steps"""
progress = {}
for step in self.steps:
progress[step.name] = {
"processed_batches": step.state.processed_batches,
"total_batches": step.state.total_batches,
"failed_batches": len(step.state.failed_batches),
"completion_percentage": (
step.state.processed_batches / max(1, step.state.total_batches)
)
* 100,
}
return progress
+129
View File
@@ -0,0 +1,129 @@
import json
import logging
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
from pydantic import BaseModel
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.batch.batch_config import BatchConfig
@dataclass
class PipelineState:
"""Tracks the state of pipeline execution"""
processed_batches: int = 0
total_batches: int = 0
failed_batches: List[int] = None
last_checkpoint: Optional[str] = None
def __post_init__(self):
if self.failed_batches is None:
self.failed_batches = []
class NameAnnotation(BaseModel):
"""Model for name annotation results"""
identified_name: Optional[str]
identified_surname: Optional[str]
class PipelineStep(ABC):
"""Abstract base class for pipeline steps"""
def __init__(
self,
name: str,
pipeline_config: PipelineConfig,
batch_config: Optional[BatchConfig] = None,
):
self.name = name
self.pipeline_config = pipeline_config
self.data_loader = DataLoader(pipeline_config)
# Use provided batch_config or create default from pipeline config
if batch_config is None:
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=pipeline_config.processing.max_workers,
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
self.batch_config = batch_config
self.state = PipelineState()
@property
def requires_batch_mutation(self) -> bool:
"""Indicates if this step modifies the batch data"""
return False
@abstractmethod
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch of data"""
pass
def get_checkpoint_path(self, batch_id: int) -> str:
"""Get the checkpoint file path for a batch"""
checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
checkpoint_dir.mkdir(parents=True, exist_ok=True)
return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
def get_state_path(self) -> str:
"""Get the state file path"""
state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
state_dir.mkdir(parents=True, exist_ok=True)
return str(state_dir / "pipeline_state.json")
def save_state(self):
"""Save pipeline state to disk"""
state_file = self.get_state_path()
with open(state_file, "w") as f:
json.dump(
{
"processed_batches": self.state.processed_batches,
"total_batches": self.state.total_batches,
"failed_batches": self.state.failed_batches,
"last_checkpoint": self.state.last_checkpoint,
},
f,
)
def load_state(self) -> bool:
"""Load pipeline state from disk. Returns True if state was loaded."""
state_file = self.get_state_path()
if os.path.exists(state_file):
try:
with open(state_file, "r") as f:
state_data = json.load(f)
self.state.processed_batches = state_data.get("processed_batches", 0)
self.state.total_batches = state_data.get("total_batches", 0)
self.state.failed_batches = state_data.get("failed_batches", [])
self.state.last_checkpoint = state_data.get("last_checkpoint")
return True
except Exception as e:
logging.warning(f"Failed to load state: {e}")
return False
def batch_exists(self, batch_id: int) -> bool:
"""Check if a batch has already been processed (idempotency)"""
checkpoint_path = self.get_checkpoint_path(batch_id)
return os.path.exists(checkpoint_path)
def save_batch(self, batch: pd.DataFrame, batch_id: int):
"""Save processed batch to checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
self.data_loader.save_csv(batch, checkpoint_path)
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
"""Load processed batch from checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
if os.path.exists(checkpoint_path):
return self.data_loader.load_csv_complete(checkpoint_path)
return None
@@ -0,0 +1,31 @@
import logging
import pandas as pd
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.text_cleaner import TextCleaner
from ners.processing.steps import PipelineStep
class DataCleaningStep(PipelineStep):
"""Configuration-driven data cleaning step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_cleaning", pipeline_config)
self.text_cleaner = TextCleaner()
self.required_columns = ["name", "sex", "region"]
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data cleaning"""
logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
# Drop rows with essential missing values
batch = batch.dropna(subset=self.required_columns)
# Apply text cleaning
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
# Remove duplicates
batch = batch.drop_duplicates(subset=self.required_columns)
return batch
@@ -0,0 +1,60 @@
import logging
import pandas as pd
from ners.core.config.pipeline_config import PipelineConfig
from ners.processing.steps import PipelineStep
class DataSelectionStep(PipelineStep):
"""Configuration-driven data selection step to keep only specified columns"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_selection", pipeline_config)
self.selected_columns = pipeline_config.data.selected_columns
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data selection"""
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
# Remove rows where region == "global" only for specific years
if "region" in batch.columns and "year" in batch.columns:
target_years = {2015, 2021, 2022}
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
target_years
)
removed = int(mask_remove.sum())
if removed:
batch = batch[~mask_remove]
logging.info(
f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}"
)
# Check which columns exist in the batch
available_columns = [
col for col in self.selected_columns if col in batch.columns
]
missing_columns = [
col for col in self.selected_columns if col not in batch.columns
]
if missing_columns:
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
if not available_columns:
logging.error(f"No required columns found in batch {batch_id}")
return pd.DataFrame() # Return empty DataFrame if no required columns exist
# Select only the available required columns
selected_batch = batch[available_columns].copy()
logging.info(
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
)
return selected_batch
@property
def requires_batch_mutation(self) -> bool:
"""This step modifies the batch data by selecting columns"""
return True
@@ -0,0 +1,69 @@
import numpy as np
import pandas as pd
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.region_mapper import RegionMapper
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.steps import PipelineStep
from ners.processing.steps.feature_extraction_step import Gender
class DataSplittingStep(PipelineStep):
"""Configuration-driven data splitting step"""
def __init__(self, pipeline_config: PipelineConfig):
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=1, # No need for parallelism in splitting
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=False,
)
super().__init__("data_splitting", pipeline_config, batch_config)
self.eval_indices = None
def determine_eval_indices(self, total_size: int) -> set:
"""Determine evaluation indices consistently across batches"""
if self.eval_indices is None:
np.random.seed(self.pipeline_config.data.random_seed)
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
self.eval_indices = set(
np.random.choice(total_size, size=eval_size, replace=False)
)
return self.eval_indices
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch for data splitting - no modification needed"""
return batch
def split(self, df: pd.DataFrame) -> None:
"""Save the split datasets based on configuration"""
output_files = self.pipeline_config.data.output_files
data_dir = self.pipeline_config.paths.data_dir
if self.pipeline_config.data.split_evaluation:
eval_indices = self.determine_eval_indices(len(df))
eval_mask = df.index.isin(eval_indices)
df_evaluation = df[eval_mask]
df_featured = df[~eval_mask]
self.data_loader.save_csv(
df_evaluation, data_dir / output_files["evaluation"]
)
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
else:
self.data_loader.save_csv(df, data_dir / output_files["featured"])
if self.pipeline_config.data.split_by_province:
for province in RegionMapper.get_provinces():
df_region = df[df.province == province]
self.data_loader.save_csv(
df_region, data_dir / "provinces" / f"{province}.csv"
)
if self.pipeline_config.data.split_by_gender:
df_males = df[df.sex == Gender.MALE.value]
df_females = df[df.sex == Gender.FEMALE.value]
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -0,0 +1,196 @@
import gc
import logging
from enum import Enum
from typing import Dict, Any
import pandas as pd
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.region_mapper import RegionMapper
from ners.processing.ner.name_tagger import NameTagger
from ners.processing.steps import PipelineStep
class Gender(Enum):
MALE = "m"
FEMALE = "f"
class NameCategory(Enum):
SIMPLE = "simple"
COMPOSE = "compose"
class FeatureExtractionStep(PipelineStep):
"""Configuration-driven feature extraction step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("feature_extraction", pipeline_config)
self.region_mapper = RegionMapper()
self.name_tagger = NameTagger()
@classmethod
def requires_batch_mutation(cls) -> bool:
"""This step creates new columns, so mutation is required"""
return True
@classmethod
def validate_gender(cls, gender: str) -> Gender:
"""Validate and normalize gender value"""
gender_lower = str(gender).lower().strip()
if gender_lower in ["m", "male", "homme", "masculin"]:
return Gender.MALE
elif gender_lower in ["f", "female", "femme", "féminin"]:
return Gender.FEMALE
else:
raise ValueError(f"Unknown gender: {gender}")
@classmethod
def get_name_category(cls, word_count: int) -> NameCategory:
"""Determine name category based on word count"""
return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Extract features from names in batch"""
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
result = batch.copy()
numeric_features = self._compute_numeric_features(result["name"])
result = result.assign(**numeric_features)
# Initialize features columns with optimal dtypes
features_columns = self._initialize_features_columns(len(result))
result = result.assign(**features_columns)
self._assign_probable_names(result)
self._process_simple_names(result)
result["identified_category"] = self._assign_identified_category(
result["words"]
)
if "year" in result.columns:
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
"Int16"
)
if "region" in result.columns:
result["province"] = self.region_mapper.map(result["region"]).str.lower()
result["province"] = result["province"].astype("category")
if "sex" in result.columns:
result["sex"] = self._normalize_gender(result["sex"])
# Apply final dtype optimizations
result = self._optimize_dtypes(result)
# Cleanup
del numeric_features, features_columns
if batch_id % 10 == 0: # Periodic cleanup
gc.collect()
return result
@classmethod
def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
"""Calculate basic features in vectorized manner"""
return {
"words": (series.str.count(" ") + 1).astype("Int8"),
"length": series.str.len().astype("Int16"),
}
@classmethod
def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
"""Initialize new columns with optimal dtypes"""
return {
"probable_native": pd.Series([None] * size, dtype="string"),
"probable_surname": pd.Series([None] * size, dtype="string"),
"identified_name": pd.Series([None] * size, dtype="string"),
"identified_surname": pd.Series([None] * size, dtype="string"),
"ner_entities": pd.Series([None] * size, dtype="string"),
"ner_tagged": pd.Series([0] * size, dtype="Int8"),
"annotated": pd.Series([0] * size, dtype="Int8"),
}
@classmethod
def _assign_probable_names(cls, df: pd.DataFrame) -> None:
"""Assign probable native and surname names efficiently"""
name_splits = df["name"].str.split()
mask = name_splits.str.len() >= 2
df.loc[mask, "probable_native"] = name_splits[mask].apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
)
df.loc[mask, "probable_surname"] = name_splits[mask].apply(
lambda x: x[-1] if isinstance(x, list) else None
)
def _assign_identified_category(self, series: pd.Series) -> pd.Series:
"""Assign identified category based on word count"""
return series.map(lambda x: self.get_name_category(x).value).astype("category")
def _process_simple_names(self, df: pd.DataFrame) -> None:
"""Process 3-word names efficiently with vectorized operations"""
mask = pd.Series(df["words"] == 3)
if not mask.any():
return
df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
df.loc[mask, "annotated"] = 1
# NER tagging for 3-word names
three_word_rows = df[mask]
for idx, row in three_word_rows.iterrows():
try:
entity = self.name_tagger.tag_name(
row["name"], row["identified_name"], row["identified_surname"]
)
if entity:
df.at[idx, "ner_entities"] = str(entity["entities"])
df.at[idx, "ner_tagged"] = 1
except Exception as e:
logging.warning(f"NER tagging failed for row {idx}: {e}")
@classmethod
def _normalize_gender(cls, series: pd.Series) -> pd.Series:
gender_mapping = {
"m": "m",
"male": "m",
"homme": "m",
"masculin": "m",
"f": "f",
"female": "f",
"femme": "f",
"féminin": "f",
}
# Apply mapping with error handling
normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
return normalized.astype("category")
@classmethod
def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
categories = ["province", "identified_category", "sex"]
for col in categories:
if col in df.columns and df[col].dtype != "category":
df[col] = df[col].astype("category")
# Ensure string columns are proper string dtype
string_cols = [
"name",
"probable_native",
"probable_surname",
"identified_name",
"identified_surname",
"ner_entities",
]
for col in string_cols:
if col in df.columns and df[col].dtype == "object":
df[col] = df[col].astype("string")
return df
@@ -0,0 +1,169 @@
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict
import ollama
import pandas as pd
from pydantic import ValidationError
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.prompt_manager import PromptManager
from ners.core.utils.rate_limiter import RateLimitConfig
from ners.core.utils.rate_limiter import RateLimiter
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.steps import PipelineStep, NameAnnotation
class LLMAnnotationStep(PipelineStep):
"""Configuration-driven LLM annotation step"""
def __init__(self, pipeline_config: PipelineConfig):
# Create custom batch config for LLM processing
self.llm_config = pipeline_config.annotation.llm
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=min(
self.llm_config.max_concurrent_requests,
pipeline_config.processing.max_workers,
),
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
super().__init__("llm_annotation", pipeline_config, batch_config)
self.prompt = PromptManager(pipeline_config).load_prompt()
self.rate_limiter = (
self._create_rate_limiter()
if self.llm_config.enable_rate_limiting
else None
)
# Statistics
self.successful_requests = 0
self.failed_requests = 0
self.total_retry_attempts = 0
# Setup logging
logging.getLogger("httpx").setLevel(logging.WARNING)
def _create_rate_limiter(self):
"""Create rate limiter based on configuration"""
rate_config = RateLimitConfig(
requests_per_minute=self.llm_config.requests_per_minute,
requests_per_second=self.llm_config.requests_per_second,
)
return RateLimiter(rate_config)
def analyze_name(self, client: ollama.Client, name: str) -> Dict:
"""Analyze a name with retry logic and rate limiting"""
for attempt in range(self.llm_config.retry_attempts):
try:
# Apply rate limiting if enabled
if self.rate_limiter:
self.rate_limiter.wait_if_needed()
start_time = time.time()
response = client.chat(
model=self.llm_config.model_name,
messages=[
{"role": "system", "content": self.prompt},
{"role": "user", "content": name},
],
format=NameAnnotation.model_json_schema(),
)
elapsed_time = time.time() - start_time
if elapsed_time > self.llm_config.timeout_seconds:
raise TimeoutError(
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
)
annotation = NameAnnotation.model_validate_json(
response.message.content
)
result = {
**annotation.model_dump(),
"annotated": 1,
"processing_time": elapsed_time,
"attempts": attempt + 1,
}
self.successful_requests += 1
if attempt > 0:
self.total_retry_attempts += attempt
return result
except (ValidationError, TimeoutError, Exception) as e:
logging.warning(
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.llm_config.retry_attempts}): {e}"
)
# Exponential backoff with jitter
if attempt < self.llm_config.retry_attempts - 1:
wait_time = (2**attempt) + (time.time() % 1)
time.sleep(min(wait_time, 10))
self.failed_requests += 1
return {
"identified_name": None,
"identified_surname": None,
"annotated": 0,
"processing_time": 0,
"attempts": self.llm_config.retry_attempts,
"failed": True,
}
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch with LLM annotation"""
unannotated_mask = batch.get("annotated", 0) == 0
unannotated_entries = batch[unannotated_mask]
if unannotated_entries.empty:
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
)
batch = batch.copy()
client = ollama.Client()
# Process with controlled concurrency
max_workers = self.llm_config.max_concurrent_requests
if len(unannotated_entries) == 1 or max_workers == 1:
# Sequential processing
for idx, row in unannotated_entries.iterrows():
result = self.analyze_name(client, row["name"])
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
else:
# Concurrent processing
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {}
for idx, row in unannotated_entries.iterrows():
future = executor.submit(self.analyze_name, client, row["name"])
future_to_idx[future] = idx
for future in as_completed(future_to_idx):
idx = future_to_idx[future]
try:
result = future.result()
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
except Exception as e:
logging.error(f"Failed to process row {idx}: {e}")
batch.loc[idx, "annotated"] = 0
# Ensure proper data types
batch["annotated"] = (
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
)
return batch
@@ -0,0 +1,172 @@
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict
import pandas as pd
from ners.core.config.pipeline_config import PipelineConfig
from ners.processing.ner.name_model import NameModel
from ners.processing.steps import PipelineStep, NameAnnotation
class NERAnnotationStep(PipelineStep):
"""NER annotation step using trained spaCy model for entity recognition"""
def __init__(self, pipeline_config: PipelineConfig):
# Create custom batch config for NER processing
super().__init__("ner_annotation", pipeline_config)
self.model_name = "drc_ner_model"
self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
self.name_model = NameModel(pipeline_config)
self.ner_config = pipeline_config.annotation.ner
# Statistics
self.successful_requests = 0
self.failed_requests = 0
self.total_retry_attempts = 0
# Load the model
self._load_ner_model()
def _load_ner_model(self) -> None:
"""Load the trained NER model"""
try:
if self.model_path.exists():
logging.info(f"Loading NER model from {self.model_path}")
self.name_model.load(str(self.model_path))
logging.info("NER model loaded successfully")
else:
logging.warning(f"NER model not found at {self.model_path}")
logging.warning(
"NER annotation will be skipped. Train the model first."
)
self.name_model.nlp = None
except Exception as e:
logging.error(f"Failed to load NER model: {e}")
self.name_model.nlp = None
def analyze_name(self, name: str) -> Dict:
"""Analyze a name with retry logic"""
if self.name_model.nlp is None:
return {
"identified_name": None,
"identified_surname": None,
"annotated": 0,
"processing_time": 0,
"attempts": 0,
"failed": True,
}
for attempt in range(self.ner_config.retry_attempts):
try:
start_time = time.time()
# Get NER predictions
prediction = self.name_model.predict(name.lower())
entities = prediction.get("entities", [])
elapsed_time = time.time() - start_time
# Extract native names and surnames from entities
native_parts = []
surname_parts = []
for entity in entities:
if entity["label"] == "NATIVE":
native_parts.append(entity["text"])
elif entity["label"] == "SURNAME":
surname_parts.append(entity["text"])
# Create annotation result in same format as LLM step
annotation = NameAnnotation(
identified_name=" ".join(native_parts) if native_parts else None,
identified_surname=" ".join(surname_parts)
if surname_parts
else None,
)
result = {
**annotation.model_dump(),
"annotated": 1,
"processing_time": elapsed_time,
"attempts": attempt + 1,
}
self.successful_requests += 1
if attempt > 0:
self.total_retry_attempts += attempt
return result
except Exception as e:
logging.warning(
f"Error analyzing '{name}' with NER (attempt {attempt + 1}/{self.ner_config.retry_attempts}): {e}"
)
# Small delay between retries
if attempt < self.ner_config.retry_attempts - 1:
time.sleep(0.1)
self.failed_requests += 1
return {
"identified_name": None,
"identified_surname": None,
"annotated": 0,
"processing_time": 0,
"attempts": self.ner_config.retry_attempts,
"failed": True,
}
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch with NER annotation using same logic as LLM step"""
unannotated_mask = batch.get("annotated", 0) == 0
unannotated_entries = batch[unannotated_mask]
if unannotated_entries.empty:
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
)
batch = batch.copy()
# Process with controlled concurrency
max_workers = self.batch_config.max_workers
if len(unannotated_entries) == 1 or max_workers == 1:
# Sequential processing
for idx, row in unannotated_entries.iterrows():
result = self.analyze_name(row["name"])
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
else:
# Concurrent processing
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {}
for idx, row in unannotated_entries.iterrows():
future = executor.submit(self.analyze_name, row["name"])
future_to_idx[future] = idx
for future in as_completed(future_to_idx):
idx = future_to_idx[future]
try:
result = future.result()
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
except Exception as e:
logging.error(f"Failed to process row {idx}: {e}")
batch.loc[idx, "annotated"] = 0
# Ensure proper data types
batch["annotated"] = (
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
)
return batch
View File
+261
View File
@@ -0,0 +1,261 @@
import logging
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ners.research.experiment import ExperimentConfig
class BaseModel(ABC):
"""Abstract base class for all models"""
def __init__(self, config: ExperimentConfig):
self.config = config
self.model = None
self.feature_extractor = None
self.label_encoder = None
self.tokenizer = None # For neural models
self.is_fitted = False
self.training_history = {} # Store training history for learning curves
self.learning_curve_data = {} # Store learning curve experiment data
@property
@abstractmethod
def architecture(self) -> str:
"""Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
pass
@abstractmethod
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
"""Prepare features for training/prediction"""
pass
@abstractmethod
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the model - implemented differently for each architecture"""
pass
@abstractmethod
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float] | dict[str, np.floating[Any]]:
"""Perform cross-validation and return average scores"""
pass
@abstractmethod
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
pass
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Make predictions"""
if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions")
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
predictions = self.model.predict(X_prepared)
# Handle different prediction formats
if hasattr(predictions, "shape") and len(predictions.shape) > 1:
# Neural network outputs (probabilities)
predictions = predictions.argmax(axis=1)
return self.label_encoder.inverse_transform(predictions)
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
"""Get prediction probabilities if supported"""
if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions")
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
if hasattr(self.model, "predict_proba"):
return self.model.predict_proba(X_prepared)
elif hasattr(self.model, "predict"):
# For neural networks that return probabilities directly
probabilities = self.model.predict(X_prepared)
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
return probabilities
raise NotImplementedError("Model does not support probability predictions")
def get_feature_importance(self) -> Optional[Dict[str, float]]:
"""Get feature importance if supported by the model"""
if hasattr(self.model, "feature_importances_"):
# For tree-based models
importances = self.model.feature_importances_
feature_names = self._get_feature_names()
return dict(zip(feature_names, importances))
elif hasattr(self.model, "coef_"):
# For linear models
coefficients = np.abs(self.model.coef_[0])
feature_names = self._get_feature_names()
return dict(zip(feature_names, coefficients))
elif (
hasattr(self.model, "named_steps")
and "classifier" in self.model.named_steps
):
# For sklearn pipelines (like LogisticRegression with vectorizer)
classifier = self.model.named_steps["classifier"]
if hasattr(classifier, "coef_"):
coefficients = np.abs(classifier.coef_[0])
if hasattr(
self.model.named_steps["vectorizer"], "get_feature_names_out"
):
feature_names = self.model.named_steps[
"vectorizer"
].get_feature_names_out()
# Take top features to avoid too many n-grams
top_indices = np.argsort(coefficients)[-20:]
return dict(
zip(feature_names[top_indices], coefficients[top_indices])
)
return None
def _get_feature_names(self) -> List[str]:
"""Get feature names (override in subclasses if needed)"""
if hasattr(self.model, "feature_names_in_"):
return list(self.model.feature_names_in_)
return [f"feature_{i}" for i in range(100)] # Default fallback
def save(self, path: str):
"""Save the complete model with training history"""
model_data = {
"model": self.model,
"feature_extractor": self.feature_extractor,
"label_encoder": self.label_encoder,
"tokenizer": self.tokenizer,
"config": self.config.to_dict(),
"is_fitted": self.is_fitted,
"training_history": self.training_history,
"learning_curve_data": self.learning_curve_data,
}
joblib.dump(model_data, path)
@classmethod
def load(cls, path: str) -> "BaseModel":
"""Load a saved model with training history"""
model_data = joblib.load(path)
# Recreate the model instance
from ners.research.experiment import ExperimentConfig
config = ExperimentConfig.from_dict(model_data["config"])
instance = cls(config)
# Restore state
instance.model = model_data["model"]
instance.feature_extractor = model_data["feature_extractor"]
instance.label_encoder = model_data["label_encoder"]
instance.tokenizer = model_data.get("tokenizer")
instance.is_fitted = model_data["is_fitted"]
instance.training_history = model_data.get("training_history", {})
instance.learning_curve_data = model_data.get("learning_curve_data", {})
return instance
def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
"""Plot and save learning curve"""
if not self.learning_curve_data:
logging.warning("No learning curve data available")
return ""
plt.figure(figsize=(10, 6))
data = self.learning_curve_data
train_sizes = data["train_sizes"]
train_scores = data["train_scores"]
val_scores = data["val_scores"]
train_std = data.get("train_scores_std", [0] * len(train_sizes))
val_std = data.get("val_scores_std", [0] * len(train_sizes))
# Plot learning curves
plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
plt.fill_between(
train_sizes,
np.array(train_scores) - np.array(train_std),
np.array(train_scores) + np.array(train_std),
alpha=0.1,
color="blue",
)
plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
plt.fill_between(
train_sizes,
np.array(val_scores) - np.array(val_std),
np.array(val_scores) + np.array(val_std),
alpha=0.1,
color="red",
)
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.title(f"Learning Curve - {self.__class__.__name__}")
plt.legend(loc="best")
plt.grid(True, alpha=0.3)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
plt.close()
return save_path
else:
plt.show()
return ""
def plot_training_history(self, save_path: Optional[str] = None) -> str:
"""Plot training history for neural networks"""
if not self.training_history:
logging.warning("No training history available")
return ""
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
# Plot accuracy
if "accuracy" in self.training_history:
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
if "val_accuracy" in self.training_history:
axes[0].plot(
self.training_history["val_accuracy"], label="Validation Accuracy"
)
axes[0].set_title("Model Accuracy")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Accuracy")
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Plot loss
if "loss" in self.training_history:
axes[1].plot(self.training_history["loss"], label="Training Loss")
if "val_loss" in self.training_history:
axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
axes[1].set_title("Model Loss")
axes[1].set_xlabel("Epoch")
axes[1].set_ylabel("Loss")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
plt.close()
return save_path
else:
plt.show()
return ""
+97
View File
@@ -0,0 +1,97 @@
from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import List, Dict, Any, Optional
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from .feature_extractor import FeatureType
@dataclass
class ExperimentConfig:
"""Configuration for a single experiment"""
# Experiment metadata
name: str
description: str = ""
tags: List[str] = field(default_factory=list)
# Model configuration
model_type: str = (
"logistic_regression" # logistic_regression, lstm, transformer, etc.
)
model_params: Dict[str, Any] = field(default_factory=dict)
# Feature configuration
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
feature_params: Dict[str, Any] = field(default_factory=dict)
# Data configuration
train_data_filter: Optional[Dict[str, Any]] = (
None # Filter criteria for training data
)
test_data_filter: Optional[Dict[str, Any]] = None
target_column: str = "sex"
# Training configuration
test_size: float = 0.2
random_seed: int = 42
cross_validation_folds: int = 5
# Evaluation configuration
metrics: List[str] = field(
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
# Convert enums to strings
result["features"] = [f.value for f in self.features]
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
"""Create from dictionary"""
if "features" in data:
data["features"] = [FeatureType(f) for f in data["features"]]
return cls(**data)
class ExperimentStatus(Enum):
"""Experiment execution status"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
def calculate_metrics(
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
) -> Dict[str, float]:
"""Calculate specified metrics"""
if metrics is None:
metrics = ["accuracy", "precision", "recall", "f1"]
results = {}
if "accuracy" in metrics:
results["accuracy"] = accuracy_score(y_true, y_pred)
if any(m in metrics for m in ["precision", "recall", "f1"]):
precision, recall, f1, _ = precision_recall_fscore_support(
y_true, y_pred, average="weighted"
)
if "precision" in metrics:
results["precision"] = precision
if "recall" in metrics:
results["recall"] = recall
if "f1" in metrics:
results["f1"] = f1
return results
@@ -0,0 +1,58 @@
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional, Dict, List, Any
from ners.research.experiment import ExperimentConfig, ExperimentStatus
@dataclass
class ExperimentResult:
"""Results from an experiment execution"""
experiment_id: str
config: ExperimentConfig
# Execution metadata
start_time: datetime
end_time: Optional[datetime] = None
status: ExperimentStatus = ExperimentStatus.PENDING
error_message: Optional[str] = None
# Model artifacts
model_path: Optional[str] = None
feature_extractor_path: Optional[str] = None
# Metrics
train_metrics: Dict[str, float] = field(default_factory=dict)
test_metrics: Dict[str, float] = field(default_factory=dict)
cv_metrics: Dict[str, float] = field(default_factory=dict)
# Additional results
confusion_matrix: Optional[List[List[int]]] = None
feature_importance: Optional[Dict[str, float]] = None
prediction_examples: Optional[List[Dict]] = None
# Data statistics
train_size: int = 0
test_size: int = 0
class_distribution: Dict[str, int] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
result["config"] = self.config.to_dict()
result["start_time"] = self.start_time.isoformat()
result["end_time"] = self.end_time.isoformat() if self.end_time else None
result["status"] = self.status.value
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
"""Create from dictionary"""
data["config"] = ExperimentConfig.from_dict(data["config"])
data["start_time"] = datetime.fromisoformat(data["start_time"])
data["end_time"] = (
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
)
data["status"] = ExperimentStatus(data["status"])
return cls(**data)
@@ -0,0 +1,112 @@
import logging
from typing import List, Dict
import yaml
from ners.core.config.pipeline_config import PipelineConfig
from ners.research.experiment import ExperimentConfig
from ners.research.experiment.feature_extractor import FeatureType
class ExperimentBuilder:
"""Helper class to build experiment configurations"""
def __init__(self, config: PipelineConfig):
self.config = config
def load_templates(self, templates: str = "research_templates.yaml") -> dict:
"""Load research templates from YAML file"""
try:
with open(self.config.paths.configs_dir / templates, "r") as file:
return yaml.safe_load(file)
except FileNotFoundError:
logging.error(f"Templates file not found: {templates}")
raise
except yaml.YAMLError as e:
logging.error(f"Error parsing templates file: {e}")
raise
@classmethod
def find_template(
cls, templates: dict, name: str, experiment_type: str = "baseline"
) -> dict:
"""Find experiment configuration by name and type"""
# Map type to section in templates
type_mapping = {
"baseline": "baseline_experiments",
"advanced": "advanced_experiments",
"feature_study": "feature_studies",
"tuning": "hyperparameter_tuning",
}
section_name = type_mapping.get(experiment_type)
if not section_name:
available_types = list(type_mapping.keys())
raise ValueError(
f"Unknown experiment type '{experiment_type}'. Available types: {available_types}"
)
if section_name not in templates:
raise ValueError(f"Section '{section_name}' not found in templates")
experiments = templates[section_name]
# Search for experiment by model name
for experiment in experiments:
# Check if this is the experiment we're looking for
# Look for experiments that match the model type or contain the name
if (
experiment.get("model_type") == name
or name.lower() in experiment.get("name", "").lower()
or experiment.get("name") == name
or f"baseline_{name}" == experiment.get("name")
or f"advanced_{name}" == experiment.get("name")
):
return experiment
# If not found, list available experiments
available_experiments = [
exp.get("name", exp.get("model_type", "unknown")) for exp in experiments
]
raise ValueError(
f"Experiment '{name}' not found in '{experiment_type}' section. "
f"Available experiments: {available_experiments}"
)
def get_templates(
self, templates_path: str = "research_templates.yaml"
) -> Dict[str, List[Dict]]:
"""Get all available experiments from templates organized by type"""
templates = self.load_templates(templates_path)
return {
"baseline": templates.get("baseline_experiments", []),
"advanced": templates.get("advanced_experiments", []),
"feature_study": templates.get("feature_studies", []),
"tuning": templates.get("hyperparameter_tuning", []),
}
@classmethod
def from_template(cls, template_config: dict) -> ExperimentConfig:
"""Create an ExperimentConfig from a template configuration"""
# Convert feature strings to FeatureType objects
features = []
for feature_str in template_config.get("features", []):
try:
features.append(FeatureType(feature_str))
except ValueError:
logging.warning(f"Unknown feature type: {feature_str}")
continue
return ExperimentConfig(
name=template_config.get("name"),
description=template_config.get("description"),
model_type=template_config.get("model_type"),
features=features,
model_params=template_config.get("model_params", {}),
tags=template_config.get("tags", []),
test_size=template_config.get("test_size", 0.2),
cross_validation_folds=template_config.get("cross_validation_folds", 5),
train_data_filter=template_config.get("train_data_filter"),
)
@@ -0,0 +1,285 @@
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.research.base_model import BaseModel
from ners.research.experiment import (
ExperimentConfig,
ExperimentStatus,
calculate_metrics,
)
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.model_registry import create_model
class ExperimentRunner:
"""Runs and manages experiments"""
def __init__(self, config: PipelineConfig):
self.config = config
self.tracker = ExperimentTracker(self.config)
self.data_loader = DataLoader(self.config)
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
"""Run a single experiment and return experiment ID"""
# Create experiment
experiment_id = self.tracker.create_experiment(experiment_config)
try:
logging.info(f"Starting experiment: {experiment_id}")
self.tracker.update_experiment(
experiment_id, status=ExperimentStatus.RUNNING
)
# Load data
filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath)
# Apply data filters if specified
df = self._apply_data_filters(df, experiment_config)
# Prepare target variable
y = df[experiment_config.target_column]
X = df
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=experiment_config.test_size,
random_state=experiment_config.random_seed,
stratify=y,
)
# Create and train model
model = create_model(experiment_config)
model.fit(X_train, y_train)
# Make predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
# Calculate metrics
train_metrics = calculate_metrics(
y_train, train_pred, experiment_config.metrics
)
test_metrics = calculate_metrics(
y_test, test_pred, experiment_config.metrics
)
# Cross-validation if requested
cv_metrics = {}
if experiment_config.cross_validation_folds > 1:
cv_metrics = model.cross_validate(
X_train, y_train, experiment_config.cross_validation_folds
)
# Additional analysis
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
feature_importance = model.get_feature_importance()
# Create prediction examples
prediction_examples = self._create_prediction_examples(
X_test, y_test, test_pred, model, n_examples=10
)
# Calculate class distribution
class_distribution = y.value_counts().to_dict()
# Save model
model_path = self._save_model(model, experiment_id)
# Update experiment with results
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.COMPLETED,
end_time=datetime.now(),
model_path=str(model_path),
train_metrics=train_metrics,
test_metrics=test_metrics,
cv_metrics=cv_metrics,
confusion_matrix=conf_matrix,
feature_importance=feature_importance,
prediction_examples=prediction_examples,
train_size=len(X_train),
test_size=len(X_test),
class_distribution=class_distribution,
)
logging.info(f"Experiment {experiment_id} completed successfully")
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
return experiment_id
except Exception as e:
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.FAILED,
end_time=datetime.now(),
error_message=str(e),
)
raise
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
"""Run multiple experiments"""
experiment_ids = []
for i, config in enumerate(experiments):
logging.info(
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
)
try:
exp_id = self.run_experiment(config)
experiment_ids.append(exp_id)
except Exception as e:
logging.error(f"Failed to run experiment {config.name}: {e}")
continue
return experiment_ids
@classmethod
def _apply_data_filters(
cls, df: pd.DataFrame, config: ExperimentConfig
) -> pd.DataFrame:
"""Apply data filters specified in experiment config"""
filtered_df = df.copy()
# Apply training data filters
if config.train_data_filter:
for column, criteria in config.train_data_filter.items():
if column in filtered_df.columns:
if isinstance(criteria, list):
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
elif isinstance(criteria, dict):
if "min" in criteria:
filtered_df = filtered_df[
filtered_df[column] >= criteria["min"]
]
if "max" in criteria:
filtered_df = filtered_df[
filtered_df[column] <= criteria["max"]
]
else:
filtered_df = filtered_df[filtered_df[column] == criteria]
return filtered_df
@classmethod
def _create_prediction_examples(
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
) -> List[Dict]:
"""Create prediction examples for analysis"""
examples = []
# Get both correct and incorrect predictions
correct_mask = y_test == predictions
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
correct_indices = X_test[correct_mask].index[: n_examples // 2]
sample_indices = list(incorrect_indices) + list(correct_indices)
for idx in sample_indices[:n_examples]:
example = {
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
"true_label": y_test.loc[idx],
"predicted_label": predictions[X_test.index.get_loc(idx)],
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
}
# Add probability if available
if model.architecture == "traditional":
proba = model.predict_proba(X_test.loc[[idx]])
example["prediction_confidence"] = float(proba.max())
examples.append(example)
return examples
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
"""Save trained model"""
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / "model.joblib"
model.save(str(model_path))
return model_path
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
"""Load a model from a completed experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.model_path:
try:
# Load the saved model data Recreate the model instance using the saved config
model_data = joblib.load(experiment.model_path)
config = ExperimentConfig.from_dict(model_data["config"])
model = create_model(config)
# Restore the saved state
model.model = model_data["model"]
model.feature_extractor = model_data["feature_extractor"]
model.label_encoder = model_data["label_encoder"]
model.tokenizer = model_data.get("tokenizer")
model.is_fitted = model_data["is_fitted"]
model.training_history = model_data.get("training_history", {})
model.learning_curve_data = model_data.get("learning_curve_data", {})
# Restore vectorizers and encoders for models that use them (like XGBoost)
if "vectorizers" in model_data and hasattr(model, "vectorizers"):
model.vectorizers = model_data["vectorizers"]
if "label_encoders" in model_data and hasattr(model, "label_encoders"):
model.label_encoders = model_data["label_encoders"]
return model
except Exception as e:
logging.error(
f"Failed to load model for experiment {experiment_id}: {e}"
)
return None
return None
def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
"""Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids)
if f"test_{metric}" in comparison_df.columns:
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
return comparison_df
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
"""Get feature importance analysis for an experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.feature_importance:
importance_df = pd.DataFrame(
[
{"feature": feature, "importance": importance}
for feature, importance in experiment.feature_importance.items()
]
)
return importance_df.sort_values("importance", ascending=False)
return None
@@ -0,0 +1,200 @@
import hashlib
import json
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, List
import pandas as pd
from ners.core.config import PipelineConfig, get_config
from ners.research.experiment import ExperimentConfig, ExperimentStatus
from ners.research.experiment.experiement_result import ExperimentResult
class ExperimentTracker:
"""Tracks and manages experiments"""
def __init__(self, config: Optional[PipelineConfig] = None):
self.config = config or get_config()
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
self.experiments_dir.mkdir(parents=True, exist_ok=True)
self.results_db_path = self.experiments_dir / "experiments.json"
self._results: Dict[str, ExperimentResult] = {}
self._load_results()
def _load_results(self):
"""Load existing experiment results"""
if self.results_db_path.exists():
try:
with open(self.results_db_path, "r") as f:
data = json.load(f)
for exp_id, exp_data in data.items():
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
except Exception as e:
print(f"Warning: Failed to load experiment results: {e}")
def _save_results(self):
"""Save experiment results to disk"""
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
with open(self.results_db_path, "w") as f:
json.dump(data, f, indent=2, default=str)
def create_experiment(self, config: ExperimentConfig) -> str:
"""Create a new experiment and return its ID"""
# Generate experiment ID
config_hash = hashlib.md5(
json.dumps(config.to_dict(), sort_keys=True).encode()
).hexdigest()[:8]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
# Create result object
result = ExperimentResult(
experiment_id=experiment_id, config=config, start_time=datetime.now()
)
self._results[experiment_id] = result
self._save_results()
return experiment_id
def update_experiment(self, experiment_id: str, **updates):
"""Update an experiment's results"""
if experiment_id in self._results:
result = self._results[experiment_id]
for key, value in updates.items():
if hasattr(result, key):
setattr(result, key, value)
self._save_results()
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
"""Get experiment by ID"""
return self._results.get(experiment_id)
def list_experiments(
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
) -> List[ExperimentResult]:
"""List experiments with optional filtering"""
results = list(self._results.values())
if status:
results = [r for r in results if r.status == status]
if tags:
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
if model_type:
results = [r for r in results if r.config.model_type == model_type]
return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment(
self,
metric: str = "accuracy",
dataset: str = "test",
filters: Optional[Dict] = None,
) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric"""
experiments = self.list_experiments()
if filters:
# Apply additional filters
if "model_type" in filters:
experiments = [
e
for e in experiments
if e.config.model_type == filters["model_type"]
]
if "features" in filters:
experiments = [
e
for e in experiments
if any(f in e.config.features for f in filters["features"])
]
valid_experiments = []
for exp in experiments:
if exp.status == ExperimentStatus.COMPLETED:
metrics_dict = (
exp.test_metrics if dataset == "test" else exp.train_metrics
)
if metric in metrics_dict:
valid_experiments.append((exp, metrics_dict[metric]))
if not valid_experiments:
return None
return max(valid_experiments, key=lambda x: x[1])[0]
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
"""Compare multiple experiments in a DataFrame"""
rows = []
for exp_id in experiment_ids:
exp = self.get_experiment(exp_id)
if exp:
row = {
"experiment_id": exp_id,
"name": exp.config.name,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
return pd.DataFrame(rows)
def export_results(self, output_path: Optional[Path] = None) -> Path:
"""Export all results to CSV"""
if output_path is None:
output_path = (
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
rows = []
for exp in self._results.values():
row = {
"experiment_id": exp.experiment_id,
"name": exp.config.name,
"description": exp.config.description,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"start_time": exp.start_time.isoformat(),
"end_time": exp.end_time.isoformat() if exp.end_time else None,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add all metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv(output_path, index=False)
return output_path
@@ -0,0 +1,92 @@
from enum import Enum
from typing import List, Dict, Any, Union
import pandas as pd
class FeatureType(Enum):
"""Types of features that can be extracted from names"""
FULL_NAME = "full_name"
NATIVE_NAME = "native_name"
SURNAME = "surname"
FIRST_WORD = "first_word"
LAST_WORD = "last_word"
NAME_LENGTH = "name_length"
WORD_COUNT = "word_count"
PROVINCE = "province"
CHAR_NGRAMS = "char_ngrams"
WORD_NGRAMS = "word_ngrams"
NAME_ENDINGS = "name_endings"
NAME_BEGINNINGS = "name_beginnings"
class FeatureExtractor:
"""Extract different types of features from name data"""
def __init__(
self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None
):
self.feature_types = feature_types
self.feature_params = feature_params or {}
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Extract all configured features"""
features_df = pd.DataFrame(index=df.index)
for feature_type in self.feature_types:
feature_data = self._extract_single_feature(df, feature_type)
if isinstance(feature_data, pd.DataFrame):
features_df = pd.concat([features_df, feature_data], axis=1)
else:
features_df[feature_type.value] = feature_data
return features_df
def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME:
return df["name"].fillna("")
elif feature_type == FeatureType.NATIVE_NAME:
return df["identified_name"].fillna(df["probable_native"]).fillna("")
elif feature_type == FeatureType.SURNAME:
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
elif feature_type == FeatureType.FIRST_WORD:
return df["name"].str.split().str[0].fillna("")
elif feature_type == FeatureType.LAST_WORD:
return df["name"].str.split().str[-1].fillna("")
elif feature_type == FeatureType.NAME_LENGTH:
return df["name"].str.len().fillna(0)
elif feature_type == FeatureType.WORD_COUNT:
return df["words"].fillna(1)
elif feature_type == FeatureType.PROVINCE:
return df["province"].fillna("unknown")
elif feature_type == FeatureType.NAME_ENDINGS:
n = self.feature_params.get("ending_length", 3)
return df["name"].str[-n:].fillna("")
elif feature_type == FeatureType.NAME_BEGINNINGS:
n = self.feature_params.get("beginning_length", 3)
return df["name"].str[:n].fillna("")
elif feature_type == FeatureType.CHAR_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
elif feature_type == FeatureType.WORD_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
else:
raise ValueError(f"Unknown feature type: {feature_type}")
+44
View File
@@ -0,0 +1,44 @@
from typing import List
from ners.research.base_model import BaseModel
from ners.research.experiment import ExperimentConfig
from ners.research.models.bigru_model import BiGRUModel
from ners.research.models.cnn_model import CNNModel
from ners.research.models.ensemble_model import EnsembleModel
from ners.research.models.lightgbm_model import LightGBMModel
from ners.research.models.logistic_regression_model import LogisticRegressionModel
from ners.research.models.lstm_model import LSTMModel
from ners.research.models.naive_bayes_model import NaiveBayesModel
from ners.research.models.random_forest_model import RandomForestModel
from ners.research.models.svm_model import SVMModel
from ners.research.models.transformer_model import TransformerModel
from ners.research.models.xgboost_model import XGBoostModel
MODEL_REGISTRY = {
"bigru": BiGRUModel,
"cnn": CNNModel,
"ensemble": EnsembleModel,
"lightgbm": LightGBMModel,
"logistic_regression": LogisticRegressionModel,
"lstm": LSTMModel,
"naive_bayes": NaiveBayesModel,
"random_forest": RandomForestModel,
"svm": SVMModel,
"transformer": TransformerModel,
"xgboost": XGBoostModel,
}
def create_model(config: ExperimentConfig) -> BaseModel:
"""Factory function to create models"""
model_class = MODEL_REGISTRY.get(config.model_type)
if model_class is None:
raise ValueError(f"Unknown model type: {config.model_type}")
return model_class(config)
def list_available_models() -> List[str]:
"""List all available model types"""
return list(MODEL_REGISTRY.keys())
+301
View File
@@ -0,0 +1,301 @@
import json
import logging
from datetime import datetime
from typing import List, Dict, Any
import pandas as pd
from ners.core.config import get_config
from ners.core.utils.data_loader import DataLoader
from ners.research.experiment import FeatureType, ExperimentConfig
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.model_registry import MODEL_REGISTRY
class ModelTrainer:
"""Comprehensive model training and artifact management"""
def __init__(self, config=None):
self.config = config or get_config()
self.data_loader = DataLoader(self.config)
self.experiment_runner = ExperimentRunner(self.config)
self.experiment_tracker = ExperimentTracker(self.config)
# Setup model artifacts directory
self.models_dir = self.config.paths.models_dir
self.models_dir.mkdir(parents=True, exist_ok=True)
def train_single_model(
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
tags: List[str] = None,
save_artifacts: bool = True,
) -> str:
"""
Train a single model and save its artifacts.
Returns the experiment ID.
"""
logging.info(f"Training {model_type} model: {model_name}")
if features is None:
features = ["full_name"]
feature_types = [FeatureType(f) for f in features]
# Prepare tags - combine default tags with template tags
default_tags = ["training", model_type]
experiment_tags = default_tags + (tags or [])
# Create experiment configuration
config = ExperimentConfig(
name=model_name,
description=f"Training {model_type} model with features: {', '.join(features)}",
model_type=model_type,
features=feature_types,
model_params=model_params or {},
tags=experiment_tags,
)
# Run experiment
experiment_id = self.experiment_runner.run_experiment(config)
experiment = self.experiment_tracker.get_experiment(experiment_id)
if experiment and experiment.test_metrics:
logging.info("Training completed successfully!")
logging.info(f"Experiment ID: {experiment_id}")
logging.info(
f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}"
)
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
if save_artifacts:
self.save_model_artifacts(experiment_id)
return experiment_id
def train_multiple_models(
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
) -> List[str]:
"""
Train multiple models with different configurations.
"""
logging.info(f"Training {len(model_configs)} models...")
experiment_ids = []
for i, config in enumerate(model_configs):
model_name = f"{base_name}_{config['model_type']}_{i + 1}"
try:
exp_id = self.train_single_model(
model_name=model_name,
model_type=config["model_type"],
features=config.get("features", ["full_name"]),
model_params=config.get("model_params", {}),
save_artifacts=save_all,
)
experiment_ids.append(exp_id)
except Exception as e:
logging.error(f"Failed to train {model_name}: {e}")
continue
logging.info(f"Completed training {len(experiment_ids)} models successfully")
return experiment_ids
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
"""
Save model artifacts in a structured way for easy loading.
Returns paths to saved artifacts.
"""
experiment = self.experiment_tracker.get_experiment(experiment_id)
if not experiment:
raise ValueError(f"Experiment {experiment_id} not found")
# Create model-specific directory
model_dir = self.models_dir / experiment_id
model_dir.mkdir(parents=True, exist_ok=True)
# Load the trained model
trained_model = self.experiment_runner.load_experiment_model(experiment_id)
if not trained_model:
raise ValueError(f"Could not load model for experiment {experiment_id}")
# Save complete model with joblib
model_path = model_dir / "complete_model.joblib"
trained_model.save(str(model_path))
# Save model configuration
config_path = model_dir / "model_config.json"
with open(config_path, "w") as f:
import json
json.dump(experiment.config.to_dict(), f, indent=2)
# Save experiment results
results_path = model_dir / "experiment_results.json"
with open(results_path, "w") as f:
json.dump(experiment.to_dict(), f, indent=2, default=str)
# Generate and save learning curves
learning_curve_path = None
training_history_path = None
try:
# Load data for learning curve generation
data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists():
df = self.data_loader.load_csv_complete(data_path)
# Generate learning curve
logging.info("Generating learning curve...")
trained_model.generate_learning_curve(
df, df[experiment.config.target_column]
)
# Plot and save learning curve
learning_curve_path = model_dir / "learning_curve.png"
trained_model.plot_learning_curve(str(learning_curve_path))
# Plot and save training history (for neural networks)
if trained_model.training_history:
training_history_path = model_dir / "training_history.png"
trained_model.plot_training_history(str(training_history_path))
# Save learning curve data as JSON
learning_data_path = model_dir / "learning_curve_data.json"
with open(learning_data_path, "w") as f:
json.dump(trained_model.learning_curve_data, f, indent=2)
# Save training history data as JSON
if trained_model.training_history:
history_data_path = model_dir / "training_history_data.json"
with open(history_data_path, "w") as f:
json.dump(trained_model.training_history, f, indent=2)
except Exception as e:
logging.warning(f"Could not generate learning curves: {e}")
# Save artifacts metadata
metadata = {
"experiment_id": experiment_id,
"model_name": experiment.config.name,
"model_type": experiment.config.model_type,
"features": [f.value for f in experiment.config.features],
"training_date": datetime.now().isoformat(),
"test_accuracy": experiment.test_metrics.get("accuracy", 0),
"test_f1": experiment.test_metrics.get("f1", 0),
"model_path": str(model_path),
"config_path": str(config_path),
"results_path": str(results_path),
"learning_curve_plot": str(learning_curve_path)
if learning_curve_path
else None,
"training_history_plot": str(training_history_path)
if training_history_path
else None,
"has_learning_curve": bool(trained_model.learning_curve_data),
"has_training_history": bool(trained_model.training_history),
}
metadata_path = model_dir / "metadata.json"
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=2)
logging.info(f"Model artifacts saved to: {model_dir}")
logging.info(f" - Complete model: {model_path.name}")
logging.info(f" - Configuration: {config_path.name}")
logging.info(f" - Results: {results_path.name}")
logging.info(f" - Metadata: {metadata_path.name}")
if learning_curve_path and learning_curve_path.exists():
logging.info(f" - Learning curve: {learning_curve_path.name}")
if training_history_path and training_history_path.exists():
logging.info(f" - Training history: {training_history_path.name}")
return {
"model_dir": str(model_dir),
"model_path": str(model_path),
"config_path": str(config_path),
"results_path": str(results_path),
"metadata_path": str(metadata_path),
"learning_curve_plot": str(learning_curve_path)
if learning_curve_path
else None,
"training_history_plot": str(training_history_path)
if training_history_path
else None,
}
def load_trained_model(self, experiment_id: str):
"""
Load a previously trained model from artifacts.
"""
model_dir = self.models_dir / experiment_id
model_path = model_dir / "complete_model.joblib"
if not model_path.exists():
raise FileNotFoundError(
f"Model artifacts not found for experiment {experiment_id}"
)
# Load the model class dynamically
metadata_path = model_dir / "metadata.json"
with open(metadata_path, "r") as f:
metadata = json.load(f)
model_type = metadata["model_type"]
model_class = MODEL_REGISTRY[model_type]
# Load the complete model
loaded_model = model_class.load(str(model_path))
logging.info(f"Loaded model: {metadata['model_name']}")
logging.info(f" Type: {model_type}")
logging.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
return loaded_model
def list_saved_models(self) -> pd.DataFrame:
"""
List all saved model artifacts.
"""
models_data = []
for model_dir in self.models_dir.iterdir():
if model_dir.is_dir():
metadata_path = model_dir / "metadata.json"
if metadata_path.exists():
try:
with open(metadata_path, "r") as f:
metadata = json.load(f)
models_data.append(metadata)
except Exception as e:
logging.warning(
f"Could not read metadata for {model_dir.name}: {e}"
)
if not models_data:
logging.info("No saved models found.")
return pd.DataFrame()
df = pd.DataFrame(models_data)
# Format the display
display_columns = [
"model_name",
"model_type",
"features",
"test_accuracy",
"test_f1",
"training_date",
]
available_columns = [col for col in display_columns if col in df.columns]
return df[available_columns].sort_values("training_date", ascending=False)
+72
View File
@@ -0,0 +1,72 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from ners.research.neural_network_model import NeuralNetworkModel
class BiGRUModel(NeuralNetworkModel):
"""Bidirectional GRU model for name classification"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
params = kwargs
model = Sequential(
[
# Mask padding tokens so recurrent layers ignore them; fix input length
# for better shape inference and to support masking through the stack.
Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
mask_zero=True,
),
# First recurrent block returns full sequences to allow stacking.
# Moderate dropout + optional recurrent_dropout to reduce overfitting
# on short names while retaining temporal signal.
Bidirectional(
GRU(
params.get("gru_units", 32),
return_sequences=True,
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Second GRU summarizes to the last hidden state (no return_sequences),
# capturing bidirectional context efficiently for classification.
Bidirectional(
GRU(
params.get("gru_units", 32),
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Small dense head; ReLU + dropout for capacity and regularization.
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary gender classification.
Dense(2, activation="softmax", dtype="float32"),
]
)
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = self._collect_text_corpus(X)
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+86
View File
@@ -0,0 +1,86 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import (
Embedding,
Conv1D,
MaxPooling1D,
GlobalMaxPooling1D,
Dense,
Dropout,
SpatialDropout1D,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from ners.research.neural_network_model import NeuralNetworkModel
class CNNModel(NeuralNetworkModel):
"""1D Convolutional Neural Network for character patterns"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
"""Build CNN model with known vocabulary size"""
params = kwargs
model = Sequential(
[
# Learn char/subword embeddings; spatial dropout regularizes across channels
# to make the model robust to noisy characters and transliteration.
Embedding(
input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)
),
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
# Small kernels capture short n-gram like patterns; padding='same' keeps
# sequence length stable for simpler pooling behavior.
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
padding="same",
),
# Downsample to gain some position invariance and reduce computation.
MaxPooling1D(pool_size=2),
# Second conv layer to compose higher-level motifs (e.g., suffix+vowel).
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
padding="same",
),
# Global max pooling picks strongest motif evidence anywhere in the name.
GlobalMaxPooling1D(),
# Compact dense head with dropout to control overfitting.
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary classification.
Dense(2, activation="softmax", dtype="float32"),
]
)
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
"""Prepare sequences for CNN using extracted features"""
# X here contains the features already extracted by FeatureExtractor
# Get text data from extracted features - use character level for CNN
text_data = self._collect_text_corpus(X)
# Initialize character-level tokenizer
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get(
"max_len", 20
) # Longer for character level
return pad_sequences(sequences, maxlen=max_len, padding="post")
+110
View File
@@ -0,0 +1,110 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from ners.research.experiment import ExperimentConfig
from ners.research.traditional_model import TraditionalModel
class EnsembleModel(TraditionalModel):
"""Ensemble model combining multiple base models"""
@property
def architecture(self) -> str:
"""Return the architecture type"""
return "ensemble"
def __init__(self, config: ExperimentConfig):
super().__init__(config)
self.base_models = []
self.model_weights = None
def build_model(self) -> BaseEstimator:
params = self.config.model_params
base_model_types = params.get(
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
)
# Create base models with simplified configs; diverse vectorizers/classifiers
# encourage complementary errors that voting can average out.
estimators = []
for model_type in base_model_types:
if model_type == "logistic_regression":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(
analyzer="char", ngram_range=(2, 4), max_features=5000
),
),
(
"classifier",
LogisticRegression(
max_iter=1000, random_state=self.config.random_seed
),
),
]
)
estimators.append(("logistic_regression", model))
elif model_type == "random_forest":
model = Pipeline(
[
(
"vectorizer",
TfidfVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=3000
),
),
(
"classifier",
RandomForestClassifier(
n_estimators=50, random_state=self.config.random_seed
),
),
]
)
estimators.append(("rf", model))
elif model_type == "naive_bayes":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(
analyzer="char", ngram_range=(1, 3), max_features=4000
),
),
("classifier", MultinomialNB()),
]
)
estimators.append(("nb", model))
# Soft voting averages probabilities (preferred when members are calibrated);
# hard voting uses majority class. Parallelize member predictions.
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+115
View File
@@ -0,0 +1,115 @@
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from ners.research.traditional_model import TraditionalModel
class LightGBMModel(TraditionalModel):
"""LightGBM with engineered features"""
def __init__(self, config):
super().__init__(config)
# Store vectorizers and encoders to ensure consistent feature space
self.vectorizers = {}
self.label_encoders = {}
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Optional GPU acceleration
use_gpu = bool(params.get("use_gpu", False))
device = params.get("device", "gpu" if use_gpu else "cpu")
gpu_platform_id = params.get("gpu_platform_id", None)
gpu_device_id = params.get("gpu_device_id", None)
# Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
# and parallelism improve training speed for this task.
return lgb.LGBMClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", -1),
learning_rate=params.get("learning_rate", 0.1),
num_leaves=params.get("num_leaves", 31),
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
objective=params.get("objective", "binary"),
n_jobs=params.get("n_jobs", -1),
verbose=2,
device=device,
gpu_platform_id=gpu_platform_id,
gpu_device_id=gpu_device_id,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character-level features for names
feature_key = f"vectorizer_{feature_type.value}"
if feature_key not in self.vectorizers:
# First time - create and fit vectorizer
self.vectorizers[feature_key] = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=50
)
char_features = (
self.vectorizers[feature_key]
.fit_transform(column.fillna("").astype(str))
.toarray()
)
else:
# Subsequent times - use existing vectorizer
char_features = (
self.vectorizers[feature_key]
.transform(column.fillna("").astype(str))
.toarray()
)
features.append(char_features)
else:
# Categorical features
feature_key = f"encoder_{feature_type.value}"
if feature_key not in self.label_encoders:
# First time - create and fit encoder
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
# Subsequent times - use existing encoder
# Handle unseen labels by mapping them to a default value
column_clean = column.fillna("unknown").astype(str)
# Get the classes the encoder was trained on
known_classes = set(self.label_encoders[feature_key].classes_)
# Map unseen values to "unknown" if it exists, otherwise to the first class
if "unknown" in known_classes:
default_class = "unknown"
else:
default_class = self.label_encoders[feature_key].classes_[0]
# Replace unseen values with default
column_mapped = column_clean.apply(
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(
column_mapped
)
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,53 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from ners.research.traditional_model import TraditionalModel
class LogisticRegressionModel(TraditionalModel):
"""Logistic Regression with character n-grams"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Character n-grams are strong signals for names; (2,5) balances
# capturing prefixes/suffixes with tractable feature size.
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 5)),
max_features=params.get("max_features", 10000),
)
# liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
# OvR across classes (no effect for binary). class_weight can mitigate imbalance.
classifier = LogisticRegression(
max_iter=params.get("max_iter", 1000),
random_state=self.config.random_seed,
verbose=2,
solver=params.get("solver", "liblinear"),
n_jobs=params.get("n_jobs", -1),
class_weight=params.get("class_weight", None),
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
# Collect text-based features from the extracted features DataFrame
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
# Combine text features
if len(text_features) == 1:
return text_features[0].values
else:
# Concatenate multiple text features with separator
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+71
View File
@@ -0,0 +1,71 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from ners.research.neural_network_model import NeuralNetworkModel
class LSTMModel(NeuralNetworkModel):
"""LSTM model for sequence learning"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
params = kwargs
model = Sequential(
[
# Mask padding tokens; required for LSTM to ignore padded timesteps.
Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
mask_zero=True,
),
# Stacked bidirectional LSTMs: first returns sequences to feed the next.
# Dropout/recurrent_dropout mitigate overfitting on short sequences.
Bidirectional(
LSTM(
params.get("lstm_units", 32),
return_sequences=True,
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Second LSTM condenses sequence to a fixed vector for classification.
Bidirectional(
LSTM(
params.get("lstm_units", 32),
dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0),
)
),
# Compact dense head with dropout; sufficient capacity for name signals.
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary classification.
Dense(2, activation="softmax", dtype="float32"),
]
)
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = self._collect_text_corpus(X)
# Initialize tokenizer if needed
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,42 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from ners.research.traditional_model import TraditionalModel
class NaiveBayesModel(TraditionalModel):
"""Multinomial Naive Bayes with character n-grams"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
# includes unigrams for coverage and higher n for suffix/prefix cues.
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 5)),
max_features=params.get("max_features", 8000),
)
# Laplace smoothing (alpha) counters zero counts for rare n-grams.
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
@@ -0,0 +1,71 @@
from typing import Dict
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from ners.research.traditional_model import TraditionalModel
class RandomForestModel(TraditionalModel):
"""Random Forest with engineered features"""
def __init__(self, config):
super().__init__(config)
# Persist encoders so categorical mappings stay consistent.
self.label_encoders: Dict[str, LabelEncoder] = {}
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
# across trees for speed. Keep depth moderate for generalisation.
return RandomForestClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", None),
random_state=self.config.random_seed,
verbose=2,
n_jobs=params.get("n_jobs", -1),
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
# Handle different feature types
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
else:
# Categorical features (encode them persistently)
feature_key = f"encoder_{feature_type.value}"
if feature_key not in self.label_encoders:
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
encoder = self.label_encoders[feature_key]
column_clean = column.fillna("unknown").astype(str)
known_classes = set(encoder.classes_)
default_class = (
"unknown"
if "unknown" in known_classes
else encoder.classes_[0]
)
column_mapped = column_clean.apply(
lambda value: value
if value in known_classes
else default_class
)
encoded = encoder.transform(column_mapped)
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+52
View File
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from ners.research.traditional_model import TraditionalModel
class SVMModel(TraditionalModel):
"""Support Vector Machine with character n-grams and RBF kernel"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
# for distinguishing name morphology under RBF kernels.
vectorizer = TfidfVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 4)),
max_features=params.get("max_features", 5000),
)
# RBF kernel captures non-linear interactions between n-grams; probability=True
# adds calibration at some cost. Larger cache helps speed kernel computations.
classifier = SVC(
kernel=params.get("kernel", "rbf"),
C=params.get("C", 1.0),
gamma=params.get("gamma", "scale"),
probability=True, # Enable probability prediction
class_weight=params.get("class_weight", None),
cache_size=params.get("cache_size", 1000),
random_state=self.config.random_seed,
verbose=2,
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
@@ -0,0 +1,90 @@
from typing import Any
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import (
Input,
Embedding,
Dense,
GlobalAveragePooling1D,
MultiHeadAttention,
Dropout,
LayerNormalization,
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from ners.research.neural_network_model import NeuralNetworkModel
class TransformerModel(NeuralNetworkModel):
"""Transformer-based model"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
params = kwargs
# Build Transformer model
inputs = Input(shape=(params.get("max_len", 8),))
x = Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
input_length=params.get("max_len", 8),
mask_zero=True,
)(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
pos_embedding = Embedding(
input_dim=params.get("max_len", 8),
output_dim=params.get("embedding_dim", 64),
)(positions)
x = x + pos_embedding
x = self._transformer_encoder(x, params)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
x = Dropout(params.get("dropout", 0.1))(x)
outputs = Dense(2, activation="softmax", dtype="float32")(x)
model = Model(inputs, outputs)
model.compile(
optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["accuracy"],
)
return model
@classmethod
def _transformer_encoder(cls, x, cfg_params):
"""Transformer encoder block"""
attn = MultiHeadAttention(
num_heads=cfg_params.get("transformer_num_heads", 2),
key_dim=cfg_params.get("transformer_head_size", 64),
dropout=cfg_params.get("attn_dropout", 0.1),
)(x, x)
x = LayerNormalization(epsilon=1e-6)(
x + Dropout(cfg_params.get("dropout", 0.1))(attn)
)
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(
x + Dropout(cfg_params.get("dropout", 0.1))(ff)
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = self._collect_text_corpus(X)
# Initialize tokenizer if needed
if self.tokenizer is None:
self.tokenizer = Tokenizer(oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+115
View File
@@ -0,0 +1,115 @@
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from ners.research.traditional_model import TraditionalModel
class XGBoostModel(TraditionalModel):
"""XGBoost with engineered features and character embeddings"""
def __init__(self, config):
super().__init__(config)
# Store vectorizers and encoders to ensure consistent feature space
self.vectorizers = {}
self.label_encoders = {}
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Optional GPU acceleration
use_gpu = bool(params.get("use_gpu", False))
default_tree_method = "gpu_hist" if use_gpu else "hist"
tree_method = params.get("tree_method", default_tree_method)
predictor = params.get(
"predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto"
)
# Histogram-based trees and parallelism provide fast training; default
# logloss metric suits binary classification of gender.
return xgb.XGBClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", 6),
learning_rate=params.get("learning_rate", 0.1),
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
eval_metric="logloss",
n_jobs=params.get("n_jobs", -1),
tree_method=tree_method,
predictor=predictor,
verbosity=2,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character-level features for names
feature_key = f"vectorizer_{feature_type.value}"
if feature_key not in self.vectorizers:
# First time - create and fit vectorizer
self.vectorizers[feature_key] = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=100
)
char_features = (
self.vectorizers[feature_key]
.fit_transform(column.fillna("").astype(str))
.toarray()
)
else:
# Subsequent times - use existing vectorizer
char_features = (
self.vectorizers[feature_key]
.transform(column.fillna("").astype(str))
.toarray()
)
features.append(char_features)
else:
# Categorical features
feature_key = f"encoder_{feature_type.value}"
if feature_key not in self.label_encoders:
# First time - create and fit encoder
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
# Subsequent times - use existing encoder
# Handle unseen labels by mapping them to a default value
column_clean = column.fillna("unknown").astype(str)
# Get the classes the encoder was trained on
known_classes = set(self.label_encoders[feature_key].classes_)
# Map unseen values to "unknown" if it exists, otherwise to the first class
if "unknown" in known_classes:
default_class = "unknown"
else:
default_class = self.label_encoders[feature_key].classes_[0]
# Replace unseen values with default
column_mapped = column_clean.apply(
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(
column_mapped
)
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+377
View File
@@ -0,0 +1,377 @@
import logging
from abc import abstractmethod
from typing import Any, Dict, List
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from ners.research.base_model import BaseModel
from ners.research.experiment.feature_extractor import FeatureExtractor
class NeuralNetworkModel(BaseModel):
"""Base class for neural network models (TensorFlow/Keras)"""
@property
def architecture(self) -> str:
return "neural_network"
@abstractmethod
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
"""Build neural network model with known vocabulary size"""
pass
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the neural network model with deferred building"""
logging.info(f"Training {self.__class__.__name__}")
# Best-effort GPU configuration for TensorFlow when available
# - Enables memory growth to avoid pre-allocating all VRAM
# - Optionally enables mixed precision if requested via model params
try:
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
tf.keras.mixed_precision.set_global_policy("mixed_float16")
logging.info("Enabled TensorFlow mixed precision (float16)")
except Exception as e:
logging.warning(f"Could not enable mixed precision: {e}")
else:
if requested_gpu:
logging.warning(
"Requested GPU but no TensorFlow GPU device is available."
)
except Exception as e:
# Keep silent in non-TF environments / non-NN workflows
logging.debug(f"TensorFlow GPU setup skipped: {e}")
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
# Extract and prepare features (this will also initialize tokenizer)
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Sanitize any out-of-range indices to avoid embedding scatter errors
X_prepared = self._sanitize_sequences(X_prepared)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
# Now we can build the model with known vocab size
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
logging.info(f"Vocabulary size: {vocab_size}")
# Get additional model parameters
self.model = self.build_model_with_vocab(
vocab_size=vocab_size, **self.config.model_params
)
# Train the neural network
logging.info(
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
)
logging.info(X_prepared[0])
logging.info(f"Model parameters: {self.config.model_params}")
history = self.model.fit(
X_prepared,
y_encoded,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 64),
validation_split=self.config.model_params.get("validation_split", 0.1),
verbose=2,
)
# Store training history
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
"val_accuracy": history.history.get("val_accuracy", []),
"val_loss": history.history.get("val_loss", []),
}
self.is_fitted = True
return self
def _sanitize_sequences(self, sequences: np.ndarray) -> np.ndarray:
"""Clamp invalid token indices to OOV and ensure int32 dtype.
This prevents rare cases where malformed inputs or dtype issues introduce
large or negative indices which can trigger TensorScatterUpdate errors
during embedding updates on GPU.
"""
try:
if sequences is None:
return sequences
arr = np.asarray(sequences)
# Ensure integer dtype for embedding lookups
if not np.issubdtype(arr.dtype, np.integer):
arr = arr.astype(np.int64, copy=False)
if self.tokenizer is not None and hasattr(self.tokenizer, "word_index"):
# Use the actual max index present in the tokenizer mapping
if self.tokenizer.word_index:
max_idx = max(self.tokenizer.word_index.values())
else:
max_idx = 0
# OOV token index if available, else fall back to 1
oov_index = self.tokenizer.word_index.get(
getattr(self.tokenizer, "oov_token", "<OOV>"), 1
)
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
invalid_mask = (arr < 0) | (arr > max_idx)
# Avoid turning zeros into OOV
invalid_mask &= arr != 0
if invalid_mask.any():
arr[invalid_mask] = oov_index
# Use int32 for TF embedding ops compatibility
return arr.astype(np.int32, copy=False)
except Exception as e:
logging.debug(f"Sequence sanitization skipped due to: {e}")
return sequences
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
"""Combine configured textual features into one string per record."""
column_names = [
feature.value
for feature in self.config.features
if feature.value in X.columns
]
if not column_names:
raise ValueError(
"No configured text features found in the provided DataFrame."
)
text_frame = X[column_names].fillna("").astype(str)
if len(column_names) == 1:
return text_frame.iloc[:, 0].tolist()
combined_rows = []
for row in text_frame.itertuples(index=False):
tokens = [value for value in row if value]
combined_rows.append(" ".join(tokens))
return combined_rows
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
# Ensure TF GPU/mixed-precision config also applies to CV runs
try:
import tensorflow as tf
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
tf.keras.mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
if requested_gpu:
logging.warning("Requested GPU for CV but none is available.")
except Exception:
pass
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
X_prepared = self._sanitize_sequences(X_prepared)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
)
accuracies = []
precisions = []
recalls = []
f1_scores = []
# Get vocabulary size and model parameters
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
# Create fresh model for each fold using build_model_with_vocab
fold_model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train on fold
if hasattr(fold_model, "fit"):
fold_model.fit(
X_prepared[train_idx],
y_encoded[train_idx],
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
verbose=0,
)
# Predict on validation
y_pred = fold_model.predict(X_prepared[val_idx])
if len(y_pred.shape) > 1:
y_pred = y_pred.argmax(axis=1)
# Calculate metrics
acc = accuracy_score(y_encoded[val_idx], y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
y_encoded[val_idx], y_pred, average="weighted"
)
accuracies.append(acc)
precisions.append(prec)
recalls.append(rec)
f1_scores.append(f1)
return {
"accuracy": np.mean(accuracies),
"precision": np.mean(precisions),
"recall": np.mean(recalls),
"f1": np.mean(f1_scores),
}
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
# Ensure TF GPU/mixed-precision config also applies here
try:
import tensorflow as tf
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
tf.keras.mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
if requested_gpu:
logging.warning(
"Requested GPU for learning curve but none is available."
)
except Exception:
pass
if train_sizes is None:
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
learning_curve_data = {
"train_sizes": [],
"train_scores": [],
"val_scores": [],
"train_scores_std": [],
"val_scores_std": [],
}
# Prepare features and get vocabulary size
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
X_prepared = self._sanitize_sequences(X_prepared)
y_encoded = self.label_encoder.transform(y)
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
# Split data once for validation
X_train_full, X_val, y_train_full, y_val = train_test_split(
X_prepared,
y_encoded,
test_size=0.2,
random_state=self.config.random_seed,
stratify=y_encoded,
)
for size in train_sizes:
train_size = int(len(X_train_full) * size)
if train_size < 10: # Minimum training size
continue
# Sample training data
indices = np.random.choice(len(X_train_full), train_size, replace=False)
X_train_subset = X_train_full[indices]
y_train_subset = y_train_full[indices]
# Train multiple models for variance estimation
train_scores = []
val_scores = []
for seed in range(3): # 3 runs for variance
# Build fresh model using build_model_with_vocab
model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train model
if hasattr(model, "fit"):
model.fit(
X_train_subset,
y_train_subset,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
validation_data=(X_val, y_val),
verbose=0,
)
# Evaluate
train_pred = model.predict(X_train_subset)
val_pred = model.predict(X_val)
train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
train_scores.append(train_acc)
val_scores.append(val_acc)
learning_curve_data["train_sizes"].append(train_size)
learning_curve_data["train_scores"].append(np.mean(train_scores))
learning_curve_data["val_scores"].append(np.mean(val_scores))
learning_curve_data["train_scores_std"].append(np.std(train_scores))
learning_curve_data["val_scores_std"].append(np.std(val_scores))
self.learning_curve_data = learning_curve_data
return learning_curve_data
+1
View File
@@ -0,0 +1 @@
LETTERS = "abcdefghijklmnopqrstuvwxyz"
+54
View File
@@ -0,0 +1,54 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ners.research.statistics.utils import LETTERS, build_letter_frequencies
def plot_transition_matrix(ax, df_probs, title=""):
hm = sns.heatmap(
df_probs.loc[list(LETTERS), list(LETTERS)],
cmap="Reds",
annot=False,
cbar=False,
ax=ax,
)
ax.set_title(title, fontsize=12)
return hm
def plot_letter_frequencies(males, females, sort_values=False, title=None):
# Compute frequencies
L_m = build_letter_frequencies(males["name"]).set_index("letter")["freq"]
L_f = build_letter_frequencies(females["name"]).set_index("letter")["freq"]
# Combine into one DataFrame
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
df_plot.to_csv(f"../assets/{title}_letter_frequencies.csv", index=False)
# Optional sorting
if sort_values:
df_plot = df_plot.sort_values("Male", ascending=False)
# Plot side-by-side bars
x = np.arange(len(df_plot))
w = 0.4
fig, ax = plt.subplots(figsize=(16, 6))
ax.bar(
x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8
)
ax.bar(
x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8
)
ax.set_xticks(x)
ax.set_xticklabels(df_plot["letter"])
ax.set_ylabel("Frequency")
ax.set_xlabel("Letter")
ax.set_title(f"{title} - Letter Frequencies")
ax.legend()
ax.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
+276
View File
@@ -0,0 +1,276 @@
import re
import unicodedata
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from scipy.stats import entropy
from typing import Dict, Any
LETTERS = "abcdefghijklmnopqrstuvwxyz"
START_TOKEN = "^"
END_TOKEN = "$"
def normalize_letters(s):
"""Normalize accents -> ascii, lowercase, keep only a-z."""
s = str(s)
s = unicodedata.normalize("NFKD", s)
s = s.encode("ascii", errors="ignore").decode("utf-8")
s = s.lower()
s = re.sub(r"[^a-z]", "", s)
return s
def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
return (
df.groupby("province")["identified_category"]
.value_counts(normalize=True) # get proportions
.unstack(fill_value=0) # reshape into columns per word count
)
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
# Normalize + split once (vectorized)
s = df[source].fillna("").astype(str)
s = s.str.lower().str.replace(r"[^\w'\-]+", " ", regex=True).str.strip().str.split()
# Explode the token list into rows under `target`
out = df.assign(**{target: s}).explode(target, ignore_index=True)
# Drop NA/empty tokens and strip whitespace
out[target] = out[target].astype(str).str.strip()
out = out[out[target].ne("")].dropna(subset=[target]).reset_index(drop=True)
return out
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
# Normalize: lowercase, remove non-letters, concatenate all into one string
s = (
series.astype(str)
.str.lower()
.str.replace(r"[^a-z]", "", regex=True)
.str.cat(sep="")
)
# Convert string into Series of characters
chars = pd.Series(list(s))
# Count letters and ensure all letters are present
out = (
chars.value_counts(normalize=False)
.reindex(list(LETTERS), fill_value=0)
.rename_axis("letter")
.reset_index(name="count")
)
# Relative frequency
total = out["count"].sum()
out["freq"] = out["count"] / (total if total > 0 else 1)
return out
def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict:
# 1) Normalize
names = names.astype(str).str.lower().str.replace(rf"[^{LETTERS}]", "", regex=True)
names = names[names.str.len() > 0]
# 2) Prepare sequences
sequences = (START_TOKEN + names + END_TOKEN).tolist()
# 3) Tokens and indices
tokens = [START_TOKEN] + list(LETTERS) + [END_TOKEN]
index = {t: i for i, t in enumerate(tokens)}
V = len(tokens)
# 4) ASCII lookup table (O(1) char -> idx); others -> -1
lut = np.full(128, -1, dtype=np.int32)
for ch, i in index.items():
lut[ord(ch)] = i
# 5) Concatenate with a separator thats not in vocab to kill cross-boundary pairs
concat = (" ".join(sequences)).encode("ascii", errors="ignore")
# 6) Map bytes to indices
arr = np.frombuffer(concat, dtype=np.uint8)
idx = lut[arr]
# 7) Build bigram pairs; drop invalid ones (separator & OOV)
a = idx[:-1]
b = idx[1:]
mask = (a >= 0) & (b >= 0)
a, b = a[mask], b[mask]
# 8) Count with a single bincount
lin = a * V + b
counts = np.bincount(lin, minlength=V * V).reshape(V, V)
# 9) Optional Laplace smoothing
if alpha and alpha > 0:
counts = counts + alpha
# 10) Row-normalize to probabilities
row_sums = counts.sum(axis=1, keepdims=True)
# avoid division by zero
probs = np.divide(counts, np.where(row_sums == 0, 1.0, row_sums), where=True)
# 11) DataFrames
df_counts = pd.DataFrame(counts, index=tokens, columns=tokens)
df_probs = pd.DataFrame(probs, index=tokens, columns=tokens)
return {
"tokens": tokens,
"index": index,
"counts": counts,
"df_counts": df_counts,
"probs": probs,
"df_probs": df_probs,
}
def build_transition_comparisons(
names_transitions: Dict[str, Any],
surnames_transitions: Dict[str, Any],
n_permutations: int = 1000,
) -> pd.DataFrame:
"""
Compares letter transition probability matrices for names and surnames using
various distance metrics and a permutation test for statistical significance.
"""
# Helper function to flatten and smooth matrices
def prepare_data(data):
return {"m": data["m"]["probs"].flatten(), "f": data["f"]["probs"].flatten()}
prepared_names = prepare_data(names_transitions)
prepared_surnames = prepare_data(surnames_transitions)
# Distance Metrics
names_l2 = euclidean(prepared_names["m"], prepared_names["f"])
surnames_l2 = euclidean(prepared_surnames["m"], prepared_surnames["f"])
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
kl_surnames_mf = entropy(
prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12
)
kl_surnames_fm = entropy(
prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12
)
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
# Permutation Test
def run_permutation_test(transitions):
# Flattened probabilities for male and female
P_m = transitions["m"]["probs"].flatten()
P_f = transitions["f"]["probs"].flatten()
# Calculate the observed JSD (our test statistic)
observed_jsd = 0.5 * (
entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)
)
# Concatenate male and female counts
counts_m = transitions["m"]["counts"]
counts_f = transitions["f"]["counts"]
all_counts = np.concatenate((counts_m, counts_f), axis=1)
total_counts = counts_m.shape[1] + counts_f.shape[1]
permuted_jsds = []
for _ in range(n_permutations):
# Shuffle the columns (names) and split back into two groups
shuffled_indices = np.random.permutation(total_counts)
# Note: This is a simplified approach, assuming counts are
# structured per name. A more robust implementation would
# shuffle the actual names themselves.
permuted_counts_m = all_counts[:, shuffled_indices[: counts_m.shape[1]]]
permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1] :]]
# Re-calculate probabilities and JSD for the permuted groups
# Add a small epsilon to the denominator to prevent division by zero
epsilon = 1e-12
permuted_probs_m = permuted_counts_m / (
permuted_counts_m.sum(axis=0, keepdims=True) + epsilon
)
permuted_probs_f = permuted_counts_f / (
permuted_counts_f.sum(axis=0, keepdims=True) + epsilon
)
permuted_jsd = 0.5 * (
entropy(
permuted_probs_m.mean(axis=1) + 1e-12,
permuted_probs_f.mean(axis=1) + 1e-12,
)
+ entropy(
permuted_probs_f.mean(axis=1) + 1e-12,
permuted_probs_m.mean(axis=1) + 1e-12,
)
)
permuted_jsds.append(permuted_jsd)
# Calculate the p-value
p_value = np.mean(np.array(permuted_jsds) >= observed_jsd)
return p_value
names_p_value = run_permutation_test(names_transitions)
surnames_p_value = run_permutation_test(surnames_transitions)
out = pd.DataFrame(
{
"l2": [names_l2, surnames_l2],
"kl_mf": [kl_names_mf, kl_surnames_mf],
"kl_fm": [kl_names_fm, kl_surnames_fm],
"jsd": [jsd_names, jsd_surnames],
"permutation_p_value": [names_p_value, surnames_p_value],
},
index=["names", "surnames"],
)
return out
import pandas as pd
from collections import Counter
from typing import Literal
def build_ngrams_count(
df: pd.DataFrame,
n: int,
where: Literal["any", "prefix", "suffix"] = "any",
) -> pd.DataFrame:
# Normalize and clean to az
names = df["name"].astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True)
ngrams = []
if where == "any":
for s in names:
L = len(s)
if L >= n:
ngrams.extend(s[i : i + n] for i in range(L - n + 1))
elif where == "prefix":
for s in names:
if len(s) >= n:
ngrams.append(s[:n])
elif where == "suffix":
for s in names:
if len(s) >= n:
ngrams.append(s[-n:])
else:
raise ValueError("where must be one of: 'any', 'prefix', 'suffix'")
counter = Counter(ngrams)
out = (
pd.DataFrame(counter.items(), columns=[f"{n}-gram", "count"])
.sort_values("count", ascending=False, kind="mergesort")
.reset_index(drop=True)
)
total = out["count"].sum()
out["freq"] = out["count"] / (total if total > 0 else 1)
return out
+163
View File
@@ -0,0 +1,163 @@
import logging
from abc import abstractmethod
from typing import Dict, Any, List
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
from ners.research.base_model import BaseModel
from ners.research.experiment.feature_extractor import FeatureExtractor
class TraditionalModel(BaseModel):
"""Base class for traditional ML models (scikit-learn compatible)"""
@property
def architecture(self) -> str:
return "traditional"
@abstractmethod
def build_model(self) -> BaseEstimator:
"""Build and return the sklearn model instance"""
pass
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the traditional ML model"""
logging.info(f"Training {self.__class__.__name__}")
# Build model if not already built
if self.model is None:
self.model = self.build_model()
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
# Extract and prepare features
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
# Train model
if len(X_prepared.shape) == 1:
# For text-based features (like LogisticRegression with vectorization)
logging.info(
f"Fitting model with {X_prepared.shape[0]} samples (text features)"
)
else:
# For numerical features
logging.info(
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
)
logging.info(X_prepared[0])
logging.info(f"Model parameters: {self.config.model_params}")
history = self.model.fit(X_prepared, y_encoded)
self.is_fitted = True
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
"val_accuracy": history.history.get("val_accuracy", []),
"val_loss": history.history.get("val_loss", []),
}
return self
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
)
# Calculate different metrics
results = {}
# Accuracy
accuracy_scores = cross_val_score(
self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
)
results["accuracy"] = accuracy_scores.mean()
results["accuracy_std"] = accuracy_scores.std()
# Precision, Recall, F1
for metric in ["precision", "recall", "f1"]:
if metric in self.config.metrics:
scores = cross_val_score(
self.model,
X_prepared,
y_encoded,
cv=cv,
scoring=f"{metric}_weighted",
)
results[metric] = scores.mean()
results[f"{metric}_std"] = scores.std()
return results
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
if train_sizes is None:
train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
# Prepare features
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
try:
train_sizes_abs, train_scores, val_scores = learning_curve(
self.build_model(),
X_prepared,
y_encoded,
train_sizes=train_sizes,
cv=3, # Use 3-fold CV for speed
scoring="accuracy",
random_state=self.config.random_seed,
)
learning_curve_data = {
"train_sizes": train_sizes_abs.tolist(),
"train_scores": train_scores.mean(axis=1).tolist(),
"val_scores": val_scores.mean(axis=1).tolist(),
"train_scores_std": train_scores.std(axis=1).tolist(),
"val_scores_std": val_scores.std(axis=1).tolist(),
}
except Exception as e:
logging.warning(f"Could not generate learning curve: {e}")
return {}
self.learning_curve_data = learning_curve_data
return learning_curve_data
+46
View File
@@ -0,0 +1,46 @@
#!.venv/bin/python3
import logging
import traceback
from ners.core.config import setup_config
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.model_trainer import ModelTrainer
def train_from_template(
name: str,
type: str,
*,
templates: str = "research_templates.yaml",
config: str | None = None,
env: str = "development",
) -> int:
try:
cfg = setup_config(config_path=config, env=env)
experiment_builder = ExperimentBuilder(cfg)
logging.info(f"Loading research templates from: {templates}")
tmpl = experiment_builder.load_templates(templates)
logging.info(f"Looking for experiment: name='{name}', type='{type}'")
experiment_config = experiment_builder.find_template(tmpl, name, type)
logging.info(f"Found experiment: {experiment_config.get('name')}")
logging.info(f"Description: {experiment_config.get('description')}")
logging.info(f"Features: {experiment_config.get('features')}")
trainer = ModelTrainer(cfg)
trainer.train_single_model(
model_name=experiment_config.get("name"),
model_type=experiment_config.get("model_type"),
features=experiment_config.get("features"),
model_params=experiment_config.get("model_params", {}),
tags=experiment_config.get("tags", []),
)
logging.info("Training completed successfully!")
return 0
except Exception as e:
logging.error(f"Training failed: {e}")
traceback.print_exc()
return 1
+1
View File
@@ -0,0 +1 @@
+67
View File
@@ -0,0 +1,67 @@
#!.venv/bin/python3
import os
import streamlit as st
from ners.core.config import setup_config, PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
# Page configuration
st.set_page_config(
page_title="DRC NERS Platform",
page_icon="🇨🇩",
layout="wide",
initial_sidebar_state="expanded",
)
def initialize_session_state(config: PipelineConfig):
"""Initialize session state variables"""
if "config" not in st.session_state:
st.session_state.config = config
if "data_loader" not in st.session_state:
st.session_state.data_loader = DataLoader(config)
if "experiment_tracker" not in st.session_state:
st.session_state.experiment_tracker = ExperimentTracker(config)
if "experiment_runner" not in st.session_state:
st.session_state.experiment_runner = ExperimentRunner(config)
if "pipeline_monitor" not in st.session_state:
st.session_state.pipeline_monitor = PipelineMonitor()
if "current_experiment" not in st.session_state:
st.session_state.current_experiment = None
if "experiment_results" not in st.session_state:
st.session_state.experiment_results = {}
class StreamlitApp:
def __init__(self, config: PipelineConfig):
self.config = config
initialize_session_state(config)
@classmethod
def run(cls):
st.title("🇨🇩 DRC NERS Platform")
st.markdown(
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
)
st.markdown(
"""
## Overview
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
data.
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
"""
)
# Initialize app using environment variables when launched via Typer
_config_path = os.environ.get("NERS_CONFIG")
_env = os.environ.get("NERS_ENV", "development")
_cfg = setup_config(_config_path, env=_env)
_app = StreamlitApp(_cfg)
_app.run()
+1
View File
@@ -0,0 +1 @@
from .ner_testing import NERTesting
+10
View File
@@ -0,0 +1,10 @@
import streamlit as st
class Configuration:
def __init__(self, config):
self.config = config
def index(self):
st.title("Configuration")
st.json(self.config.model_dump())
+90
View File
@@ -0,0 +1,90 @@
import pandas as pd
import streamlit as st
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class Dashboard:
def __init__(self, config, experiment_tracker, experiment_runner):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
st.title("Dashboard")
col1, col2, col3, col4, col5 = st.columns(5)
# Load basic statistics
try:
data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists():
df = load_dataset(str(data_path))
with col1:
st.metric("Total Names", f"{len(df):,}")
with col2:
annotated = (df.get("annotated", 0) == 1).sum()
st.metric("Annotated Names", f"{annotated:,}")
with col3:
provinces = (
df["province"].nunique() if "province" in df.columns else 0
)
st.metric("Provinces", provinces)
with col4:
if "sex" in df.columns:
gender_dist = df["sex"].value_counts()
ratio = gender_dist.get("f", 0) / max(
gender_dist.get("m", 1), 1
)
st.metric("F/M Rate", f"{ratio:.2%}")
with col5:
if "annotated" in df.columns:
annotated = (df.get("annotated", 0) == 1).sum()
ratio = annotated / len(df) if len(df) > 0 else 0
st.metric("Annotation Rate", f"{ratio:.2%}")
else:
st.warning("No processed data found. Please run data processing first.")
except Exception as e:
st.error(f"Error loading dashboard data: {e}")
# Recent experiments
st.subheader("Recent Experiments")
experiments = self.experiment_tracker.list_experiments()[:5]
if experiments:
exp_data = []
for exp in experiments:
exp_data.append(
{
"Name": exp.config.name,
"Model": exp.config.model_type,
"Status": exp.status.value,
"Accuracy": (
f"{exp.test_metrics.get('accuracy', 0):.3f}"
if exp.test_metrics
else "N/A"
),
"Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
}
)
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
else:
st.info(
"No experiments found. Create your first experiment in the Experiments tab!"
)
+52
View File
@@ -0,0 +1,52 @@
from datetime import datetime
import pandas as pd
import streamlit as st
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataOverview:
def __init__(self, config):
self.config = config
def index(self):
st.title("Data Overview")
data_files = {
"Names": self.config.data.input_file,
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
"Male Names": self.config.data.output_files["males"],
"Female Names": self.config.data.output_files["females"],
}
st.write("Available Data Files:")
for name, rel_path in data_files.items():
file_path = self.config.paths.get_data_path(rel_path)
exists = file_path.exists()
size = file_path.stat().st_size if exists else 0
stats = (
f"Size: {size / (1024 * 1024):.1f} MB, Last Modified: {datetime.fromtimestamp(file_path.stat().st_mtime)}"
if exists
else "Not found"
)
st.write(f"- {name}: {file_path} ({stats})")
# Preview featured dataset if available
data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists():
df = load_dataset(str(data_path))
st.subheader("Featured Dataset Preview")
st.dataframe(df.head(), use_container_width=True)
st.write(f"Rows: {len(df):,}")
+141
View File
@@ -0,0 +1,141 @@
import pandas as pd
import plotly.express as px
import streamlit as st
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
from ners.web.interfaces.log_reader import LogReader
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataProcessing:
def __init__(self, config, pipeline_monitor):
self.config = config
self.pipeline_monitor = pipeline_monitor
def index(self):
st.title("Data Processing")
status = self.pipeline_monitor.get_pipeline_status()
# Overall progress
overall_progress = status["overall_completion"] / 100
st.progress(overall_progress)
st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
# Step details
for step_name, step_status in status["steps"].items():
with st.expander(
f"{step_name.replace('_', ' ').title()} - {step_status['status']}"
):
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Processed Batches", step_status["processed_batches"])
with col2:
st.metric("Total Batches", step_status["total_batches"])
with col3:
st.metric("Failed Batches", step_status["failed_batches"])
if step_status["completion_percentage"] > 0:
st.progress(step_status["completion_percentage"] / 100)
# Read actual log entries from the log file
st.subheader("Recent Processing Logs")
try:
log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
log_reader = LogReader(log_file_path)
# Options for filtering logs
col1, col2 = st.columns(2)
with col1:
log_level_filter = st.selectbox(
"Filter by Level",
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
key="log_level_filter",
)
with col2:
num_entries = st.number_input(
"Number of entries",
min_value=5,
max_value=50,
value=10,
key="num_log_entries",
)
# Get log entries based on filter
if log_level_filter == "All":
log_entries = log_reader.read_last_entries(num_entries)
else:
log_entries = log_reader.read_entries_by_level(
log_level_filter, num_entries
)
if log_entries:
for entry in log_entries:
if entry.level == "ERROR":
st.error(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
elif entry.level == "WARNING":
st.warning(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
elif entry.level == "INFO":
st.info(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
else:
st.text(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
# Show log statistics
st.subheader("Log Statistics")
log_stats = log_reader.get_log_stats()
if log_stats:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Lines", log_stats.get("total_lines", 0))
with col2:
st.metric("INFO", log_stats.get("INFO", 0))
with col3:
st.metric("WARNING", log_stats.get("WARNING", 0))
with col4:
st.metric("ERROR", log_stats.get("ERROR", 0))
# Log level distribution chart
levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
counts = [log_stats.get(level, 0) for level in levels]
if sum(counts) > 0:
fig = px.bar(
x=levels,
y=counts,
title="Log Entries by Level",
color=levels,
color_discrete_map={
"INFO": "blue",
"WARNING": "orange",
"ERROR": "red",
"DEBUG": "gray",
"CRITICAL": "darkred",
},
)
st.plotly_chart(fig, use_container_width=True)
else:
st.info("No log entries found or log file is empty.")
except Exception as e:
st.error(f"Error reading log file: {e}")
+434
View File
@@ -0,0 +1,434 @@
from typing import List, Dict
import streamlit as st
from ners.core.config.pipeline_config import PipelineConfig
from ners.research.experiment import ExperimentConfig, ExperimentStatus
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.experiment.feature_extractor import FeatureType
from ners.research.model_registry import list_available_models
class Experiments:
def __init__(
self,
config: PipelineConfig,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner,
):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
self.experiment_builder = ExperimentBuilder(config)
def index(self):
st.title("Experiments")
tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
with tab1:
self.show_template_experiments()
with tab2:
self.show_experiment_list()
with tab3:
self.show_batch_experiments()
def show_template_experiments(self):
"""Show interface for running predefined template experiments"""
st.subheader("Template Experiments")
st.write("Run predefined experiments based on research templates.")
try:
available_experiments = self.experiment_builder.get_templates()
# Create tabs for different experiment types
exp_tabs = st.tabs(
["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]
)
with exp_tabs[0]:
self._show_experiments_by_type(
available_experiments["baseline"], "baseline"
)
with exp_tabs[1]:
self._show_experiments_by_type(
available_experiments["advanced"], "advanced"
)
with exp_tabs[2]:
self._show_experiments_by_type(
available_experiments["feature_study"], "feature_study"
)
with exp_tabs[3]:
self._show_experiments_by_type(
available_experiments["tuning"], "tuning"
)
except Exception as e:
st.error(f"Error loading experiment templates: {e}")
st.info(
"Make sure the research templates file exists at `config/research_templates.yaml`"
)
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
"""Show experiments for a specific type"""
if not experiments:
st.info(f"No {experiment_type} experiments available in templates.")
return
st.write(f"**{experiment_type.title()} Experiments**")
# Show available experiments
for i, exp_template in enumerate(experiments):
exp_name = exp_template.get("name", f"Experiment {i + 1}")
exp_description = exp_template.get(
"description", "No description available"
)
with st.expander(f"📊 {exp_name} - {exp_description}"):
col1, col2 = st.columns([2, 1])
with col1:
st.json(exp_template)
with col2:
if st.button("🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
self._run_template_experiment(exp_template)
def _run_template_experiment(self, exp_template: Dict):
"""Run a template experiment"""
try:
with st.spinner(f"Running {exp_template.get('name')}..."):
# Create experiment config from template
experiment_config = self.experiment_builder.from_template(exp_template)
# Run the experiment
experiment_id = self.experiment_runner.run_experiment(experiment_config)
st.success(
f"Experiment '{experiment_config.name}' completed successfully!"
)
st.info(f"Experiment ID: `{experiment_id}`")
# Show results
experiment = self.experiment_tracker.get_experiment(experiment_id)
if experiment and experiment.test_metrics:
st.write("**Results:**")
col1, col2, col3 = st.columns(3)
metrics = list(experiment.test_metrics.items())
for i, (metric, value) in enumerate(metrics):
with [col1, col2, col3][i % 3]:
st.metric(metric.title(), f"{value:.4f}")
except Exception as e:
st.error(f"Error running experiment: {e}")
def show_experiment_list(self):
"""Show list of all experiments with filtering"""
st.subheader("All Experiments")
# Filters
col1, col2, col3 = st.columns(3)
with col1:
status_filter = st.selectbox(
"Filter by Status", ["All", "completed", "running", "failed", "pending"]
)
with col2:
model_filter = st.selectbox(
"Filter by Model", ["All"] + list_available_models()
)
with col3:
tag_filter = st.text_input("Filter by Tags (comma-separated)")
# Get and filter experiments
experiments = self._get_filtered_experiments(
status_filter, model_filter, tag_filter
)
if not experiments:
st.info("No experiments found matching the filters.")
return
# Display experiments
for i, exp in enumerate(experiments):
with st.expander(
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
):
self._display_experiment_details(exp, i)
def _get_filtered_experiments(
self, status_filter: str, model_filter: str, tag_filter: str
):
"""Get experiments with applied filters"""
experiments = self.experiment_tracker.list_experiments()
# Apply filters
if status_filter != "All":
experiments = [
e for e in experiments if e.status == ExperimentStatus(status_filter)
]
if model_filter != "All":
experiments = [
e for e in experiments if e.config.model_type == model_filter
]
if tag_filter:
tags = [tag.strip() for tag in tag_filter.split(",")]
experiments = [
e for e in experiments if any(tag in e.config.tags for tag in tags)
]
return experiments
@classmethod
def _display_experiment_details(cls, exp, index: int):
"""Display details for a single experiment"""
col1, col2, col3 = st.columns(3)
with col1:
st.write(f"**Model:** {exp.config.model_type}")
st.write(
f"**Features:** {', '.join([f.value for f in exp.config.features])}"
)
st.write(f"**Tags:** {', '.join(exp.config.tags)}")
with col2:
if exp.test_metrics:
for metric, value in exp.test_metrics.items():
st.metric(metric.title(), f"{value:.4f}")
with col3:
st.write(f"**Train Size:** {exp.train_size:,}")
st.write(f"**Test Size:** {exp.test_size:,}")
if st.button("View Details", key=f"details_{index}"):
st.session_state.selected_experiment = exp.experiment_id
st.rerun()
if exp.config.description:
st.write(f"**Description:** {exp.config.description}")
def show_batch_experiments(self):
"""Show interface for running batch experiments"""
st.subheader("Batch Experiments")
st.write("Run multiple experiments with different parameter combinations.")
# Add option to run template batch experiments
batch_type = st.radio(
"Batch Type", ["Template Batch", "Custom Parameter Sweep"]
)
if batch_type == "Template Batch":
self._show_template_batch_experiments()
else:
self._show_custom_batch_experiments()
def _show_template_batch_experiments(self):
"""Show interface for running batch experiments from templates"""
st.write("**Run Multiple Template Experiments**")
try:
available_experiments = self.experiment_builder.get_templates()
# Select experiment types to run
experiment_types = st.multiselect(
"Select Experiment Types",
["baseline", "advanced", "feature_study", "tuning"],
default=["baseline"],
)
if experiment_types:
selected_experiments = []
for exp_type in experiment_types:
experiments = available_experiments.get(exp_type, [])
if experiments:
st.write(f"**{exp_type.title()} Experiments:**")
exp_names = [
exp.get("name", f"Exp {i}")
for i, exp in enumerate(experiments)
]
selected_names = st.multiselect(
f"Select {exp_type} experiments",
exp_names,
key=f"select_{exp_type}",
)
for name in selected_names:
for exp in experiments:
if exp.get("name") == name:
selected_experiments.append(exp)
if st.button("🚀 Run Selected Template Experiments"):
self._run_template_batch_experiments(selected_experiments)
except Exception as e:
st.error(f"Error loading templates for batch experiments: {e}")
def _run_template_batch_experiments(self, selected_experiments: List[Dict]):
"""Run batch experiments from templates"""
if not selected_experiments:
st.warning("No experiments selected")
return
with st.spinner(f"Running {len(selected_experiments)} template experiments..."):
try:
experiment_configs = []
for exp_template in selected_experiments:
config = self.experiment_builder.from_template(exp_template)
experiment_configs.append(config)
# Run batch experiments
experiment_ids = self.experiment_runner.run_experiment_batch(
experiment_configs
)
st.success(f"Completed {len(experiment_ids)} template experiments!")
# Show summary
if experiment_ids:
comparison = self.experiment_runner.compare_experiments(
experiment_ids
)
st.write("**Template Batch Results:**")
st.dataframe(
comparison[["name", "model_type", "test_accuracy"]],
use_container_width=True,
)
except Exception as e:
st.error(f"Error running template batch experiments: {e}")
def _show_custom_batch_experiments(self):
"""Show interface for custom parameter sweep experiments"""
# Parameter sweep configuration
with st.form("batch_experiments"):
st.write("**Parameter Sweep Configuration**")
col1, col2 = st.columns(2)
with col1:
base_name = st.text_input("Base Experiment Name", "parameter_sweep")
model_types = st.multiselect(
"Model Types",
list_available_models(),
default=["logistic_regression"],
)
# N-gram ranges for logistic regression
st.write("**Logistic Regression Parameters**")
ngram_ranges = st.text_area(
"N-gram Ranges (one per line, format: min,max)", "2,4\n2,5\n3,6"
)
with col2:
feature_combinations = st.multiselect(
"Feature Combinations",
[f.value for f in FeatureType],
default=["full_name", "native_name", "surname"],
)
test_sizes = st.text_input(
"Test Sizes (comma-separated)", "0.15,0.2,0.25"
)
tags = st.text_input("Common Tags", "parameter_sweep,batch")
if st.form_submit_button("🚀 Run Parameter Sweep"):
self.run_batch_experiments(
base_name,
model_types,
ngram_ranges,
feature_combinations,
test_sizes,
tags,
)
def run_batch_experiments(
self,
base_name: str,
model_types: List[str],
ngram_ranges: str,
feature_combinations: List[str],
test_sizes: str,
tags: str,
):
"""Run batch experiments with parameter combinations"""
with st.spinner("Running batch experiments..."):
try:
experiments = []
# Parse parameters
ngram_list = []
for line in ngram_ranges.strip().split("\n"):
if "," in line:
min_val, max_val = map(int, line.split(","))
ngram_list.append([min_val, max_val])
test_size_list = [float(x.strip()) for x in test_sizes.split(",")]
tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()]
# Generate experiment combinations
exp_count = 0
for model_type in model_types:
for feature_combo in feature_combinations:
for test_size in test_size_list:
if model_type == "logistic_regression":
for ngram_range in ngram_list:
exp_name = f"{base_name}_{model_type}_{feature_combo}_{ngram_range[0]}_{ngram_range[1]}_{test_size}"
config = ExperimentConfig(
name=exp_name,
description=f"Batch experiment: {model_type} with {feature_combo}",
model_type=model_type,
features=[FeatureType(feature_combo)],
model_params={"ngram_range": ngram_range},
test_size=test_size,
tags=tag_list,
)
experiments.append(config)
exp_count += 1
else:
exp_name = f"{base_name}_{model_type}_{feature_combo}_{test_size}"
config = ExperimentConfig(
name=exp_name,
description=f"Batch experiment: {model_type} with {feature_combo}",
model_type=model_type,
features=[FeatureType(feature_combo)],
test_size=test_size,
tags=tag_list,
)
experiments.append(config)
exp_count += 1
# Run experiments
experiment_ids = self.experiment_runner.run_experiment_batch(
experiments
)
st.success(f"Completed {len(experiment_ids)} batch experiments")
# Show summary
if experiment_ids:
comparison = self.experiment_runner.compare_experiments(
experiment_ids
)
st.write("**Batch Results Summary:**")
st.dataframe(
comparison[["name", "model_type", "test_accuracy"]],
use_container_width=True,
)
except Exception as e:
st.error(f"Error running batch experiments: {e}")
+80
View File
@@ -0,0 +1,80 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List
@dataclass
class LogEntry:
timestamp: datetime
level: str
message: str
class LogReader:
def __init__(self, log_file_path: Path):
self.log_file_path = Path(log_file_path)
def read_last_entries(self, num_entries: int = 20) -> List[LogEntry]:
entries = []
if not self.log_file_path.exists():
return entries
with open(self.log_file_path, "r") as f:
lines = f.readlines()[-num_entries:]
for line in lines:
entry = self._parse_log_line(line)
if entry:
entries.append(entry)
return entries
def read_entries_by_level(
self, level: str, num_entries: int = 20
) -> List[LogEntry]:
entries = []
if not self.log_file_path.exists():
return entries
with open(self.log_file_path, "r") as f:
for line in reversed(f.readlines()):
entry = self._parse_log_line(line)
if entry and entry.level == level:
entries.append(entry)
if len(entries) >= num_entries:
break
return list(reversed(entries))
def get_log_stats(self) -> dict:
if not self.log_file_path.exists():
return {}
stats = {"total_lines": 0}
with open(self.log_file_path, "r") as f:
for line in f:
stats["total_lines"] += 1
entry = self._parse_log_line(line)
if entry:
stats[entry.level] = stats.get(entry.level, 0) + 1
return stats
@staticmethod
def _parse_log_line(line: str) -> LogEntry | None:
try:
# Expected format from logging config: [timestamp] - LEVEL - message
parts = line.strip().split(" - ")
if len(parts) >= 3:
timestamp_str = parts[0].strip("[]")
timestamp = datetime.fromisoformat(timestamp_str)
level = parts[1].strip()
message = " - ".join(parts[2:])
return LogEntry(timestamp, level, message)
except Exception:
return None
return None
+170
View File
@@ -0,0 +1,170 @@
import streamlit as st
from spacy import displacy
from ners.core.config import PipelineConfig
from ners.processing.ner.name_model import NameModel
class NERTesting:
def __init__(self, config: PipelineConfig):
self.config = config
self.model_path = config.paths.models_dir / "drc_ner_model"
self.ner_model = None
self.training_stats = None
self.evaluation_stats = None
def load_ner_model(self) -> bool:
"""Load the trained NER model"""
try:
if self.ner_model is None:
self.ner_model = NameModel(self.config)
self.ner_model.load(str(self.model_path))
self.training_stats = self.ner_model.training_stats
self.evaluation_stats = {}
return True
except Exception as e:
st.error(f"Error loading NER model: {e}")
return False
def index(self):
st.title("Named Entity Recognition")
# Load model
if not self.load_ner_model():
st.warning(
"NER model could not be loaded. Please ensure the model is trained and available."
)
return
# Display model information
self.show_model_training_info()
self.show_model_evaluation_info()
st.markdown("---")
st.subheader("Test the NER Model")
input_method = st.radio("Input Method", ["Single Name", "Multiple Names"])
if input_method == "Single Name":
self.test_single_name()
elif input_method == "Multiple Names":
self.test_multiple_names()
def show_model_training_info(self):
if self.training_stats:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
"Training Examples",
f"{self.training_stats.get('training_examples', 0):,}",
)
with col2:
st.metric("Epochs", self.training_stats.get("epochs", 0))
with col3:
st.metric(
"Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}"
)
with col4:
st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
def show_model_evaluation_info(self):
if self.evaluation_stats:
col1, col2, col3 = st.columns(4)
overall = self.evaluation_stats.get("overall", {})
with col1:
st.metric("Overall Precision", f"{overall['precision']:.2f}")
with col2:
st.metric("Overall Recall", f"{overall['recall']:.2f}")
with col3:
st.metric("Overall F1 Score", f"{overall['f1_score']:.2f}")
st.json(self.evaluation_stats.get("by_label", {}))
def test_single_name(self):
name_input = st.text_input(
"Name:",
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
help="Enter a full name or multiple names separated by spaces",
)
if name_input.strip():
if st.button("Analyze Name", type="primary"):
self.analyze_and_display(name_input)
def test_multiple_names(self):
names_input = st.text_area(
"Names:",
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
height=150,
help="Enter each name on a new line",
)
if names_input.strip():
if st.button("Analyze All Names", type="primary"):
names = [
name.strip() for name in names_input.split("\n") if name.strip()
]
for i, name in enumerate(names):
st.markdown(f"**Name {i + 1}: {name}**")
self.analyze_and_display(name)
if i < len(names) - 1:
st.markdown("---")
def analyze_and_display(self, text: str):
try:
result = self.ner_model.predict(text)
st.subheader("Analysis Results")
entities = result.get("entities", [])
if entities:
self.show_visual_entities(text, entities)
native_count = sum(1 for e in entities if e["label"] == "NATIVE")
surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Entities", len(entities))
with col2:
st.metric("Native Names", native_count)
with col3:
st.metric("Surnames", surname_count)
else:
st.warning("No entities detected in the input text.")
st.info(
"Try using traditional Congolese names or ensure the spelling is correct."
)
except Exception as e:
st.error(f"Error analyzing text: {e}")
@classmethod
def show_visual_entities(cls, text: str, entities: list):
try:
# Convert our entities format to spaCy format for displacy
ents = []
for entity in entities:
ents.append(
{
"start": entity["start"],
"end": entity["end"],
"label": entity["label"],
}
)
# Create doc-like structure for displacy
doc_data = {"text": text, "ents": ents, "title": None}
# Custom colors for our labels
colors = {
"NATIVE": "#74C0FC",
"SURNAME": "#69DB7C",
} # Light blue # Light green
options = {"colors": colors, "distance": 90}
# Generate HTML visualization
html = displacy.render(doc_data, style="ent", manual=True, options=options)
st.markdown(html, unsafe_allow_html=True)
except Exception as e:
st.warning(f"Could not generate visual representation: {e}")
+215
View File
@@ -0,0 +1,215 @@
from typing import Optional
import numpy as np
import pandas as pd
import streamlit as st
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
class Predictions:
def __init__(
self,
config,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner,
):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
st.title("Predictions")
# Load available models
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [
e for e in experiments if e.status.value == "completed" and e.model_path
]
if not completed_experiments:
st.warning(
"No trained models available. Please run some experiments first."
)
return
# Model selection
model_options = {
f"{exp.config.name} (Acc: {exp.test_metrics.get('accuracy', 0):.3f})": exp
for exp in completed_experiments
if exp.test_metrics
}
selected_model_name = st.selectbox("Select Model", list(model_options.keys()))
if not selected_model_name:
return
selected_experiment = model_options[selected_model_name]
# Prediction modes
prediction_mode = st.radio(
"Prediction Mode", ["Single Name", "Batch Upload", "Dataset Prediction"]
)
if prediction_mode == "Single Name":
self.show_single_prediction(selected_experiment)
elif prediction_mode == "Batch Upload":
self.show_batch_prediction(selected_experiment)
elif prediction_mode == "Dataset Prediction":
self.show_dataset_prediction(selected_experiment)
def show_single_prediction(self, experiment):
"""Show single name prediction interface"""
name_input = st.text_input(
"Enter a name:", placeholder="e.g., Jean Baptiste Mukendi"
)
if name_input and st.button("Predict Gender"):
try:
# Load the model
model = self.experiment_runner.load_experiment_model(
experiment.experiment_id
)
if model is None:
st.error("Failed to load model")
return
# Create a DataFrame with the input
input_df = self._prepare_single_input(name_input)
# Make prediction
prediction = model.predict(input_df)[0]
# Get prediction probability if available
confidence = self._get_prediction_confidence(model, input_df)
# Display results
self._display_single_prediction_results(
prediction, confidence, experiment, name_input
)
except Exception as e:
st.error(f"Error making prediction: {e}")
def _prepare_single_input(self, name_input: str) -> pd.DataFrame:
"""Prepare single name input for prediction"""
return pd.DataFrame(
{
"name": [name_input],
"words": [len(name_input.split())],
"length": [len(name_input.replace(" ", ""))],
"province": ["unknown"], # Default values
"identified_name": [None],
"identified_surname": [None],
"probable_native": [None],
"probable_surname": [None],
}
)
def _get_prediction_confidence(
self, model, input_df: pd.DataFrame
) -> Optional[float]:
"""Get prediction confidence if available"""
try:
probabilities = model.predict_proba(input_df)[0]
return max(probabilities)
except:
return None
def _display_single_prediction_results(
self, prediction: str, confidence: Optional[float], experiment, name_input: str
):
"""Display single prediction results"""
col1, col2 = st.columns(2)
with col1:
gender_label = "Female" if prediction == "f" else "Male"
st.success(f"**Predicted Gender:** {gender_label}")
with col2:
if confidence:
st.metric("Confidence", f"{confidence:.2%}")
# Additional info
st.info(f"Model used: {experiment.config.name}")
st.info(
f"Features used: {', '.join([f.value for f in experiment.config.features])}"
)
def show_batch_prediction(self, experiment):
uploaded_file = st.file_uploader("Upload CSV file with names", type="csv")
if uploaded_file is not None:
try:
df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)
st.write("**Uploaded Data Preview:**")
st.dataframe(df.head(), use_container_width=True)
# Column selection
df = self._prepare_batch_data(df)
if st.button("Run Batch Prediction"):
self._run_batch_prediction(df, experiment)
except Exception as e:
st.error(f"Error processing file: {e}")
def _prepare_batch_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Prepare batch data for prediction"""
# Column selection
if "name" not in df.columns:
name_column = st.selectbox("Select the name column:", df.columns)
df = df.rename(columns={name_column: "name"})
# Add missing columns with defaults
required_columns = [
"words",
"length",
"province",
"identified_name",
"identified_surname",
"probable_native",
"probable_surname",
]
for col in required_columns:
if col not in df.columns:
if col == "words":
df[col] = df["name"].str.split().str.len()
elif col == "length":
df[col] = df["name"].str.replace(" ", "").str.len()
else:
df[col] = None
return df
def _run_batch_prediction(self, df: pd.DataFrame, experiment):
"""Run batch prediction and display results"""
with st.spinner("Making predictions..."):
# Load model
model = self.experiment_runner.load_experiment_model(
experiment.experiment_id
)
if model is None:
st.error("Failed to load model")
return
# Make predictions
predictions = model.predict(df)
df["predicted_gender"] = predictions
df["gender_label"] = df["predicted_gender"].map(
{"f": "Female", "m": "Male"}
)
# Try to get probabilities
try:
probabilities = model.predict_proba(df)
df["confidence"] = np.max(probabilities, axis=1)
except:
df["confidence"] = None
st.success("Predictions completed!")
+283
View File
@@ -0,0 +1,283 @@
from typing import List
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
class ResultsAnalysis:
def __init__(
self,
config,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner,
):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
st.title("Results & Analysis")
tab1, tab2, tab3 = st.tabs(
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
)
with tab1:
self.show_experiment_comparison()
with tab2:
self.show_performance_analysis()
with tab3:
self.show_model_analysis()
def show_experiment_comparison(self):
"""Show experiment comparison interface"""
st.subheader("Compare Experiments")
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [
e for e in experiments if e.status.value == "completed"
]
if not completed_experiments:
st.warning("No completed experiments found.")
return
# Experiment selection
exp_options = {
f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id
for exp in completed_experiments
}
selected_exp_names = st.multiselect(
"Select Experiments to Compare",
list(exp_options.keys()),
default=list(exp_options.keys())[: min(5, len(exp_options))],
)
if not selected_exp_names:
st.info("Please select experiments to compare.")
return
selected_exp_ids = [exp_options[name] for name in selected_exp_names]
# Generate comparison
comparison_df = self.experiment_runner.compare_experiments(selected_exp_ids)
if comparison_df.empty:
st.error("No data available for comparison.")
return
self._display_comparison_table(comparison_df)
self._display_comparison_charts(comparison_df)
def _display_comparison_table(self, comparison_df: pd.DataFrame):
"""Display comparison table"""
st.write("**Experiment Comparison Table**")
# Select columns to display
metric_columns = [
col
for col in comparison_df.columns
if col.startswith("test_") or col.startswith("cv_")
]
display_columns = ["name", "model_type", "features"] + metric_columns
available_columns = [
col for col in display_columns if col in comparison_df.columns
]
st.dataframe(comparison_df[available_columns], use_container_width=True)
def _display_comparison_charts(self, comparison_df: pd.DataFrame):
"""Display comparison charts"""
st.write("**Performance Comparison**")
if "test_accuracy" in comparison_df.columns:
fig = px.bar(
comparison_df,
x="name",
y="test_accuracy",
color="model_type",
title="Test Accuracy Comparison",
)
fig.update_layout(xaxis_tickangle=-45)
st.plotly_chart(fig, use_container_width=True)
# Metric comparison across multiple metrics
metric_columns = [
col
for col in comparison_df.columns
if col.startswith("test_") or col.startswith("cv_")
]
if len(metric_columns) > 1:
metric_to_plot = st.selectbox(
"Select Metric for Detailed Comparison", metric_columns
)
if metric_to_plot in comparison_df.columns:
fig = px.bar(
comparison_df,
x="name",
y=metric_to_plot,
color="model_type",
title=f"{metric_to_plot.replace('_', ' ').title()} Comparison",
)
fig.update_layout(xaxis_tickangle=-45)
st.plotly_chart(fig, use_container_width=True)
def show_performance_analysis(self):
"""Show performance analysis across experiments"""
st.subheader("Performance Analysis")
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [
e for e in experiments if e.status.value == "completed" and e.test_metrics
]
if not completed_experiments:
st.warning("No completed experiments with metrics found.")
return
# Prepare data for analysis
analysis_data = self._prepare_analysis_data(completed_experiments)
analysis_df = pd.DataFrame(analysis_data)
self._display_performance_trends(analysis_df)
self._display_model_comparison(analysis_df)
self._display_top_experiments(analysis_df)
def _prepare_analysis_data(self, completed_experiments: List) -> List[dict]:
"""Prepare data for performance analysis"""
analysis_data = []
for exp in completed_experiments:
row = {
"experiment_id": exp.experiment_id,
"name": exp.config.name,
"model_type": exp.config.model_type,
"feature_count": len(exp.config.features),
"features": ", ".join([f.value for f in exp.config.features]),
"train_size": exp.train_size,
"test_size": exp.test_size,
**exp.test_metrics,
}
analysis_data.append(row)
return analysis_data
def _display_performance_trends(self, analysis_df: pd.DataFrame):
"""Display performance trend charts"""
col1, col2 = st.columns(2)
with col1:
# Accuracy vs Training Size
if (
"accuracy" in analysis_df.columns
and "train_size" in analysis_df.columns
):
fig = px.scatter(
analysis_df,
x="train_size",
y="accuracy",
color="model_type",
hover_data=["name"],
title="Accuracy vs Training Size",
)
st.plotly_chart(fig, use_container_width=True)
with col2:
# Feature Count vs Performance
if (
"accuracy" in analysis_df.columns
and "feature_count" in analysis_df.columns
):
fig = px.scatter(
analysis_df,
x="feature_count",
y="accuracy",
color="model_type",
hover_data=["name"],
title="Accuracy vs Number of Features",
)
st.plotly_chart(fig, use_container_width=True)
def _display_model_comparison(self, analysis_df: pd.DataFrame):
"""Display model type comparison"""
if "accuracy" in analysis_df.columns:
model_performance = (
analysis_df.groupby("model_type")["accuracy"]
.agg(["mean", "std", "count"])
.reset_index()
)
fig = go.Figure()
fig.add_trace(
go.Bar(
x=model_performance["model_type"],
y=model_performance["mean"],
error_y=dict(type="data", array=model_performance["std"].fillna(0)),
name="Accuracy",
)
)
st.plotly_chart(fig, use_container_width=True)
def _display_top_experiments(self, analysis_df: pd.DataFrame):
"""Display top-performing experiments"""
if "accuracy" in analysis_df.columns:
top_n = st.slider("Select Top N Experiments", 3, 20, 5)
top_experiments = analysis_df.nlargest(top_n, "accuracy")
st.write("**Top Performing Experiments:**")
st.dataframe(
top_experiments[
[
"name",
"model_type",
"features",
"train_size",
"test_size",
"accuracy",
]
],
use_container_width=True,
)
def show_model_analysis(self):
"""Show detailed model analysis interface"""
st.subheader("Model Analysis")
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [
e for e in experiments if e.status.value == "completed"
]
if not completed_experiments:
st.warning("No completed experiments found for analysis.")
return
# Model selection
exp_options = {
f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id
for exp in completed_experiments
}
selected_exp_name = st.selectbox(
"Select Model for Analysis", list(exp_options.keys())
)
if not selected_exp_name:
return
exp_id = exp_options[selected_exp_name]
experiment = self.experiment_tracker.get_experiment(exp_id)
if not experiment or not experiment.test_metrics:
st.warning("Selected experiment has no evaluation metrics.")
return
# Display detailed metrics
st.write("**Detailed Metrics:**")
st.json(experiment.test_metrics)
+16
View File
@@ -0,0 +1,16 @@
import streamlit as st
from ners.web.interfaces.dashboard import Dashboard
st.set_page_config(page_title="Dashboard", page_icon="📊", layout="wide")
if "config" in st.session_state:
dashboard = Dashboard(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
dashboard.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,12 @@
import streamlit as st
from ners.web.interfaces.data_overview import DataOverview
st.set_page_config(page_title="Data Overview", page_icon="📋", layout="wide")
if "config" in st.session_state:
data_overview = DataOverview(st.session_state.config)
data_overview.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,14 @@
import streamlit as st
from ners.web.interfaces.data_processing import DataProcessing
st.set_page_config(page_title="Data Processing", page_icon="⚙️", layout="wide")
if "config" in st.session_state:
data_processing = DataProcessing(
st.session_state.config, st.session_state.pipeline_monitor
)
data_processing.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+16
View File
@@ -0,0 +1,16 @@
import streamlit as st
from ners.web.interfaces.experiments import Experiments
st.set_page_config(page_title="Experiments", page_icon="🧪", layout="wide")
if "config" in st.session_state:
experiments = Experiments(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
experiments.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,16 @@
import streamlit as st
from ners.web.interfaces.results_analysis import ResultsAnalysis
st.set_page_config(page_title="Results & Analysis", page_icon="📈", layout="wide")
if "config" in st.session_state:
results_analysis = ResultsAnalysis(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
results_analysis.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+16
View File
@@ -0,0 +1,16 @@
import streamlit as st
from ners.web.interfaces.predictions import Predictions
st.set_page_config(page_title="Predictions", page_icon="🔮", layout="wide")
if "config" in st.session_state:
predictions = Predictions(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
predictions.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,12 @@
import streamlit as st
from ners.web.interfaces.configuration import Configuration
st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
if "config" in st.session_state:
configuration = Configuration(st.session_state.config)
configuration.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,12 @@
import streamlit as st
from ners.web.interfaces.ner_testing import NERTesting
st.set_page_config(page_title="NER Testing", page_icon="🏷️", layout="wide")
if "config" in st.session_state:
ner_testing = NERTesting(st.session_state.config)
ner_testing.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long