refactoring: uv

This commit is contained in:
2025-10-05 18:14:15 +02:00
parent f3b06fbd07
commit 9dd4f759b3
120 changed files with 5525 additions and 3366 deletions
+16
View File
@@ -0,0 +1,16 @@
.git
.gitignore
.idea
.vscode
__pycache__
.ruff_cache
.venv
*.pyc
*.pyo
*.pyd
*.swp
*.swo
*.DS_Store
dist
build
*.egg-info
+1
View File
@@ -0,0 +1 @@
3.11
+49
View File
@@ -0,0 +1,49 @@
# syntax=docker/dockerfile:1
# Minimal Linux base (glibc) Python will be installed by uv
FROM debian:bookworm-slim
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
UV_INSTALL_DIR=/usr/local/bin \
UV_LINK_MODE=copy \
UV_PYTHON_DOWNLOADS=1 \
UV_PROJECT_ENVIRONMENT=/app/.venv \
PATH=/app/.venv/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
WORKDIR /app
# System deps for building/using common scientific stack
# Keep minimal; rely on wheels where possible
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates curl git \
build-essential pkg-config \
libssl-dev libffi-dev \
libopenblas0 libstdc++6 \
libfreetype6 libpng16-16 libjpeg62-turbo \
&& rm -rf /var/lib/apt/lists/*
# Install uv (static binary)
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
# Copy project metadata first for layer caching
COPY pyproject.toml README.md ./
# Install a managed Python via uv and create the project venv
RUN uv python install 3.11 \
&& uv venv /app/.venv --python 3.11
# Resolve and install runtime deps into project venv
# Use lockfile if present for reproducibility
RUN if [ -f uv.lock ]; then uv sync --no-dev --no-install-project --frozen; else uv sync --no-dev --no-install-project; fi
# Copy source code and optional templates
COPY src ./src
# Re-sync to ensure the local package is installed
RUN uv sync --no-dev \
&& rm -rf /root/.cache
# Default command shows help; override in compose or docker run
CMD ["ners", "--help"]
+83 -137
View File
@@ -10,37 +10,23 @@ million names from the Democratic Republic of Congo (DRC) annotated with gender
### Installation & Setup ### Installation & Setup
Instructions and command line snippets bellow are provided to help you set up the project environment quickly and **Unix based**
efficiently.
assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
**Using Makefile (Recommended)**
```bash ```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
git clone https://github.com/bernard-ng/drc-ners-nlp.git git clone https://github.com/bernard-ng/drc-ners-nlp.git
cd drc-ners-nlp cd drc-ners-nlp
# Setup environment uv sync
make setup
make activate
``` ```
**Manual Setup** **Macos & windows**
```bash ```bash
git clone https://github.com/bernard-ng/drc-ners-nlp.git docker compose build
cd drc-ners-nlp docker compose run --rm app
docker compose run --rm app ners pipeline run --env=production
# Setup environment docker compose run --rm app ners research train --name=lightgbm --type=baseline --env=production
python -m venv .venv docker compose run --rm --service-ports app ners web run --env=production
.venv/bin/pip install --upgrade pip
.venv/bin/pip install -r requirements.txt
pip install --upgrade pip
pip install -r requirements.txt
pip install jupyter notebook ipykernel pytest black flake8 mypy
source .venv/bin/activate
``` ```
## Data Processing ## Data Processing
@@ -55,6 +41,7 @@ the `drc-ners-nlp/config/pipeline.yaml` file.
```yaml ```yaml
stages: stages:
- "data_cleaning" - "data_cleaning"
- "data_selection"
- "feature_extraction" - "feature_extraction"
- "data_splitting" - "data_splitting"
``` ```
@@ -62,37 +49,7 @@ stages:
**Running the Pipeline** **Running the Pipeline**
```bash ```bash
python main.py --env production uv run ners pipeline run --env="production"
```
## NER Processing (Optional)
This project implements a custom named entity recognition (NER) pipeline tailored for Congolese names.
Its main objective is to accurately identify and tag the different components of a Congolese name,
specifically distinguishing between the native part and the surname.
```bash
python ner.py --env production
```
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
**Running the Pipeline with NER Annotation**
```yaml
stages:
- "data_cleaning"
- "feature_extraction"
- "ner_annotation"
- "data_splitting"
```
**Running the Pipeline with LLM Annotation**
```yaml
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
``` ```
## Experiments ## Experiments
@@ -105,54 +62,94 @@ you can define model features, training parameters, and evaluation metrics in th
```bash ```bash
# bigru # bigru
python train.py --name="bigru" --type="baseline" --env="production" uv run ners research train --name="bigru" --type="baseline" --env="production"
python train.py --name="bigru_native" --type="baseline" --env="production" uv run ners research train --name="bigru_native" --type="baseline" --env="production"
python train.py --name="bigru_surname" --type="baseline" --env="production" uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
# cnn # cnn
python train.py --name="cnn" --type="baseline" --env="production" uv run ners research train --name="cnn" --type="baseline" --env="production"
python train.py --name="cnn_native" --type="baseline" --env="production" uv run ners research train --name="cnn_native" --type="baseline" --env="production"
python train.py --name="cnn_surname" --type="baseline" --env="production" uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
# lightgbm # lightgbm
python train.py --name="lightgbm" --type="baseline" --env="production" uv run ners research train --name="lightgbm" --type="baseline" --env="production"
python train.py --name="lightgbm_native" --type="baseline" --env="production" uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
python train.py --name="lightgbm_surname" --type="baseline" --env="production" uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
# logistic regression # logistic regression
python train.py --name="logistic_regression" --type="baseline" --env="production" uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
python train.py --name="logistic_regression_native" --type="baseline" --env="production" uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
python train.py --name="logistic_regression_surname" --type="baseline" --env="production" uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
# lstm # lstm
python train.py --name="lstm" --type="baseline" --env="production" uv run ners research train --name="lstm" --type="baseline" --env="production"
python train.py --name="lstm_native" --type="baseline" --env="production" uv run ners research train --name="lstm_native" --type="baseline" --env="production"
python train.py --name="lstm_surname" --type="baseline" --env="production" uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
# random forest # random forest
python train.py --name="random_forest" --type="baseline" --env="production" uv run ners research train --name="random_forest" --type="baseline" --env="production"
python train.py --name="random_forest_native" --type="baseline" --env="production" uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
python train.py --name="random_forest_surname" --type="baseline" --env="production" uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
# svm # svm
python train.py --name="svm" --type="baseline" --env="production" uv run ners research train --name="svm" --type="baseline" --env="production"
python train.py --name="svm_native" --type="baseline" --env="production" uv run ners research train --name="svm_native" --type="baseline" --env="production"
python train.py --name="svm_surname" --type="baseline" --env="production" uv run ners research train --name="svm_surname" --type="baseline" --env="production"
# naive bayes # naive bayes
python train.py --name="naive_bayes" --type="baseline" --env="production" uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
python train.py --name="naive_bayes_native" --type="baseline" --env="production" uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
python train.py --name="naive_bayes_surname" --type="baseline" --env="production" uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
# transformer # transformer
python train.py --name="transformer" --type="baseline" --env="production" uv run ners research train --name="transformer" --type="baseline" --env="production"
python train.py --name="transformer_native" --type="baseline" --env="production" uv run ners research train --name="transformer_native" --type="baseline" --env="production"
python train.py --name="transformer_surname" --type="baseline" --env="production" uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
# xgboost # xgboost
python train.py --name="xgboost" --type="baseline" --env="production" uv run ners research train --name="xgboost" --type="baseline" --env="production"
python train.py --name="xgboost_native" --type="baseline" --env="production" uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
python train.py --name="xgboost_surname" --type="baseline" --env="production" uv run ners research train --name="xgboost_surname" --type="baseline" --env="production"
```
## TensorFlow on macOS (Intel) with uv
TensorFlow no longer publishes wheels for macOS Intel. To keep using uv and run TF reliably, use a Linux container with TF preinstalled and install project code with minimal extras inside the container.
### One-time build
```bash
docker compose -f docker/compose.tf.yml build
If you see a message like `tensorflow/tensorflow:<tag>: not found`, update `docker/Dockerfile.tf-cpu` to a tag that exists (e.g., `2.17.0`) and rebuild:
```bash
sed -n '1,20p' docker/Dockerfile.tf-cpu # verify the FROM line
docker pull tensorflow/tensorflow:2.17.0 # quick availability check
docker compose -f docker/compose.tf.yml build
```
```
### Start a shell with uv and TF available
```bash
docker compose -f docker/compose.tf.yml run --rm tf bash
```
Inside the container:
```bash
# Install project in editable mode without pulling full deps
uv pip install -e . --no-deps
# Install only what research needs alongside TensorFlow
uv pip install typer pandas scikit-learn seaborn plotly
# Sanity check
uv run python -c "import tensorflow as tf; print(tf.__version__)"
# Run an experiment
uv run ners research train --name="lstm" --type="baseline" --env="production"
``` ```
## Web Interface ## Web Interface
@@ -163,60 +160,9 @@ experiments and make predictions without needing to understand the underlying co
### Running the Web Interface ### Running the Web Interface
```bash ```bash
streamlit run web/app.py uv run ners web run --env="production"
``` ```
## GPU Acceleration
This project can leverage GPUs for faster training when supported libraries and hardware are available.
- TensorFlow/Keras models (BiGRU, LSTM, CNN, Transformer)
- Uses GPU automatically if a TensorFlow GPU build is installed.
- The code enables safe GPU memory growth by default; optionally enable mixed precision for additional speed:
- Add `mixed_precision: true` in the experiment `model_params` (e.g., in `config/research_templates.yaml`).
- The final layer outputs are set to float32 for numerical stability under mixed precision.
- spaCy NER
- Automatically prefers GPU if available; otherwise falls back to CPU.
- Ensure a compatible CUDA-enabled spaCy/thinc stack is installed to use GPU.
- XGBoost
- Enable GPU by adding to the experiment `model_params`:
- `use_gpu: true` (sets `tree_method: gpu_hist` and `predictor: gpu_predictor`).
- LightGBM
- Enable GPU by adding to the experiment `model_params`:
- `use_gpu: true` (sets `device: gpu`). Optional: `gpu_platform_id`, `gpu_device_id`.
Example template snippet (GPU on):
```yaml
- name: "lstm_gpu"
description: "LSTM with GPU + mixed precision"
model_type: "lstm"
features: ["full_name"]
model_params:
embedding_dim: 128
lstm_units: 64
epochs: 5
batch_size: 128
use_gpu: true
mixed_precision: true
tags: ["gpu", "mixed_precision"]
- name: "xgboost_gpu"
description: "XGBoost with GPU"
model_type: "xgboost"
features: ["full_name"]
model_params:
n_estimators: 200
use_gpu: true
```
Notes:
- Install CUDAenabled binaries for TensorFlow/spaCy/LightGBM/XGBoost to actually use GPU.
- If GPU is requested but not available, training will proceed on CPU with a warning.
## Contributors ## Contributors
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors"> <a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
+21
View File
@@ -0,0 +1,21 @@
services:
app:
build:
context: .
dockerfile: Dockerfile
image: drc-ners:uv
working_dir: /app
tty: true
stdin_open: true
environment:
NERS_ENV: production
STREAMLIT_SERVER_ADDRESS: 0.0.0.0
# expose Streamlit for `ners web run`
ports:
- "8501:8501"
volumes:
- ./assets:/app/assets
- ./config:/app/config
- ./data:/app/data
# default command shows CLI help; override per run
command: ["ners", "--help"]
-90
View File
@@ -1,90 +0,0 @@
#!.venv/bin/python3
import argparse
import sys
import traceback
from pathlib import Path
from core.config import setup_config
from processing.monitoring.pipeline_monitor import PipelineMonitor
def main():
choices = [
"data_cleaning",
"data_selection",
"feature_extraction",
"ner_annotation",
"llm_annotation",
"data_splitting",
]
parser = argparse.ArgumentParser(description="DRC NERS Processing Monitoring")
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Clean command
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
clean_parser.add_argument("--step", type=str, choices=choices, help="default: all")
clean_parser.add_argument("--keep-last", type=int, default=1, help="(default: 1)")
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
# Reset command
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
reset_parser.add_argument("--step", type=str, choices=choices, help="(default: all)")
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
args = parser.parse_args()
try:
setup_config(config_path=args.config, env=args.env)
monitor = PipelineMonitor()
if not args.command:
parser.print_help()
monitor.print_status(detailed=True)
return 1
elif args.command == "clean":
checkpoint_info = monitor.count_checkpoint_files()
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
if not args.force:
response = input("Are you sure you want to clean checkpoints? (y/N): ")
if response.lower() != "y":
print("Cancelled")
return 0
if args.step:
monitor.clean_step_checkpoints(args.step, args.keep_last)
else:
for step in monitor.steps:
monitor.clean_step_checkpoints(step, args.keep_last)
print("Checkpoint cleaning completed")
elif args.command == "reset":
if not args.force:
response = input(
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
)
if response.lower() != "y":
print("Cancelled")
return 0
if args.step:
monitor.reset_step(args.step)
else:
for step in monitor.steps:
monitor.reset_step(step)
print(f"Reset completed")
except Exception as e:
print(f"Monitoring failed: {e}")
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
-499
View File
File diff suppressed because one or more lines are too long
+41
View File
@@ -0,0 +1,41 @@
[project]
name = "ners"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"geopandas>=1.1.1",
"joblib>=1.5.2",
"lightgbm>=4.6.0",
"matplotlib>=3.10.6",
"numpy>=2.3.3",
"ollama>=0.6.0",
"pandas>=2.3.3",
"plotly>=6.3.1",
"psutil>=7.1.0",
"pydantic>=2.11.10",
"pyyaml>=6.0.3",
"scikit-learn>=1.7.2",
"seaborn>=0.13.2",
"spacy>=3.8.7",
"streamlit>=1.50.0",
"tqdm>=4.67.1",
"typer>=0.19.2",
"xgboost>=3.0.5",
]
[project.scripts]
ners = "ners.cli:app"
[build-system]
requires = ["uv_build>=0.8.12,<0.9.0"]
build-backend = "uv_build"
[dependency-groups]
dev = [
"ruff>=0.13.3",
]
[tool.uv]
required-environments = ["sys_platform == 'linux' and platform_machine == 'x86_64'"]
-170
View File
@@ -1,170 +0,0 @@
absl-py==2.3.0
altair==5.1.2
annotated-types==0.7.0
anyio==4.9.0
appnope==0.1.4
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==3.0.0
astunparse==1.6.3
async-lru==2.0.5
attrs==25.3.0
babel==2.17.0
beautifulsoup4==4.13.4
black==25.1.0
bleach==6.2.0
blinker==1.9.0
cachetools==6.1.0
certifi==2025.6.15
cffi==1.17.1
charset-normalizer==3.4.2
click==8.2.1
comm==0.2.2
contourpy==1.3.2
cycler==0.12.1
debugpy==1.8.14
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.0
fastjsonschema==2.21.1
flake8==7.3.0
flatbuffers==25.2.10
fonttools==4.58.4
fqdn==1.5.1
gast==0.6.0
gitdb==4.0.12
GitPython==3.1.45
google-pasta==0.2.0
grpcio==1.73.0
h11==0.16.0
h5py==3.14.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
imbalanced-learn==0.13.0
ipykernel==6.29.5
ipython>=8.0,<9.0
ipython_pygments_lexers==1.1.1
isoduration==20.11.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.1
json5==0.12.0
jsonpointer==3.0.0
jsonschema==4.24.0
jsonschema-specifications==2025.4.1
jupyter-events==0.12.0
jupyter-lsp==2.2.5
jupyter_client==8.6.3
jupyter_core==5.8.1
jupyter_server==2.16.0
jupyter_server_terminals==0.5.3
jupyterlab==4.4.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
keras==3.10.0
kiwisolver==1.4.8
libclang==18.1.1
lightgbm~=4.6.0
Markdown==3.8.2
markdown-it-py==3.0.0
MarkupSafe==3.0.2
matplotlib==3.10.3
matplotlib-inline==0.1.7
mccabe==0.7.0
mdurl==0.1.2
mistune==3.1.3
ml-dtypes==0.3.2
mypy==1.17.0
mypy_extensions==1.1.0
namex==0.1.0
narwhals==2.0.1
nbclient==0.10.2
nbconvert==7.16.6
nbformat==5.10.4
nest-asyncio==1.6.0
nltk==3.9.1
notebook==7.4.4
notebook_shim==0.2.4
numpy==1.26.4
ollama~=0.5.1
opt_einsum==3.4.0
optree==0.16.0
overrides==7.7.0
packaging==25.0
pandas==2.3.0
pandocfilters==1.5.1
parso==0.8.4
pathspec==0.12.1
pexpect==4.9.0
pillow==11.2.1
platformdirs==4.3.8
plotly~=6.2.0
prometheus_client==0.22.1
prompt_toolkit==3.0.51
protobuf==4.25.8
psutil==7.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pyarrow==21.0.0
pycodestyle==2.14.0
pycparser==2.22
pydantic~=2.11.7
pydantic_core==2.33.2
pydeck==0.9.1
pyflakes==3.4.0
Pygments==2.19.1
pyparsing==3.2.3
python-dateutil==2.9.0.post0
python-json-logger==3.3.0
pytz==2025.2
PyYAML~=6.0.2
pyzmq==27.0.0
referencing==0.36.2
regex==2024.11.6
requests==2.32.4
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==14.0.0
rpds-py==0.26.0
scikit-learn~=1.6.1
scipy==1.15.3
seaborn==0.13.2
Send2Trash==1.8.3
six==1.17.0
sklearn-compat==0.1.3
smmap==5.0.2
sniffio==1.3.1
soupsieve==2.7
spacy~=3.8.7
stack-data==0.6.3
streamlit~=1.47.1
tenacity==9.1.2
tensorboard==2.16.2
tensorboard-data-server==0.7.2
tensorflow==2.16.2
tensorflow-io-gcs-filesystem==0.37.1
termcolor==3.1.0
terminado==0.18.1
threadpoolctl==3.6.0
tinycss2==1.4.0
toml==0.10.2
toolz==1.0.0
tornado==6.5.1
tqdm==4.67.1
traitlets==5.14.3
types-python-dateutil==2.9.0.20250516
types-PyYAML==6.0.12.20250516
typing-inspection==0.4.1
typing_extensions==4.14.0
tzdata==2025.2
uri-template==1.3.0
urllib3==2.5.0
wcwidth==0.2.13
webcolors==24.11.1
webencodings==0.5.1
websocket-client==1.8.0
Werkzeug==3.1.3
wrapt==1.17.2
xgboost~=3.0.3
+3
View File
@@ -0,0 +1,3 @@
"""DRC NERS NLP package."""
__all__: list[str] = []
+226
View File
@@ -0,0 +1,226 @@
from __future__ import annotations
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional
import typer
from ners.core.config import setup_config, PipelineConfig
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
# -------------------------
# Pipeline commands
# -------------------------
pipeline_app = typer.Typer(help="Data processing pipeline")
app.add_typer(pipeline_app, name="pipeline")
@pipeline_app.command("run")
def pipeline_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Run the full processing pipeline."""
from ners.main import run_pipeline as _run_pipeline
cfg = setup_config(config_path=config, env=env)
code = _run_pipeline(cfg)
raise typer.Exit(code)
# -------------------------
# NER commands
# -------------------------
ner_app = typer.Typer(help="NER dataset and model")
app.add_typer(ner_app, name="ner")
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
return setup_config(config_path=config, env=env)
@ner_app.command("feature")
def ner_feature(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import feature as _feature
cfg = _load_config(config, env)
_feature(cfg)
@ner_app.command("build")
def ner_build(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import build as _build
cfg = _load_config(config, env)
_build(cfg)
@ner_app.command("train")
def ner_train(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import train as _train
cfg = _load_config(config, env)
_train(cfg)
@ner_app.command("run")
def ner_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
reset: bool = typer.Option(
False, help="Reset intermediate outputs and rerun all steps"
),
) -> None:
from ners.ner import run_pipeline as _ner_pipeline
cfg = _load_config(config, env)
code = _ner_pipeline(cfg, reset)
raise typer.Exit(code)
# -------------------------
# Research commands
# -------------------------
research_app = typer.Typer(help="Research experiments and training")
app.add_typer(research_app, name="research")
@research_app.command("train")
def research_train(
name: str = typer.Option(..., "--name", help="Model name to train"),
type: str = typer.Option(..., "--type", help="Experiment type"),
templates: str = typer.Option(
"research_templates.yaml", help="Templates file path"
),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.model_trainer import ModelTrainer
cfg = _load_config(config, env)
exp_builder = ExperimentBuilder(cfg)
tmpl = exp_builder.load_templates(templates)
exp_cfg = exp_builder.find_template(tmpl, name, type)
trainer = ModelTrainer(cfg)
trainer.train_single_model(
model_name=exp_cfg.get("name"),
model_type=exp_cfg.get("model_type"),
features=exp_cfg.get("features"),
model_params=exp_cfg.get("model_params", {}),
tags=exp_cfg.get("tags", []),
)
# -------------------------
# Monitor commands
# -------------------------
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
app.add_typer(monitor_app, name="monitor")
@monitor_app.command("status")
def monitor_status(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
detailed: bool = typer.Option(
False, help="Show detailed status (failed batch IDs)"
),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
PipelineMonitor().print_status(detailed=detailed)
@monitor_app.command("clean")
def monitor_clean(
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
typer.confirm("Clean checkpoints?", abort=True)
if step:
mon.clean_step_checkpoints(step, keep_last)
else:
for s in mon.steps:
mon.clean_step_checkpoints(s, keep_last)
@monitor_app.command("reset")
def monitor_reset(
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
typer.confirm(msg, abort=True)
if step:
mon.reset_step(step)
else:
for s in mon.steps:
mon.reset_step(s)
# -------------------------
# Web commands
# -------------------------
web_app = typer.Typer(help="Web UI wrapper")
app.add_typer(web_app, name="web")
@web_app.command("run")
def web_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Launch the Streamlit web app via subprocess."""
app_path = Path(__file__).parent / "web" / "app.py"
cmd = [
sys.executable,
"-m",
"streamlit",
"run",
str(app_path),
]
# Pass configuration via environment variables to avoid argparse in Streamlit
env_vars = os.environ.copy()
if config is not None:
env_vars["NERS_CONFIG"] = str(config)
env_vars["NERS_ENV"] = env
raise typer.Exit(subprocess.call(cmd, env=env_vars))
if __name__ == "__main__": # pragma: no cover
app()
@@ -2,10 +2,10 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
from core.utils import ensure_directories from ners.core.utils import ensure_directories
from .config_manager import ConfigManager from ners.core.config.config_manager import ConfigManager
from .logging_config import LoggingConfig from ners.core.config.logging_config import LoggingConfig
from .pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
config_manager = ConfigManager() config_manager = ConfigManager()
@@ -22,7 +22,9 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfi
return config_manager.get_config() return config_manager.get_config()
def setup_config(config_path: Optional[Path] = None, env: str = "development") -> PipelineConfig: def setup_config(
config_path: Optional[Path] = None, env: str = "development"
) -> PipelineConfig:
""" """
Unified configuration loading and logging setup for all entrypoint scripts. Unified configuration loading and logging setup for all entrypoint scripts.
@@ -5,8 +5,8 @@ from typing import Optional, Union, Dict, Any
import yaml import yaml
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from core.config.project_paths import ProjectPaths from ners.core.config.project_paths import ProjectPaths
class ConfigManager: class ConfigManager:
@@ -36,7 +36,7 @@ class ConfigManager:
def _setup_default_paths(self): def _setup_default_paths(self):
"""Setup default project paths""" """Setup default project paths"""
root_dir = Path(__file__).parent.parent.parent root_dir = Path(__file__).parent.parent.parent.parent.parent
self.default_paths = ProjectPaths( self.default_paths = ProjectPaths(
root_dir=root_dir, root_dir=root_dir,
configs_dir=root_dir / "config", configs_dir=root_dir / "config",
@@ -53,7 +53,9 @@ class ConfigManager:
self.config_path = config_path self.config_path = config_path
if not self.config_path.exists(): if not self.config_path.exists():
logging.warning(f"Config file not found: {self.config_path}. Using defaults.") logging.warning(
f"Config file not found: {self.config_path}. Using defaults."
)
return self._create_default_config() return self._create_default_config()
try: try:
@@ -122,7 +124,11 @@ class ConfigManager:
def _deep_update(self, base_dict: Dict, update_dict: Dict): def _deep_update(self, base_dict: Dict, update_dict: Dict):
"""Recursively update nested dictionaries""" """Recursively update nested dictionaries"""
for key, value in update_dict.items(): for key, value in update_dict.items():
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict): if (
key in base_dict
and isinstance(base_dict[key], dict)
and isinstance(value, dict)
):
self._deep_update(base_dict[key], value) self._deep_update(base_dict[key], value)
else: else:
base_dict[key] = value base_dict[key] = value
@@ -1,10 +1,10 @@
from pydantic import BaseModel from pydantic import BaseModel
from core.config.annotation_config import AnnotationConfig from ners.core.config.annotation_config import AnnotationConfig
from core.config.data_config import DataConfig from ners.core.config.data_config import DataConfig
from core.config.logging_config import LoggingConfig from ners.core.config.logging_config import LoggingConfig
from core.config.processing_config import ProcessingConfig from ners.core.config.processing_config import ProcessingConfig
from core.config.project_paths import ProjectPaths from ners.core.config.project_paths import ProjectPaths
class PipelineConfig(BaseModel): class PipelineConfig(BaseModel):
@@ -10,6 +10,8 @@ class ProcessingConfig(BaseModel):
max_workers: int = 4 max_workers: int = 4
checkpoint_interval: int = 5 checkpoint_interval: int = 5
use_multiprocessing: bool = False use_multiprocessing: bool = False
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"]) encoding_options: list = field(
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
)
chunk_size: int = 100_000 chunk_size: int = 100_000
epochs: int = 2 epochs: int = 2
@@ -4,13 +4,13 @@ from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
if TYPE_CHECKING: if TYPE_CHECKING:
from core.config import PipelineConfig from ners.core.config import PipelineConfig
@contextmanager @contextmanager
def temporary_config_override(**overrides): def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration""" """Context manager for temporarily overriding configuration"""
from core.config import get_config from ners.core.config import get_config
config = get_config() config = get_config()
original_values = {} original_values = {}
@@ -5,7 +5,7 @@ from typing import Optional, Union, Iterator, Dict
import pandas as pd import pandas as pd
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
OPTIMIZED_DTYPES = { OPTIMIZED_DTYPES = {
# Numeric columns with appropriate bit-width # Numeric columns with appropriate bit-width
@@ -113,7 +113,9 @@ class DataLoader:
sex_values = df["sex"].dropna().unique() sex_values = df["sex"].dropna().unique()
if len(sex_values) == 0: if len(sex_values) == 0:
logging.warning(f"No valid values found in sex column 'sex', using random sampling") logging.warning(
"No valid values found in sex column 'sex', using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed) return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Calculate samples per sex category # Calculate samples per sex category
@@ -140,18 +142,22 @@ class DataLoader:
logging.info(f"Sampled {current_samples} records for sex '{sex}'") logging.info(f"Sampled {current_samples} records for sex '{sex}'")
if not balanced_samples: if not balanced_samples:
logging.warning("No balanced samples could be created, using random sampling") logging.warning(
"No balanced samples could be created, using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed) return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Create result using iloc with indices (no copying until final step) # Create result using iloc with indices (no copying until final step)
result = df.iloc[balanced_samples].copy() result = df.iloc[balanced_samples].copy()
# Shuffle the final result # Shuffle the final result
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index( result = result.sample(
drop=True frac=1, random_state=self.config.data.random_seed
) ).reset_index(drop=True)
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total") logging.info(
f"Created balanced dataset with {len(result)} records from {len(df)} total"
)
return result return result
@classmethod @classmethod
@@ -1,4 +1,4 @@
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
class PromptManager: class PromptManager:
@@ -2,7 +2,7 @@ import json
import logging import logging
from typing import Dict, Any from typing import Dict, Any
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
class StateManager: class StateManager:
+11 -41
View File
@@ -1,21 +1,17 @@
#!.venv/bin/python3 #!.venv/bin/python3
import argparse
import logging import logging
import sys from ners.core.utils.data_loader import DataLoader
import traceback from ners.processing.batch.batch_config import BatchConfig
from ners.processing.pipeline import Pipeline
from core.config import setup_config from ners.processing.steps.data_cleaning_step import DataCleaningStep
from core.utils.data_loader import DataLoader from ners.processing.steps.data_selection_step import DataSelectionStep
from processing.batch.batch_config import BatchConfig from ners.processing.steps.data_splitting_step import DataSplittingStep
from processing.pipeline import Pipeline from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
from processing.steps.data_cleaning_step import DataCleaningStep from ners.processing.steps.ner_annotation_step import NERAnnotationStep
from processing.steps.data_selection_step import DataSelectionStep from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
from processing.steps.data_splitting_step import DataSplittingStep
from processing.steps.feature_extraction_step import FeatureExtractionStep
def create_pipeline(config) -> Pipeline: def create_pipeline(config) -> Pipeline:
"""Create pipeline from configuration"""
batch_config = BatchConfig( batch_config = BatchConfig(
batch_size=config.processing.batch_size, batch_size=config.processing.batch_size,
max_workers=config.processing.max_workers, max_workers=config.processing.max_workers,
@@ -23,14 +19,13 @@ def create_pipeline(config) -> Pipeline:
use_multiprocessing=config.processing.use_multiprocessing, use_multiprocessing=config.processing.use_multiprocessing,
) )
# Add steps based on configuration
pipeline = Pipeline(batch_config) pipeline = Pipeline(batch_config)
steps = [ steps = [
DataCleaningStep(config), DataCleaningStep(config),
FeatureExtractionStep(config), FeatureExtractionStep(config),
DataSelectionStep(config), DataSelectionStep(config),
# NERAnnotationStep(config), NERAnnotationStep(config),
# LLMAnnotationStep(config), LLMAnnotationStep(config),
] ]
for stage in config.stages: for stage in config.stages:
@@ -42,7 +37,6 @@ def create_pipeline(config) -> Pipeline:
def run_pipeline(config) -> int: def run_pipeline(config) -> int:
"""Run the complete pipeline"""
try: try:
logging.info(f"Starting pipeline: {config.name} v{config.version}") logging.info(f"Starting pipeline: {config.name} v{config.version}")
@@ -79,27 +73,3 @@ def run_pipeline(config) -> int:
except Exception as e: except Exception as e:
logging.error(f"Pipeline failed: {e}", exc_info=True) logging.error(f"Pipeline failed: {e}", exc_info=True)
return 1 return 1
def main():
"""Main entry point with unified configuration loading"""
parser = argparse.ArgumentParser(
description="DRC NERS Processing Pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
args = parser.parse_args()
try:
config = setup_config(config_path=args.config, env=args.env)
return run_pipeline(config)
except Exception as e:
print(f"Pipeline failed: {e}")
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
+14
View File
@@ -0,0 +1,14 @@
#!.venv/bin/python3
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
def status(*, detailed: bool = False) -> None:
PipelineMonitor().print_status(detailed=detailed)
def clean_step(step: str, *, keep_last: int = 1) -> None:
PipelineMonitor().clean_step_checkpoints(step, keep_last)
def reset_step(step: str) -> None:
PipelineMonitor().reset_step(step)
+10 -25
View File
@@ -1,29 +1,24 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse
import logging import logging
import os import os
import sys
import traceback import traceback
from pathlib import Path from pathlib import Path
from core.config import setup_config, PipelineConfig from ners.core.config import PipelineConfig
from processing.ner.name_builder import NameBuilder from ners.processing.ner.name_builder import NameBuilder
from processing.ner.name_engineering import NameEngineering from ners.processing.ner.name_engineering import NameEngineering
from processing.ner.name_model import NameModel from ners.processing.ner.name_model import NameModel
def feature(config: PipelineConfig): def feature(config: PipelineConfig):
"""Apply feature engineering to create position-independent NER dataset."""
NameEngineering(config).compute() NameEngineering(config).compute()
def build(config: PipelineConfig): def build(config: PipelineConfig):
"""Build NER dataset using NERDataBuilder."""
NameBuilder(config).build() NameBuilder(config).build()
def train(config: PipelineConfig): def train(config: PipelineConfig):
"""Train the NER model."""
name_model = NameModel(config) name_model = NameModel(config)
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"] data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
@@ -37,7 +32,9 @@ def train(config: PipelineConfig):
split_idx = int(len(data) * 0.9) split_idx = int(len(data) * 0.9)
train_data, eval_data = data[:split_idx], data[split_idx:] train_data, eval_data = data[:split_idx], data[split_idx:]
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}") logging.info(
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
)
name_model.train( name_model.train(
data=train_data, data=train_data,
epochs=config.processing.epochs, epochs=config.processing.epochs,
@@ -75,21 +72,9 @@ def run_pipeline(config: PipelineConfig, reset: bool = False):
def main(): def main():
parser = argparse.ArgumentParser(description="NER model management for DRC names")
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
parser.add_argument("--reset", action="store_true", help="Reset all steps")
args = parser.parse_args()
try: try:
config = setup_config(config_path=args.config, env=args.env) logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
return run_pipeline(config, args.reset) return 1
except Exception:
except Exception as e:
print(f"Pipeline failed: {e}")
traceback.print_exc() traceback.print_exc()
return 1 return 1
if __name__ == "__main__":
sys.exit(main())
@@ -8,4 +8,6 @@ class BatchConfig:
batch_size: int = 1000 batch_size: int = 1000
max_workers: int = 4 max_workers: int = 4
checkpoint_interval: int = 5 # Save checkpoint every N batches checkpoint_interval: int = 5 # Save checkpoint every N batches
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor use_multiprocessing: bool = (
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
)
@@ -4,9 +4,9 @@ from typing import Iterator
import pandas as pd import pandas as pd
from processing.batch.batch_config import BatchConfig from ners.processing.batch.batch_config import BatchConfig
from processing.batch.memory_monitor import MemoryMonitor from ners.processing.batch.memory_monitor import MemoryMonitor
from processing.steps import PipelineStep from ners.processing.steps import PipelineStep
class BatchProcessor: class BatchProcessor:
@@ -33,7 +33,9 @@ class BatchProcessor:
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)): for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
if step.batch_exists(batch_id): if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint") logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
processed_batch = step.load_batch(batch_id) processed_batch = step.load_batch(batch_id)
else: else:
try: try:
@@ -80,7 +82,9 @@ class BatchProcessor:
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame: def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Memory-optimized concurrent processing""" """Memory-optimized concurrent processing"""
executor_class = ( executor_class = (
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor ProcessPoolExecutor
if self.config.use_multiprocessing
else ThreadPoolExecutor
) )
results = {} results = {}
@@ -89,7 +93,9 @@ class BatchProcessor:
future_to_batch = {} future_to_batch = {}
for batch, batch_id in self.create_batches(df): for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id): if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint") logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
results[batch_id] = step.load_batch(batch_id) results[batch_id] = step.load_batch(batch_id)
else: else:
# Only copy if necessary for concurrent processing # Only copy if necessary for concurrent processing
@@ -121,7 +127,9 @@ class BatchProcessor:
del results del results
self.memory_monitor.cleanup_memory() self.memory_monitor.cleanup_memory()
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame() result = (
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
)
# Final cleanup # Final cleanup
del ordered_results del ordered_results
@@ -131,7 +139,9 @@ class BatchProcessor:
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame: def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process data using the configured strategy""" """Process data using the configured strategy"""
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size step.state.total_batches = (
len(df) + self.config.batch_size - 1
) // self.config.batch_size
step.load_state() step.load_state()
logging.info(f"Starting {step.name} with {step.state.total_batches} batches") logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
@@ -4,8 +4,8 @@ import shutil
from datetime import datetime from datetime import datetime
from typing import Optional, Dict from typing import Optional, Dict
from core.config.config_manager import ConfigManager from ners.core.config.config_manager import ConfigManager
from core.config.project_paths import ProjectPaths from ners.core.config.project_paths import ProjectPaths
class PipelineMonitor: class PipelineMonitor:
@@ -97,7 +97,10 @@ class PipelineMonitor:
avg_completion = total_completion / len(self.steps) avg_completion = total_completion / len(self.steps)
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]: if avg_completion >= 100 and overall_status not in [
"error",
"completed_with_errors",
]:
overall_status = "completed" overall_status = "completed"
return { return {
@@ -121,7 +124,9 @@ class PipelineMonitor:
print(f"{step_name.replace('_', ' ').title()}:") print(f"{step_name.replace('_', ' ').title()}:")
print(f" Status: {step_status['status']}") print(f" Status: {step_status['status']}")
print(f" Progress: {step_status['completion_percentage']:.1f}%") print(f" Progress: {step_status['completion_percentage']:.1f}%")
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}") print(
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
)
if step_status["failed_batches"] > 0: if step_status["failed_batches"] > 0:
print(f" Failed Batches: {step_status['failed_batches']}") print(f" Failed Batches: {step_status['failed_batches']}")
@@ -141,7 +146,10 @@ class PipelineMonitor:
if step_dir.exists(): if step_dir.exists():
csv_files = list(step_dir.glob("*.csv")) csv_files = list(step_dir.glob("*.csv"))
step_size = sum(f.stat().st_size for f in csv_files) step_size = sum(f.stat().st_size for f in csv_files)
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)} counts[step] = {
"files": len(csv_files),
"size_mb": step_size / (1024 * 1024),
}
total_size += step_size total_size += step_size
else: else:
counts[step] = {"files": 0, "size_mb": 0} counts[step] = {"files": 0, "size_mb": 0}
@@ -160,7 +168,9 @@ class PipelineMonitor:
csv_files = sorted(step_dir.glob("batch_*.csv")) csv_files = sorted(step_dir.glob("batch_*.csv"))
if len(csv_files) <= keep_last: if len(csv_files) <= keep_last:
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all") logging.info(
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
)
return return
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
@@ -3,7 +3,7 @@ from typing import List, Tuple, Dict
import pandas as pd import pandas as pd
from processing.steps.feature_extraction_step import NameCategory from ners.processing.steps.feature_extraction_step import NameCategory
class BaseNameFormatter(ABC): class BaseNameFormatter(ABC):
@@ -12,7 +12,9 @@ class BaseNameFormatter(ABC):
Contains common logic for NER tagging and attribute computation. Contains common logic for NER tagging and attribute computation.
""" """
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None): def __init__(
self, connectors: List[str] = None, additional_surnames: List[str] = None
):
self.connectors = connectors or ["wa", "ya", "ka", "ba"] self.connectors = connectors or ["wa", "ya", "ka", "ba"]
self.additional_surnames = additional_surnames or [ self.additional_surnames = additional_surnames or [
"jean", "jean",
@@ -46,7 +48,9 @@ class BaseNameFormatter(ABC):
end_pos = current_pos + len(word) end_pos = current_pos + len(word)
# Determine tag based on word content # Determine tag based on word content
if word in native_parts or any(connector in word for connector in self.connectors): if word in native_parts or any(
connector in word for connector in self.connectors
):
tag = "NATIVE" tag = "NATIVE"
elif word == surname or word in self.additional_surnames: elif word == surname or word in self.additional_surnames:
tag = "SURNAME" tag = "SURNAME"
@@ -72,7 +76,9 @@ class BaseNameFormatter(ABC):
"words": words_count, "words": words_count,
"length": length, "length": length,
"identified_category": ( "identified_category": (
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value NameCategory.SIMPLE.value
if words_count == 3
else NameCategory.COMPOSE.value
), ),
} }
@@ -3,7 +3,7 @@ from typing import Dict
import pandas as pd import pandas as pd
from processing.ner.formats import BaseNameFormatter from ners.processing.ner.formats import BaseNameFormatter
class ConnectorFormatter(BaseNameFormatter): class ConnectorFormatter(BaseNameFormatter):
@@ -3,13 +3,15 @@ from typing import Dict
import pandas as pd import pandas as pd
from processing.ner.formats import BaseNameFormatter from ners.processing.ner.formats import BaseNameFormatter
class ExtendedSurnameFormatter(BaseNameFormatter): class ExtendedSurnameFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict: def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"]) native_parts = self.parse_native_components(row["probable_native"])
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else "" original_surname = (
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
)
# Add random additional surname # Add random additional surname
additional_surname = random.choice(self.additional_surnames) additional_surname = random.choice(self.additional_surnames)
@@ -22,7 +24,9 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
"identified_name": row["probable_native"], "identified_name": row["probable_native"],
"probable_surname": combined_surname, "probable_surname": combined_surname,
"identified_surname": combined_surname, "identified_surname": combined_surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)), "ner_entities": str(
self.create_ner_tags(full_name, native_parts, combined_surname)
),
"transformation_type": self.transformation_type, "transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name), **self.compute_numeric_features(full_name),
} }
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd import pandas as pd
from processing.ner.formats import BaseNameFormatter from ners.processing.ner.formats import BaseNameFormatter
class NativeOnlyFormatter(BaseNameFormatter): class NativeOnlyFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd import pandas as pd
from processing.ner.formats import BaseNameFormatter from ners.processing.ner.formats import BaseNameFormatter
class OriginalFormatter(BaseNameFormatter): class OriginalFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd import pandas as pd
from processing.ner.formats import BaseNameFormatter from ners.processing.ner.formats import BaseNameFormatter
class PositionFlippedFormatter(BaseNameFormatter): class PositionFlippedFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd import pandas as pd
from processing.ner.formats import BaseNameFormatter from ners.processing.ner.formats import BaseNameFormatter
class ReducedNativeFormatter(BaseNameFormatter): class ReducedNativeFormatter(BaseNameFormatter):
@@ -11,7 +11,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else "" surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Keep only first native component + surname # Keep only first native component + surname
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"] reduced_native = (
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
)
full_name = f"{reduced_native} {surname}".strip() full_name = f"{reduced_native} {surname}".strip()
return { return {
@@ -20,7 +22,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
"identified_name": reduced_native, "identified_name": reduced_native,
"probable_surname": surname, "probable_surname": surname,
"identified_surname": surname, "identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)), "ner_entities": str(
self.create_ner_tags(full_name, [reduced_native], surname)
),
"transformation_type": self.transformation_type, "transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name), **self.compute_numeric_features(full_name),
} }
@@ -4,8 +4,8 @@ import logging
import spacy import spacy
from spacy.tokens import DocBin from spacy.tokens import DocBin
from core.config import PipelineConfig from ners.core.config import PipelineConfig
from core.utils.data_loader import DataLoader from ners.core.utils.data_loader import DataLoader
from .name_tagger import NameTagger from .name_tagger import NameTagger
@@ -20,7 +20,9 @@ class NameBuilder:
self.tagger = NameTagger() self.tagger = NameTagger()
def build(self) -> int: def build(self) -> int:
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"]) filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(filepath) df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]] df = df[["name", "ner_tagged", "ner_entities"]]
@@ -38,7 +40,9 @@ class NameBuilder:
# Use NERNameTagger for parsing and validation # Use NERNameTagger for parsing and validation
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"]) parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities) validated_entities = self.tagger.validate_entities(
ner_df["name"], parsed_entities
)
# Drop rows with no valid entities # Drop rows with no valid entities
mask = validated_entities.map(bool) mask = validated_entities.map(bool)
@@ -51,22 +55,33 @@ class NameBuilder:
# Prepare training data # Prepare training data
training_data = list( training_data = list(
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities]) zip(
ner_df["name"].tolist(),
[{"entities": ents} for ents in validated_entities],
)
) )
# Use NERNameTagger to create spaCy DocBin # Use NERNameTagger to create spaCy DocBin
docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist()) docs = self.tagger.create_docs(
nlp, ner_df["name"].tolist(), validated_entities.tolist()
)
doc_bin = DocBin(docs=docs) doc_bin = DocBin(docs=docs)
# Save # Save
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"]) json_path = self.config.paths.get_data_path(
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"]) self.config.data.output_files["ner_data"]
)
spacy_path = self.config.paths.get_data_path(
self.config.data.output_files["ner_spacy"]
)
with open(json_path, "w", encoding="utf-8") as f: with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":")) json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
doc_bin.to_disk(spacy_path) doc_bin.to_disk(spacy_path)
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}") logging.info(
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
)
logging.info(f"Saved NER JSON to {json_path}") logging.info(f"Saved NER JSON to {json_path}")
logging.info(f"Saved NER spacy to {spacy_path}") logging.info(f"Saved NER spacy to {spacy_path}")
return 0 return 0
@@ -6,14 +6,14 @@ import numpy as np
import pandas as pd import pandas as pd
from tqdm import tqdm from tqdm import tqdm
from core.config import PipelineConfig from ners.core.config import PipelineConfig
from core.utils.data_loader import DataLoader from ners.core.utils.data_loader import DataLoader
from processing.ner.formats.connectors_format import ConnectorFormatter from ners.processing.ner.formats.connectors_format import ConnectorFormatter
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from processing.ner.formats.native_only_format import NativeOnlyFormatter from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
from processing.ner.formats.original_format import OriginalFormatter from ners.processing.ner.formats.original_format import OriginalFormatter
from processing.ner.formats.position_flipped_format import PositionFlippedFormatter from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
class NameEngineering: class NameEngineering:
@@ -44,42 +44,60 @@ class NameEngineering:
# Initialize format classes # Initialize format classes
self.formatters = { self.formatters = {
"original": OriginalFormatter(self.connectors, self.additional_surnames), "original": OriginalFormatter(self.connectors, self.additional_surnames),
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames), "native_only": NativeOnlyFormatter(
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames), self.connectors, self.additional_surnames
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames), ),
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames), "position_flipped": PositionFlippedFormatter(
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames), self.connectors, self.additional_surnames
),
"reduced_native": ReducedNativeFormatter(
self.connectors, self.additional_surnames
),
"connector_added": ConnectorFormatter(
self.connectors, self.additional_surnames
),
"extended_surname": ExtendedSurnameFormatter(
self.connectors, self.additional_surnames
),
} }
def load_data(self) -> pd.DataFrame: def load_data(self) -> pd.DataFrame:
"""Load and filter NER-tagged data from CSV file""" """Load and filter NER-tagged data from CSV file"""
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"]) filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath) df = self.data_loader.load_csv_complete(filepath)
# Filter only NER-tagged rows # Filter only NER-tagged rows
ner_data = df[df["ner_tagged"] == 1].copy() ner_data = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records") logging.info(
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
)
return ner_data return ner_data
def compute(self) -> None: def compute(self) -> None:
logging.info("Applying feature engineering transformations...") logging.info("Applying feature engineering transformations...")
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"]) input_filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
output_filepath = self.config.paths.get_data_path( output_filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"] self.config.data.output_files["engineered"]
) )
df = self.data_loader.load_csv_complete(input_filepath) df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy() ner_df = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records") logging.info(
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
)
del df # No need to keep in memory del df # No need to keep in memory
gc.collect() gc.collect()
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index( ner_df = ner_df.sample(
drop=True frac=1, random_state=self.config.data.random_seed
) ).reset_index(drop=True)
total_rows = len(ner_df) total_rows = len(ner_df)
# Calculate split points # Calculate split points
@@ -94,7 +112,11 @@ class NameEngineering:
(0, split_25_1, "original"), # First 25%: original format (0, split_25_1, "original"), # First 25%: original format
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname (split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions (split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components (
split_25_3,
split_10_1,
"reduced_native",
), # Fourth 10%: reduce native components
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors (split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames (split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
] ]
@@ -11,7 +11,7 @@ from spacy.training import Example
from spacy.util import minibatch from spacy.util import minibatch
from tqdm import tqdm from tqdm import tqdm
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
class NameModel: class NameModel:
@@ -87,7 +87,9 @@ class NameModel:
# Handle different annotation formats from NERNameTagger # Handle different annotation formats from NERNameTagger
if not isinstance(annotations, dict) or "entities" not in annotations: if not isinstance(annotations, dict) or "entities" not in annotations:
logging.warning(f"Skipping invalid annotations at index {i}: {annotations}") logging.warning(
f"Skipping invalid annotations at index {i}: {annotations}"
)
skipped_count += 1 skipped_count += 1
continue continue
@@ -124,7 +126,9 @@ class NameModel:
valid_entities = [] valid_entities = []
for entity in entities: for entity in entities:
if not isinstance(entity, (list, tuple)) or len(entity) != 3: if not isinstance(entity, (list, tuple)) or len(entity) != 3:
logging.warning(f"Skipping invalid entity format in '{text}': {entity}") logging.warning(
f"Skipping invalid entity format in '{text}': {entity}"
)
continue continue
start, end, label = entity start, end, label = entity
@@ -138,21 +142,30 @@ class NameModel:
or start < 0 or start < 0
or end > len(text) or end > len(text)
): ):
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}") logging.warning(
f"Skipping invalid entity bounds in '{text}': {entity}"
)
continue continue
# Check for overlaps with already validated entities # Check for overlaps with already validated entities
has_overlap = any( has_overlap = any(
start < v_end and end > v_start for v_start, v_end, _ in valid_entities start < v_end and end > v_start
for v_start, v_end, _ in valid_entities
) )
if has_overlap: if has_overlap:
logging.warning(f"Skipping overlapping entity in '{text}': {entity}") logging.warning(
f"Skipping overlapping entity in '{text}': {entity}"
)
continue continue
# Validate that the span doesn't contain spaces (matching tagger validation) # Validate that the span doesn't contain spaces (matching tagger validation)
span_text = text[start:end] span_text = text[start:end]
if not span_text or span_text != span_text.strip() or " " in span_text: if (
not span_text
or span_text != span_text.strip()
or " " in span_text
):
logging.warning( logging.warning(
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'" f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
) )
@@ -161,7 +174,9 @@ class NameModel:
valid_entities.append((start, end, label)) valid_entities.append((start, end, label))
if not valid_entities: if not valid_entities:
logging.warning(f"Skipping training example with no valid entities: '{text}'") logging.warning(
f"Skipping training example with no valid entities: '{text}'"
)
skipped_count += 1 skipped_count += 1
continue continue
@@ -219,7 +234,9 @@ class NameModel:
batches = minibatch(examples, size=batch_size) batches = minibatch(examples, size=batch_size)
for batch in batches: for batch in batches:
batch_losses = {} batch_losses = {}
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer) self.nlp.update(
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
)
logging.info( logging.info(
f"Training batch with {len(batch)} examples, current losses: {batch_losses}" f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
) )
@@ -230,7 +247,7 @@ class NameModel:
del batches # free memory del batches # free memory
losses_history.append(losses.get("ner", 0)) losses_history.append(losses.get("ner", 0))
logging.info(f"Epoch {epoch+1}/{epochs}, Total Loss: {losses['ner']:.4f}") logging.info(f"Epoch {epoch + 1}/{epochs}, Total Loss: {losses['ner']:.4f}")
# Store training statistics # Store training statistics
self.training_stats = { self.training_stats = {
@@ -242,7 +259,9 @@ class NameModel:
"dropout_rate": dropout_rate, "dropout_rate": dropout_rate,
} }
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}") logging.info(
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
)
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]: def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
"""Evaluate the trained model on test data""" """Evaluate the trained model on test data"""
@@ -291,10 +310,14 @@ class NameModel:
entity_stats[label]["fp"] += 1 entity_stats[label]["fp"] += 1
# Calculate overall metrics # Calculate overall metrics
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0 precision = (
correct_entities / predicted_entities if predicted_entities > 0 else 0
)
recall = correct_entities / actual_entities if actual_entities > 0 else 0 recall = correct_entities / actual_entities if actual_entities > 0 else 0
f1_score = ( f1_score = (
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 2 * (precision * recall) / (precision + recall)
if (precision + recall) > 0
else 0
) )
# Calculate per-label metrics # Calculate per-label metrics
@@ -304,7 +327,11 @@ class NameModel:
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0 label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0 label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
label_f1 = ( label_f1 = (
(2 * (label_precision * label_recall) / (label_precision + label_recall)) (
2
* (label_precision * label_recall)
/ (label_precision + label_recall)
)
if (label_precision + label_recall) > 0 if (label_precision + label_recall) > 0
else 0 else 0
) )
@@ -394,7 +421,9 @@ class NameModel:
"label": ent.label_, "label": ent.label_,
"start": ent.start_char, "start": ent.start_char,
"end": ent.end_char, "end": ent.end_char,
"confidence": getattr(ent, "score", None), # If confidence scores are available "confidence": getattr(
ent, "score", None
), # If confidence scores are available
} }
) )
@@ -48,7 +48,9 @@ class NameTagger:
# Find the first occurrence of this native word that doesn't overlap # Find the first occurrence of this native word that doesn't overlap
start_pos = 0 start_pos = 0
while True: while True:
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search pos = name_lower.find(
native_word_lower, start_pos
) # Case-insensitive search
if pos == -1: if pos == -1:
break break
@@ -78,7 +80,9 @@ class NameTagger:
# Find the first occurrence that doesn't overlap # Find the first occurrence that doesn't overlap
start_pos = 0 start_pos = 0
while True: while True:
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search pos = name_lower.find(
surname_lower, start_pos
) # Case-insensitive search
if pos == -1: if pos == -1:
break break
@@ -120,8 +124,13 @@ class NameTagger:
continue continue
# Check for overlaps with already validated entities # Check for overlaps with already validated entities
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities): if any(
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'") start < v_end and end > v_start
for v_start, v_end, _ in validated_entities
):
logging.warning(
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
)
continue continue
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces) # CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
@@ -200,10 +209,16 @@ class NameTagger:
elif entities_str.startswith("[[") and entities_str.endswith("]]"): elif entities_str.startswith("[[") and entities_str.endswith("]]"):
return [tuple(e) for e in ast.literal_eval(entities_str)] return [tuple(e) for e in ast.literal_eval(entities_str)]
elif entities_str.startswith("[{") and entities_str.endswith("}]"): elif entities_str.startswith("[{") and entities_str.endswith("}]"):
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)] return [
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
]
else: else:
parsed = ast.literal_eval(entities_str) parsed = ast.literal_eval(entities_str)
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3] return [
tuple(e)
for e in parsed
if isinstance(e, (list, tuple)) and len(e) == 3
]
except (ValueError, SyntaxError, json.JSONDecodeError): except (ValueError, SyntaxError, json.JSONDecodeError):
return [] return []
@@ -251,7 +266,9 @@ class NameTagger:
last_end = e last_end = e
return filtered return filtered
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series: def validate_entities(
self, texts: pd.Series, entities_series: pd.Series
) -> pd.Series:
"""Vectorized entity validation.""" """Vectorized entity validation."""
return pd.Series(map(self.validate, texts, entities_series), index=texts.index) return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
@@ -4,9 +4,9 @@ from typing import Dict, Any
import pandas as pd import pandas as pd
from processing.batch.batch_config import BatchConfig from ners.processing.batch.batch_config import BatchConfig
from processing.batch.batch_processor import BatchProcessor from ners.processing.batch.batch_processor import BatchProcessor
from processing.steps import PipelineStep from ners.processing.steps import PipelineStep
class Pipeline: class Pipeline:
@@ -8,9 +8,9 @@ from typing import List, Optional
import pandas as pd import pandas as pd
from pydantic import BaseModel from pydantic import BaseModel
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from core.utils.data_loader import DataLoader from ners.core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig from ners.processing.batch.batch_config import BatchConfig
@dataclass @dataclass
@@ -38,7 +38,10 @@ class PipelineStep(ABC):
"""Abstract base class for pipeline steps""" """Abstract base class for pipeline steps"""
def __init__( def __init__(
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None self,
name: str,
pipeline_config: PipelineConfig,
batch_config: Optional[BatchConfig] = None,
): ):
self.name = name self.name = name
self.pipeline_config = pipeline_config self.pipeline_config = pipeline_config
@@ -2,9 +2,9 @@ import logging
import pandas as pd import pandas as pd
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner from ners.core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep from ners.processing.steps import PipelineStep
class DataCleaningStep(PipelineStep): class DataCleaningStep(PipelineStep):
@@ -2,8 +2,8 @@ import logging
import pandas as pd import pandas as pd
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from processing.steps import PipelineStep from ners.processing.steps import PipelineStep
class DataSelectionStep(PipelineStep): class DataSelectionStep(PipelineStep):
@@ -31,8 +31,12 @@ class DataSelectionStep(PipelineStep):
) )
# Check which columns exist in the batch # Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns] available_columns = [
missing_columns = [col for col in self.selected_columns if col not in batch.columns] col for col in self.selected_columns if col in batch.columns
]
missing_columns = [
col for col in self.selected_columns if col not in batch.columns
]
if missing_columns: if missing_columns:
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}") logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
@@ -1,11 +1,11 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper from ners.core.utils.region_mapper import RegionMapper
from processing.batch.batch_config import BatchConfig from ners.processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep from ners.processing.steps import PipelineStep
from processing.steps.feature_extraction_step import Gender from ners.processing.steps.feature_extraction_step import Gender
class DataSplittingStep(PipelineStep): class DataSplittingStep(PipelineStep):
@@ -26,7 +26,9 @@ class DataSplittingStep(PipelineStep):
if self.eval_indices is None: if self.eval_indices is None:
np.random.seed(self.pipeline_config.data.random_seed) np.random.seed(self.pipeline_config.data.random_seed)
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction) eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False)) self.eval_indices = set(
np.random.choice(total_size, size=eval_size, replace=False)
)
return self.eval_indices return self.eval_indices
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame: def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
@@ -45,7 +47,9 @@ class DataSplittingStep(PipelineStep):
df_evaluation = df[eval_mask] df_evaluation = df[eval_mask]
df_featured = df[~eval_mask] df_featured = df[~eval_mask]
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"]) self.data_loader.save_csv(
df_evaluation, data_dir / output_files["evaluation"]
)
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"]) self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
else: else:
self.data_loader.save_csv(df, data_dir / output_files["featured"]) self.data_loader.save_csv(df, data_dir / output_files["featured"])
@@ -53,7 +57,9 @@ class DataSplittingStep(PipelineStep):
if self.pipeline_config.data.split_by_province: if self.pipeline_config.data.split_by_province:
for province in RegionMapper.get_provinces(): for province in RegionMapper.get_provinces():
df_region = df[df.province == province] df_region = df[df.province == province]
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv") self.data_loader.save_csv(
df_region, data_dir / "provinces" / f"{province}.csv"
)
if self.pipeline_config.data.split_by_gender: if self.pipeline_config.data.split_by_gender:
df_males = df[df.sex == Gender.MALE.value] df_males = df[df.sex == Gender.MALE.value]
@@ -5,10 +5,10 @@ from typing import Dict, Any
import pandas as pd import pandas as pd
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper from ners.core.utils.region_mapper import RegionMapper
from processing.ner.name_tagger import NameTagger from ners.processing.ner.name_tagger import NameTagger
from processing.steps import PipelineStep from ners.processing.steps import PipelineStep
class Gender(Enum): class Gender(Enum):
@@ -64,10 +64,14 @@ class FeatureExtractionStep(PipelineStep):
self._assign_probable_names(result) self._assign_probable_names(result)
self._process_simple_names(result) self._process_simple_names(result)
result["identified_category"] = self._assign_identified_category(result["words"]) result["identified_category"] = self._assign_identified_category(
result["words"]
)
if "year" in result.columns: if "year" in result.columns:
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16") result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
"Int16"
)
if "region" in result.columns: if "region" in result.columns:
result["province"] = self.region_mapper.map(result["region"]).str.lower() result["province"] = self.region_mapper.map(result["region"]).str.lower()
@@ -7,12 +7,12 @@ import ollama
import pandas as pd import pandas as pd
from pydantic import ValidationError from pydantic import ValidationError
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from core.utils.prompt_manager import PromptManager from ners.core.utils.prompt_manager import PromptManager
from core.utils.rate_limiter import RateLimitConfig from ners.core.utils.rate_limiter import RateLimitConfig
from core.utils.rate_limiter import RateLimiter from ners.core.utils.rate_limiter import RateLimiter
from processing.batch.batch_config import BatchConfig from ners.processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep, NameAnnotation from ners.processing.steps import PipelineStep, NameAnnotation
class LLMAnnotationStep(PipelineStep): class LLMAnnotationStep(PipelineStep):
@@ -24,7 +24,8 @@ class LLMAnnotationStep(PipelineStep):
batch_config = BatchConfig( batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size, batch_size=pipeline_config.processing.batch_size,
max_workers=min( max_workers=min(
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers self.llm_config.max_concurrent_requests,
pipeline_config.processing.max_workers,
), ),
checkpoint_interval=pipeline_config.processing.checkpoint_interval, checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing, use_multiprocessing=pipeline_config.processing.use_multiprocessing,
@@ -33,7 +34,9 @@ class LLMAnnotationStep(PipelineStep):
self.prompt = PromptManager(pipeline_config).load_prompt() self.prompt = PromptManager(pipeline_config).load_prompt()
self.rate_limiter = ( self.rate_limiter = (
self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None self._create_rate_limiter()
if self.llm_config.enable_rate_limiting
else None
) )
# Statistics # Statistics
@@ -76,7 +79,9 @@ class LLMAnnotationStep(PipelineStep):
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout" f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
) )
annotation = NameAnnotation.model_validate_json(response.message.content) annotation = NameAnnotation.model_validate_json(
response.message.content
)
result = { result = {
**annotation.model_dump(), **annotation.model_dump(),
"annotated": 1, "annotated": 1,
@@ -119,7 +124,9 @@ class LLMAnnotationStep(PipelineStep):
logging.info(f"Batch {batch_id}: No entries to annotate") logging.info(f"Batch {batch_id}: No entries to annotate")
return batch return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM") logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
)
batch = batch.copy() batch = batch.copy()
client = ollama.Client() client = ollama.Client()
@@ -5,9 +5,9 @@ from typing import Dict
import pandas as pd import pandas as pd
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from processing.ner.name_model import NameModel from ners.processing.ner.name_model import NameModel
from processing.steps import PipelineStep, NameAnnotation from ners.processing.steps import PipelineStep, NameAnnotation
class NERAnnotationStep(PipelineStep): class NERAnnotationStep(PipelineStep):
@@ -39,7 +39,9 @@ class NERAnnotationStep(PipelineStep):
logging.info("NER model loaded successfully") logging.info("NER model loaded successfully")
else: else:
logging.warning(f"NER model not found at {self.model_path}") logging.warning(f"NER model not found at {self.model_path}")
logging.warning("NER annotation will be skipped. Train the model first.") logging.warning(
"NER annotation will be skipped. Train the model first."
)
self.name_model.nlp = None self.name_model.nlp = None
except Exception as e: except Exception as e:
logging.error(f"Failed to load NER model: {e}") logging.error(f"Failed to load NER model: {e}")
@@ -80,7 +82,9 @@ class NERAnnotationStep(PipelineStep):
# Create annotation result in same format as LLM step # Create annotation result in same format as LLM step
annotation = NameAnnotation( annotation = NameAnnotation(
identified_name=" ".join(native_parts) if native_parts else None, identified_name=" ".join(native_parts) if native_parts else None,
identified_surname=" ".join(surname_parts) if surname_parts else None, identified_surname=" ".join(surname_parts)
if surname_parts
else None,
) )
result = { result = {
@@ -124,7 +128,9 @@ class NERAnnotationStep(PipelineStep):
logging.info(f"Batch {batch_id}: No entries to annotate") logging.info(f"Batch {batch_id}: No entries to annotate")
return batch return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER") logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
)
batch = batch.copy() batch = batch.copy()
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from research.experiment import ExperimentConfig from ners.research.experiment import ExperimentConfig
class BaseModel(ABC): class BaseModel(ABC):
@@ -103,16 +103,25 @@ class BaseModel(ABC):
feature_names = self._get_feature_names() feature_names = self._get_feature_names()
return dict(zip(feature_names, coefficients)) return dict(zip(feature_names, coefficients))
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps: elif (
hasattr(self.model, "named_steps")
and "classifier" in self.model.named_steps
):
# For sklearn pipelines (like LogisticRegression with vectorizer) # For sklearn pipelines (like LogisticRegression with vectorizer)
classifier = self.model.named_steps["classifier"] classifier = self.model.named_steps["classifier"]
if hasattr(classifier, "coef_"): if hasattr(classifier, "coef_"):
coefficients = np.abs(classifier.coef_[0]) coefficients = np.abs(classifier.coef_[0])
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"): if hasattr(
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out() self.model.named_steps["vectorizer"], "get_feature_names_out"
):
feature_names = self.model.named_steps[
"vectorizer"
].get_feature_names_out()
# Take top features to avoid too many n-grams # Take top features to avoid too many n-grams
top_indices = np.argsort(coefficients)[-20:] top_indices = np.argsort(coefficients)[-20:]
return dict(zip(feature_names[top_indices], coefficients[top_indices])) return dict(
zip(feature_names[top_indices], coefficients[top_indices])
)
return None return None
@@ -143,7 +152,7 @@ class BaseModel(ABC):
model_data = joblib.load(path) model_data = joblib.load(path)
# Recreate the model instance # Recreate the model instance
from research.experiment import ExperimentConfig from ners.research.experiment import ExperimentConfig
config = ExperimentConfig.from_dict(model_data["config"]) config = ExperimentConfig.from_dict(model_data["config"])
instance = cls(config) instance = cls(config)
@@ -221,7 +230,9 @@ class BaseModel(ABC):
if "accuracy" in self.training_history: if "accuracy" in self.training_history:
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy") axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
if "val_accuracy" in self.training_history: if "val_accuracy" in self.training_history:
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy") axes[0].plot(
self.training_history["val_accuracy"], label="Validation Accuracy"
)
axes[0].set_title("Model Accuracy") axes[0].set_title("Model Accuracy")
axes[0].set_xlabel("Epoch") axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Accuracy") axes[0].set_ylabel("Accuracy")
@@ -18,7 +18,9 @@ class ExperimentConfig:
tags: List[str] = field(default_factory=list) tags: List[str] = field(default_factory=list)
# Model configuration # Model configuration
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc. model_type: str = (
"logistic_regression" # logistic_regression, lstm, transformer, etc.
)
model_params: Dict[str, Any] = field(default_factory=dict) model_params: Dict[str, Any] = field(default_factory=dict)
# Feature configuration # Feature configuration
@@ -26,7 +28,9 @@ class ExperimentConfig:
feature_params: Dict[str, Any] = field(default_factory=dict) feature_params: Dict[str, Any] = field(default_factory=dict)
# Data configuration # Data configuration
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data train_data_filter: Optional[Dict[str, Any]] = (
None # Filter criteria for training data
)
test_data_filter: Optional[Dict[str, Any]] = None test_data_filter: Optional[Dict[str, Any]] = None
target_column: str = "sex" target_column: str = "sex"
@@ -36,7 +40,9 @@ class ExperimentConfig:
cross_validation_folds: int = 5 cross_validation_folds: int = 5
# Evaluation configuration # Evaluation configuration
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"]) metrics: List[str] = field(
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
)
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization""" """Convert to dictionary for serialization"""
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field, asdict
from datetime import datetime from datetime import datetime
from typing import Optional, Dict, List, Any from typing import Optional, Dict, List, Any
from research.experiment import ExperimentConfig, ExperimentStatus from ners.research.experiment import ExperimentConfig, ExperimentStatus
@dataclass @dataclass
@@ -51,6 +51,8 @@ class ExperimentResult:
"""Create from dictionary""" """Create from dictionary"""
data["config"] = ExperimentConfig.from_dict(data["config"]) data["config"] = ExperimentConfig.from_dict(data["config"])
data["start_time"] = datetime.fromisoformat(data["start_time"]) data["start_time"] = datetime.fromisoformat(data["start_time"])
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None data["end_time"] = (
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
)
data["status"] = ExperimentStatus(data["status"]) data["status"] = ExperimentStatus(data["status"])
return cls(**data) return cls(**data)
@@ -3,9 +3,9 @@ from typing import List, Dict
import yaml import yaml
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from research.experiment import ExperimentConfig from ners.research.experiment import ExperimentConfig
from research.experiment.feature_extractor import FeatureType from ners.research.experiment.feature_extractor import FeatureType
class ExperimentBuilder: class ExperimentBuilder:
@@ -27,7 +27,9 @@ class ExperimentBuilder:
raise raise
@classmethod @classmethod
def find_template(cls, templates: dict, name: str, experiment_type: str = "baseline") -> dict: def find_template(
cls, templates: dict, name: str, experiment_type: str = "baseline"
) -> dict:
"""Find experiment configuration by name and type""" """Find experiment configuration by name and type"""
# Map type to section in templates # Map type to section in templates
@@ -9,12 +9,16 @@ import pandas as pd
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from core.config import PipelineConfig from ners.core.config import PipelineConfig
from core.utils.data_loader import DataLoader from ners.core.utils.data_loader import DataLoader
from research.base_model import BaseModel from ners.research.base_model import BaseModel
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics from ners.research.experiment import (
from research.experiment.experiment_tracker import ExperimentTracker ExperimentConfig,
from research.model_registry import create_model ExperimentStatus,
calculate_metrics,
)
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.model_registry import create_model
class ExperimentRunner: class ExperimentRunner:
@@ -32,10 +36,14 @@ class ExperimentRunner:
try: try:
logging.info(f"Starting experiment: {experiment_id}") logging.info(f"Starting experiment: {experiment_id}")
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING) self.tracker.update_experiment(
experiment_id, status=ExperimentStatus.RUNNING
)
# Load data # Load data
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"]) filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath) df = self.data_loader.load_csv_complete(filepath)
# Apply data filters if specified # Apply data filters if specified
@@ -63,8 +71,12 @@ class ExperimentRunner:
test_pred = model.predict(X_test) test_pred = model.predict(X_test)
# Calculate metrics # Calculate metrics
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics) train_metrics = calculate_metrics(
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics) y_train, train_pred, experiment_config.metrics
)
test_metrics = calculate_metrics(
y_test, test_pred, experiment_config.metrics
)
# Cross-validation if requested # Cross-validation if requested
cv_metrics = {} cv_metrics = {}
@@ -125,7 +137,9 @@ class ExperimentRunner:
experiment_ids = [] experiment_ids = []
for i, config in enumerate(experiments): for i, config in enumerate(experiments):
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}") logging.info(
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
)
try: try:
exp_id = self.run_experiment(config) exp_id = self.run_experiment(config)
experiment_ids.append(exp_id) experiment_ids.append(exp_id)
@@ -136,7 +150,9 @@ class ExperimentRunner:
return experiment_ids return experiment_ids
@classmethod @classmethod
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame: def _apply_data_filters(
cls, df: pd.DataFrame, config: ExperimentConfig
) -> pd.DataFrame:
"""Apply data filters specified in experiment config""" """Apply data filters specified in experiment config"""
filtered_df = df.copy() filtered_df = df.copy()
@@ -148,9 +164,13 @@ class ExperimentRunner:
filtered_df = filtered_df[filtered_df[column].isin(criteria)] filtered_df = filtered_df[filtered_df[column].isin(criteria)]
elif isinstance(criteria, dict): elif isinstance(criteria, dict):
if "min" in criteria: if "min" in criteria:
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]] filtered_df = filtered_df[
filtered_df[column] >= criteria["min"]
]
if "max" in criteria: if "max" in criteria:
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]] filtered_df = filtered_df[
filtered_df[column] <= criteria["max"]
]
else: else:
filtered_df = filtered_df[filtered_df[column] == criteria] filtered_df = filtered_df[filtered_df[column] == criteria]
@@ -231,7 +251,9 @@ class ExperimentRunner:
return model return model
except Exception as e: except Exception as e:
logging.error(f"Failed to load model for experiment {experiment_id}: {e}") logging.error(
f"Failed to load model for experiment {experiment_id}: {e}"
)
return None return None
return None return None
@@ -6,9 +6,9 @@ from typing import Optional, Dict, List
import pandas as pd import pandas as pd
from core.config import PipelineConfig, get_config from ners.core.config import PipelineConfig, get_config
from research.experiment import ExperimentConfig, ExperimentStatus from ners.research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiement_result import ExperimentResult from ners.research.experiment.experiement_result import ExperimentResult
class ExperimentTracker: class ExperimentTracker:
@@ -97,7 +97,10 @@ class ExperimentTracker:
return sorted(results, key=lambda x: x.start_time, reverse=True) return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment( def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None self,
metric: str = "accuracy",
dataset: str = "test",
filters: Optional[Dict] = None,
) -> Optional[ExperimentResult]: ) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric""" """Get the best experiment based on a metric"""
experiments = self.list_experiments() experiments = self.list_experiments()
@@ -106,7 +109,9 @@ class ExperimentTracker:
# Apply additional filters # Apply additional filters
if "model_type" in filters: if "model_type" in filters:
experiments = [ experiments = [
e for e in experiments if e.config.model_type == filters["model_type"] e
for e in experiments
if e.config.model_type == filters["model_type"]
] ]
if "features" in filters: if "features" in filters:
experiments = [ experiments = [
@@ -118,7 +123,9 @@ class ExperimentTracker:
valid_experiments = [] valid_experiments = []
for exp in experiments: for exp in experiments:
if exp.status == ExperimentStatus.COMPLETED: if exp.status == ExperimentStatus.COMPLETED:
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics metrics_dict = (
exp.test_metrics if dataset == "test" else exp.train_metrics
)
if metric in metrics_dict: if metric in metrics_dict:
valid_experiments.append((exp, metrics_dict[metric])) valid_experiments.append((exp, metrics_dict[metric]))
@@ -24,7 +24,9 @@ class FeatureType(Enum):
class FeatureExtractor: class FeatureExtractor:
"""Extract different types of features from name data""" """Extract different types of features from name data"""
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None): def __init__(
self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None
):
self.feature_types = feature_types self.feature_types = feature_types
self.feature_params = feature_params or {} self.feature_params = feature_params or {}
@@ -1,18 +1,18 @@
from typing import List from typing import List
from research.base_model import BaseModel from ners.research.base_model import BaseModel
from research.experiment import ExperimentConfig from ners.research.experiment import ExperimentConfig
from research.models.bigru_model import BiGRUModel from ners.research.models.bigru_model import BiGRUModel
from research.models.cnn_model import CNNModel from ners.research.models.cnn_model import CNNModel
from research.models.ensemble_model import EnsembleModel from ners.research.models.ensemble_model import EnsembleModel
from research.models.lightgbm_model import LightGBMModel from ners.research.models.lightgbm_model import LightGBMModel
from research.models.logistic_regression_model import LogisticRegressionModel from ners.research.models.logistic_regression_model import LogisticRegressionModel
from research.models.lstm_model import LSTMModel from ners.research.models.lstm_model import LSTMModel
from research.models.naive_bayes_model import NaiveBayesModel from ners.research.models.naive_bayes_model import NaiveBayesModel
from research.models.random_forest_model import RandomForestModel from ners.research.models.random_forest_model import RandomForestModel
from research.models.svm_model import SVMModel from ners.research.models.svm_model import SVMModel
from research.models.transformer_model import TransformerModel from ners.research.models.transformer_model import TransformerModel
from research.models.xgboost_model import XGBoostModel from ners.research.models.xgboost_model import XGBoostModel
MODEL_REGISTRY = { MODEL_REGISTRY = {
"bigru": BiGRUModel, "bigru": BiGRUModel,
@@ -5,12 +5,12 @@ from typing import List, Dict, Any
import pandas as pd import pandas as pd
from core.config import get_config from ners.core.config import get_config
from core.utils.data_loader import DataLoader from ners.core.utils.data_loader import DataLoader
from research.experiment import FeatureType, ExperimentConfig from ners.research.experiment import FeatureType, ExperimentConfig
from research.experiment.experiment_runner import ExperimentRunner from ners.research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker from ners.research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import MODEL_REGISTRY from ners.research.model_registry import MODEL_REGISTRY
class ModelTrainer: class ModelTrainer:
@@ -66,7 +66,9 @@ class ModelTrainer:
if experiment and experiment.test_metrics: if experiment and experiment.test_metrics:
logging.info("Training completed successfully!") logging.info("Training completed successfully!")
logging.info(f"Experiment ID: {experiment_id}") logging.info(f"Experiment ID: {experiment_id}")
logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}") logging.info(
f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}"
)
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}") logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
if save_artifacts: if save_artifacts:
@@ -144,13 +146,17 @@ class ModelTrainer:
try: try:
# Load data for learning curve generation # Load data for learning curve generation
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"]) data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists(): if data_path.exists():
df = self.data_loader.load_csv_complete(data_path) df = self.data_loader.load_csv_complete(data_path)
# Generate learning curve # Generate learning curve
logging.info("Generating learning curve...") logging.info("Generating learning curve...")
trained_model.generate_learning_curve(df, df[experiment.config.target_column]) trained_model.generate_learning_curve(
df, df[experiment.config.target_column]
)
# Plot and save learning curve # Plot and save learning curve
learning_curve_path = model_dir / "learning_curve.png" learning_curve_path = model_dir / "learning_curve.png"
@@ -187,8 +193,12 @@ class ModelTrainer:
"model_path": str(model_path), "model_path": str(model_path),
"config_path": str(config_path), "config_path": str(config_path),
"results_path": str(results_path), "results_path": str(results_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None, "learning_curve_plot": str(learning_curve_path)
"training_history_plot": str(training_history_path) if training_history_path else None, if learning_curve_path
else None,
"training_history_plot": str(training_history_path)
if training_history_path
else None,
"has_learning_curve": bool(trained_model.learning_curve_data), "has_learning_curve": bool(trained_model.learning_curve_data),
"has_training_history": bool(trained_model.training_history), "has_training_history": bool(trained_model.training_history),
} }
@@ -215,8 +225,12 @@ class ModelTrainer:
"config_path": str(config_path), "config_path": str(config_path),
"results_path": str(results_path), "results_path": str(results_path),
"metadata_path": str(metadata_path), "metadata_path": str(metadata_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None, "learning_curve_plot": str(learning_curve_path)
"training_history_plot": str(training_history_path) if training_history_path else None, if learning_curve_path
else None,
"training_history_plot": str(training_history_path)
if training_history_path
else None,
} }
def load_trained_model(self, experiment_id: str): def load_trained_model(self, experiment_id: str):
@@ -227,7 +241,9 @@ class ModelTrainer:
model_path = model_dir / "complete_model.joblib" model_path = model_dir / "complete_model.joblib"
if not model_path.exists(): if not model_path.exists():
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}") raise FileNotFoundError(
f"Model artifacts not found for experiment {experiment_id}"
)
# Load the model class dynamically # Load the model class dynamically
metadata_path = model_dir / "metadata.json" metadata_path = model_dir / "metadata.json"
@@ -261,7 +277,9 @@ class ModelTrainer:
metadata = json.load(f) metadata = json.load(f)
models_data.append(metadata) models_data.append(metadata)
except Exception as e: except Exception as e:
logging.warning(f"Could not read metadata for {model_dir.name}: {e}") logging.warning(
f"Could not read metadata for {model_dir.name}: {e}"
)
if not models_data: if not models_data:
logging.info("No saved models found.") logging.info("No saved models found.")
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel from ners.research.neural_network_model import NeuralNetworkModel
class BiGRUModel(NeuralNetworkModel): class BiGRUModel(NeuralNetworkModel):
@@ -53,7 +53,9 @@ class BiGRUModel(NeuralNetworkModel):
) )
model.compile( model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"] loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
) )
return model return model
@@ -15,7 +15,7 @@ from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.sequence import pad_sequences
from research.neural_network_model import NeuralNetworkModel from ners.research.neural_network_model import NeuralNetworkModel
class CNNModel(NeuralNetworkModel): class CNNModel(NeuralNetworkModel):
@@ -29,7 +29,9 @@ class CNNModel(NeuralNetworkModel):
[ [
# Learn char/subword embeddings; spatial dropout regularizes across channels # Learn char/subword embeddings; spatial dropout regularizes across channels
# to make the model robust to noisy characters and transliteration. # to make the model robust to noisy characters and transliteration.
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)), Embedding(
input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)
),
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)), SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
# Small kernels capture short n-gram like patterns; padding='same' keeps # Small kernels capture short n-gram like patterns; padding='same' keeps
# sequence length stable for simpler pooling behavior. # sequence length stable for simpler pooling behavior.
@@ -59,7 +61,9 @@ class CNNModel(NeuralNetworkModel):
) )
model.compile( model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"] loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
) )
return model return model
@@ -75,6 +79,8 @@ class CNNModel(NeuralNetworkModel):
self.tokenizer.fit_on_texts(text_data) self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data) sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 20) # Longer for character level max_len = self.config.model_params.get(
"max_len", 20
) # Longer for character level
return pad_sequences(sequences, maxlen=max_len, padding="post") return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -8,8 +8,8 @@ from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from research.experiment import ExperimentConfig from ners.research.experiment import ExperimentConfig
from research.traditional_model import TraditionalModel from ners.research.traditional_model import TraditionalModel
class EnsembleModel(TraditionalModel): class EnsembleModel(TraditionalModel):
@@ -40,22 +40,28 @@ class EnsembleModel(TraditionalModel):
[ [
( (
"vectorizer", "vectorizer",
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000), CountVectorizer(
analyzer="char", ngram_range=(2, 4), max_features=5000
),
), ),
( (
"classifier", "classifier",
LogisticRegression(max_iter=1000, random_state=self.config.random_seed), LogisticRegression(
max_iter=1000, random_state=self.config.random_seed
),
), ),
] ]
) )
estimators.append((f"logistic_regression", model)) estimators.append(("logistic_regression", model))
elif model_type == "random_forest": elif model_type == "random_forest":
model = Pipeline( model = Pipeline(
[ [
( (
"vectorizer", "vectorizer",
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000), TfidfVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=3000
),
), ),
( (
"classifier", "classifier",
@@ -65,19 +71,21 @@ class EnsembleModel(TraditionalModel):
), ),
] ]
) )
estimators.append((f"rf", model)) estimators.append(("rf", model))
elif model_type == "naive_bayes": elif model_type == "naive_bayes":
model = Pipeline( model = Pipeline(
[ [
( (
"vectorizer", "vectorizer",
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000), CountVectorizer(
analyzer="char", ngram_range=(1, 3), max_features=4000
),
), ),
("classifier", MultinomialNB()), ("classifier", MultinomialNB()),
] ]
) )
estimators.append((f"nb", model)) estimators.append(("nb", model))
# Soft voting averages probabilities (preferred when members are calibrated); # Soft voting averages probabilities (preferred when members are calibrated);
# hard voting uses majority class. Parallelize member predictions. # hard voting uses majority class. Parallelize member predictions.
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel from ners.research.traditional_model import TraditionalModel
class LightGBMModel(TraditionalModel): class LightGBMModel(TraditionalModel):
@@ -106,7 +106,9 @@ class LightGBMModel(TraditionalModel):
lambda x: x if x in known_classes else default_class lambda x: x if x in known_classes else default_class
) )
encoded = self.label_encoders[feature_key].transform(column_mapped) encoded = self.label_encoders[feature_key].transform(
column_mapped
)
features.append(encoded.reshape(-1, 1)) features.append(encoded.reshape(-1, 1))
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel from ners.research.traditional_model import TraditionalModel
class LogisticRegressionModel(TraditionalModel): class LogisticRegressionModel(TraditionalModel):
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel from ners.research.neural_network_model import NeuralNetworkModel
class LSTMModel(NeuralNetworkModel): class LSTMModel(NeuralNetworkModel):
@@ -50,7 +50,9 @@ class LSTMModel(NeuralNetworkModel):
) )
model.compile( model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"] loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
) )
return model return model
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel from ners.research.traditional_model import TraditionalModel
class NaiveBayesModel(TraditionalModel): class NaiveBayesModel(TraditionalModel):
@@ -6,7 +6,7 @@ from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel from ners.research.traditional_model import TraditionalModel
class RandomForestModel(TraditionalModel): class RandomForestModel(TraditionalModel):
@@ -18,7 +18,6 @@ class RandomForestModel(TraditionalModel):
self.label_encoders: Dict[str, LabelEncoder] = {} self.label_encoders: Dict[str, LabelEncoder] = {}
def build_model(self) -> BaseEstimator: def build_model(self) -> BaseEstimator:
params = self.config.model_params params = self.config.model_params
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize # Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
@@ -56,10 +55,14 @@ class RandomForestModel(TraditionalModel):
column_clean = column.fillna("unknown").astype(str) column_clean = column.fillna("unknown").astype(str)
known_classes = set(encoder.classes_) known_classes = set(encoder.classes_)
default_class = ( default_class = (
"unknown" if "unknown" in known_classes else encoder.classes_[0] "unknown"
if "unknown" in known_classes
else encoder.classes_[0]
) )
column_mapped = column_clean.apply( column_mapped = column_clean.apply(
lambda value: value if value in known_classes else default_class lambda value: value
if value in known_classes
else default_class
) )
encoded = encoder.transform(column_mapped) encoded = encoder.transform(column_mapped)
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.svm import SVC from sklearn.svm import SVC
from research.traditional_model import TraditionalModel from ners.research.traditional_model import TraditionalModel
class SVMModel(TraditionalModel): class SVMModel(TraditionalModel):
@@ -16,7 +16,7 @@ from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel from ners.research.neural_network_model import NeuralNetworkModel
class TransformerModel(NeuralNetworkModel): class TransformerModel(NeuralNetworkModel):
@@ -37,7 +37,8 @@ class TransformerModel(NeuralNetworkModel):
# Add positional encoding # Add positional encoding
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1) positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
pos_embedding = Embedding( pos_embedding = Embedding(
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64) input_dim=params.get("max_len", 8),
output_dim=params.get("embedding_dim", 64),
)(positions) )(positions)
x = x + pos_embedding x = x + pos_embedding
@@ -49,7 +50,9 @@ class TransformerModel(NeuralNetworkModel):
model = Model(inputs, outputs) model = Model(inputs, outputs)
model.compile( model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["accuracy"],
) )
return model return model
@@ -62,11 +65,15 @@ class TransformerModel(NeuralNetworkModel):
key_dim=cfg_params.get("transformer_head_size", 64), key_dim=cfg_params.get("transformer_head_size", 64),
dropout=cfg_params.get("attn_dropout", 0.1), dropout=cfg_params.get("attn_dropout", 0.1),
)(x, x) )(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn)) x = LayerNormalization(epsilon=1e-6)(
x + Dropout(cfg_params.get("dropout", 0.1))(attn)
)
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x) ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
ff = Dense(x.shape[-1])(ff) ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff)) return LayerNormalization(epsilon=1e-6)(
x + Dropout(cfg_params.get("dropout", 0.1))(ff)
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray: def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = self._collect_text_corpus(X) text_data = self._collect_text_corpus(X)
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel from ners.research.traditional_model import TraditionalModel
class XGBoostModel(TraditionalModel): class XGBoostModel(TraditionalModel):
@@ -106,7 +106,9 @@ class XGBoostModel(TraditionalModel):
lambda x: x if x in known_classes else default_class lambda x: x if x in known_classes else default_class
) )
encoded = self.label_encoders[feature_key].transform(column_mapped) encoded = self.label_encoders[feature_key].transform(
column_mapped
)
features.append(encoded.reshape(-1, 1)) features.append(encoded.reshape(-1, 1))
@@ -10,8 +10,10 @@ from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel import tensorflow as tf
from research.experiment.feature_extractor import FeatureExtractor
from ners.research.base_model import BaseModel
from ners.research.experiment.feature_extractor import FeatureExtractor
class NeuralNetworkModel(BaseModel): class NeuralNetworkModel(BaseModel):
@@ -34,8 +36,6 @@ class NeuralNetworkModel(BaseModel):
# - Enables memory growth to avoid pre-allocating all VRAM # - Enables memory growth to avoid pre-allocating all VRAM
# - Optionally enables mixed precision if requested via model params # - Optionally enables mixed precision if requested via model params
try: try:
import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs
requested_gpu = bool(self.config.model_params.get("use_gpu", False)) requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False)) enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
@@ -49,15 +49,15 @@ class NeuralNetworkModel(BaseModel):
if enable_mixed: if enable_mixed:
try: try:
from tensorflow.keras import mixed_precision tf.keras.mixed_precision.set_global_policy("mixed_float16")
mixed_precision.set_global_policy("mixed_float16")
logging.info("Enabled TensorFlow mixed precision (float16)") logging.info("Enabled TensorFlow mixed precision (float16)")
except Exception as e: except Exception as e:
logging.warning(f"Could not enable mixed precision: {e}") logging.warning(f"Could not enable mixed precision: {e}")
else: else:
if requested_gpu: if requested_gpu:
logging.warning("Requested GPU but no TensorFlow GPU device is available.") logging.warning(
"Requested GPU but no TensorFlow GPU device is available."
)
except Exception as e: except Exception as e:
# Keep silent in non-TF environments / non-NN workflows # Keep silent in non-TF environments / non-NN workflows
logging.debug(f"TensorFlow GPU setup skipped: {e}") logging.debug(f"TensorFlow GPU setup skipped: {e}")
@@ -86,7 +86,9 @@ class NeuralNetworkModel(BaseModel):
logging.info(f"Vocabulary size: {vocab_size}") logging.info(f"Vocabulary size: {vocab_size}")
# Get additional model parameters # Get additional model parameters
self.model = self.build_model_with_vocab(vocab_size=vocab_size, **self.config.model_params) self.model = self.build_model_with_vocab(
vocab_size=vocab_size, **self.config.model_params
)
# Train the neural network # Train the neural network
logging.info( logging.info(
@@ -143,7 +145,7 @@ class NeuralNetworkModel(BaseModel):
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV # Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
invalid_mask = (arr < 0) | (arr > max_idx) invalid_mask = (arr < 0) | (arr > max_idx)
# Avoid turning zeros into OOV # Avoid turning zeros into OOV
invalid_mask &= (arr != 0) invalid_mask &= arr != 0
if invalid_mask.any(): if invalid_mask.any():
arr[invalid_mask] = oov_index arr[invalid_mask] = oov_index
@@ -157,10 +159,14 @@ class NeuralNetworkModel(BaseModel):
"""Combine configured textual features into one string per record.""" """Combine configured textual features into one string per record."""
column_names = [ column_names = [
feature.value for feature in self.config.features if feature.value in X.columns feature.value
for feature in self.config.features
if feature.value in X.columns
] ]
if not column_names: if not column_names:
raise ValueError("No configured text features found in the provided DataFrame.") raise ValueError(
"No configured text features found in the provided DataFrame."
)
text_frame = X[column_names].fillna("").astype(str) text_frame = X[column_names].fillna("").astype(str)
@@ -193,9 +199,7 @@ class NeuralNetworkModel(BaseModel):
pass pass
if enable_mixed: if enable_mixed:
try: try:
from tensorflow.keras import mixed_precision tf.keras.mixed_precision.set_global_policy("mixed_float16")
mixed_precision.set_global_policy("mixed_float16")
except Exception: except Exception:
pass pass
else: else:
@@ -208,7 +212,9 @@ class NeuralNetworkModel(BaseModel):
X_prepared = self._sanitize_sequences(X_prepared) X_prepared = self._sanitize_sequences(X_prepared)
y_encoded = self.label_encoder.transform(y) y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed) cv = StratifiedKFold(
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
)
accuracies = [] accuracies = []
precisions = [] precisions = []
@@ -280,14 +286,14 @@ class NeuralNetworkModel(BaseModel):
pass pass
if enable_mixed: if enable_mixed:
try: try:
from tensorflow.keras import mixed_precision tf.keras.mixed_precision.set_global_policy("mixed_float16")
mixed_precision.set_global_policy("mixed_float16")
except Exception: except Exception:
pass pass
else: else:
if requested_gpu: if requested_gpu:
logging.warning("Requested GPU for learning curve but none is available.") logging.warning(
"Requested GPU for learning curve but none is available."
)
except Exception: except Exception:
pass pass
@@ -342,7 +348,7 @@ class NeuralNetworkModel(BaseModel):
# Train model # Train model
if hasattr(model, "fit"): if hasattr(model, "fit"):
history = model.fit( model.fit(
X_train_subset, X_train_subset,
y_train_subset, y_train_subset,
epochs=self.config.model_params.get("epochs", 10), epochs=self.config.model_params.get("epochs", 10),
@@ -3,12 +3,16 @@ import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
from research.statistics.utils import LETTERS, build_letter_frequencies from ners.research.statistics.utils import LETTERS, build_letter_frequencies
def plot_transition_matrix(ax, df_probs, title=""): def plot_transition_matrix(ax, df_probs, title=""):
hm = sns.heatmap( hm = sns.heatmap(
df_probs.loc[list(LETTERS), list(LETTERS)], cmap="Reds", annot=False, cbar=False, ax=ax df_probs.loc[list(LETTERS), list(LETTERS)],
cmap="Reds",
annot=False,
cbar=False,
ax=ax,
) )
ax.set_title(title, fontsize=12) ax.set_title(title, fontsize=12)
return hm return hm
@@ -31,8 +35,12 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None):
x = np.arange(len(df_plot)) x = np.arange(len(df_plot))
w = 0.4 w = 0.4
fig, ax = plt.subplots(figsize=(16, 6)) fig, ax = plt.subplots(figsize=(16, 6))
ax.bar(x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8) ax.bar(
ax.bar(x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8) x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8
)
ax.bar(
x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8
)
ax.set_xticks(x) ax.set_xticks(x)
ax.set_xticklabels(df_plot["letter"]) ax.set_xticklabels(df_plot["letter"])
@@ -5,8 +5,6 @@ import numpy as np
import pandas as pd import pandas as pd
from scipy.spatial.distance import euclidean from scipy.spatial.distance import euclidean
from scipy.stats import entropy from scipy.stats import entropy
from scipy.spatial.distance import euclidean
from scipy.stats import entropy
from typing import Dict, Any from typing import Dict, Any
LETTERS = "abcdefghijklmnopqrstuvwxyz" LETTERS = "abcdefghijklmnopqrstuvwxyz"
@@ -49,7 +47,12 @@ def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFram
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame: def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
# Normalize: lowercase, remove non-letters, concatenate all into one string # Normalize: lowercase, remove non-letters, concatenate all into one string
s = series.astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True).str.cat(sep="") s = (
series.astype(str)
.str.lower()
.str.replace(r"[^a-z]", "", regex=True)
.str.cat(sep="")
)
# Convert string into Series of characters # Convert string into Series of characters
chars = pd.Series(list(s)) chars = pd.Series(list(s))
@@ -150,8 +153,12 @@ def build_transition_comparisons(
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12) kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12) kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
kl_surnames_mf = entropy(prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12) kl_surnames_mf = entropy(
kl_surnames_fm = entropy(prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12) prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12
)
kl_surnames_fm = entropy(
prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12
)
jsd_names = 0.5 * (kl_names_mf + kl_names_fm) jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm) jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
@@ -163,7 +170,9 @@ def build_transition_comparisons(
P_f = transitions["f"]["probs"].flatten() P_f = transitions["f"]["probs"].flatten()
# Calculate the observed JSD (our test statistic) # Calculate the observed JSD (our test statistic)
observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)) observed_jsd = 0.5 * (
entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)
)
# Concatenate male and female counts # Concatenate male and female counts
counts_m = transitions["m"]["counts"] counts_m = transitions["m"]["counts"]
@@ -194,10 +203,12 @@ def build_transition_comparisons(
permuted_jsd = 0.5 * ( permuted_jsd = 0.5 * (
entropy( entropy(
permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12 permuted_probs_m.mean(axis=1) + 1e-12,
permuted_probs_f.mean(axis=1) + 1e-12,
) )
+ entropy( + entropy(
permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12 permuted_probs_f.mean(axis=1) + 1e-12,
permuted_probs_m.mean(axis=1) + 1e-12,
) )
) )
permuted_jsds.append(permuted_jsd) permuted_jsds.append(permuted_jsd)
@@ -8,8 +8,8 @@ from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import learning_curve from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel from ners.research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor from ners.research.experiment.feature_extractor import FeatureExtractor
class TraditionalModel(BaseModel): class TraditionalModel(BaseModel):
@@ -52,7 +52,9 @@ class TraditionalModel(BaseModel):
# Train model # Train model
if len(X_prepared.shape) == 1: if len(X_prepared.shape) == 1:
# For text-based features (like LogisticRegression with vectorization) # For text-based features (like LogisticRegression with vectorization)
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)") logging.info(
f"Fitting model with {X_prepared.shape[0]} samples (text features)"
)
else: else:
# For numerical features # For numerical features
logging.info( logging.info(
@@ -74,12 +76,16 @@ class TraditionalModel(BaseModel):
return self return self
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]: def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float]:
features_df = self.feature_extractor.extract_features(X) features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df) X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y) y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed) cv = StratifiedKFold(
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
)
# Calculate different metrics # Calculate different metrics
results = {} results = {}
@@ -95,7 +101,11 @@ class TraditionalModel(BaseModel):
for metric in ["precision", "recall", "f1"]: for metric in ["precision", "recall", "f1"]:
if metric in self.config.metrics: if metric in self.config.metrics:
scores = cross_val_score( scores = cross_val_score(
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted" self.model,
X_prepared,
y_encoded,
cv=cv,
scoring=f"{metric}_weighted",
) )
results[metric] = scores.mean() results[metric] = scores.mean()
results[f"{metric}_std"] = scores.std() results[f"{metric}_std"] = scores.std()
+46
View File
@@ -0,0 +1,46 @@
#!.venv/bin/python3
import logging
import traceback
from ners.core.config import setup_config
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.model_trainer import ModelTrainer
def train_from_template(
name: str,
type: str,
*,
templates: str = "research_templates.yaml",
config: str | None = None,
env: str = "development",
) -> int:
try:
cfg = setup_config(config_path=config, env=env)
experiment_builder = ExperimentBuilder(cfg)
logging.info(f"Loading research templates from: {templates}")
tmpl = experiment_builder.load_templates(templates)
logging.info(f"Looking for experiment: name='{name}', type='{type}'")
experiment_config = experiment_builder.find_template(tmpl, name, type)
logging.info(f"Found experiment: {experiment_config.get('name')}")
logging.info(f"Description: {experiment_config.get('description')}")
logging.info(f"Features: {experiment_config.get('features')}")
trainer = ModelTrainer(cfg)
trainer.train_single_model(
model_name=experiment_config.get("name"),
model_type=experiment_config.get("model_type"),
features=experiment_config.get("features"),
model_params=experiment_config.get("model_params", {}),
tags=experiment_config.get("tags", []),
)
logging.info("Training completed successfully!")
return 0
except Exception as e:
logging.error(f"Training failed: {e}")
traceback.print_exc()
return 1
+12 -28
View File
@@ -1,19 +1,13 @@
#!.venv/bin/python3 #!.venv/bin/python3
import argparse import os
import sys
from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules from ners.core.config import setup_config, PipelineConfig
parent_dir = Path(__file__).parent.parent from ners.core.utils.data_loader import DataLoader
sys.path.insert(0, str(parent_dir)) from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
from ners.research.experiment.experiment_runner import ExperimentRunner
from core.config import setup_config, PipelineConfig from ners.research.experiment.experiment_tracker import ExperimentTracker
from core.utils.data_loader import DataLoader
from processing.monitoring.pipeline_monitor import PipelineMonitor
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
# Page configuration # Page configuration
st.set_page_config( st.set_page_config(
@@ -65,19 +59,9 @@ class StreamlitApp:
) )
def main(): # Initialize app using environment variables when launched via Typer
parser = argparse.ArgumentParser( _config_path = os.environ.get("NERS_CONFIG")
description="DRC NERS Platform", _env = os.environ.get("NERS_ENV", "development")
formatter_class=argparse.RawDescriptionHelpFormatter, _cfg = setup_config(_config_path, env=_env)
) _app = StreamlitApp(_cfg)
parser.add_argument("--config", type=str, help="Path to configuration file") _app.run()
parser.add_argument("--env", type=str, default="development", help="Environment name")
args = parser.parse_args()
config = setup_config(args.config, env=args.env)
app = StreamlitApp(config)
app.run()
if __name__ == "__main__":
main()
@@ -1,7 +1,7 @@
import pandas as pd import pandas as pd
import streamlit as st import streamlit as st
from core.utils.data_loader import OPTIMIZED_DTYPES from ners.core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data @st.cache_data
@@ -25,7 +25,9 @@ class Dashboard:
# Load basic statistics # Load basic statistics
try: try:
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"]) data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists(): if data_path.exists():
df = load_dataset(str(data_path)) df = load_dataset(str(data_path))
@@ -37,13 +39,17 @@ class Dashboard:
st.metric("Annotated Names", f"{annotated:,}") st.metric("Annotated Names", f"{annotated:,}")
with col3: with col3:
provinces = df["province"].nunique() if "province" in df.columns else 0 provinces = (
df["province"].nunique() if "province" in df.columns else 0
)
st.metric("Provinces", provinces) st.metric("Provinces", provinces)
with col4: with col4:
if "sex" in df.columns: if "sex" in df.columns:
gender_dist = df["sex"].value_counts() gender_dist = df["sex"].value_counts()
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1) ratio = gender_dist.get("f", 0) / max(
gender_dist.get("m", 1), 1
)
st.metric("F/M Rate", f"{ratio:.2%}") st.metric("F/M Rate", f"{ratio:.2%}")
with col5: with col5:
if "annotated" in df.columns: if "annotated" in df.columns:
@@ -79,4 +85,6 @@ class Dashboard:
st.dataframe(pd.DataFrame(exp_data), use_container_width=True) st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
else: else:
st.info("No experiments found. Create your first experiment in the Experiments tab!") st.info(
"No experiments found. Create your first experiment in the Experiments tab!"
)
+52
View File
@@ -0,0 +1,52 @@
from datetime import datetime
import pandas as pd
import streamlit as st
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataOverview:
def __init__(self, config):
self.config = config
def index(self):
st.title("Data Overview")
data_files = {
"Names": self.config.data.input_file,
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
"Male Names": self.config.data.output_files["males"],
"Female Names": self.config.data.output_files["females"],
}
st.write("Available Data Files:")
for name, rel_path in data_files.items():
file_path = self.config.paths.get_data_path(rel_path)
exists = file_path.exists()
size = file_path.stat().st_size if exists else 0
stats = (
f"Size: {size / (1024 * 1024):.1f} MB, Last Modified: {datetime.fromtimestamp(file_path.stat().st_mtime)}"
if exists
else "Not found"
)
st.write(f"- {name}: {file_path} ({stats})")
# Preview featured dataset if available
data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists():
df = load_dataset(str(data_path))
st.subheader("Featured Dataset Preview")
st.dataframe(df.head(), use_container_width=True)
st.write(f"Rows: {len(df):,}")
@@ -2,8 +2,8 @@ import pandas as pd
import plotly.express as px import plotly.express as px
import streamlit as st import streamlit as st
from core.utils.data_loader import OPTIMIZED_DTYPES from ners.core.utils.data_loader import OPTIMIZED_DTYPES
from web.interfaces.log_reader import LogReader from ners.web.interfaces.log_reader import LogReader
@st.cache_data @st.cache_data
@@ -31,7 +31,9 @@ class DataProcessing:
# Step details # Step details
for step_name, step_status in status["steps"].items(): for step_name, step_status in status["steps"].items():
with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"): with st.expander(
f"{step_name.replace('_', ' ').title()} - {step_status['status']}"
):
col1, col2, col3 = st.columns(3) col1, col2, col3 = st.columns(3)
with col1: with col1:
@@ -63,14 +65,20 @@ class DataProcessing:
with col2: with col2:
num_entries = st.number_input( num_entries = st.number_input(
"Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries" "Number of entries",
min_value=5,
max_value=50,
value=10,
key="num_log_entries",
) )
# Get log entries based on filter # Get log entries based on filter
if log_level_filter == "All": if log_level_filter == "All":
log_entries = log_reader.read_last_entries(num_entries) log_entries = log_reader.read_last_entries(num_entries)
else: else:
log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries) log_entries = log_reader.read_entries_by_level(
log_level_filter, num_entries
)
if log_entries: if log_entries:
for entry in log_entries: for entry in log_entries:
@@ -2,13 +2,13 @@ from typing import List, Dict
import streamlit as st import streamlit as st
from core.config.pipeline_config import PipelineConfig from ners.core.config.pipeline_config import PipelineConfig
from research.experiment import ExperimentConfig, ExperimentStatus from ners.research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiment_builder import ExperimentBuilder from ners.research.experiment.experiment_builder import ExperimentBuilder
from research.experiment.experiment_runner import ExperimentRunner from ners.research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker from ners.research.experiment.experiment_tracker import ExperimentTracker
from research.experiment.feature_extractor import FeatureType from ners.research.experiment.feature_extractor import FeatureType
from research.model_registry import list_available_models from ners.research.model_registry import list_available_models
class Experiments: class Experiments:
@@ -46,13 +46,19 @@ class Experiments:
available_experiments = self.experiment_builder.get_templates() available_experiments = self.experiment_builder.get_templates()
# Create tabs for different experiment types # Create tabs for different experiment types
exp_tabs = st.tabs(["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]) exp_tabs = st.tabs(
["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]
)
with exp_tabs[0]: with exp_tabs[0]:
self._show_experiments_by_type(available_experiments["baseline"], "baseline") self._show_experiments_by_type(
available_experiments["baseline"], "baseline"
)
with exp_tabs[1]: with exp_tabs[1]:
self._show_experiments_by_type(available_experiments["advanced"], "advanced") self._show_experiments_by_type(
available_experiments["advanced"], "advanced"
)
with exp_tabs[2]: with exp_tabs[2]:
self._show_experiments_by_type( self._show_experiments_by_type(
@@ -60,7 +66,9 @@ class Experiments:
) )
with exp_tabs[3]: with exp_tabs[3]:
self._show_experiments_by_type(available_experiments["tuning"], "tuning") self._show_experiments_by_type(
available_experiments["tuning"], "tuning"
)
except Exception as e: except Exception as e:
st.error(f"Error loading experiment templates: {e}") st.error(f"Error loading experiment templates: {e}")
@@ -79,7 +87,9 @@ class Experiments:
# Show available experiments # Show available experiments
for i, exp_template in enumerate(experiments): for i, exp_template in enumerate(experiments):
exp_name = exp_template.get("name", f"Experiment {i + 1}") exp_name = exp_template.get("name", f"Experiment {i + 1}")
exp_description = exp_template.get("description", "No description available") exp_description = exp_template.get(
"description", "No description available"
)
with st.expander(f"📊 {exp_name} - {exp_description}"): with st.expander(f"📊 {exp_name} - {exp_description}"):
col1, col2 = st.columns([2, 1]) col1, col2 = st.columns([2, 1])
@@ -88,7 +98,7 @@ class Experiments:
st.json(exp_template) st.json(exp_template)
with col2: with col2:
if st.button(f"🚀 Run Experiment", key=f"run_{experiment_type}_{i}"): if st.button("🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
self._run_template_experiment(exp_template) self._run_template_experiment(exp_template)
def _run_template_experiment(self, exp_template: Dict): def _run_template_experiment(self, exp_template: Dict):
@@ -100,7 +110,9 @@ class Experiments:
# Run the experiment # Run the experiment
experiment_id = self.experiment_runner.run_experiment(experiment_config) experiment_id = self.experiment_runner.run_experiment(experiment_config)
st.success(f"Experiment '{experiment_config.name}' completed successfully!") st.success(
f"Experiment '{experiment_config.name}' completed successfully!"
)
st.info(f"Experiment ID: `{experiment_id}`") st.info(f"Experiment ID: `{experiment_id}`")
# Show results # Show results
@@ -130,13 +142,17 @@ class Experiments:
) )
with col2: with col2:
model_filter = st.selectbox("Filter by Model", ["All"] + list_available_models()) model_filter = st.selectbox(
"Filter by Model", ["All"] + list_available_models()
)
with col3: with col3:
tag_filter = st.text_input("Filter by Tags (comma-separated)") tag_filter = st.text_input("Filter by Tags (comma-separated)")
# Get and filter experiments # Get and filter experiments
experiments = self._get_filtered_experiments(status_filter, model_filter, tag_filter) experiments = self._get_filtered_experiments(
status_filter, model_filter, tag_filter
)
if not experiments: if not experiments:
st.info("No experiments found matching the filters.") st.info("No experiments found matching the filters.")
@@ -149,20 +165,28 @@ class Experiments:
): ):
self._display_experiment_details(exp, i) self._display_experiment_details(exp, i)
def _get_filtered_experiments(self, status_filter: str, model_filter: str, tag_filter: str): def _get_filtered_experiments(
self, status_filter: str, model_filter: str, tag_filter: str
):
"""Get experiments with applied filters""" """Get experiments with applied filters"""
experiments = self.experiment_tracker.list_experiments() experiments = self.experiment_tracker.list_experiments()
# Apply filters # Apply filters
if status_filter != "All": if status_filter != "All":
experiments = [e for e in experiments if e.status == ExperimentStatus(status_filter)] experiments = [
e for e in experiments if e.status == ExperimentStatus(status_filter)
]
if model_filter != "All": if model_filter != "All":
experiments = [e for e in experiments if e.config.model_type == model_filter] experiments = [
e for e in experiments if e.config.model_type == model_filter
]
if tag_filter: if tag_filter:
tags = [tag.strip() for tag in tag_filter.split(",")] tags = [tag.strip() for tag in tag_filter.split(",")]
experiments = [e for e in experiments if any(tag in e.config.tags for tag in tags)] experiments = [
e for e in experiments if any(tag in e.config.tags for tag in tags)
]
return experiments return experiments
@@ -173,7 +197,9 @@ class Experiments:
with col1: with col1:
st.write(f"**Model:** {exp.config.model_type}") st.write(f"**Model:** {exp.config.model_type}")
st.write(f"**Features:** {', '.join([f.value for f in exp.config.features])}") st.write(
f"**Features:** {', '.join([f.value for f in exp.config.features])}"
)
st.write(f"**Tags:** {', '.join(exp.config.tags)}") st.write(f"**Tags:** {', '.join(exp.config.tags)}")
with col2: with col2:
@@ -185,7 +211,7 @@ class Experiments:
st.write(f"**Train Size:** {exp.train_size:,}") st.write(f"**Train Size:** {exp.train_size:,}")
st.write(f"**Test Size:** {exp.test_size:,}") st.write(f"**Test Size:** {exp.test_size:,}")
if st.button(f"View Details", key=f"details_{index}"): if st.button("View Details", key=f"details_{index}"):
st.session_state.selected_experiment = exp.experiment_id st.session_state.selected_experiment = exp.experiment_id
st.rerun() st.rerun()
@@ -198,7 +224,9 @@ class Experiments:
st.write("Run multiple experiments with different parameter combinations.") st.write("Run multiple experiments with different parameter combinations.")
# Add option to run template batch experiments # Add option to run template batch experiments
batch_type = st.radio("Batch Type", ["Template Batch", "Custom Parameter Sweep"]) batch_type = st.radio(
"Batch Type", ["Template Batch", "Custom Parameter Sweep"]
)
if batch_type == "Template Batch": if batch_type == "Template Batch":
self._show_template_batch_experiments() self._show_template_batch_experiments()
@@ -227,10 +255,13 @@ class Experiments:
if experiments: if experiments:
st.write(f"**{exp_type.title()} Experiments:**") st.write(f"**{exp_type.title()} Experiments:**")
exp_names = [ exp_names = [
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments) exp.get("name", f"Exp {i}")
for i, exp in enumerate(experiments)
] ]
selected_names = st.multiselect( selected_names = st.multiselect(
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}" f"Select {exp_type} experiments",
exp_names,
key=f"select_{exp_type}",
) )
for name in selected_names: for name in selected_names:
@@ -258,13 +289,17 @@ class Experiments:
experiment_configs.append(config) experiment_configs.append(config)
# Run batch experiments # Run batch experiments
experiment_ids = self.experiment_runner.run_experiment_batch(experiment_configs) experiment_ids = self.experiment_runner.run_experiment_batch(
experiment_configs
)
st.success(f"Completed {len(experiment_ids)} template experiments!") st.success(f"Completed {len(experiment_ids)} template experiments!")
# Show summary # Show summary
if experiment_ids: if experiment_ids:
comparison = self.experiment_runner.compare_experiments(experiment_ids) comparison = self.experiment_runner.compare_experiments(
experiment_ids
)
st.write("**Template Batch Results:**") st.write("**Template Batch Results:**")
st.dataframe( st.dataframe(
comparison[["name", "model_type", "test_accuracy"]], comparison[["name", "model_type", "test_accuracy"]],
@@ -285,7 +320,9 @@ class Experiments:
with col1: with col1:
base_name = st.text_input("Base Experiment Name", "parameter_sweep") base_name = st.text_input("Base Experiment Name", "parameter_sweep")
model_types = st.multiselect( model_types = st.multiselect(
"Model Types", list_available_models(), default=["logistic_regression"] "Model Types",
list_available_models(),
default=["logistic_regression"],
) )
# N-gram ranges for logistic regression # N-gram ranges for logistic regression
@@ -301,13 +338,20 @@ class Experiments:
default=["full_name", "native_name", "surname"], default=["full_name", "native_name", "surname"],
) )
test_sizes = st.text_input("Test Sizes (comma-separated)", "0.15,0.2,0.25") test_sizes = st.text_input(
"Test Sizes (comma-separated)", "0.15,0.2,0.25"
)
tags = st.text_input("Common Tags", "parameter_sweep,batch") tags = st.text_input("Common Tags", "parameter_sweep,batch")
if st.form_submit_button("🚀 Run Parameter Sweep"): if st.form_submit_button("🚀 Run Parameter Sweep"):
self.run_batch_experiments( self.run_batch_experiments(
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags base_name,
model_types,
ngram_ranges,
feature_combinations,
test_sizes,
tags,
) )
def run_batch_experiments( def run_batch_experiments(
@@ -369,13 +413,17 @@ class Experiments:
exp_count += 1 exp_count += 1
# Run experiments # Run experiments
experiment_ids = self.experiment_runner.run_experiment_batch(experiments) experiment_ids = self.experiment_runner.run_experiment_batch(
experiments
)
st.success(f"Completed {len(experiment_ids)} batch experiments") st.success(f"Completed {len(experiment_ids)} batch experiments")
# Show summary # Show summary
if experiment_ids: if experiment_ids:
comparison = self.experiment_runner.compare_experiments(experiment_ids) comparison = self.experiment_runner.compare_experiments(
experiment_ids
)
st.write("**Batch Results Summary:**") st.write("**Batch Results Summary:**")
st.dataframe( st.dataframe(
comparison[["name", "model_type", "test_accuracy"]], comparison[["name", "model_type", "test_accuracy"]],
+80
View File
@@ -0,0 +1,80 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List
@dataclass
class LogEntry:
timestamp: datetime
level: str
message: str
class LogReader:
def __init__(self, log_file_path: Path):
self.log_file_path = Path(log_file_path)
def read_last_entries(self, num_entries: int = 20) -> List[LogEntry]:
entries = []
if not self.log_file_path.exists():
return entries
with open(self.log_file_path, "r") as f:
lines = f.readlines()[-num_entries:]
for line in lines:
entry = self._parse_log_line(line)
if entry:
entries.append(entry)
return entries
def read_entries_by_level(
self, level: str, num_entries: int = 20
) -> List[LogEntry]:
entries = []
if not self.log_file_path.exists():
return entries
with open(self.log_file_path, "r") as f:
for line in reversed(f.readlines()):
entry = self._parse_log_line(line)
if entry and entry.level == level:
entries.append(entry)
if len(entries) >= num_entries:
break
return list(reversed(entries))
def get_log_stats(self) -> dict:
if not self.log_file_path.exists():
return {}
stats = {"total_lines": 0}
with open(self.log_file_path, "r") as f:
for line in f:
stats["total_lines"] += 1
entry = self._parse_log_line(line)
if entry:
stats[entry.level] = stats.get(entry.level, 0) + 1
return stats
@staticmethod
def _parse_log_line(line: str) -> LogEntry | None:
try:
# Expected format from logging config: [timestamp] - LEVEL - message
parts = line.strip().split(" - ")
if len(parts) >= 3:
timestamp_str = parts[0].strip("[]")
timestamp = datetime.fromisoformat(timestamp_str)
level = parts[1].strip()
message = " - ".join(parts[2:])
return LogEntry(timestamp, level, message)
except Exception:
return None
return None
@@ -1,10 +1,8 @@
from pathlib import Path
import streamlit as st import streamlit as st
from spacy import displacy from spacy import displacy
from core.config import PipelineConfig from ners.core.config import PipelineConfig
from processing.ner.name_model import NameModel from ners.processing.ner.name_model import NameModel
class NERTesting: class NERTesting:
@@ -56,12 +54,15 @@ class NERTesting:
with col1: with col1:
st.metric( st.metric(
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}" "Training Examples",
f"{self.training_stats.get('training_examples', 0):,}",
) )
with col2: with col2:
st.metric("Epochs", self.training_stats.get("epochs", 0)) st.metric("Epochs", self.training_stats.get("epochs", 0))
with col3: with col3:
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}") st.metric(
"Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}"
)
with col4: with col4:
st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}") st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
@@ -99,9 +100,11 @@ class NERTesting:
if names_input.strip(): if names_input.strip():
if st.button("Analyze All Names", type="primary"): if st.button("Analyze All Names", type="primary"):
names = [name.strip() for name in names_input.split("\n") if name.strip()] names = [
name.strip() for name in names_input.split("\n") if name.strip()
]
for i, name in enumerate(names): for i, name in enumerate(names):
st.markdown(f"**Name {i+1}: {name}**") st.markdown(f"**Name {i + 1}: {name}**")
self.analyze_and_display(name) self.analyze_and_display(name)
if i < len(names) - 1: if i < len(names) - 1:
st.markdown("---") st.markdown("---")
@@ -127,7 +130,9 @@ class NERTesting:
else: else:
st.warning("No entities detected in the input text.") st.warning("No entities detected in the input text.")
st.info("Try using traditional Congolese names or ensure the spelling is correct.") st.info(
"Try using traditional Congolese names or ensure the spelling is correct."
)
except Exception as e: except Exception as e:
st.error(f"Error analyzing text: {e}") st.error(f"Error analyzing text: {e}")
@@ -139,14 +144,21 @@ class NERTesting:
ents = [] ents = []
for entity in entities: for entity in entities:
ents.append( ents.append(
{"start": entity["start"], "end": entity["end"], "label": entity["label"]} {
"start": entity["start"],
"end": entity["end"],
"label": entity["label"],
}
) )
# Create doc-like structure for displacy # Create doc-like structure for displacy
doc_data = {"text": text, "ents": ents, "title": None} doc_data = {"text": text, "ents": ents, "title": None}
# Custom colors for our labels # Custom colors for our labels
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green colors = {
"NATIVE": "#74C0FC",
"SURNAME": "#69DB7C",
} # Light blue # Light green
options = {"colors": colors, "distance": 90} options = {"colors": colors, "distance": 90}

Some files were not shown because too many files have changed in this diff Show More