refactoring: uv

This commit is contained in:
2025-10-05 18:14:15 +02:00
parent f3b06fbd07
commit 9dd4f759b3
120 changed files with 5525 additions and 3366 deletions
+16
View File
@@ -0,0 +1,16 @@
.git
.gitignore
.idea
.vscode
__pycache__
.ruff_cache
.venv
*.pyc
*.pyo
*.pyd
*.swp
*.swo
*.DS_Store
dist
build
*.egg-info
+1
View File
@@ -0,0 +1 @@
3.11
+49
View File
@@ -0,0 +1,49 @@
# syntax=docker/dockerfile:1
# Minimal Linux base (glibc) Python will be installed by uv
FROM debian:bookworm-slim
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
UV_INSTALL_DIR=/usr/local/bin \
UV_LINK_MODE=copy \
UV_PYTHON_DOWNLOADS=1 \
UV_PROJECT_ENVIRONMENT=/app/.venv \
PATH=/app/.venv/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
WORKDIR /app
# System deps for building/using common scientific stack
# Keep minimal; rely on wheels where possible
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates curl git \
build-essential pkg-config \
libssl-dev libffi-dev \
libopenblas0 libstdc++6 \
libfreetype6 libpng16-16 libjpeg62-turbo \
&& rm -rf /var/lib/apt/lists/*
# Install uv (static binary)
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
# Copy project metadata first for layer caching
COPY pyproject.toml README.md ./
# Install a managed Python via uv and create the project venv
RUN uv python install 3.11 \
&& uv venv /app/.venv --python 3.11
# Resolve and install runtime deps into project venv
# Use lockfile if present for reproducibility
RUN if [ -f uv.lock ]; then uv sync --no-dev --no-install-project --frozen; else uv sync --no-dev --no-install-project; fi
# Copy source code and optional templates
COPY src ./src
# Re-sync to ensure the local package is installed
RUN uv sync --no-dev \
&& rm -rf /root/.cache
# Default command shows help; override in compose or docker run
CMD ["ners", "--help"]
+83 -137
View File
@@ -10,37 +10,23 @@ million names from the Democratic Republic of Congo (DRC) annotated with gender
### Installation & Setup
Instructions and command line snippets bellow are provided to help you set up the project environment quickly and
efficiently.
assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
**Using Makefile (Recommended)**
**Unix based**
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
git clone https://github.com/bernard-ng/drc-ners-nlp.git
cd drc-ners-nlp
# Setup environment
make setup
make activate
uv sync
```
**Manual Setup**
**Macos & windows**
```bash
git clone https://github.com/bernard-ng/drc-ners-nlp.git
cd drc-ners-nlp
# Setup environment
python -m venv .venv
.venv/bin/pip install --upgrade pip
.venv/bin/pip install -r requirements.txt
pip install --upgrade pip
pip install -r requirements.txt
pip install jupyter notebook ipykernel pytest black flake8 mypy
source .venv/bin/activate
docker compose build
docker compose run --rm app
docker compose run --rm app ners pipeline run --env=production
docker compose run --rm app ners research train --name=lightgbm --type=baseline --env=production
docker compose run --rm --service-ports app ners web run --env=production
```
## Data Processing
@@ -55,6 +41,7 @@ the `drc-ners-nlp/config/pipeline.yaml` file.
```yaml
stages:
- "data_cleaning"
- "data_selection"
- "feature_extraction"
- "data_splitting"
```
@@ -62,37 +49,7 @@ stages:
**Running the Pipeline**
```bash
python main.py --env production
```
## NER Processing (Optional)
This project implements a custom named entity recognition (NER) pipeline tailored for Congolese names.
Its main objective is to accurately identify and tag the different components of a Congolese name,
specifically distinguishing between the native part and the surname.
```bash
python ner.py --env production
```
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
**Running the Pipeline with NER Annotation**
```yaml
stages:
- "data_cleaning"
- "feature_extraction"
- "ner_annotation"
- "data_splitting"
```
**Running the Pipeline with LLM Annotation**
```yaml
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
uv run ners pipeline run --env="production"
```
## Experiments
@@ -105,54 +62,94 @@ you can define model features, training parameters, and evaluation metrics in th
```bash
# bigru
python train.py --name="bigru" --type="baseline" --env="production"
python train.py --name="bigru_native" --type="baseline" --env="production"
python train.py --name="bigru_surname" --type="baseline" --env="production"
uv run ners research train --name="bigru" --type="baseline" --env="production"
uv run ners research train --name="bigru_native" --type="baseline" --env="production"
uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
# cnn
python train.py --name="cnn" --type="baseline" --env="production"
python train.py --name="cnn_native" --type="baseline" --env="production"
python train.py --name="cnn_surname" --type="baseline" --env="production"
uv run ners research train --name="cnn" --type="baseline" --env="production"
uv run ners research train --name="cnn_native" --type="baseline" --env="production"
uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
# lightgbm
python train.py --name="lightgbm" --type="baseline" --env="production"
python train.py --name="lightgbm_native" --type="baseline" --env="production"
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
uv run ners research train --name="lightgbm" --type="baseline" --env="production"
uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
# logistic regression
python train.py --name="logistic_regression" --type="baseline" --env="production"
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
# lstm
python train.py --name="lstm" --type="baseline" --env="production"
python train.py --name="lstm_native" --type="baseline" --env="production"
python train.py --name="lstm_surname" --type="baseline" --env="production"
uv run ners research train --name="lstm" --type="baseline" --env="production"
uv run ners research train --name="lstm_native" --type="baseline" --env="production"
uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
# random forest
python train.py --name="random_forest" --type="baseline" --env="production"
python train.py --name="random_forest_native" --type="baseline" --env="production"
python train.py --name="random_forest_surname" --type="baseline" --env="production"
uv run ners research train --name="random_forest" --type="baseline" --env="production"
uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
# svm
python train.py --name="svm" --type="baseline" --env="production"
python train.py --name="svm_native" --type="baseline" --env="production"
python train.py --name="svm_surname" --type="baseline" --env="production"
uv run ners research train --name="svm" --type="baseline" --env="production"
uv run ners research train --name="svm_native" --type="baseline" --env="production"
uv run ners research train --name="svm_surname" --type="baseline" --env="production"
# naive bayes
python train.py --name="naive_bayes" --type="baseline" --env="production"
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
# transformer
python train.py --name="transformer" --type="baseline" --env="production"
python train.py --name="transformer_native" --type="baseline" --env="production"
python train.py --name="transformer_surname" --type="baseline" --env="production"
uv run ners research train --name="transformer" --type="baseline" --env="production"
uv run ners research train --name="transformer_native" --type="baseline" --env="production"
uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
# xgboost
python train.py --name="xgboost" --type="baseline" --env="production"
python train.py --name="xgboost_native" --type="baseline" --env="production"
python train.py --name="xgboost_surname" --type="baseline" --env="production"
uv run ners research train --name="xgboost" --type="baseline" --env="production"
uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
uv run ners research train --name="xgboost_surname" --type="baseline" --env="production"
```
## TensorFlow on macOS (Intel) with uv
TensorFlow no longer publishes wheels for macOS Intel. To keep using uv and run TF reliably, use a Linux container with TF preinstalled and install project code with minimal extras inside the container.
### One-time build
```bash
docker compose -f docker/compose.tf.yml build
If you see a message like `tensorflow/tensorflow:<tag>: not found`, update `docker/Dockerfile.tf-cpu` to a tag that exists (e.g., `2.17.0`) and rebuild:
```bash
sed -n '1,20p' docker/Dockerfile.tf-cpu # verify the FROM line
docker pull tensorflow/tensorflow:2.17.0 # quick availability check
docker compose -f docker/compose.tf.yml build
```
```
### Start a shell with uv and TF available
```bash
docker compose -f docker/compose.tf.yml run --rm tf bash
```
Inside the container:
```bash
# Install project in editable mode without pulling full deps
uv pip install -e . --no-deps
# Install only what research needs alongside TensorFlow
uv pip install typer pandas scikit-learn seaborn plotly
# Sanity check
uv run python -c "import tensorflow as tf; print(tf.__version__)"
# Run an experiment
uv run ners research train --name="lstm" --type="baseline" --env="production"
```
## Web Interface
@@ -163,60 +160,9 @@ experiments and make predictions without needing to understand the underlying co
### Running the Web Interface
```bash
streamlit run web/app.py
uv run ners web run --env="production"
```
## GPU Acceleration
This project can leverage GPUs for faster training when supported libraries and hardware are available.
- TensorFlow/Keras models (BiGRU, LSTM, CNN, Transformer)
- Uses GPU automatically if a TensorFlow GPU build is installed.
- The code enables safe GPU memory growth by default; optionally enable mixed precision for additional speed:
- Add `mixed_precision: true` in the experiment `model_params` (e.g., in `config/research_templates.yaml`).
- The final layer outputs are set to float32 for numerical stability under mixed precision.
- spaCy NER
- Automatically prefers GPU if available; otherwise falls back to CPU.
- Ensure a compatible CUDA-enabled spaCy/thinc stack is installed to use GPU.
- XGBoost
- Enable GPU by adding to the experiment `model_params`:
- `use_gpu: true` (sets `tree_method: gpu_hist` and `predictor: gpu_predictor`).
- LightGBM
- Enable GPU by adding to the experiment `model_params`:
- `use_gpu: true` (sets `device: gpu`). Optional: `gpu_platform_id`, `gpu_device_id`.
Example template snippet (GPU on):
```yaml
- name: "lstm_gpu"
description: "LSTM with GPU + mixed precision"
model_type: "lstm"
features: ["full_name"]
model_params:
embedding_dim: 128
lstm_units: 64
epochs: 5
batch_size: 128
use_gpu: true
mixed_precision: true
tags: ["gpu", "mixed_precision"]
- name: "xgboost_gpu"
description: "XGBoost with GPU"
model_type: "xgboost"
features: ["full_name"]
model_params:
n_estimators: 200
use_gpu: true
```
Notes:
- Install CUDAenabled binaries for TensorFlow/spaCy/LightGBM/XGBoost to actually use GPU.
- If GPU is requested but not available, training will proceed on CPU with a warning.
## Contributors
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
+21
View File
@@ -0,0 +1,21 @@
services:
app:
build:
context: .
dockerfile: Dockerfile
image: drc-ners:uv
working_dir: /app
tty: true
stdin_open: true
environment:
NERS_ENV: production
STREAMLIT_SERVER_ADDRESS: 0.0.0.0
# expose Streamlit for `ners web run`
ports:
- "8501:8501"
volumes:
- ./assets:/app/assets
- ./config:/app/config
- ./data:/app/data
# default command shows CLI help; override per run
command: ["ners", "--help"]
-90
View File
@@ -1,90 +0,0 @@
#!.venv/bin/python3
import argparse
import sys
import traceback
from pathlib import Path
from core.config import setup_config
from processing.monitoring.pipeline_monitor import PipelineMonitor
def main():
choices = [
"data_cleaning",
"data_selection",
"feature_extraction",
"ner_annotation",
"llm_annotation",
"data_splitting",
]
parser = argparse.ArgumentParser(description="DRC NERS Processing Monitoring")
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Clean command
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
clean_parser.add_argument("--step", type=str, choices=choices, help="default: all")
clean_parser.add_argument("--keep-last", type=int, default=1, help="(default: 1)")
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
# Reset command
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
reset_parser.add_argument("--step", type=str, choices=choices, help="(default: all)")
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
args = parser.parse_args()
try:
setup_config(config_path=args.config, env=args.env)
monitor = PipelineMonitor()
if not args.command:
parser.print_help()
monitor.print_status(detailed=True)
return 1
elif args.command == "clean":
checkpoint_info = monitor.count_checkpoint_files()
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
if not args.force:
response = input("Are you sure you want to clean checkpoints? (y/N): ")
if response.lower() != "y":
print("Cancelled")
return 0
if args.step:
monitor.clean_step_checkpoints(args.step, args.keep_last)
else:
for step in monitor.steps:
monitor.clean_step_checkpoints(step, args.keep_last)
print("Checkpoint cleaning completed")
elif args.command == "reset":
if not args.force:
response = input(
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
)
if response.lower() != "y":
print("Cancelled")
return 0
if args.step:
monitor.reset_step(args.step)
else:
for step in monitor.steps:
monitor.reset_step(step)
print(f"Reset completed")
except Exception as e:
print(f"Monitoring failed: {e}")
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
-499
View File
File diff suppressed because one or more lines are too long
+41
View File
@@ -0,0 +1,41 @@
[project]
name = "ners"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"geopandas>=1.1.1",
"joblib>=1.5.2",
"lightgbm>=4.6.0",
"matplotlib>=3.10.6",
"numpy>=2.3.3",
"ollama>=0.6.0",
"pandas>=2.3.3",
"plotly>=6.3.1",
"psutil>=7.1.0",
"pydantic>=2.11.10",
"pyyaml>=6.0.3",
"scikit-learn>=1.7.2",
"seaborn>=0.13.2",
"spacy>=3.8.7",
"streamlit>=1.50.0",
"tqdm>=4.67.1",
"typer>=0.19.2",
"xgboost>=3.0.5",
]
[project.scripts]
ners = "ners.cli:app"
[build-system]
requires = ["uv_build>=0.8.12,<0.9.0"]
build-backend = "uv_build"
[dependency-groups]
dev = [
"ruff>=0.13.3",
]
[tool.uv]
required-environments = ["sys_platform == 'linux' and platform_machine == 'x86_64'"]
-170
View File
@@ -1,170 +0,0 @@
absl-py==2.3.0
altair==5.1.2
annotated-types==0.7.0
anyio==4.9.0
appnope==0.1.4
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==3.0.0
astunparse==1.6.3
async-lru==2.0.5
attrs==25.3.0
babel==2.17.0
beautifulsoup4==4.13.4
black==25.1.0
bleach==6.2.0
blinker==1.9.0
cachetools==6.1.0
certifi==2025.6.15
cffi==1.17.1
charset-normalizer==3.4.2
click==8.2.1
comm==0.2.2
contourpy==1.3.2
cycler==0.12.1
debugpy==1.8.14
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.0
fastjsonschema==2.21.1
flake8==7.3.0
flatbuffers==25.2.10
fonttools==4.58.4
fqdn==1.5.1
gast==0.6.0
gitdb==4.0.12
GitPython==3.1.45
google-pasta==0.2.0
grpcio==1.73.0
h11==0.16.0
h5py==3.14.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
imbalanced-learn==0.13.0
ipykernel==6.29.5
ipython>=8.0,<9.0
ipython_pygments_lexers==1.1.1
isoduration==20.11.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.1
json5==0.12.0
jsonpointer==3.0.0
jsonschema==4.24.0
jsonschema-specifications==2025.4.1
jupyter-events==0.12.0
jupyter-lsp==2.2.5
jupyter_client==8.6.3
jupyter_core==5.8.1
jupyter_server==2.16.0
jupyter_server_terminals==0.5.3
jupyterlab==4.4.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
keras==3.10.0
kiwisolver==1.4.8
libclang==18.1.1
lightgbm~=4.6.0
Markdown==3.8.2
markdown-it-py==3.0.0
MarkupSafe==3.0.2
matplotlib==3.10.3
matplotlib-inline==0.1.7
mccabe==0.7.0
mdurl==0.1.2
mistune==3.1.3
ml-dtypes==0.3.2
mypy==1.17.0
mypy_extensions==1.1.0
namex==0.1.0
narwhals==2.0.1
nbclient==0.10.2
nbconvert==7.16.6
nbformat==5.10.4
nest-asyncio==1.6.0
nltk==3.9.1
notebook==7.4.4
notebook_shim==0.2.4
numpy==1.26.4
ollama~=0.5.1
opt_einsum==3.4.0
optree==0.16.0
overrides==7.7.0
packaging==25.0
pandas==2.3.0
pandocfilters==1.5.1
parso==0.8.4
pathspec==0.12.1
pexpect==4.9.0
pillow==11.2.1
platformdirs==4.3.8
plotly~=6.2.0
prometheus_client==0.22.1
prompt_toolkit==3.0.51
protobuf==4.25.8
psutil==7.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pyarrow==21.0.0
pycodestyle==2.14.0
pycparser==2.22
pydantic~=2.11.7
pydantic_core==2.33.2
pydeck==0.9.1
pyflakes==3.4.0
Pygments==2.19.1
pyparsing==3.2.3
python-dateutil==2.9.0.post0
python-json-logger==3.3.0
pytz==2025.2
PyYAML~=6.0.2
pyzmq==27.0.0
referencing==0.36.2
regex==2024.11.6
requests==2.32.4
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==14.0.0
rpds-py==0.26.0
scikit-learn~=1.6.1
scipy==1.15.3
seaborn==0.13.2
Send2Trash==1.8.3
six==1.17.0
sklearn-compat==0.1.3
smmap==5.0.2
sniffio==1.3.1
soupsieve==2.7
spacy~=3.8.7
stack-data==0.6.3
streamlit~=1.47.1
tenacity==9.1.2
tensorboard==2.16.2
tensorboard-data-server==0.7.2
tensorflow==2.16.2
tensorflow-io-gcs-filesystem==0.37.1
termcolor==3.1.0
terminado==0.18.1
threadpoolctl==3.6.0
tinycss2==1.4.0
toml==0.10.2
toolz==1.0.0
tornado==6.5.1
tqdm==4.67.1
traitlets==5.14.3
types-python-dateutil==2.9.0.20250516
types-PyYAML==6.0.12.20250516
typing-inspection==0.4.1
typing_extensions==4.14.0
tzdata==2025.2
uri-template==1.3.0
urllib3==2.5.0
wcwidth==0.2.13
webcolors==24.11.1
webencodings==0.5.1
websocket-client==1.8.0
Werkzeug==3.1.3
wrapt==1.17.2
xgboost~=3.0.3
+3
View File
@@ -0,0 +1,3 @@
"""DRC NERS NLP package."""
__all__: list[str] = []
+226
View File
@@ -0,0 +1,226 @@
from __future__ import annotations
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional
import typer
from ners.core.config import setup_config, PipelineConfig
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
# -------------------------
# Pipeline commands
# -------------------------
pipeline_app = typer.Typer(help="Data processing pipeline")
app.add_typer(pipeline_app, name="pipeline")
@pipeline_app.command("run")
def pipeline_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Run the full processing pipeline."""
from ners.main import run_pipeline as _run_pipeline
cfg = setup_config(config_path=config, env=env)
code = _run_pipeline(cfg)
raise typer.Exit(code)
# -------------------------
# NER commands
# -------------------------
ner_app = typer.Typer(help="NER dataset and model")
app.add_typer(ner_app, name="ner")
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
return setup_config(config_path=config, env=env)
@ner_app.command("feature")
def ner_feature(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import feature as _feature
cfg = _load_config(config, env)
_feature(cfg)
@ner_app.command("build")
def ner_build(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import build as _build
cfg = _load_config(config, env)
_build(cfg)
@ner_app.command("train")
def ner_train(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import train as _train
cfg = _load_config(config, env)
_train(cfg)
@ner_app.command("run")
def ner_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
reset: bool = typer.Option(
False, help="Reset intermediate outputs and rerun all steps"
),
) -> None:
from ners.ner import run_pipeline as _ner_pipeline
cfg = _load_config(config, env)
code = _ner_pipeline(cfg, reset)
raise typer.Exit(code)
# -------------------------
# Research commands
# -------------------------
research_app = typer.Typer(help="Research experiments and training")
app.add_typer(research_app, name="research")
@research_app.command("train")
def research_train(
name: str = typer.Option(..., "--name", help="Model name to train"),
type: str = typer.Option(..., "--type", help="Experiment type"),
templates: str = typer.Option(
"research_templates.yaml", help="Templates file path"
),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.model_trainer import ModelTrainer
cfg = _load_config(config, env)
exp_builder = ExperimentBuilder(cfg)
tmpl = exp_builder.load_templates(templates)
exp_cfg = exp_builder.find_template(tmpl, name, type)
trainer = ModelTrainer(cfg)
trainer.train_single_model(
model_name=exp_cfg.get("name"),
model_type=exp_cfg.get("model_type"),
features=exp_cfg.get("features"),
model_params=exp_cfg.get("model_params", {}),
tags=exp_cfg.get("tags", []),
)
# -------------------------
# Monitor commands
# -------------------------
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
app.add_typer(monitor_app, name="monitor")
@monitor_app.command("status")
def monitor_status(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
detailed: bool = typer.Option(
False, help="Show detailed status (failed batch IDs)"
),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
PipelineMonitor().print_status(detailed=detailed)
@monitor_app.command("clean")
def monitor_clean(
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
typer.confirm("Clean checkpoints?", abort=True)
if step:
mon.clean_step_checkpoints(step, keep_last)
else:
for s in mon.steps:
mon.clean_step_checkpoints(s, keep_last)
@monitor_app.command("reset")
def monitor_reset(
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
typer.confirm(msg, abort=True)
if step:
mon.reset_step(step)
else:
for s in mon.steps:
mon.reset_step(s)
# -------------------------
# Web commands
# -------------------------
web_app = typer.Typer(help="Web UI wrapper")
app.add_typer(web_app, name="web")
@web_app.command("run")
def web_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Launch the Streamlit web app via subprocess."""
app_path = Path(__file__).parent / "web" / "app.py"
cmd = [
sys.executable,
"-m",
"streamlit",
"run",
str(app_path),
]
# Pass configuration via environment variables to avoid argparse in Streamlit
env_vars = os.environ.copy()
if config is not None:
env_vars["NERS_CONFIG"] = str(config)
env_vars["NERS_ENV"] = env
raise typer.Exit(subprocess.call(cmd, env=env_vars))
if __name__ == "__main__": # pragma: no cover
app()
@@ -2,10 +2,10 @@ import logging
from pathlib import Path
from typing import Optional, Union
from core.utils import ensure_directories
from .config_manager import ConfigManager
from .logging_config import LoggingConfig
from .pipeline_config import PipelineConfig
from ners.core.utils import ensure_directories
from ners.core.config.config_manager import ConfigManager
from ners.core.config.logging_config import LoggingConfig
from ners.core.config.pipeline_config import PipelineConfig
config_manager = ConfigManager()
@@ -22,7 +22,9 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfi
return config_manager.get_config()
def setup_config(config_path: Optional[Path] = None, env: str = "development") -> PipelineConfig:
def setup_config(
config_path: Optional[Path] = None, env: str = "development"
) -> PipelineConfig:
"""
Unified configuration loading and logging setup for all entrypoint scripts.
@@ -5,8 +5,8 @@ from typing import Optional, Union, Dict, Any
import yaml
from core.config.pipeline_config import PipelineConfig
from core.config.project_paths import ProjectPaths
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.config.project_paths import ProjectPaths
class ConfigManager:
@@ -36,7 +36,7 @@ class ConfigManager:
def _setup_default_paths(self):
"""Setup default project paths"""
root_dir = Path(__file__).parent.parent.parent
root_dir = Path(__file__).parent.parent.parent.parent.parent
self.default_paths = ProjectPaths(
root_dir=root_dir,
configs_dir=root_dir / "config",
@@ -53,7 +53,9 @@ class ConfigManager:
self.config_path = config_path
if not self.config_path.exists():
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
logging.warning(
f"Config file not found: {self.config_path}. Using defaults."
)
return self._create_default_config()
try:
@@ -122,7 +124,11 @@ class ConfigManager:
def _deep_update(self, base_dict: Dict, update_dict: Dict):
"""Recursively update nested dictionaries"""
for key, value in update_dict.items():
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
if (
key in base_dict
and isinstance(base_dict[key], dict)
and isinstance(value, dict)
):
self._deep_update(base_dict[key], value)
else:
base_dict[key] = value
@@ -1,10 +1,10 @@
from pydantic import BaseModel
from core.config.annotation_config import AnnotationConfig
from core.config.data_config import DataConfig
from core.config.logging_config import LoggingConfig
from core.config.processing_config import ProcessingConfig
from core.config.project_paths import ProjectPaths
from ners.core.config.annotation_config import AnnotationConfig
from ners.core.config.data_config import DataConfig
from ners.core.config.logging_config import LoggingConfig
from ners.core.config.processing_config import ProcessingConfig
from ners.core.config.project_paths import ProjectPaths
class PipelineConfig(BaseModel):
@@ -10,6 +10,8 @@ class ProcessingConfig(BaseModel):
max_workers: int = 4
checkpoint_interval: int = 5
use_multiprocessing: bool = False
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
encoding_options: list = field(
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
)
chunk_size: int = 100_000
epochs: int = 2
@@ -4,13 +4,13 @@ from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from core.config import PipelineConfig
from ners.core.config import PipelineConfig
@contextmanager
def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration"""
from core.config import get_config
from ners.core.config import get_config
config = get_config()
original_values = {}
@@ -5,7 +5,7 @@ from typing import Optional, Union, Iterator, Dict
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
OPTIMIZED_DTYPES = {
# Numeric columns with appropriate bit-width
@@ -113,7 +113,9 @@ class DataLoader:
sex_values = df["sex"].dropna().unique()
if len(sex_values) == 0:
logging.warning(f"No valid values found in sex column 'sex', using random sampling")
logging.warning(
"No valid values found in sex column 'sex', using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Calculate samples per sex category
@@ -140,18 +142,22 @@ class DataLoader:
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
if not balanced_samples:
logging.warning("No balanced samples could be created, using random sampling")
logging.warning(
"No balanced samples could be created, using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Create result using iloc with indices (no copying until final step)
result = df.iloc[balanced_samples].copy()
# Shuffle the final result
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
drop=True
)
result = result.sample(
frac=1, random_state=self.config.data.random_seed
).reset_index(drop=True)
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
logging.info(
f"Created balanced dataset with {len(result)} records from {len(df)} total"
)
return result
@classmethod
@@ -1,4 +1,4 @@
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
class PromptManager:
@@ -2,7 +2,7 @@ import json
import logging
from typing import Dict, Any
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
class StateManager:
+11 -41
View File
@@ -1,21 +1,17 @@
#!.venv/bin/python3
import argparse
import logging
import sys
import traceback
from core.config import setup_config
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from processing.pipeline import Pipeline
from processing.steps.data_cleaning_step import DataCleaningStep
from processing.steps.data_selection_step import DataSelectionStep
from processing.steps.data_splitting_step import DataSplittingStep
from processing.steps.feature_extraction_step import FeatureExtractionStep
from ners.core.utils.data_loader import DataLoader
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.pipeline import Pipeline
from ners.processing.steps.data_cleaning_step import DataCleaningStep
from ners.processing.steps.data_selection_step import DataSelectionStep
from ners.processing.steps.data_splitting_step import DataSplittingStep
from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
from ners.processing.steps.ner_annotation_step import NERAnnotationStep
from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
def create_pipeline(config) -> Pipeline:
"""Create pipeline from configuration"""
batch_config = BatchConfig(
batch_size=config.processing.batch_size,
max_workers=config.processing.max_workers,
@@ -23,14 +19,13 @@ def create_pipeline(config) -> Pipeline:
use_multiprocessing=config.processing.use_multiprocessing,
)
# Add steps based on configuration
pipeline = Pipeline(batch_config)
steps = [
DataCleaningStep(config),
FeatureExtractionStep(config),
DataSelectionStep(config),
# NERAnnotationStep(config),
# LLMAnnotationStep(config),
NERAnnotationStep(config),
LLMAnnotationStep(config),
]
for stage in config.stages:
@@ -42,7 +37,6 @@ def create_pipeline(config) -> Pipeline:
def run_pipeline(config) -> int:
"""Run the complete pipeline"""
try:
logging.info(f"Starting pipeline: {config.name} v{config.version}")
@@ -79,27 +73,3 @@ def run_pipeline(config) -> int:
except Exception as e:
logging.error(f"Pipeline failed: {e}", exc_info=True)
return 1
def main():
"""Main entry point with unified configuration loading"""
parser = argparse.ArgumentParser(
description="DRC NERS Processing Pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
args = parser.parse_args()
try:
config = setup_config(config_path=args.config, env=args.env)
return run_pipeline(config)
except Exception as e:
print(f"Pipeline failed: {e}")
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
+14
View File
@@ -0,0 +1,14 @@
#!.venv/bin/python3
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
def status(*, detailed: bool = False) -> None:
PipelineMonitor().print_status(detailed=detailed)
def clean_step(step: str, *, keep_last: int = 1) -> None:
PipelineMonitor().clean_step_checkpoints(step, keep_last)
def reset_step(step: str) -> None:
PipelineMonitor().reset_step(step)
+10 -25
View File
@@ -1,29 +1,24 @@
#!/usr/bin/env python3
import argparse
import logging
import os
import sys
import traceback
from pathlib import Path
from core.config import setup_config, PipelineConfig
from processing.ner.name_builder import NameBuilder
from processing.ner.name_engineering import NameEngineering
from processing.ner.name_model import NameModel
from ners.core.config import PipelineConfig
from ners.processing.ner.name_builder import NameBuilder
from ners.processing.ner.name_engineering import NameEngineering
from ners.processing.ner.name_model import NameModel
def feature(config: PipelineConfig):
"""Apply feature engineering to create position-independent NER dataset."""
NameEngineering(config).compute()
def build(config: PipelineConfig):
"""Build NER dataset using NERDataBuilder."""
NameBuilder(config).build()
def train(config: PipelineConfig):
"""Train the NER model."""
name_model = NameModel(config)
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
@@ -37,7 +32,9 @@ def train(config: PipelineConfig):
split_idx = int(len(data) * 0.9)
train_data, eval_data = data[:split_idx], data[split_idx:]
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
logging.info(
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
)
name_model.train(
data=train_data,
epochs=config.processing.epochs,
@@ -75,21 +72,9 @@ def run_pipeline(config: PipelineConfig, reset: bool = False):
def main():
parser = argparse.ArgumentParser(description="NER model management for DRC names")
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
parser.add_argument("--reset", action="store_true", help="Reset all steps")
args = parser.parse_args()
try:
config = setup_config(config_path=args.config, env=args.env)
return run_pipeline(config, args.reset)
except Exception as e:
print(f"Pipeline failed: {e}")
logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
return 1
except Exception:
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
@@ -8,4 +8,6 @@ class BatchConfig:
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5 # Save checkpoint every N batches
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
use_multiprocessing: bool = (
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
)
@@ -4,9 +4,9 @@ from typing import Iterator
import pandas as pd
from processing.batch.batch_config import BatchConfig
from processing.batch.memory_monitor import MemoryMonitor
from processing.steps import PipelineStep
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.batch.memory_monitor import MemoryMonitor
from ners.processing.steps import PipelineStep
class BatchProcessor:
@@ -33,7 +33,9 @@ class BatchProcessor:
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
processed_batch = step.load_batch(batch_id)
else:
try:
@@ -80,7 +82,9 @@ class BatchProcessor:
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Memory-optimized concurrent processing"""
executor_class = (
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
ProcessPoolExecutor
if self.config.use_multiprocessing
else ThreadPoolExecutor
)
results = {}
@@ -89,7 +93,9 @@ class BatchProcessor:
future_to_batch = {}
for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
results[batch_id] = step.load_batch(batch_id)
else:
# Only copy if necessary for concurrent processing
@@ -121,7 +127,9 @@ class BatchProcessor:
del results
self.memory_monitor.cleanup_memory()
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
result = (
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
)
# Final cleanup
del ordered_results
@@ -131,7 +139,9 @@ class BatchProcessor:
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process data using the configured strategy"""
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
step.state.total_batches = (
len(df) + self.config.batch_size - 1
) // self.config.batch_size
step.load_state()
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
@@ -4,8 +4,8 @@ import shutil
from datetime import datetime
from typing import Optional, Dict
from core.config.config_manager import ConfigManager
from core.config.project_paths import ProjectPaths
from ners.core.config.config_manager import ConfigManager
from ners.core.config.project_paths import ProjectPaths
class PipelineMonitor:
@@ -97,7 +97,10 @@ class PipelineMonitor:
avg_completion = total_completion / len(self.steps)
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
if avg_completion >= 100 and overall_status not in [
"error",
"completed_with_errors",
]:
overall_status = "completed"
return {
@@ -121,7 +124,9 @@ class PipelineMonitor:
print(f"{step_name.replace('_', ' ').title()}:")
print(f" Status: {step_status['status']}")
print(f" Progress: {step_status['completion_percentage']:.1f}%")
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
print(
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
)
if step_status["failed_batches"] > 0:
print(f" Failed Batches: {step_status['failed_batches']}")
@@ -141,7 +146,10 @@ class PipelineMonitor:
if step_dir.exists():
csv_files = list(step_dir.glob("*.csv"))
step_size = sum(f.stat().st_size for f in csv_files)
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
counts[step] = {
"files": len(csv_files),
"size_mb": step_size / (1024 * 1024),
}
total_size += step_size
else:
counts[step] = {"files": 0, "size_mb": 0}
@@ -160,7 +168,9 @@ class PipelineMonitor:
csv_files = sorted(step_dir.glob("batch_*.csv"))
if len(csv_files) <= keep_last:
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
logging.info(
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
)
return
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
@@ -3,7 +3,7 @@ from typing import List, Tuple, Dict
import pandas as pd
from processing.steps.feature_extraction_step import NameCategory
from ners.processing.steps.feature_extraction_step import NameCategory
class BaseNameFormatter(ABC):
@@ -12,7 +12,9 @@ class BaseNameFormatter(ABC):
Contains common logic for NER tagging and attribute computation.
"""
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
def __init__(
self, connectors: List[str] = None, additional_surnames: List[str] = None
):
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
self.additional_surnames = additional_surnames or [
"jean",
@@ -46,7 +48,9 @@ class BaseNameFormatter(ABC):
end_pos = current_pos + len(word)
# Determine tag based on word content
if word in native_parts or any(connector in word for connector in self.connectors):
if word in native_parts or any(
connector in word for connector in self.connectors
):
tag = "NATIVE"
elif word == surname or word in self.additional_surnames:
tag = "SURNAME"
@@ -72,7 +76,9 @@ class BaseNameFormatter(ABC):
"words": words_count,
"length": length,
"identified_category": (
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
NameCategory.SIMPLE.value
if words_count == 3
else NameCategory.COMPOSE.value
),
}
@@ -3,7 +3,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class ConnectorFormatter(BaseNameFormatter):
@@ -3,13 +3,15 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class ExtendedSurnameFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
original_surname = (
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
)
# Add random additional surname
additional_surname = random.choice(self.additional_surnames)
@@ -22,7 +24,9 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
"identified_name": row["probable_native"],
"probable_surname": combined_surname,
"identified_surname": combined_surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
"ner_entities": str(
self.create_ner_tags(full_name, native_parts, combined_surname)
),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class NativeOnlyFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class OriginalFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class PositionFlippedFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class ReducedNativeFormatter(BaseNameFormatter):
@@ -11,7 +11,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Keep only first native component + surname
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
reduced_native = (
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
)
full_name = f"{reduced_native} {surname}".strip()
return {
@@ -20,7 +22,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
"identified_name": reduced_native,
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
"ner_entities": str(
self.create_ner_tags(full_name, [reduced_native], surname)
),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@@ -4,8 +4,8 @@ import logging
import spacy
from spacy.tokens import DocBin
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from .name_tagger import NameTagger
@@ -20,7 +20,9 @@ class NameBuilder:
self.tagger = NameTagger()
def build(self) -> int:
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
@@ -38,7 +40,9 @@ class NameBuilder:
# Use NERNameTagger for parsing and validation
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities)
validated_entities = self.tagger.validate_entities(
ner_df["name"], parsed_entities
)
# Drop rows with no valid entities
mask = validated_entities.map(bool)
@@ -51,22 +55,33 @@ class NameBuilder:
# Prepare training data
training_data = list(
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
zip(
ner_df["name"].tolist(),
[{"entities": ents} for ents in validated_entities],
)
)
# Use NERNameTagger to create spaCy DocBin
docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
docs = self.tagger.create_docs(
nlp, ner_df["name"].tolist(), validated_entities.tolist()
)
doc_bin = DocBin(docs=docs)
# Save
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
json_path = self.config.paths.get_data_path(
self.config.data.output_files["ner_data"]
)
spacy_path = self.config.paths.get_data_path(
self.config.data.output_files["ner_spacy"]
)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
doc_bin.to_disk(spacy_path)
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
logging.info(
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
)
logging.info(f"Saved NER JSON to {json_path}")
logging.info(f"Saved NER spacy to {spacy_path}")
return 0
@@ -6,14 +6,14 @@ import numpy as np
import pandas as pd
from tqdm import tqdm
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
from processing.ner.formats.connectors_format import ConnectorFormatter
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from processing.ner.formats.native_only_format import NativeOnlyFormatter
from processing.ner.formats.original_format import OriginalFormatter
from processing.ner.formats.position_flipped_format import PositionFlippedFormatter
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.ner.formats.connectors_format import ConnectorFormatter
from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
from ners.processing.ner.formats.original_format import OriginalFormatter
from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
class NameEngineering:
@@ -44,42 +44,60 @@ class NameEngineering:
# Initialize format classes
self.formatters = {
"original": OriginalFormatter(self.connectors, self.additional_surnames),
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
"native_only": NativeOnlyFormatter(
self.connectors, self.additional_surnames
),
"position_flipped": PositionFlippedFormatter(
self.connectors, self.additional_surnames
),
"reduced_native": ReducedNativeFormatter(
self.connectors, self.additional_surnames
),
"connector_added": ConnectorFormatter(
self.connectors, self.additional_surnames
),
"extended_surname": ExtendedSurnameFormatter(
self.connectors, self.additional_surnames
),
}
def load_data(self) -> pd.DataFrame:
"""Load and filter NER-tagged data from CSV file"""
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath)
# Filter only NER-tagged rows
ner_data = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
logging.info(
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
)
return ner_data
def compute(self) -> None:
logging.info("Applying feature engineering transformations...")
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
input_filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
output_filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
logging.info(
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
)
del df # No need to keep in memory
gc.collect()
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
drop=True
)
ner_df = ner_df.sample(
frac=1, random_state=self.config.data.random_seed
).reset_index(drop=True)
total_rows = len(ner_df)
# Calculate split points
@@ -94,7 +112,11 @@ class NameEngineering:
(0, split_25_1, "original"), # First 25%: original format
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
(
split_25_3,
split_10_1,
"reduced_native",
), # Fourth 10%: reduce native components
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
]
@@ -11,7 +11,7 @@ from spacy.training import Example
from spacy.util import minibatch
from tqdm import tqdm
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
class NameModel:
@@ -87,7 +87,9 @@ class NameModel:
# Handle different annotation formats from NERNameTagger
if not isinstance(annotations, dict) or "entities" not in annotations:
logging.warning(f"Skipping invalid annotations at index {i}: {annotations}")
logging.warning(
f"Skipping invalid annotations at index {i}: {annotations}"
)
skipped_count += 1
continue
@@ -124,7 +126,9 @@ class NameModel:
valid_entities = []
for entity in entities:
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
logging.warning(f"Skipping invalid entity format in '{text}': {entity}")
logging.warning(
f"Skipping invalid entity format in '{text}': {entity}"
)
continue
start, end, label = entity
@@ -138,21 +142,30 @@ class NameModel:
or start < 0
or end > len(text)
):
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
logging.warning(
f"Skipping invalid entity bounds in '{text}': {entity}"
)
continue
# Check for overlaps with already validated entities
has_overlap = any(
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
start < v_end and end > v_start
for v_start, v_end, _ in valid_entities
)
if has_overlap:
logging.warning(f"Skipping overlapping entity in '{text}': {entity}")
logging.warning(
f"Skipping overlapping entity in '{text}': {entity}"
)
continue
# Validate that the span doesn't contain spaces (matching tagger validation)
span_text = text[start:end]
if not span_text or span_text != span_text.strip() or " " in span_text:
if (
not span_text
or span_text != span_text.strip()
or " " in span_text
):
logging.warning(
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
)
@@ -161,7 +174,9 @@ class NameModel:
valid_entities.append((start, end, label))
if not valid_entities:
logging.warning(f"Skipping training example with no valid entities: '{text}'")
logging.warning(
f"Skipping training example with no valid entities: '{text}'"
)
skipped_count += 1
continue
@@ -219,7 +234,9 @@ class NameModel:
batches = minibatch(examples, size=batch_size)
for batch in batches:
batch_losses = {}
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
self.nlp.update(
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
)
logging.info(
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
)
@@ -242,7 +259,9 @@ class NameModel:
"dropout_rate": dropout_rate,
}
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
logging.info(
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
)
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
"""Evaluate the trained model on test data"""
@@ -291,10 +310,14 @@ class NameModel:
entity_stats[label]["fp"] += 1
# Calculate overall metrics
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
precision = (
correct_entities / predicted_entities if predicted_entities > 0 else 0
)
recall = correct_entities / actual_entities if actual_entities > 0 else 0
f1_score = (
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
2 * (precision * recall) / (precision + recall)
if (precision + recall) > 0
else 0
)
# Calculate per-label metrics
@@ -304,7 +327,11 @@ class NameModel:
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
label_f1 = (
(2 * (label_precision * label_recall) / (label_precision + label_recall))
(
2
* (label_precision * label_recall)
/ (label_precision + label_recall)
)
if (label_precision + label_recall) > 0
else 0
)
@@ -394,7 +421,9 @@ class NameModel:
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
"confidence": getattr(ent, "score", None), # If confidence scores are available
"confidence": getattr(
ent, "score", None
), # If confidence scores are available
}
)
@@ -48,7 +48,9 @@ class NameTagger:
# Find the first occurrence of this native word that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
pos = name_lower.find(
native_word_lower, start_pos
) # Case-insensitive search
if pos == -1:
break
@@ -78,7 +80,9 @@ class NameTagger:
# Find the first occurrence that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
pos = name_lower.find(
surname_lower, start_pos
) # Case-insensitive search
if pos == -1:
break
@@ -120,8 +124,13 @@ class NameTagger:
continue
# Check for overlaps with already validated entities
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
if any(
start < v_end and end > v_start
for v_start, v_end, _ in validated_entities
):
logging.warning(
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
)
continue
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
@@ -200,10 +209,16 @@ class NameTagger:
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
return [tuple(e) for e in ast.literal_eval(entities_str)]
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
return [
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
]
else:
parsed = ast.literal_eval(entities_str)
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
return [
tuple(e)
for e in parsed
if isinstance(e, (list, tuple)) and len(e) == 3
]
except (ValueError, SyntaxError, json.JSONDecodeError):
return []
@@ -251,7 +266,9 @@ class NameTagger:
last_end = e
return filtered
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
def validate_entities(
self, texts: pd.Series, entities_series: pd.Series
) -> pd.Series:
"""Vectorized entity validation."""
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
@@ -4,9 +4,9 @@ from typing import Dict, Any
import pandas as pd
from processing.batch.batch_config import BatchConfig
from processing.batch.batch_processor import BatchProcessor
from processing.steps import PipelineStep
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.batch.batch_processor import BatchProcessor
from ners.processing.steps import PipelineStep
class Pipeline:
@@ -8,9 +8,9 @@ from typing import List, Optional
import pandas as pd
from pydantic import BaseModel
from core.config.pipeline_config import PipelineConfig
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.batch.batch_config import BatchConfig
@dataclass
@@ -38,7 +38,10 @@ class PipelineStep(ABC):
"""Abstract base class for pipeline steps"""
def __init__(
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
self,
name: str,
pipeline_config: PipelineConfig,
batch_config: Optional[BatchConfig] = None,
):
self.name = name
self.pipeline_config = pipeline_config
@@ -2,9 +2,9 @@ import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.text_cleaner import TextCleaner
from ners.processing.steps import PipelineStep
class DataCleaningStep(PipelineStep):
@@ -2,8 +2,8 @@ import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps import PipelineStep
from ners.core.config.pipeline_config import PipelineConfig
from ners.processing.steps import PipelineStep
class DataSelectionStep(PipelineStep):
@@ -31,8 +31,12 @@ class DataSelectionStep(PipelineStep):
)
# Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns]
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
available_columns = [
col for col in self.selected_columns if col in batch.columns
]
missing_columns = [
col for col in self.selected_columns if col not in batch.columns
]
if missing_columns:
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
@@ -1,11 +1,11 @@
import numpy as np
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
from processing.steps.feature_extraction_step import Gender
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.region_mapper import RegionMapper
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.steps import PipelineStep
from ners.processing.steps.feature_extraction_step import Gender
class DataSplittingStep(PipelineStep):
@@ -26,7 +26,9 @@ class DataSplittingStep(PipelineStep):
if self.eval_indices is None:
np.random.seed(self.pipeline_config.data.random_seed)
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
self.eval_indices = set(
np.random.choice(total_size, size=eval_size, replace=False)
)
return self.eval_indices
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
@@ -45,7 +47,9 @@ class DataSplittingStep(PipelineStep):
df_evaluation = df[eval_mask]
df_featured = df[~eval_mask]
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
self.data_loader.save_csv(
df_evaluation, data_dir / output_files["evaluation"]
)
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
else:
self.data_loader.save_csv(df, data_dir / output_files["featured"])
@@ -53,7 +57,9 @@ class DataSplittingStep(PipelineStep):
if self.pipeline_config.data.split_by_province:
for province in RegionMapper.get_provinces():
df_region = df[df.province == province]
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
self.data_loader.save_csv(
df_region, data_dir / "provinces" / f"{province}.csv"
)
if self.pipeline_config.data.split_by_gender:
df_males = df[df.sex == Gender.MALE.value]
@@ -5,10 +5,10 @@ from typing import Dict, Any
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.ner.name_tagger import NameTagger
from processing.steps import PipelineStep
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.region_mapper import RegionMapper
from ners.processing.ner.name_tagger import NameTagger
from ners.processing.steps import PipelineStep
class Gender(Enum):
@@ -64,10 +64,14 @@ class FeatureExtractionStep(PipelineStep):
self._assign_probable_names(result)
self._process_simple_names(result)
result["identified_category"] = self._assign_identified_category(result["words"])
result["identified_category"] = self._assign_identified_category(
result["words"]
)
if "year" in result.columns:
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
"Int16"
)
if "region" in result.columns:
result["province"] = self.region_mapper.map(result["region"]).str.lower()
@@ -7,12 +7,12 @@ import ollama
import pandas as pd
from pydantic import ValidationError
from core.config.pipeline_config import PipelineConfig
from core.utils.prompt_manager import PromptManager
from core.utils.rate_limiter import RateLimitConfig
from core.utils.rate_limiter import RateLimiter
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep, NameAnnotation
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.prompt_manager import PromptManager
from ners.core.utils.rate_limiter import RateLimitConfig
from ners.core.utils.rate_limiter import RateLimiter
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.steps import PipelineStep, NameAnnotation
class LLMAnnotationStep(PipelineStep):
@@ -24,7 +24,8 @@ class LLMAnnotationStep(PipelineStep):
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=min(
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
self.llm_config.max_concurrent_requests,
pipeline_config.processing.max_workers,
),
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
@@ -33,7 +34,9 @@ class LLMAnnotationStep(PipelineStep):
self.prompt = PromptManager(pipeline_config).load_prompt()
self.rate_limiter = (
self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None
self._create_rate_limiter()
if self.llm_config.enable_rate_limiting
else None
)
# Statistics
@@ -76,7 +79,9 @@ class LLMAnnotationStep(PipelineStep):
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
)
annotation = NameAnnotation.model_validate_json(response.message.content)
annotation = NameAnnotation.model_validate_json(
response.message.content
)
result = {
**annotation.model_dump(),
"annotated": 1,
@@ -119,7 +124,9 @@ class LLMAnnotationStep(PipelineStep):
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM")
logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
)
batch = batch.copy()
client = ollama.Client()
@@ -5,9 +5,9 @@ from typing import Dict
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.ner.name_model import NameModel
from processing.steps import PipelineStep, NameAnnotation
from ners.core.config.pipeline_config import PipelineConfig
from ners.processing.ner.name_model import NameModel
from ners.processing.steps import PipelineStep, NameAnnotation
class NERAnnotationStep(PipelineStep):
@@ -39,7 +39,9 @@ class NERAnnotationStep(PipelineStep):
logging.info("NER model loaded successfully")
else:
logging.warning(f"NER model not found at {self.model_path}")
logging.warning("NER annotation will be skipped. Train the model first.")
logging.warning(
"NER annotation will be skipped. Train the model first."
)
self.name_model.nlp = None
except Exception as e:
logging.error(f"Failed to load NER model: {e}")
@@ -80,7 +82,9 @@ class NERAnnotationStep(PipelineStep):
# Create annotation result in same format as LLM step
annotation = NameAnnotation(
identified_name=" ".join(native_parts) if native_parts else None,
identified_surname=" ".join(surname_parts) if surname_parts else None,
identified_surname=" ".join(surname_parts)
if surname_parts
else None,
)
result = {
@@ -124,7 +128,9 @@ class NERAnnotationStep(PipelineStep):
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER")
logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
)
batch = batch.copy()
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from research.experiment import ExperimentConfig
from ners.research.experiment import ExperimentConfig
class BaseModel(ABC):
@@ -103,16 +103,25 @@ class BaseModel(ABC):
feature_names = self._get_feature_names()
return dict(zip(feature_names, coefficients))
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
elif (
hasattr(self.model, "named_steps")
and "classifier" in self.model.named_steps
):
# For sklearn pipelines (like LogisticRegression with vectorizer)
classifier = self.model.named_steps["classifier"]
if hasattr(classifier, "coef_"):
coefficients = np.abs(classifier.coef_[0])
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
if hasattr(
self.model.named_steps["vectorizer"], "get_feature_names_out"
):
feature_names = self.model.named_steps[
"vectorizer"
].get_feature_names_out()
# Take top features to avoid too many n-grams
top_indices = np.argsort(coefficients)[-20:]
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
return dict(
zip(feature_names[top_indices], coefficients[top_indices])
)
return None
@@ -143,7 +152,7 @@ class BaseModel(ABC):
model_data = joblib.load(path)
# Recreate the model instance
from research.experiment import ExperimentConfig
from ners.research.experiment import ExperimentConfig
config = ExperimentConfig.from_dict(model_data["config"])
instance = cls(config)
@@ -221,7 +230,9 @@ class BaseModel(ABC):
if "accuracy" in self.training_history:
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
if "val_accuracy" in self.training_history:
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
axes[0].plot(
self.training_history["val_accuracy"], label="Validation Accuracy"
)
axes[0].set_title("Model Accuracy")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Accuracy")
@@ -18,7 +18,9 @@ class ExperimentConfig:
tags: List[str] = field(default_factory=list)
# Model configuration
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
model_type: str = (
"logistic_regression" # logistic_regression, lstm, transformer, etc.
)
model_params: Dict[str, Any] = field(default_factory=dict)
# Feature configuration
@@ -26,7 +28,9 @@ class ExperimentConfig:
feature_params: Dict[str, Any] = field(default_factory=dict)
# Data configuration
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
train_data_filter: Optional[Dict[str, Any]] = (
None # Filter criteria for training data
)
test_data_filter: Optional[Dict[str, Any]] = None
target_column: str = "sex"
@@ -36,7 +40,9 @@ class ExperimentConfig:
cross_validation_folds: int = 5
# Evaluation configuration
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
metrics: List[str] = field(
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional, Dict, List, Any
from research.experiment import ExperimentConfig, ExperimentStatus
from ners.research.experiment import ExperimentConfig, ExperimentStatus
@dataclass
@@ -51,6 +51,8 @@ class ExperimentResult:
"""Create from dictionary"""
data["config"] = ExperimentConfig.from_dict(data["config"])
data["start_time"] = datetime.fromisoformat(data["start_time"])
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
data["end_time"] = (
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
)
data["status"] = ExperimentStatus(data["status"])
return cls(**data)
@@ -3,9 +3,9 @@ from typing import List, Dict
import yaml
from core.config.pipeline_config import PipelineConfig
from research.experiment import ExperimentConfig
from research.experiment.feature_extractor import FeatureType
from ners.core.config.pipeline_config import PipelineConfig
from ners.research.experiment import ExperimentConfig
from ners.research.experiment.feature_extractor import FeatureType
class ExperimentBuilder:
@@ -27,7 +27,9 @@ class ExperimentBuilder:
raise
@classmethod
def find_template(cls, templates: dict, name: str, experiment_type: str = "baseline") -> dict:
def find_template(
cls, templates: dict, name: str, experiment_type: str = "baseline"
) -> dict:
"""Find experiment configuration by name and type"""
# Map type to section in templates
@@ -9,12 +9,16 @@ import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
from research.base_model import BaseModel
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
from research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import create_model
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.research.base_model import BaseModel
from ners.research.experiment import (
ExperimentConfig,
ExperimentStatus,
calculate_metrics,
)
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.model_registry import create_model
class ExperimentRunner:
@@ -32,10 +36,14 @@ class ExperimentRunner:
try:
logging.info(f"Starting experiment: {experiment_id}")
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
self.tracker.update_experiment(
experiment_id, status=ExperimentStatus.RUNNING
)
# Load data
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath)
# Apply data filters if specified
@@ -63,8 +71,12 @@ class ExperimentRunner:
test_pred = model.predict(X_test)
# Calculate metrics
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
train_metrics = calculate_metrics(
y_train, train_pred, experiment_config.metrics
)
test_metrics = calculate_metrics(
y_test, test_pred, experiment_config.metrics
)
# Cross-validation if requested
cv_metrics = {}
@@ -125,7 +137,9 @@ class ExperimentRunner:
experiment_ids = []
for i, config in enumerate(experiments):
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
logging.info(
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
)
try:
exp_id = self.run_experiment(config)
experiment_ids.append(exp_id)
@@ -136,7 +150,9 @@ class ExperimentRunner:
return experiment_ids
@classmethod
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
def _apply_data_filters(
cls, df: pd.DataFrame, config: ExperimentConfig
) -> pd.DataFrame:
"""Apply data filters specified in experiment config"""
filtered_df = df.copy()
@@ -148,9 +164,13 @@ class ExperimentRunner:
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
elif isinstance(criteria, dict):
if "min" in criteria:
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
filtered_df = filtered_df[
filtered_df[column] >= criteria["min"]
]
if "max" in criteria:
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
filtered_df = filtered_df[
filtered_df[column] <= criteria["max"]
]
else:
filtered_df = filtered_df[filtered_df[column] == criteria]
@@ -231,7 +251,9 @@ class ExperimentRunner:
return model
except Exception as e:
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
logging.error(
f"Failed to load model for experiment {experiment_id}: {e}"
)
return None
return None
@@ -6,9 +6,9 @@ from typing import Optional, Dict, List
import pandas as pd
from core.config import PipelineConfig, get_config
from research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiement_result import ExperimentResult
from ners.core.config import PipelineConfig, get_config
from ners.research.experiment import ExperimentConfig, ExperimentStatus
from ners.research.experiment.experiement_result import ExperimentResult
class ExperimentTracker:
@@ -97,7 +97,10 @@ class ExperimentTracker:
return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
self,
metric: str = "accuracy",
dataset: str = "test",
filters: Optional[Dict] = None,
) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric"""
experiments = self.list_experiments()
@@ -106,7 +109,9 @@ class ExperimentTracker:
# Apply additional filters
if "model_type" in filters:
experiments = [
e for e in experiments if e.config.model_type == filters["model_type"]
e
for e in experiments
if e.config.model_type == filters["model_type"]
]
if "features" in filters:
experiments = [
@@ -118,7 +123,9 @@ class ExperimentTracker:
valid_experiments = []
for exp in experiments:
if exp.status == ExperimentStatus.COMPLETED:
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
metrics_dict = (
exp.test_metrics if dataset == "test" else exp.train_metrics
)
if metric in metrics_dict:
valid_experiments.append((exp, metrics_dict[metric]))
@@ -24,7 +24,9 @@ class FeatureType(Enum):
class FeatureExtractor:
"""Extract different types of features from name data"""
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
def __init__(
self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None
):
self.feature_types = feature_types
self.feature_params = feature_params or {}
@@ -1,18 +1,18 @@
from typing import List
from research.base_model import BaseModel
from research.experiment import ExperimentConfig
from research.models.bigru_model import BiGRUModel
from research.models.cnn_model import CNNModel
from research.models.ensemble_model import EnsembleModel
from research.models.lightgbm_model import LightGBMModel
from research.models.logistic_regression_model import LogisticRegressionModel
from research.models.lstm_model import LSTMModel
from research.models.naive_bayes_model import NaiveBayesModel
from research.models.random_forest_model import RandomForestModel
from research.models.svm_model import SVMModel
from research.models.transformer_model import TransformerModel
from research.models.xgboost_model import XGBoostModel
from ners.research.base_model import BaseModel
from ners.research.experiment import ExperimentConfig
from ners.research.models.bigru_model import BiGRUModel
from ners.research.models.cnn_model import CNNModel
from ners.research.models.ensemble_model import EnsembleModel
from ners.research.models.lightgbm_model import LightGBMModel
from ners.research.models.logistic_regression_model import LogisticRegressionModel
from ners.research.models.lstm_model import LSTMModel
from ners.research.models.naive_bayes_model import NaiveBayesModel
from ners.research.models.random_forest_model import RandomForestModel
from ners.research.models.svm_model import SVMModel
from ners.research.models.transformer_model import TransformerModel
from ners.research.models.xgboost_model import XGBoostModel
MODEL_REGISTRY = {
"bigru": BiGRUModel,
@@ -5,12 +5,12 @@ from typing import List, Dict, Any
import pandas as pd
from core.config import get_config
from core.utils.data_loader import DataLoader
from research.experiment import FeatureType, ExperimentConfig
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import MODEL_REGISTRY
from ners.core.config import get_config
from ners.core.utils.data_loader import DataLoader
from ners.research.experiment import FeatureType, ExperimentConfig
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.model_registry import MODEL_REGISTRY
class ModelTrainer:
@@ -66,7 +66,9 @@ class ModelTrainer:
if experiment and experiment.test_metrics:
logging.info("Training completed successfully!")
logging.info(f"Experiment ID: {experiment_id}")
logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
logging.info(
f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}"
)
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
if save_artifacts:
@@ -144,13 +146,17 @@ class ModelTrainer:
try:
# Load data for learning curve generation
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists():
df = self.data_loader.load_csv_complete(data_path)
# Generate learning curve
logging.info("Generating learning curve...")
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
trained_model.generate_learning_curve(
df, df[experiment.config.target_column]
)
# Plot and save learning curve
learning_curve_path = model_dir / "learning_curve.png"
@@ -187,8 +193,12 @@ class ModelTrainer:
"model_path": str(model_path),
"config_path": str(config_path),
"results_path": str(results_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
"training_history_plot": str(training_history_path) if training_history_path else None,
"learning_curve_plot": str(learning_curve_path)
if learning_curve_path
else None,
"training_history_plot": str(training_history_path)
if training_history_path
else None,
"has_learning_curve": bool(trained_model.learning_curve_data),
"has_training_history": bool(trained_model.training_history),
}
@@ -215,8 +225,12 @@ class ModelTrainer:
"config_path": str(config_path),
"results_path": str(results_path),
"metadata_path": str(metadata_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
"training_history_plot": str(training_history_path) if training_history_path else None,
"learning_curve_plot": str(learning_curve_path)
if learning_curve_path
else None,
"training_history_plot": str(training_history_path)
if training_history_path
else None,
}
def load_trained_model(self, experiment_id: str):
@@ -227,7 +241,9 @@ class ModelTrainer:
model_path = model_dir / "complete_model.joblib"
if not model_path.exists():
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
raise FileNotFoundError(
f"Model artifacts not found for experiment {experiment_id}"
)
# Load the model class dynamically
metadata_path = model_dir / "metadata.json"
@@ -261,7 +277,9 @@ class ModelTrainer:
metadata = json.load(f)
models_data.append(metadata)
except Exception as e:
logging.warning(f"Could not read metadata for {model_dir.name}: {e}")
logging.warning(
f"Could not read metadata for {model_dir.name}: {e}"
)
if not models_data:
logging.info("No saved models found.")
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
from ners.research.neural_network_model import NeuralNetworkModel
class BiGRUModel(NeuralNetworkModel):
@@ -53,7 +53,9 @@ class BiGRUModel(NeuralNetworkModel):
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
return model
@@ -15,7 +15,7 @@ from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from research.neural_network_model import NeuralNetworkModel
from ners.research.neural_network_model import NeuralNetworkModel
class CNNModel(NeuralNetworkModel):
@@ -29,7 +29,9 @@ class CNNModel(NeuralNetworkModel):
[
# Learn char/subword embeddings; spatial dropout regularizes across channels
# to make the model robust to noisy characters and transliteration.
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Embedding(
input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)
),
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
# Small kernels capture short n-gram like patterns; padding='same' keeps
# sequence length stable for simpler pooling behavior.
@@ -59,7 +61,9 @@ class CNNModel(NeuralNetworkModel):
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
return model
@@ -75,6 +79,8 @@ class CNNModel(NeuralNetworkModel):
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
max_len = self.config.model_params.get(
"max_len", 20
) # Longer for character level
return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -8,8 +8,8 @@ from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from research.experiment import ExperimentConfig
from research.traditional_model import TraditionalModel
from ners.research.experiment import ExperimentConfig
from ners.research.traditional_model import TraditionalModel
class EnsembleModel(TraditionalModel):
@@ -40,22 +40,28 @@ class EnsembleModel(TraditionalModel):
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
CountVectorizer(
analyzer="char", ngram_range=(2, 4), max_features=5000
),
),
(
"classifier",
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
LogisticRegression(
max_iter=1000, random_state=self.config.random_seed
),
),
]
)
estimators.append((f"logistic_regression", model))
estimators.append(("logistic_regression", model))
elif model_type == "random_forest":
model = Pipeline(
[
(
"vectorizer",
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
TfidfVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=3000
),
),
(
"classifier",
@@ -65,19 +71,21 @@ class EnsembleModel(TraditionalModel):
),
]
)
estimators.append((f"rf", model))
estimators.append(("rf", model))
elif model_type == "naive_bayes":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
CountVectorizer(
analyzer="char", ngram_range=(1, 3), max_features=4000
),
),
("classifier", MultinomialNB()),
]
)
estimators.append((f"nb", model))
estimators.append(("nb", model))
# Soft voting averages probabilities (preferred when members are calibrated);
# hard voting uses majority class. Parallelize member predictions.
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
from ners.research.traditional_model import TraditionalModel
class LightGBMModel(TraditionalModel):
@@ -106,7 +106,9 @@ class LightGBMModel(TraditionalModel):
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(column_mapped)
encoded = self.label_encoders[feature_key].transform(
column_mapped
)
features.append(encoded.reshape(-1, 1))
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel
from ners.research.traditional_model import TraditionalModel
class LogisticRegressionModel(TraditionalModel):
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
from ners.research.neural_network_model import NeuralNetworkModel
class LSTMModel(NeuralNetworkModel):
@@ -50,7 +50,9 @@ class LSTMModel(NeuralNetworkModel):
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
return model
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel
from ners.research.traditional_model import TraditionalModel
class NaiveBayesModel(TraditionalModel):
@@ -6,7 +6,7 @@ from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
from ners.research.traditional_model import TraditionalModel
class RandomForestModel(TraditionalModel):
@@ -18,7 +18,6 @@ class RandomForestModel(TraditionalModel):
self.label_encoders: Dict[str, LabelEncoder] = {}
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
@@ -56,10 +55,14 @@ class RandomForestModel(TraditionalModel):
column_clean = column.fillna("unknown").astype(str)
known_classes = set(encoder.classes_)
default_class = (
"unknown" if "unknown" in known_classes else encoder.classes_[0]
"unknown"
if "unknown" in known_classes
else encoder.classes_[0]
)
column_mapped = column_clean.apply(
lambda value: value if value in known_classes else default_class
lambda value: value
if value in known_classes
else default_class
)
encoded = encoder.transform(column_mapped)
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from research.traditional_model import TraditionalModel
from ners.research.traditional_model import TraditionalModel
class SVMModel(TraditionalModel):
@@ -16,7 +16,7 @@ from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
from ners.research.neural_network_model import NeuralNetworkModel
class TransformerModel(NeuralNetworkModel):
@@ -37,7 +37,8 @@ class TransformerModel(NeuralNetworkModel):
# Add positional encoding
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
pos_embedding = Embedding(
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
input_dim=params.get("max_len", 8),
output_dim=params.get("embedding_dim", 64),
)(positions)
x = x + pos_embedding
@@ -49,7 +50,9 @@ class TransformerModel(NeuralNetworkModel):
model = Model(inputs, outputs)
model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["accuracy"],
)
return model
@@ -62,11 +65,15 @@ class TransformerModel(NeuralNetworkModel):
key_dim=cfg_params.get("transformer_head_size", 64),
dropout=cfg_params.get("attn_dropout", 0.1),
)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
x = LayerNormalization(epsilon=1e-6)(
x + Dropout(cfg_params.get("dropout", 0.1))(attn)
)
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
return LayerNormalization(epsilon=1e-6)(
x + Dropout(cfg_params.get("dropout", 0.1))(ff)
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = self._collect_text_corpus(X)
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
from ners.research.traditional_model import TraditionalModel
class XGBoostModel(TraditionalModel):
@@ -106,7 +106,9 @@ class XGBoostModel(TraditionalModel):
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(column_mapped)
encoded = self.label_encoders[feature_key].transform(
column_mapped
)
features.append(encoded.reshape(-1, 1))
@@ -10,8 +10,10 @@ from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor
import tensorflow as tf
from ners.research.base_model import BaseModel
from ners.research.experiment.feature_extractor import FeatureExtractor
class NeuralNetworkModel(BaseModel):
@@ -34,8 +36,6 @@ class NeuralNetworkModel(BaseModel):
# - Enables memory growth to avoid pre-allocating all VRAM
# - Optionally enables mixed precision if requested via model params
try:
import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
@@ -49,15 +49,15 @@ class NeuralNetworkModel(BaseModel):
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
tf.keras.mixed_precision.set_global_policy("mixed_float16")
logging.info("Enabled TensorFlow mixed precision (float16)")
except Exception as e:
logging.warning(f"Could not enable mixed precision: {e}")
else:
if requested_gpu:
logging.warning("Requested GPU but no TensorFlow GPU device is available.")
logging.warning(
"Requested GPU but no TensorFlow GPU device is available."
)
except Exception as e:
# Keep silent in non-TF environments / non-NN workflows
logging.debug(f"TensorFlow GPU setup skipped: {e}")
@@ -86,7 +86,9 @@ class NeuralNetworkModel(BaseModel):
logging.info(f"Vocabulary size: {vocab_size}")
# Get additional model parameters
self.model = self.build_model_with_vocab(vocab_size=vocab_size, **self.config.model_params)
self.model = self.build_model_with_vocab(
vocab_size=vocab_size, **self.config.model_params
)
# Train the neural network
logging.info(
@@ -143,7 +145,7 @@ class NeuralNetworkModel(BaseModel):
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
invalid_mask = (arr < 0) | (arr > max_idx)
# Avoid turning zeros into OOV
invalid_mask &= (arr != 0)
invalid_mask &= arr != 0
if invalid_mask.any():
arr[invalid_mask] = oov_index
@@ -157,10 +159,14 @@ class NeuralNetworkModel(BaseModel):
"""Combine configured textual features into one string per record."""
column_names = [
feature.value for feature in self.config.features if feature.value in X.columns
feature.value
for feature in self.config.features
if feature.value in X.columns
]
if not column_names:
raise ValueError("No configured text features found in the provided DataFrame.")
raise ValueError(
"No configured text features found in the provided DataFrame."
)
text_frame = X[column_names].fillna("").astype(str)
@@ -193,9 +199,7 @@ class NeuralNetworkModel(BaseModel):
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
tf.keras.mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
@@ -208,7 +212,9 @@ class NeuralNetworkModel(BaseModel):
X_prepared = self._sanitize_sequences(X_prepared)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
cv = StratifiedKFold(
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
)
accuracies = []
precisions = []
@@ -280,14 +286,14 @@ class NeuralNetworkModel(BaseModel):
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
tf.keras.mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
if requested_gpu:
logging.warning("Requested GPU for learning curve but none is available.")
logging.warning(
"Requested GPU for learning curve but none is available."
)
except Exception:
pass
@@ -342,7 +348,7 @@ class NeuralNetworkModel(BaseModel):
# Train model
if hasattr(model, "fit"):
history = model.fit(
model.fit(
X_train_subset,
y_train_subset,
epochs=self.config.model_params.get("epochs", 10),
@@ -3,12 +3,16 @@ import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from research.statistics.utils import LETTERS, build_letter_frequencies
from ners.research.statistics.utils import LETTERS, build_letter_frequencies
def plot_transition_matrix(ax, df_probs, title=""):
hm = sns.heatmap(
df_probs.loc[list(LETTERS), list(LETTERS)], cmap="Reds", annot=False, cbar=False, ax=ax
df_probs.loc[list(LETTERS), list(LETTERS)],
cmap="Reds",
annot=False,
cbar=False,
ax=ax,
)
ax.set_title(title, fontsize=12)
return hm
@@ -31,8 +35,12 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None):
x = np.arange(len(df_plot))
w = 0.4
fig, ax = plt.subplots(figsize=(16, 6))
ax.bar(x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
ax.bar(x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
ax.bar(
x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8
)
ax.bar(
x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8
)
ax.set_xticks(x)
ax.set_xticklabels(df_plot["letter"])
@@ -5,8 +5,6 @@ import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from scipy.stats import entropy
from scipy.spatial.distance import euclidean
from scipy.stats import entropy
from typing import Dict, Any
LETTERS = "abcdefghijklmnopqrstuvwxyz"
@@ -49,7 +47,12 @@ def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFram
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
# Normalize: lowercase, remove non-letters, concatenate all into one string
s = series.astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True).str.cat(sep="")
s = (
series.astype(str)
.str.lower()
.str.replace(r"[^a-z]", "", regex=True)
.str.cat(sep="")
)
# Convert string into Series of characters
chars = pd.Series(list(s))
@@ -150,8 +153,12 @@ def build_transition_comparisons(
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
kl_surnames_mf = entropy(prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12)
kl_surnames_fm = entropy(prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12)
kl_surnames_mf = entropy(
prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12
)
kl_surnames_fm = entropy(
prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12
)
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
@@ -163,7 +170,9 @@ def build_transition_comparisons(
P_f = transitions["f"]["probs"].flatten()
# Calculate the observed JSD (our test statistic)
observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12))
observed_jsd = 0.5 * (
entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)
)
# Concatenate male and female counts
counts_m = transitions["m"]["counts"]
@@ -194,10 +203,12 @@ def build_transition_comparisons(
permuted_jsd = 0.5 * (
entropy(
permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12
permuted_probs_m.mean(axis=1) + 1e-12,
permuted_probs_f.mean(axis=1) + 1e-12,
)
+ entropy(
permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12
permuted_probs_f.mean(axis=1) + 1e-12,
permuted_probs_m.mean(axis=1) + 1e-12,
)
)
permuted_jsds.append(permuted_jsd)
@@ -8,8 +8,8 @@ from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor
from ners.research.base_model import BaseModel
from ners.research.experiment.feature_extractor import FeatureExtractor
class TraditionalModel(BaseModel):
@@ -52,7 +52,9 @@ class TraditionalModel(BaseModel):
# Train model
if len(X_prepared.shape) == 1:
# For text-based features (like LogisticRegression with vectorization)
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
logging.info(
f"Fitting model with {X_prepared.shape[0]} samples (text features)"
)
else:
# For numerical features
logging.info(
@@ -74,12 +76,16 @@ class TraditionalModel(BaseModel):
return self
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
cv = StratifiedKFold(
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
)
# Calculate different metrics
results = {}
@@ -95,7 +101,11 @@ class TraditionalModel(BaseModel):
for metric in ["precision", "recall", "f1"]:
if metric in self.config.metrics:
scores = cross_val_score(
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
self.model,
X_prepared,
y_encoded,
cv=cv,
scoring=f"{metric}_weighted",
)
results[metric] = scores.mean()
results[f"{metric}_std"] = scores.std()
+46
View File
@@ -0,0 +1,46 @@
#!.venv/bin/python3
import logging
import traceback
from ners.core.config import setup_config
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.model_trainer import ModelTrainer
def train_from_template(
name: str,
type: str,
*,
templates: str = "research_templates.yaml",
config: str | None = None,
env: str = "development",
) -> int:
try:
cfg = setup_config(config_path=config, env=env)
experiment_builder = ExperimentBuilder(cfg)
logging.info(f"Loading research templates from: {templates}")
tmpl = experiment_builder.load_templates(templates)
logging.info(f"Looking for experiment: name='{name}', type='{type}'")
experiment_config = experiment_builder.find_template(tmpl, name, type)
logging.info(f"Found experiment: {experiment_config.get('name')}")
logging.info(f"Description: {experiment_config.get('description')}")
logging.info(f"Features: {experiment_config.get('features')}")
trainer = ModelTrainer(cfg)
trainer.train_single_model(
model_name=experiment_config.get("name"),
model_type=experiment_config.get("model_type"),
features=experiment_config.get("features"),
model_params=experiment_config.get("model_params", {}),
tags=experiment_config.get("tags", []),
)
logging.info("Training completed successfully!")
return 0
except Exception as e:
logging.error(f"Training failed: {e}")
traceback.print_exc()
return 1
+12 -28
View File
@@ -1,19 +1,13 @@
#!.venv/bin/python3
import argparse
import sys
from pathlib import Path
import os
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent
sys.path.insert(0, str(parent_dir))
from core.config import setup_config, PipelineConfig
from core.utils.data_loader import DataLoader
from processing.monitoring.pipeline_monitor import PipelineMonitor
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
from ners.core.config import setup_config, PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
# Page configuration
st.set_page_config(
@@ -65,19 +59,9 @@ class StreamlitApp:
)
def main():
parser = argparse.ArgumentParser(
description="DRC NERS Platform",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
args = parser.parse_args()
config = setup_config(args.config, env=args.env)
app = StreamlitApp(config)
app.run()
if __name__ == "__main__":
main()
# Initialize app using environment variables when launched via Typer
_config_path = os.environ.get("NERS_CONFIG")
_env = os.environ.get("NERS_ENV", "development")
_cfg = setup_config(_config_path, env=_env)
_app = StreamlitApp(_cfg)
_app.run()
@@ -1,7 +1,7 @@
import pandas as pd
import streamlit as st
from core.utils.data_loader import OPTIMIZED_DTYPES
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
@@ -25,7 +25,9 @@ class Dashboard:
# Load basic statistics
try:
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists():
df = load_dataset(str(data_path))
@@ -37,13 +39,17 @@ class Dashboard:
st.metric("Annotated Names", f"{annotated:,}")
with col3:
provinces = df["province"].nunique() if "province" in df.columns else 0
provinces = (
df["province"].nunique() if "province" in df.columns else 0
)
st.metric("Provinces", provinces)
with col4:
if "sex" in df.columns:
gender_dist = df["sex"].value_counts()
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
ratio = gender_dist.get("f", 0) / max(
gender_dist.get("m", 1), 1
)
st.metric("F/M Rate", f"{ratio:.2%}")
with col5:
if "annotated" in df.columns:
@@ -79,4 +85,6 @@ class Dashboard:
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
else:
st.info("No experiments found. Create your first experiment in the Experiments tab!")
st.info(
"No experiments found. Create your first experiment in the Experiments tab!"
)
+52
View File
@@ -0,0 +1,52 @@
from datetime import datetime
import pandas as pd
import streamlit as st
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataOverview:
def __init__(self, config):
self.config = config
def index(self):
st.title("Data Overview")
data_files = {
"Names": self.config.data.input_file,
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
"Male Names": self.config.data.output_files["males"],
"Female Names": self.config.data.output_files["females"],
}
st.write("Available Data Files:")
for name, rel_path in data_files.items():
file_path = self.config.paths.get_data_path(rel_path)
exists = file_path.exists()
size = file_path.stat().st_size if exists else 0
stats = (
f"Size: {size / (1024 * 1024):.1f} MB, Last Modified: {datetime.fromtimestamp(file_path.stat().st_mtime)}"
if exists
else "Not found"
)
st.write(f"- {name}: {file_path} ({stats})")
# Preview featured dataset if available
data_path = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
if data_path.exists():
df = load_dataset(str(data_path))
st.subheader("Featured Dataset Preview")
st.dataframe(df.head(), use_container_width=True)
st.write(f"Rows: {len(df):,}")
@@ -2,8 +2,8 @@ import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils.data_loader import OPTIMIZED_DTYPES
from web.interfaces.log_reader import LogReader
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
from ners.web.interfaces.log_reader import LogReader
@st.cache_data
@@ -31,7 +31,9 @@ class DataProcessing:
# Step details
for step_name, step_status in status["steps"].items():
with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
with st.expander(
f"{step_name.replace('_', ' ').title()} - {step_status['status']}"
):
col1, col2, col3 = st.columns(3)
with col1:
@@ -63,14 +65,20 @@ class DataProcessing:
with col2:
num_entries = st.number_input(
"Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries"
"Number of entries",
min_value=5,
max_value=50,
value=10,
key="num_log_entries",
)
# Get log entries based on filter
if log_level_filter == "All":
log_entries = log_reader.read_last_entries(num_entries)
else:
log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
log_entries = log_reader.read_entries_by_level(
log_level_filter, num_entries
)
if log_entries:
for entry in log_entries:
@@ -2,13 +2,13 @@ from typing import List, Dict
import streamlit as st
from core.config.pipeline_config import PipelineConfig
from research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiment_builder import ExperimentBuilder
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
from research.experiment.feature_extractor import FeatureType
from research.model_registry import list_available_models
from ners.core.config.pipeline_config import PipelineConfig
from ners.research.experiment import ExperimentConfig, ExperimentStatus
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.experiment.experiment_runner import ExperimentRunner
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.experiment.feature_extractor import FeatureType
from ners.research.model_registry import list_available_models
class Experiments:
@@ -46,13 +46,19 @@ class Experiments:
available_experiments = self.experiment_builder.get_templates()
# Create tabs for different experiment types
exp_tabs = st.tabs(["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"])
exp_tabs = st.tabs(
["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]
)
with exp_tabs[0]:
self._show_experiments_by_type(available_experiments["baseline"], "baseline")
self._show_experiments_by_type(
available_experiments["baseline"], "baseline"
)
with exp_tabs[1]:
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
self._show_experiments_by_type(
available_experiments["advanced"], "advanced"
)
with exp_tabs[2]:
self._show_experiments_by_type(
@@ -60,7 +66,9 @@ class Experiments:
)
with exp_tabs[3]:
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
self._show_experiments_by_type(
available_experiments["tuning"], "tuning"
)
except Exception as e:
st.error(f"Error loading experiment templates: {e}")
@@ -79,7 +87,9 @@ class Experiments:
# Show available experiments
for i, exp_template in enumerate(experiments):
exp_name = exp_template.get("name", f"Experiment {i + 1}")
exp_description = exp_template.get("description", "No description available")
exp_description = exp_template.get(
"description", "No description available"
)
with st.expander(f"📊 {exp_name} - {exp_description}"):
col1, col2 = st.columns([2, 1])
@@ -88,7 +98,7 @@ class Experiments:
st.json(exp_template)
with col2:
if st.button(f"🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
if st.button("🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
self._run_template_experiment(exp_template)
def _run_template_experiment(self, exp_template: Dict):
@@ -100,7 +110,9 @@ class Experiments:
# Run the experiment
experiment_id = self.experiment_runner.run_experiment(experiment_config)
st.success(f"Experiment '{experiment_config.name}' completed successfully!")
st.success(
f"Experiment '{experiment_config.name}' completed successfully!"
)
st.info(f"Experiment ID: `{experiment_id}`")
# Show results
@@ -130,13 +142,17 @@ class Experiments:
)
with col2:
model_filter = st.selectbox("Filter by Model", ["All"] + list_available_models())
model_filter = st.selectbox(
"Filter by Model", ["All"] + list_available_models()
)
with col3:
tag_filter = st.text_input("Filter by Tags (comma-separated)")
# Get and filter experiments
experiments = self._get_filtered_experiments(status_filter, model_filter, tag_filter)
experiments = self._get_filtered_experiments(
status_filter, model_filter, tag_filter
)
if not experiments:
st.info("No experiments found matching the filters.")
@@ -149,20 +165,28 @@ class Experiments:
):
self._display_experiment_details(exp, i)
def _get_filtered_experiments(self, status_filter: str, model_filter: str, tag_filter: str):
def _get_filtered_experiments(
self, status_filter: str, model_filter: str, tag_filter: str
):
"""Get experiments with applied filters"""
experiments = self.experiment_tracker.list_experiments()
# Apply filters
if status_filter != "All":
experiments = [e for e in experiments if e.status == ExperimentStatus(status_filter)]
experiments = [
e for e in experiments if e.status == ExperimentStatus(status_filter)
]
if model_filter != "All":
experiments = [e for e in experiments if e.config.model_type == model_filter]
experiments = [
e for e in experiments if e.config.model_type == model_filter
]
if tag_filter:
tags = [tag.strip() for tag in tag_filter.split(",")]
experiments = [e for e in experiments if any(tag in e.config.tags for tag in tags)]
experiments = [
e for e in experiments if any(tag in e.config.tags for tag in tags)
]
return experiments
@@ -173,7 +197,9 @@ class Experiments:
with col1:
st.write(f"**Model:** {exp.config.model_type}")
st.write(f"**Features:** {', '.join([f.value for f in exp.config.features])}")
st.write(
f"**Features:** {', '.join([f.value for f in exp.config.features])}"
)
st.write(f"**Tags:** {', '.join(exp.config.tags)}")
with col2:
@@ -185,7 +211,7 @@ class Experiments:
st.write(f"**Train Size:** {exp.train_size:,}")
st.write(f"**Test Size:** {exp.test_size:,}")
if st.button(f"View Details", key=f"details_{index}"):
if st.button("View Details", key=f"details_{index}"):
st.session_state.selected_experiment = exp.experiment_id
st.rerun()
@@ -198,7 +224,9 @@ class Experiments:
st.write("Run multiple experiments with different parameter combinations.")
# Add option to run template batch experiments
batch_type = st.radio("Batch Type", ["Template Batch", "Custom Parameter Sweep"])
batch_type = st.radio(
"Batch Type", ["Template Batch", "Custom Parameter Sweep"]
)
if batch_type == "Template Batch":
self._show_template_batch_experiments()
@@ -227,10 +255,13 @@ class Experiments:
if experiments:
st.write(f"**{exp_type.title()} Experiments:**")
exp_names = [
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
exp.get("name", f"Exp {i}")
for i, exp in enumerate(experiments)
]
selected_names = st.multiselect(
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
f"Select {exp_type} experiments",
exp_names,
key=f"select_{exp_type}",
)
for name in selected_names:
@@ -258,13 +289,17 @@ class Experiments:
experiment_configs.append(config)
# Run batch experiments
experiment_ids = self.experiment_runner.run_experiment_batch(experiment_configs)
experiment_ids = self.experiment_runner.run_experiment_batch(
experiment_configs
)
st.success(f"Completed {len(experiment_ids)} template experiments!")
# Show summary
if experiment_ids:
comparison = self.experiment_runner.compare_experiments(experiment_ids)
comparison = self.experiment_runner.compare_experiments(
experiment_ids
)
st.write("**Template Batch Results:**")
st.dataframe(
comparison[["name", "model_type", "test_accuracy"]],
@@ -285,7 +320,9 @@ class Experiments:
with col1:
base_name = st.text_input("Base Experiment Name", "parameter_sweep")
model_types = st.multiselect(
"Model Types", list_available_models(), default=["logistic_regression"]
"Model Types",
list_available_models(),
default=["logistic_regression"],
)
# N-gram ranges for logistic regression
@@ -301,13 +338,20 @@ class Experiments:
default=["full_name", "native_name", "surname"],
)
test_sizes = st.text_input("Test Sizes (comma-separated)", "0.15,0.2,0.25")
test_sizes = st.text_input(
"Test Sizes (comma-separated)", "0.15,0.2,0.25"
)
tags = st.text_input("Common Tags", "parameter_sweep,batch")
if st.form_submit_button("🚀 Run Parameter Sweep"):
self.run_batch_experiments(
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
base_name,
model_types,
ngram_ranges,
feature_combinations,
test_sizes,
tags,
)
def run_batch_experiments(
@@ -369,13 +413,17 @@ class Experiments:
exp_count += 1
# Run experiments
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
experiment_ids = self.experiment_runner.run_experiment_batch(
experiments
)
st.success(f"Completed {len(experiment_ids)} batch experiments")
# Show summary
if experiment_ids:
comparison = self.experiment_runner.compare_experiments(experiment_ids)
comparison = self.experiment_runner.compare_experiments(
experiment_ids
)
st.write("**Batch Results Summary:**")
st.dataframe(
comparison[["name", "model_type", "test_accuracy"]],
+80
View File
@@ -0,0 +1,80 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List
@dataclass
class LogEntry:
timestamp: datetime
level: str
message: str
class LogReader:
def __init__(self, log_file_path: Path):
self.log_file_path = Path(log_file_path)
def read_last_entries(self, num_entries: int = 20) -> List[LogEntry]:
entries = []
if not self.log_file_path.exists():
return entries
with open(self.log_file_path, "r") as f:
lines = f.readlines()[-num_entries:]
for line in lines:
entry = self._parse_log_line(line)
if entry:
entries.append(entry)
return entries
def read_entries_by_level(
self, level: str, num_entries: int = 20
) -> List[LogEntry]:
entries = []
if not self.log_file_path.exists():
return entries
with open(self.log_file_path, "r") as f:
for line in reversed(f.readlines()):
entry = self._parse_log_line(line)
if entry and entry.level == level:
entries.append(entry)
if len(entries) >= num_entries:
break
return list(reversed(entries))
def get_log_stats(self) -> dict:
if not self.log_file_path.exists():
return {}
stats = {"total_lines": 0}
with open(self.log_file_path, "r") as f:
for line in f:
stats["total_lines"] += 1
entry = self._parse_log_line(line)
if entry:
stats[entry.level] = stats.get(entry.level, 0) + 1
return stats
@staticmethod
def _parse_log_line(line: str) -> LogEntry | None:
try:
# Expected format from logging config: [timestamp] - LEVEL - message
parts = line.strip().split(" - ")
if len(parts) >= 3:
timestamp_str = parts[0].strip("[]")
timestamp = datetime.fromisoformat(timestamp_str)
level = parts[1].strip()
message = " - ".join(parts[2:])
return LogEntry(timestamp, level, message)
except Exception:
return None
return None
@@ -1,10 +1,8 @@
from pathlib import Path
import streamlit as st
from spacy import displacy
from core.config import PipelineConfig
from processing.ner.name_model import NameModel
from ners.core.config import PipelineConfig
from ners.processing.ner.name_model import NameModel
class NERTesting:
@@ -56,12 +54,15 @@ class NERTesting:
with col1:
st.metric(
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
"Training Examples",
f"{self.training_stats.get('training_examples', 0):,}",
)
with col2:
st.metric("Epochs", self.training_stats.get("epochs", 0))
with col3:
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
st.metric(
"Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}"
)
with col4:
st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
@@ -99,7 +100,9 @@ class NERTesting:
if names_input.strip():
if st.button("Analyze All Names", type="primary"):
names = [name.strip() for name in names_input.split("\n") if name.strip()]
names = [
name.strip() for name in names_input.split("\n") if name.strip()
]
for i, name in enumerate(names):
st.markdown(f"**Name {i + 1}: {name}**")
self.analyze_and_display(name)
@@ -127,7 +130,9 @@ class NERTesting:
else:
st.warning("No entities detected in the input text.")
st.info("Try using traditional Congolese names or ensure the spelling is correct.")
st.info(
"Try using traditional Congolese names or ensure the spelling is correct."
)
except Exception as e:
st.error(f"Error analyzing text: {e}")
@@ -139,14 +144,21 @@ class NERTesting:
ents = []
for entity in entities:
ents.append(
{"start": entity["start"], "end": entity["end"], "label": entity["label"]}
{
"start": entity["start"],
"end": entity["end"],
"label": entity["label"],
}
)
# Create doc-like structure for displacy
doc_data = {"text": text, "ents": ents, "title": None}
# Custom colors for our labels
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
colors = {
"NATIVE": "#74C0FC",
"SURNAME": "#69DB7C",
} # Light blue # Light green
options = {"colors": colors, "distance": 90}

Some files were not shown because too many files have changed in this diff Show More