refactoring: uv
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
.git
|
||||
.gitignore
|
||||
.idea
|
||||
.vscode
|
||||
__pycache__
|
||||
.ruff_cache
|
||||
.venv
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
*.swp
|
||||
*.swo
|
||||
*.DS_Store
|
||||
dist
|
||||
build
|
||||
*.egg-info
|
||||
@@ -0,0 +1 @@
|
||||
3.11
|
||||
+49
@@ -0,0 +1,49 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
|
||||
# Minimal Linux base (glibc) – Python will be installed by uv
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
UV_INSTALL_DIR=/usr/local/bin \
|
||||
UV_LINK_MODE=copy \
|
||||
UV_PYTHON_DOWNLOADS=1 \
|
||||
UV_PROJECT_ENVIRONMENT=/app/.venv \
|
||||
PATH=/app/.venv/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# System deps for building/using common scientific stack
|
||||
# Keep minimal; rely on wheels where possible
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ca-certificates curl git \
|
||||
build-essential pkg-config \
|
||||
libssl-dev libffi-dev \
|
||||
libopenblas0 libstdc++6 \
|
||||
libfreetype6 libpng16-16 libjpeg62-turbo \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install uv (static binary)
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Copy project metadata first for layer caching
|
||||
COPY pyproject.toml README.md ./
|
||||
|
||||
# Install a managed Python via uv and create the project venv
|
||||
RUN uv python install 3.11 \
|
||||
&& uv venv /app/.venv --python 3.11
|
||||
|
||||
# Resolve and install runtime deps into project venv
|
||||
# Use lockfile if present for reproducibility
|
||||
RUN if [ -f uv.lock ]; then uv sync --no-dev --no-install-project --frozen; else uv sync --no-dev --no-install-project; fi
|
||||
|
||||
# Copy source code and optional templates
|
||||
COPY src ./src
|
||||
|
||||
# Re-sync to ensure the local package is installed
|
||||
RUN uv sync --no-dev \
|
||||
&& rm -rf /root/.cache
|
||||
|
||||
# Default command shows help; override in compose or docker run
|
||||
CMD ["ners", "--help"]
|
||||
@@ -10,37 +10,23 @@ million names from the Democratic Republic of Congo (DRC) annotated with gender
|
||||
|
||||
### Installation & Setup
|
||||
|
||||
Instructions and command line snippets bellow are provided to help you set up the project environment quickly and
|
||||
efficiently.
|
||||
assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
|
||||
|
||||
**Using Makefile (Recommended)**
|
||||
|
||||
**Unix based**
|
||||
```bash
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||
cd drc-ners-nlp
|
||||
|
||||
# Setup environment
|
||||
make setup
|
||||
make activate
|
||||
uv sync
|
||||
```
|
||||
|
||||
**Manual Setup**
|
||||
|
||||
**Macos & windows**
|
||||
```bash
|
||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||
cd drc-ners-nlp
|
||||
|
||||
# Setup environment
|
||||
python -m venv .venv
|
||||
.venv/bin/pip install --upgrade pip
|
||||
.venv/bin/pip install -r requirements.txt
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
pip install jupyter notebook ipykernel pytest black flake8 mypy
|
||||
|
||||
source .venv/bin/activate
|
||||
docker compose build
|
||||
docker compose run --rm app
|
||||
docker compose run --rm app ners pipeline run --env=production
|
||||
docker compose run --rm app ners research train --name=lightgbm --type=baseline --env=production
|
||||
docker compose run --rm --service-ports app ners web run --env=production
|
||||
```
|
||||
|
||||
## Data Processing
|
||||
@@ -55,6 +41,7 @@ the `drc-ners-nlp/config/pipeline.yaml` file.
|
||||
```yaml
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "data_selection"
|
||||
- "feature_extraction"
|
||||
- "data_splitting"
|
||||
```
|
||||
@@ -62,37 +49,7 @@ stages:
|
||||
**Running the Pipeline**
|
||||
|
||||
```bash
|
||||
python main.py --env production
|
||||
```
|
||||
|
||||
## NER Processing (Optional)
|
||||
|
||||
This project implements a custom named entity recognition (NER) pipeline tailored for Congolese names.
|
||||
Its main objective is to accurately identify and tag the different components of a Congolese name,
|
||||
specifically distinguishing between the native part and the surname.
|
||||
|
||||
```bash
|
||||
python ner.py --env production
|
||||
```
|
||||
|
||||
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
|
||||
|
||||
**Running the Pipeline with NER Annotation**
|
||||
```yaml
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "feature_extraction"
|
||||
- "ner_annotation"
|
||||
- "data_splitting"
|
||||
```
|
||||
|
||||
**Running the Pipeline with LLM Annotation**
|
||||
```yaml
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "feature_extraction"
|
||||
- "llm_annotation"
|
||||
- "data_splitting"
|
||||
uv run ners pipeline run --env="production"
|
||||
```
|
||||
|
||||
## Experiments
|
||||
@@ -105,54 +62,94 @@ you can define model features, training parameters, and evaluation metrics in th
|
||||
|
||||
```bash
|
||||
# bigru
|
||||
python train.py --name="bigru" --type="baseline" --env="production"
|
||||
python train.py --name="bigru_native" --type="baseline" --env="production"
|
||||
python train.py --name="bigru_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="bigru" --type="baseline" --env="production"
|
||||
uv run ners research train --name="bigru_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
|
||||
|
||||
# cnn
|
||||
python train.py --name="cnn" --type="baseline" --env="production"
|
||||
python train.py --name="cnn_native" --type="baseline" --env="production"
|
||||
python train.py --name="cnn_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="cnn" --type="baseline" --env="production"
|
||||
uv run ners research train --name="cnn_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
|
||||
|
||||
# lightgbm
|
||||
python train.py --name="lightgbm" --type="baseline" --env="production"
|
||||
python train.py --name="lightgbm_native" --type="baseline" --env="production"
|
||||
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="lightgbm" --type="baseline" --env="production"
|
||||
uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
|
||||
|
||||
# logistic regression
|
||||
python train.py --name="logistic_regression" --type="baseline" --env="production"
|
||||
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
|
||||
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
|
||||
uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
|
||||
|
||||
# lstm
|
||||
python train.py --name="lstm" --type="baseline" --env="production"
|
||||
python train.py --name="lstm_native" --type="baseline" --env="production"
|
||||
python train.py --name="lstm_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="lstm" --type="baseline" --env="production"
|
||||
uv run ners research train --name="lstm_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
|
||||
|
||||
# random forest
|
||||
python train.py --name="random_forest" --type="baseline" --env="production"
|
||||
python train.py --name="random_forest_native" --type="baseline" --env="production"
|
||||
python train.py --name="random_forest_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="random_forest" --type="baseline" --env="production"
|
||||
uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
|
||||
|
||||
# svm
|
||||
python train.py --name="svm" --type="baseline" --env="production"
|
||||
python train.py --name="svm_native" --type="baseline" --env="production"
|
||||
python train.py --name="svm_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="svm" --type="baseline" --env="production"
|
||||
uv run ners research train --name="svm_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="svm_surname" --type="baseline" --env="production"
|
||||
|
||||
# naive bayes
|
||||
python train.py --name="naive_bayes" --type="baseline" --env="production"
|
||||
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
|
||||
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
|
||||
uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
|
||||
|
||||
# transformer
|
||||
python train.py --name="transformer" --type="baseline" --env="production"
|
||||
python train.py --name="transformer_native" --type="baseline" --env="production"
|
||||
python train.py --name="transformer_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="transformer" --type="baseline" --env="production"
|
||||
uv run ners research train --name="transformer_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
|
||||
|
||||
# xgboost
|
||||
python train.py --name="xgboost" --type="baseline" --env="production"
|
||||
python train.py --name="xgboost_native" --type="baseline" --env="production"
|
||||
python train.py --name="xgboost_surname" --type="baseline" --env="production"
|
||||
uv run ners research train --name="xgboost" --type="baseline" --env="production"
|
||||
uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
|
||||
uv run ners research train --name="xgboost_surname" --type="baseline" --env="production"
|
||||
```
|
||||
|
||||
## TensorFlow on macOS (Intel) with uv
|
||||
|
||||
TensorFlow no longer publishes wheels for macOS Intel. To keep using uv and run TF reliably, use a Linux container with TF preinstalled and install project code with minimal extras inside the container.
|
||||
|
||||
### One-time build
|
||||
|
||||
```bash
|
||||
docker compose -f docker/compose.tf.yml build
|
||||
|
||||
If you see a message like `tensorflow/tensorflow:<tag>: not found`, update `docker/Dockerfile.tf-cpu` to a tag that exists (e.g., `2.17.0`) and rebuild:
|
||||
|
||||
```bash
|
||||
sed -n '1,20p' docker/Dockerfile.tf-cpu # verify the FROM line
|
||||
docker pull tensorflow/tensorflow:2.17.0 # quick availability check
|
||||
docker compose -f docker/compose.tf.yml build
|
||||
```
|
||||
```
|
||||
|
||||
### Start a shell with uv and TF available
|
||||
|
||||
```bash
|
||||
docker compose -f docker/compose.tf.yml run --rm tf bash
|
||||
```
|
||||
|
||||
Inside the container:
|
||||
|
||||
```bash
|
||||
# Install project in editable mode without pulling full deps
|
||||
uv pip install -e . --no-deps
|
||||
|
||||
# Install only what research needs alongside TensorFlow
|
||||
uv pip install typer pandas scikit-learn seaborn plotly
|
||||
|
||||
# Sanity check
|
||||
uv run python -c "import tensorflow as tf; print(tf.__version__)"
|
||||
|
||||
# Run an experiment
|
||||
uv run ners research train --name="lstm" --type="baseline" --env="production"
|
||||
```
|
||||
|
||||
## Web Interface
|
||||
@@ -163,60 +160,9 @@ experiments and make predictions without needing to understand the underlying co
|
||||
### Running the Web Interface
|
||||
|
||||
```bash
|
||||
streamlit run web/app.py
|
||||
uv run ners web run --env="production"
|
||||
```
|
||||
|
||||
## GPU Acceleration
|
||||
|
||||
This project can leverage GPUs for faster training when supported libraries and hardware are available.
|
||||
|
||||
- TensorFlow/Keras models (BiGRU, LSTM, CNN, Transformer)
|
||||
- Uses GPU automatically if a TensorFlow GPU build is installed.
|
||||
- The code enables safe GPU memory growth by default; optionally enable mixed precision for additional speed:
|
||||
- Add `mixed_precision: true` in the experiment `model_params` (e.g., in `config/research_templates.yaml`).
|
||||
- The final layer outputs are set to float32 for numerical stability under mixed precision.
|
||||
|
||||
- spaCy NER
|
||||
- Automatically prefers GPU if available; otherwise falls back to CPU.
|
||||
- Ensure a compatible CUDA-enabled spaCy/thinc stack is installed to use GPU.
|
||||
|
||||
- XGBoost
|
||||
- Enable GPU by adding to the experiment `model_params`:
|
||||
- `use_gpu: true` (sets `tree_method: gpu_hist` and `predictor: gpu_predictor`).
|
||||
|
||||
- LightGBM
|
||||
- Enable GPU by adding to the experiment `model_params`:
|
||||
- `use_gpu: true` (sets `device: gpu`). Optional: `gpu_platform_id`, `gpu_device_id`.
|
||||
|
||||
Example template snippet (GPU on):
|
||||
|
||||
```yaml
|
||||
- name: "lstm_gpu"
|
||||
description: "LSTM with GPU + mixed precision"
|
||||
model_type: "lstm"
|
||||
features: ["full_name"]
|
||||
model_params:
|
||||
embedding_dim: 128
|
||||
lstm_units: 64
|
||||
epochs: 5
|
||||
batch_size: 128
|
||||
use_gpu: true
|
||||
mixed_precision: true
|
||||
tags: ["gpu", "mixed_precision"]
|
||||
|
||||
- name: "xgboost_gpu"
|
||||
description: "XGBoost with GPU"
|
||||
model_type: "xgboost"
|
||||
features: ["full_name"]
|
||||
model_params:
|
||||
n_estimators: 200
|
||||
use_gpu: true
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Install CUDA‑enabled binaries for TensorFlow/spaCy/LightGBM/XGBoost to actually use GPU.
|
||||
- If GPU is requested but not available, training will proceed on CPU with a warning.
|
||||
|
||||
## Contributors
|
||||
|
||||
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
services:
|
||||
app:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
image: drc-ners:uv
|
||||
working_dir: /app
|
||||
tty: true
|
||||
stdin_open: true
|
||||
environment:
|
||||
NERS_ENV: production
|
||||
STREAMLIT_SERVER_ADDRESS: 0.0.0.0
|
||||
# expose Streamlit for `ners web run`
|
||||
ports:
|
||||
- "8501:8501"
|
||||
volumes:
|
||||
- ./assets:/app/assets
|
||||
- ./config:/app/config
|
||||
- ./data:/app/data
|
||||
# default command shows CLI help; override per run
|
||||
command: ["ners", "--help"]
|
||||
-90
@@ -1,90 +0,0 @@
|
||||
#!.venv/bin/python3
|
||||
import argparse
|
||||
import sys
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
from core.config import setup_config
|
||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
|
||||
def main():
|
||||
choices = [
|
||||
"data_cleaning",
|
||||
"data_selection",
|
||||
"feature_extraction",
|
||||
"ner_annotation",
|
||||
"llm_annotation",
|
||||
"data_splitting",
|
||||
]
|
||||
|
||||
parser = argparse.ArgumentParser(description="DRC NERS Processing Monitoring")
|
||||
parser.add_argument("--config", type=Path, help="Path to configuration file")
|
||||
parser.add_argument("--env", type=str, default="development", help="Environment")
|
||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||
|
||||
# Clean command
|
||||
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
|
||||
clean_parser.add_argument("--step", type=str, choices=choices, help="default: all")
|
||||
clean_parser.add_argument("--keep-last", type=int, default=1, help="(default: 1)")
|
||||
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
|
||||
|
||||
# Reset command
|
||||
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
|
||||
reset_parser.add_argument("--step", type=str, choices=choices, help="(default: all)")
|
||||
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
|
||||
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
setup_config(config_path=args.config, env=args.env)
|
||||
monitor = PipelineMonitor()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
monitor.print_status(detailed=True)
|
||||
return 1
|
||||
|
||||
elif args.command == "clean":
|
||||
checkpoint_info = monitor.count_checkpoint_files()
|
||||
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
|
||||
|
||||
if not args.force:
|
||||
response = input("Are you sure you want to clean checkpoints? (y/N): ")
|
||||
if response.lower() != "y":
|
||||
print("Cancelled")
|
||||
return 0
|
||||
|
||||
if args.step:
|
||||
monitor.clean_step_checkpoints(args.step, args.keep_last)
|
||||
else:
|
||||
for step in monitor.steps:
|
||||
monitor.clean_step_checkpoints(step, args.keep_last)
|
||||
|
||||
print("Checkpoint cleaning completed")
|
||||
|
||||
elif args.command == "reset":
|
||||
if not args.force:
|
||||
response = input(
|
||||
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
|
||||
)
|
||||
if response.lower() != "y":
|
||||
print("Cancelled")
|
||||
return 0
|
||||
|
||||
if args.step:
|
||||
monitor.reset_step(args.step)
|
||||
else:
|
||||
for step in monitor.steps:
|
||||
monitor.reset_step(step)
|
||||
|
||||
print(f"Reset completed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Monitoring failed: {e}")
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Vendored
-499
File diff suppressed because one or more lines are too long
@@ -0,0 +1,41 @@
|
||||
[project]
|
||||
name = "ners"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"geopandas>=1.1.1",
|
||||
"joblib>=1.5.2",
|
||||
"lightgbm>=4.6.0",
|
||||
"matplotlib>=3.10.6",
|
||||
"numpy>=2.3.3",
|
||||
"ollama>=0.6.0",
|
||||
"pandas>=2.3.3",
|
||||
"plotly>=6.3.1",
|
||||
"psutil>=7.1.0",
|
||||
"pydantic>=2.11.10",
|
||||
"pyyaml>=6.0.3",
|
||||
"scikit-learn>=1.7.2",
|
||||
"seaborn>=0.13.2",
|
||||
"spacy>=3.8.7",
|
||||
"streamlit>=1.50.0",
|
||||
"tqdm>=4.67.1",
|
||||
"typer>=0.19.2",
|
||||
"xgboost>=3.0.5",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
ners = "ners.cli:app"
|
||||
|
||||
[build-system]
|
||||
requires = ["uv_build>=0.8.12,<0.9.0"]
|
||||
build-backend = "uv_build"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ruff>=0.13.3",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
required-environments = ["sys_platform == 'linux' and platform_machine == 'x86_64'"]
|
||||
@@ -1,170 +0,0 @@
|
||||
absl-py==2.3.0
|
||||
altair==5.1.2
|
||||
annotated-types==0.7.0
|
||||
anyio==4.9.0
|
||||
appnope==0.1.4
|
||||
argon2-cffi==25.1.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
arrow==1.3.0
|
||||
asttokens==3.0.0
|
||||
astunparse==1.6.3
|
||||
async-lru==2.0.5
|
||||
attrs==25.3.0
|
||||
babel==2.17.0
|
||||
beautifulsoup4==4.13.4
|
||||
black==25.1.0
|
||||
bleach==6.2.0
|
||||
blinker==1.9.0
|
||||
cachetools==6.1.0
|
||||
certifi==2025.6.15
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.2
|
||||
click==8.2.1
|
||||
comm==0.2.2
|
||||
contourpy==1.3.2
|
||||
cycler==0.12.1
|
||||
debugpy==1.8.14
|
||||
decorator==5.2.1
|
||||
defusedxml==0.7.1
|
||||
executing==2.2.0
|
||||
fastjsonschema==2.21.1
|
||||
flake8==7.3.0
|
||||
flatbuffers==25.2.10
|
||||
fonttools==4.58.4
|
||||
fqdn==1.5.1
|
||||
gast==0.6.0
|
||||
gitdb==4.0.12
|
||||
GitPython==3.1.45
|
||||
google-pasta==0.2.0
|
||||
grpcio==1.73.0
|
||||
h11==0.16.0
|
||||
h5py==3.14.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
idna==3.10
|
||||
imbalanced-learn==0.13.0
|
||||
ipykernel==6.29.5
|
||||
ipython>=8.0,<9.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
isoduration==20.11.0
|
||||
jedi==0.19.2
|
||||
Jinja2==3.1.6
|
||||
joblib==1.5.1
|
||||
json5==0.12.0
|
||||
jsonpointer==3.0.0
|
||||
jsonschema==4.24.0
|
||||
jsonschema-specifications==2025.4.1
|
||||
jupyter-events==0.12.0
|
||||
jupyter-lsp==2.2.5
|
||||
jupyter_client==8.6.3
|
||||
jupyter_core==5.8.1
|
||||
jupyter_server==2.16.0
|
||||
jupyter_server_terminals==0.5.3
|
||||
jupyterlab==4.4.4
|
||||
jupyterlab_pygments==0.3.0
|
||||
jupyterlab_server==2.27.3
|
||||
keras==3.10.0
|
||||
kiwisolver==1.4.8
|
||||
libclang==18.1.1
|
||||
lightgbm~=4.6.0
|
||||
Markdown==3.8.2
|
||||
markdown-it-py==3.0.0
|
||||
MarkupSafe==3.0.2
|
||||
matplotlib==3.10.3
|
||||
matplotlib-inline==0.1.7
|
||||
mccabe==0.7.0
|
||||
mdurl==0.1.2
|
||||
mistune==3.1.3
|
||||
ml-dtypes==0.3.2
|
||||
mypy==1.17.0
|
||||
mypy_extensions==1.1.0
|
||||
namex==0.1.0
|
||||
narwhals==2.0.1
|
||||
nbclient==0.10.2
|
||||
nbconvert==7.16.6
|
||||
nbformat==5.10.4
|
||||
nest-asyncio==1.6.0
|
||||
nltk==3.9.1
|
||||
notebook==7.4.4
|
||||
notebook_shim==0.2.4
|
||||
numpy==1.26.4
|
||||
ollama~=0.5.1
|
||||
opt_einsum==3.4.0
|
||||
optree==0.16.0
|
||||
overrides==7.7.0
|
||||
packaging==25.0
|
||||
pandas==2.3.0
|
||||
pandocfilters==1.5.1
|
||||
parso==0.8.4
|
||||
pathspec==0.12.1
|
||||
pexpect==4.9.0
|
||||
pillow==11.2.1
|
||||
platformdirs==4.3.8
|
||||
plotly~=6.2.0
|
||||
prometheus_client==0.22.1
|
||||
prompt_toolkit==3.0.51
|
||||
protobuf==4.25.8
|
||||
psutil==7.0.0
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pyarrow==21.0.0
|
||||
pycodestyle==2.14.0
|
||||
pycparser==2.22
|
||||
pydantic~=2.11.7
|
||||
pydantic_core==2.33.2
|
||||
pydeck==0.9.1
|
||||
pyflakes==3.4.0
|
||||
Pygments==2.19.1
|
||||
pyparsing==3.2.3
|
||||
python-dateutil==2.9.0.post0
|
||||
python-json-logger==3.3.0
|
||||
pytz==2025.2
|
||||
PyYAML~=6.0.2
|
||||
pyzmq==27.0.0
|
||||
referencing==0.36.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.4
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
rich==14.0.0
|
||||
rpds-py==0.26.0
|
||||
scikit-learn~=1.6.1
|
||||
scipy==1.15.3
|
||||
seaborn==0.13.2
|
||||
Send2Trash==1.8.3
|
||||
six==1.17.0
|
||||
sklearn-compat==0.1.3
|
||||
smmap==5.0.2
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.7
|
||||
spacy~=3.8.7
|
||||
stack-data==0.6.3
|
||||
streamlit~=1.47.1
|
||||
tenacity==9.1.2
|
||||
tensorboard==2.16.2
|
||||
tensorboard-data-server==0.7.2
|
||||
tensorflow==2.16.2
|
||||
tensorflow-io-gcs-filesystem==0.37.1
|
||||
termcolor==3.1.0
|
||||
terminado==0.18.1
|
||||
threadpoolctl==3.6.0
|
||||
tinycss2==1.4.0
|
||||
toml==0.10.2
|
||||
toolz==1.0.0
|
||||
tornado==6.5.1
|
||||
tqdm==4.67.1
|
||||
traitlets==5.14.3
|
||||
types-python-dateutil==2.9.0.20250516
|
||||
types-PyYAML==6.0.12.20250516
|
||||
typing-inspection==0.4.1
|
||||
typing_extensions==4.14.0
|
||||
tzdata==2025.2
|
||||
uri-template==1.3.0
|
||||
urllib3==2.5.0
|
||||
wcwidth==0.2.13
|
||||
webcolors==24.11.1
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.8.0
|
||||
Werkzeug==3.1.3
|
||||
wrapt==1.17.2
|
||||
xgboost~=3.0.3
|
||||
@@ -0,0 +1,3 @@
|
||||
"""DRC NERS NLP package."""
|
||||
|
||||
__all__: list[str] = []
|
||||
+226
@@ -0,0 +1,226 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
|
||||
from ners.core.config import setup_config, PipelineConfig
|
||||
|
||||
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Pipeline commands
|
||||
# -------------------------
|
||||
pipeline_app = typer.Typer(help="Data processing pipeline")
|
||||
app.add_typer(pipeline_app, name="pipeline")
|
||||
|
||||
|
||||
@pipeline_app.command("run")
|
||||
def pipeline_run(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
"""Run the full processing pipeline."""
|
||||
from ners.main import run_pipeline as _run_pipeline
|
||||
|
||||
cfg = setup_config(config_path=config, env=env)
|
||||
code = _run_pipeline(cfg)
|
||||
raise typer.Exit(code)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# NER commands
|
||||
# -------------------------
|
||||
ner_app = typer.Typer(help="NER dataset and model")
|
||||
app.add_typer(ner_app, name="ner")
|
||||
|
||||
|
||||
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
|
||||
return setup_config(config_path=config, env=env)
|
||||
|
||||
|
||||
@ner_app.command("feature")
|
||||
def ner_feature(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.ner import feature as _feature
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
_feature(cfg)
|
||||
|
||||
|
||||
@ner_app.command("build")
|
||||
def ner_build(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.ner import build as _build
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
_build(cfg)
|
||||
|
||||
|
||||
@ner_app.command("train")
|
||||
def ner_train(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.ner import train as _train
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
_train(cfg)
|
||||
|
||||
|
||||
@ner_app.command("run")
|
||||
def ner_run(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
reset: bool = typer.Option(
|
||||
False, help="Reset intermediate outputs and rerun all steps"
|
||||
),
|
||||
) -> None:
|
||||
from ners.ner import run_pipeline as _ner_pipeline
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
code = _ner_pipeline(cfg, reset)
|
||||
raise typer.Exit(code)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Research commands
|
||||
# -------------------------
|
||||
research_app = typer.Typer(help="Research experiments and training")
|
||||
app.add_typer(research_app, name="research")
|
||||
|
||||
|
||||
@research_app.command("train")
|
||||
def research_train(
|
||||
name: str = typer.Option(..., "--name", help="Model name to train"),
|
||||
type: str = typer.Option(..., "--type", help="Experiment type"),
|
||||
templates: str = typer.Option(
|
||||
"research_templates.yaml", help="Templates file path"
|
||||
),
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||
from ners.research.model_trainer import ModelTrainer
|
||||
|
||||
cfg = _load_config(config, env)
|
||||
exp_builder = ExperimentBuilder(cfg)
|
||||
tmpl = exp_builder.load_templates(templates)
|
||||
exp_cfg = exp_builder.find_template(tmpl, name, type)
|
||||
|
||||
trainer = ModelTrainer(cfg)
|
||||
trainer.train_single_model(
|
||||
model_name=exp_cfg.get("name"),
|
||||
model_type=exp_cfg.get("model_type"),
|
||||
features=exp_cfg.get("features"),
|
||||
model_params=exp_cfg.get("model_params", {}),
|
||||
tags=exp_cfg.get("tags", []),
|
||||
)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Monitor commands
|
||||
# -------------------------
|
||||
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
|
||||
app.add_typer(monitor_app, name="monitor")
|
||||
|
||||
|
||||
@monitor_app.command("status")
|
||||
def monitor_status(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
detailed: bool = typer.Option(
|
||||
False, help="Show detailed status (failed batch IDs)"
|
||||
),
|
||||
) -> None:
|
||||
_ = _load_config(config, env)
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
PipelineMonitor().print_status(detailed=detailed)
|
||||
|
||||
|
||||
@monitor_app.command("clean")
|
||||
def monitor_clean(
|
||||
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
|
||||
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
|
||||
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
_ = _load_config(config, env)
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
mon = PipelineMonitor()
|
||||
if not force:
|
||||
typer.confirm("Clean checkpoints?", abort=True)
|
||||
|
||||
if step:
|
||||
mon.clean_step_checkpoints(step, keep_last)
|
||||
else:
|
||||
for s in mon.steps:
|
||||
mon.clean_step_checkpoints(s, keep_last)
|
||||
|
||||
|
||||
@monitor_app.command("reset")
|
||||
def monitor_reset(
|
||||
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
|
||||
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
_ = _load_config(config, env)
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
mon = PipelineMonitor()
|
||||
if not force:
|
||||
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
|
||||
typer.confirm(msg, abort=True)
|
||||
|
||||
if step:
|
||||
mon.reset_step(step)
|
||||
else:
|
||||
for s in mon.steps:
|
||||
mon.reset_step(s)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Web commands
|
||||
# -------------------------
|
||||
web_app = typer.Typer(help="Web UI wrapper")
|
||||
app.add_typer(web_app, name="web")
|
||||
|
||||
|
||||
@web_app.command("run")
|
||||
def web_run(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
"""Launch the Streamlit web app via subprocess."""
|
||||
app_path = Path(__file__).parent / "web" / "app.py"
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"streamlit",
|
||||
"run",
|
||||
str(app_path),
|
||||
]
|
||||
# Pass configuration via environment variables to avoid argparse in Streamlit
|
||||
env_vars = os.environ.copy()
|
||||
if config is not None:
|
||||
env_vars["NERS_CONFIG"] = str(config)
|
||||
env_vars["NERS_ENV"] = env
|
||||
|
||||
raise typer.Exit(subprocess.call(cmd, env=env_vars))
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
app()
|
||||
@@ -2,10 +2,10 @@ import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from core.utils import ensure_directories
|
||||
from .config_manager import ConfigManager
|
||||
from .logging_config import LoggingConfig
|
||||
from .pipeline_config import PipelineConfig
|
||||
from ners.core.utils import ensure_directories
|
||||
from ners.core.config.config_manager import ConfigManager
|
||||
from ners.core.config.logging_config import LoggingConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
config_manager = ConfigManager()
|
||||
|
||||
@@ -22,7 +22,9 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfi
|
||||
return config_manager.get_config()
|
||||
|
||||
|
||||
def setup_config(config_path: Optional[Path] = None, env: str = "development") -> PipelineConfig:
|
||||
def setup_config(
|
||||
config_path: Optional[Path] = None, env: str = "development"
|
||||
) -> PipelineConfig:
|
||||
"""
|
||||
Unified configuration loading and logging setup for all entrypoint scripts.
|
||||
|
||||
@@ -5,8 +5,8 @@ from typing import Optional, Union, Dict, Any
|
||||
|
||||
import yaml
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.config.project_paths import ProjectPaths
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
@@ -36,7 +36,7 @@ class ConfigManager:
|
||||
|
||||
def _setup_default_paths(self):
|
||||
"""Setup default project paths"""
|
||||
root_dir = Path(__file__).parent.parent.parent
|
||||
root_dir = Path(__file__).parent.parent.parent.parent.parent
|
||||
self.default_paths = ProjectPaths(
|
||||
root_dir=root_dir,
|
||||
configs_dir=root_dir / "config",
|
||||
@@ -53,7 +53,9 @@ class ConfigManager:
|
||||
self.config_path = config_path
|
||||
|
||||
if not self.config_path.exists():
|
||||
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
|
||||
logging.warning(
|
||||
f"Config file not found: {self.config_path}. Using defaults."
|
||||
)
|
||||
return self._create_default_config()
|
||||
|
||||
try:
|
||||
@@ -122,7 +124,11 @@ class ConfigManager:
|
||||
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
||||
"""Recursively update nested dictionaries"""
|
||||
for key, value in update_dict.items():
|
||||
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
|
||||
if (
|
||||
key in base_dict
|
||||
and isinstance(base_dict[key], dict)
|
||||
and isinstance(value, dict)
|
||||
):
|
||||
self._deep_update(base_dict[key], value)
|
||||
else:
|
||||
base_dict[key] = value
|
||||
@@ -1,10 +1,10 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.config.annotation_config import AnnotationConfig
|
||||
from core.config.data_config import DataConfig
|
||||
from core.config.logging_config import LoggingConfig
|
||||
from core.config.processing_config import ProcessingConfig
|
||||
from core.config.project_paths import ProjectPaths
|
||||
from ners.core.config.annotation_config import AnnotationConfig
|
||||
from ners.core.config.data_config import DataConfig
|
||||
from ners.core.config.logging_config import LoggingConfig
|
||||
from ners.core.config.processing_config import ProcessingConfig
|
||||
from ners.core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class PipelineConfig(BaseModel):
|
||||
@@ -10,6 +10,8 @@ class ProcessingConfig(BaseModel):
|
||||
max_workers: int = 4
|
||||
checkpoint_interval: int = 5
|
||||
use_multiprocessing: bool = False
|
||||
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
||||
encoding_options: list = field(
|
||||
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
|
||||
)
|
||||
chunk_size: int = 100_000
|
||||
epochs: int = 2
|
||||
@@ -4,13 +4,13 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.config import PipelineConfig
|
||||
from ners.core.config import PipelineConfig
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temporary_config_override(**overrides):
|
||||
"""Context manager for temporarily overriding configuration"""
|
||||
from core.config import get_config
|
||||
from ners.core.config import get_config
|
||||
|
||||
config = get_config()
|
||||
original_values = {}
|
||||
@@ -5,7 +5,7 @@ from typing import Optional, Union, Iterator, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
OPTIMIZED_DTYPES = {
|
||||
# Numeric columns with appropriate bit-width
|
||||
@@ -113,7 +113,9 @@ class DataLoader:
|
||||
sex_values = df["sex"].dropna().unique()
|
||||
|
||||
if len(sex_values) == 0:
|
||||
logging.warning(f"No valid values found in sex column 'sex', using random sampling")
|
||||
logging.warning(
|
||||
"No valid values found in sex column 'sex', using random sampling"
|
||||
)
|
||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||
|
||||
# Calculate samples per sex category
|
||||
@@ -140,18 +142,22 @@ class DataLoader:
|
||||
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
||||
|
||||
if not balanced_samples:
|
||||
logging.warning("No balanced samples could be created, using random sampling")
|
||||
logging.warning(
|
||||
"No balanced samples could be created, using random sampling"
|
||||
)
|
||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||
|
||||
# Create result using iloc with indices (no copying until final step)
|
||||
result = df.iloc[balanced_samples].copy()
|
||||
|
||||
# Shuffle the final result
|
||||
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||
drop=True
|
||||
)
|
||||
result = result.sample(
|
||||
frac=1, random_state=self.config.data.random_seed
|
||||
).reset_index(drop=True)
|
||||
|
||||
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
|
||||
logging.info(
|
||||
f"Created balanced dataset with {len(result)} records from {len(df)} total"
|
||||
)
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
@@ -1,4 +1,4 @@
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class PromptManager:
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import logging
|
||||
from typing import Dict, Any
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class StateManager:
|
||||
+11
-41
@@ -1,21 +1,17 @@
|
||||
#!.venv/bin/python3
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from core.config import setup_config
|
||||
from core.utils.data_loader import DataLoader
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.pipeline import Pipeline
|
||||
from processing.steps.data_cleaning_step import DataCleaningStep
|
||||
from processing.steps.data_selection_step import DataSelectionStep
|
||||
from processing.steps.data_splitting_step import DataSplittingStep
|
||||
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.pipeline import Pipeline
|
||||
from ners.processing.steps.data_cleaning_step import DataCleaningStep
|
||||
from ners.processing.steps.data_selection_step import DataSelectionStep
|
||||
from ners.processing.steps.data_splitting_step import DataSplittingStep
|
||||
from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
|
||||
from ners.processing.steps.ner_annotation_step import NERAnnotationStep
|
||||
from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||
|
||||
|
||||
def create_pipeline(config) -> Pipeline:
|
||||
"""Create pipeline from configuration"""
|
||||
batch_config = BatchConfig(
|
||||
batch_size=config.processing.batch_size,
|
||||
max_workers=config.processing.max_workers,
|
||||
@@ -23,14 +19,13 @@ def create_pipeline(config) -> Pipeline:
|
||||
use_multiprocessing=config.processing.use_multiprocessing,
|
||||
)
|
||||
|
||||
# Add steps based on configuration
|
||||
pipeline = Pipeline(batch_config)
|
||||
steps = [
|
||||
DataCleaningStep(config),
|
||||
FeatureExtractionStep(config),
|
||||
DataSelectionStep(config),
|
||||
# NERAnnotationStep(config),
|
||||
# LLMAnnotationStep(config),
|
||||
NERAnnotationStep(config),
|
||||
LLMAnnotationStep(config),
|
||||
]
|
||||
|
||||
for stage in config.stages:
|
||||
@@ -42,7 +37,6 @@ def create_pipeline(config) -> Pipeline:
|
||||
|
||||
|
||||
def run_pipeline(config) -> int:
|
||||
"""Run the complete pipeline"""
|
||||
try:
|
||||
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
||||
|
||||
@@ -79,27 +73,3 @@ def run_pipeline(config) -> int:
|
||||
except Exception as e:
|
||||
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
||||
return 1
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point with unified configuration loading"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="DRC NERS Processing Pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
config = setup_config(config_path=args.config, env=args.env)
|
||||
return run_pipeline(config)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Pipeline failed: {e}")
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Executable
+14
@@ -0,0 +1,14 @@
|
||||
#!.venv/bin/python3
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
|
||||
|
||||
def status(*, detailed: bool = False) -> None:
|
||||
PipelineMonitor().print_status(detailed=detailed)
|
||||
|
||||
|
||||
def clean_step(step: str, *, keep_last: int = 1) -> None:
|
||||
PipelineMonitor().clean_step_checkpoints(step, keep_last)
|
||||
|
||||
|
||||
def reset_step(step: str) -> None:
|
||||
PipelineMonitor().reset_step(step)
|
||||
+10
-25
@@ -1,29 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
from core.config import setup_config, PipelineConfig
|
||||
from processing.ner.name_builder import NameBuilder
|
||||
from processing.ner.name_engineering import NameEngineering
|
||||
from processing.ner.name_model import NameModel
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.processing.ner.name_builder import NameBuilder
|
||||
from ners.processing.ner.name_engineering import NameEngineering
|
||||
from ners.processing.ner.name_model import NameModel
|
||||
|
||||
|
||||
def feature(config: PipelineConfig):
|
||||
"""Apply feature engineering to create position-independent NER dataset."""
|
||||
NameEngineering(config).compute()
|
||||
|
||||
|
||||
def build(config: PipelineConfig):
|
||||
"""Build NER dataset using NERDataBuilder."""
|
||||
NameBuilder(config).build()
|
||||
|
||||
|
||||
def train(config: PipelineConfig):
|
||||
"""Train the NER model."""
|
||||
name_model = NameModel(config)
|
||||
|
||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||
@@ -37,7 +32,9 @@ def train(config: PipelineConfig):
|
||||
split_idx = int(len(data) * 0.9)
|
||||
train_data, eval_data = data[:split_idx], data[split_idx:]
|
||||
|
||||
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
||||
logging.info(
|
||||
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
|
||||
)
|
||||
name_model.train(
|
||||
data=train_data,
|
||||
epochs=config.processing.epochs,
|
||||
@@ -75,21 +72,9 @@ def run_pipeline(config: PipelineConfig, reset: bool = False):
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="NER model management for DRC names")
|
||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||
parser.add_argument("--reset", action="store_true", help="Reset all steps")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
config = setup_config(config_path=args.config, env=args.env)
|
||||
return run_pipeline(config, args.reset)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Pipeline failed: {e}")
|
||||
logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
|
||||
return 1
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -8,4 +8,6 @@ class BatchConfig:
|
||||
batch_size: int = 1000
|
||||
max_workers: int = 4
|
||||
checkpoint_interval: int = 5 # Save checkpoint every N batches
|
||||
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
||||
use_multiprocessing: bool = (
|
||||
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
||||
)
|
||||
@@ -4,9 +4,9 @@ from typing import Iterator
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.batch.memory_monitor import MemoryMonitor
|
||||
from processing.steps import PipelineStep
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.batch.memory_monitor import MemoryMonitor
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class BatchProcessor:
|
||||
@@ -33,7 +33,9 @@ class BatchProcessor:
|
||||
|
||||
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
||||
if step.batch_exists(batch_id):
|
||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||
logging.info(
|
||||
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||
)
|
||||
processed_batch = step.load_batch(batch_id)
|
||||
else:
|
||||
try:
|
||||
@@ -80,7 +82,9 @@ class BatchProcessor:
|
||||
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Memory-optimized concurrent processing"""
|
||||
executor_class = (
|
||||
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
|
||||
ProcessPoolExecutor
|
||||
if self.config.use_multiprocessing
|
||||
else ThreadPoolExecutor
|
||||
)
|
||||
results = {}
|
||||
|
||||
@@ -89,7 +93,9 @@ class BatchProcessor:
|
||||
future_to_batch = {}
|
||||
for batch, batch_id in self.create_batches(df):
|
||||
if step.batch_exists(batch_id):
|
||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||
logging.info(
|
||||
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||
)
|
||||
results[batch_id] = step.load_batch(batch_id)
|
||||
else:
|
||||
# Only copy if necessary for concurrent processing
|
||||
@@ -121,7 +127,9 @@ class BatchProcessor:
|
||||
del results
|
||||
self.memory_monitor.cleanup_memory()
|
||||
|
||||
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
||||
result = (
|
||||
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
||||
)
|
||||
|
||||
# Final cleanup
|
||||
del ordered_results
|
||||
@@ -131,7 +139,9 @@ class BatchProcessor:
|
||||
|
||||
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process data using the configured strategy"""
|
||||
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
|
||||
step.state.total_batches = (
|
||||
len(df) + self.config.batch_size - 1
|
||||
) // self.config.batch_size
|
||||
step.load_state()
|
||||
|
||||
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
||||
+16
-6
@@ -4,8 +4,8 @@ import shutil
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict
|
||||
|
||||
from core.config.config_manager import ConfigManager
|
||||
from core.config.project_paths import ProjectPaths
|
||||
from ners.core.config.config_manager import ConfigManager
|
||||
from ners.core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class PipelineMonitor:
|
||||
@@ -97,7 +97,10 @@ class PipelineMonitor:
|
||||
|
||||
avg_completion = total_completion / len(self.steps)
|
||||
|
||||
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
|
||||
if avg_completion >= 100 and overall_status not in [
|
||||
"error",
|
||||
"completed_with_errors",
|
||||
]:
|
||||
overall_status = "completed"
|
||||
|
||||
return {
|
||||
@@ -121,7 +124,9 @@ class PipelineMonitor:
|
||||
print(f"{step_name.replace('_', ' ').title()}:")
|
||||
print(f" Status: {step_status['status']}")
|
||||
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
||||
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
|
||||
print(
|
||||
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
|
||||
)
|
||||
|
||||
if step_status["failed_batches"] > 0:
|
||||
print(f" Failed Batches: {step_status['failed_batches']}")
|
||||
@@ -141,7 +146,10 @@ class PipelineMonitor:
|
||||
if step_dir.exists():
|
||||
csv_files = list(step_dir.glob("*.csv"))
|
||||
step_size = sum(f.stat().st_size for f in csv_files)
|
||||
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
|
||||
counts[step] = {
|
||||
"files": len(csv_files),
|
||||
"size_mb": step_size / (1024 * 1024),
|
||||
}
|
||||
total_size += step_size
|
||||
else:
|
||||
counts[step] = {"files": 0, "size_mb": 0}
|
||||
@@ -160,7 +168,9 @@ class PipelineMonitor:
|
||||
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
||||
|
||||
if len(csv_files) <= keep_last:
|
||||
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
|
||||
logging.info(
|
||||
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
|
||||
)
|
||||
return
|
||||
|
||||
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
||||
@@ -3,7 +3,7 @@ from typing import List, Tuple, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.steps.feature_extraction_step import NameCategory
|
||||
from ners.processing.steps.feature_extraction_step import NameCategory
|
||||
|
||||
|
||||
class BaseNameFormatter(ABC):
|
||||
@@ -12,7 +12,9 @@ class BaseNameFormatter(ABC):
|
||||
Contains common logic for NER tagging and attribute computation.
|
||||
"""
|
||||
|
||||
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
||||
def __init__(
|
||||
self, connectors: List[str] = None, additional_surnames: List[str] = None
|
||||
):
|
||||
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
||||
self.additional_surnames = additional_surnames or [
|
||||
"jean",
|
||||
@@ -46,7 +48,9 @@ class BaseNameFormatter(ABC):
|
||||
end_pos = current_pos + len(word)
|
||||
|
||||
# Determine tag based on word content
|
||||
if word in native_parts or any(connector in word for connector in self.connectors):
|
||||
if word in native_parts or any(
|
||||
connector in word for connector in self.connectors
|
||||
):
|
||||
tag = "NATIVE"
|
||||
elif word == surname or word in self.additional_surnames:
|
||||
tag = "SURNAME"
|
||||
@@ -72,7 +76,9 @@ class BaseNameFormatter(ABC):
|
||||
"words": words_count,
|
||||
"length": length,
|
||||
"identified_category": (
|
||||
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
|
||||
NameCategory.SIMPLE.value
|
||||
if words_count == 3
|
||||
else NameCategory.COMPOSE.value
|
||||
),
|
||||
}
|
||||
|
||||
+1
-1
@@ -3,7 +3,7 @@ from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.ner.formats import BaseNameFormatter
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class ConnectorFormatter(BaseNameFormatter):
|
||||
+7
-3
@@ -3,13 +3,15 @@ from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.ner.formats import BaseNameFormatter
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
original_surname = (
|
||||
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
)
|
||||
|
||||
# Add random additional surname
|
||||
additional_surname = random.choice(self.additional_surnames)
|
||||
@@ -22,7 +24,9 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": combined_surname,
|
||||
"identified_surname": combined_surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
|
||||
"ner_entities": str(
|
||||
self.create_ner_tags(full_name, native_parts, combined_surname)
|
||||
),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
+1
-1
@@ -2,7 +2,7 @@ from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.ner.formats import BaseNameFormatter
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class NativeOnlyFormatter(BaseNameFormatter):
|
||||
+1
-1
@@ -2,7 +2,7 @@ from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.ner.formats import BaseNameFormatter
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class OriginalFormatter(BaseNameFormatter):
|
||||
+1
-1
@@ -2,7 +2,7 @@ from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.ner.formats import BaseNameFormatter
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class PositionFlippedFormatter(BaseNameFormatter):
|
||||
+7
-3
@@ -2,7 +2,7 @@ from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.ner.formats import BaseNameFormatter
|
||||
from ners.processing.ner.formats import BaseNameFormatter
|
||||
|
||||
|
||||
class ReducedNativeFormatter(BaseNameFormatter):
|
||||
@@ -11,7 +11,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Keep only first native component + surname
|
||||
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
||||
reduced_native = (
|
||||
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
||||
)
|
||||
full_name = f"{reduced_native} {surname}".strip()
|
||||
|
||||
return {
|
||||
@@ -20,7 +22,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
|
||||
"identified_name": reduced_native,
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
|
||||
"ner_entities": str(
|
||||
self.create_ner_tags(full_name, [reduced_native], surname)
|
||||
),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
@@ -4,8 +4,8 @@ import logging
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from .name_tagger import NameTagger
|
||||
|
||||
|
||||
@@ -20,7 +20,9 @@ class NameBuilder:
|
||||
self.tagger = NameTagger()
|
||||
|
||||
def build(self) -> int:
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["engineered"]
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||
|
||||
@@ -38,7 +40,9 @@ class NameBuilder:
|
||||
|
||||
# Use NERNameTagger for parsing and validation
|
||||
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
|
||||
validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities)
|
||||
validated_entities = self.tagger.validate_entities(
|
||||
ner_df["name"], parsed_entities
|
||||
)
|
||||
|
||||
# Drop rows with no valid entities
|
||||
mask = validated_entities.map(bool)
|
||||
@@ -51,22 +55,33 @@ class NameBuilder:
|
||||
|
||||
# Prepare training data
|
||||
training_data = list(
|
||||
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
||||
zip(
|
||||
ner_df["name"].tolist(),
|
||||
[{"entities": ents} for ents in validated_entities],
|
||||
)
|
||||
)
|
||||
|
||||
# Use NERNameTagger to create spaCy DocBin
|
||||
docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
||||
docs = self.tagger.create_docs(
|
||||
nlp, ner_df["name"].tolist(), validated_entities.tolist()
|
||||
)
|
||||
doc_bin = DocBin(docs=docs)
|
||||
|
||||
# Save
|
||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
||||
json_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["ner_data"]
|
||||
)
|
||||
spacy_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["ner_spacy"]
|
||||
)
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
doc_bin.to_disk(spacy_path)
|
||||
|
||||
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
||||
logging.info(
|
||||
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
|
||||
)
|
||||
logging.info(f"Saved NER JSON to {json_path}")
|
||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||
return 0
|
||||
@@ -6,14 +6,14 @@ import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||
from processing.ner.formats.original_format import OriginalFormatter
|
||||
from processing.ner.formats.position_flipped_format import PositionFlippedFormatter
|
||||
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.ner.formats.connectors_format import ConnectorFormatter
|
||||
from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||
from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||
from ners.processing.ner.formats.original_format import OriginalFormatter
|
||||
from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
|
||||
from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
||||
|
||||
|
||||
class NameEngineering:
|
||||
@@ -44,42 +44,60 @@ class NameEngineering:
|
||||
# Initialize format classes
|
||||
self.formatters = {
|
||||
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
||||
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
|
||||
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
|
||||
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
|
||||
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
|
||||
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
|
||||
"native_only": NativeOnlyFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"position_flipped": PositionFlippedFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"reduced_native": ReducedNativeFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"connector_added": ConnectorFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
"extended_surname": ExtendedSurnameFormatter(
|
||||
self.connectors, self.additional_surnames
|
||||
),
|
||||
}
|
||||
|
||||
def load_data(self) -> pd.DataFrame:
|
||||
"""Load and filter NER-tagged data from CSV file"""
|
||||
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Filter only NER-tagged rows
|
||||
ner_data = df[df["ner_tagged"] == 1].copy()
|
||||
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
|
||||
logging.info(
|
||||
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
|
||||
)
|
||||
|
||||
return ner_data
|
||||
|
||||
def compute(self) -> None:
|
||||
logging.info("Applying feature engineering transformations...")
|
||||
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
input_filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
output_filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["engineered"]
|
||||
)
|
||||
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
||||
logging.info(
|
||||
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
|
||||
)
|
||||
|
||||
del df # No need to keep in memory
|
||||
gc.collect()
|
||||
|
||||
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||
drop=True
|
||||
)
|
||||
ner_df = ner_df.sample(
|
||||
frac=1, random_state=self.config.data.random_seed
|
||||
).reset_index(drop=True)
|
||||
total_rows = len(ner_df)
|
||||
|
||||
# Calculate split points
|
||||
@@ -94,7 +112,11 @@ class NameEngineering:
|
||||
(0, split_25_1, "original"), # First 25%: original format
|
||||
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
||||
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
||||
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
|
||||
(
|
||||
split_25_3,
|
||||
split_10_1,
|
||||
"reduced_native",
|
||||
), # Fourth 10%: reduce native components
|
||||
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
||||
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
||||
]
|
||||
@@ -11,7 +11,7 @@ from spacy.training import Example
|
||||
from spacy.util import minibatch
|
||||
from tqdm import tqdm
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class NameModel:
|
||||
@@ -87,7 +87,9 @@ class NameModel:
|
||||
|
||||
# Handle different annotation formats from NERNameTagger
|
||||
if not isinstance(annotations, dict) or "entities" not in annotations:
|
||||
logging.warning(f"Skipping invalid annotations at index {i}: {annotations}")
|
||||
logging.warning(
|
||||
f"Skipping invalid annotations at index {i}: {annotations}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
@@ -124,7 +126,9 @@ class NameModel:
|
||||
valid_entities = []
|
||||
for entity in entities:
|
||||
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
||||
logging.warning(f"Skipping invalid entity format in '{text}': {entity}")
|
||||
logging.warning(
|
||||
f"Skipping invalid entity format in '{text}': {entity}"
|
||||
)
|
||||
continue
|
||||
|
||||
start, end, label = entity
|
||||
@@ -138,21 +142,30 @@ class NameModel:
|
||||
or start < 0
|
||||
or end > len(text)
|
||||
):
|
||||
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
|
||||
logging.warning(
|
||||
f"Skipping invalid entity bounds in '{text}': {entity}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
has_overlap = any(
|
||||
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
|
||||
start < v_end and end > v_start
|
||||
for v_start, v_end, _ in valid_entities
|
||||
)
|
||||
|
||||
if has_overlap:
|
||||
logging.warning(f"Skipping overlapping entity in '{text}': {entity}")
|
||||
logging.warning(
|
||||
f"Skipping overlapping entity in '{text}': {entity}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Validate that the span doesn't contain spaces (matching tagger validation)
|
||||
span_text = text[start:end]
|
||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||
if (
|
||||
not span_text
|
||||
or span_text != span_text.strip()
|
||||
or " " in span_text
|
||||
):
|
||||
logging.warning(
|
||||
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
||||
)
|
||||
@@ -161,7 +174,9 @@ class NameModel:
|
||||
valid_entities.append((start, end, label))
|
||||
|
||||
if not valid_entities:
|
||||
logging.warning(f"Skipping training example with no valid entities: '{text}'")
|
||||
logging.warning(
|
||||
f"Skipping training example with no valid entities: '{text}'"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
@@ -219,7 +234,9 @@ class NameModel:
|
||||
batches = minibatch(examples, size=batch_size)
|
||||
for batch in batches:
|
||||
batch_losses = {}
|
||||
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
|
||||
self.nlp.update(
|
||||
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
|
||||
)
|
||||
logging.info(
|
||||
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
||||
)
|
||||
@@ -230,7 +247,7 @@ class NameModel:
|
||||
|
||||
del batches # free memory
|
||||
losses_history.append(losses.get("ner", 0))
|
||||
logging.info(f"Epoch {epoch+1}/{epochs}, Total Loss: {losses['ner']:.4f}")
|
||||
logging.info(f"Epoch {epoch + 1}/{epochs}, Total Loss: {losses['ner']:.4f}")
|
||||
|
||||
# Store training statistics
|
||||
self.training_stats = {
|
||||
@@ -242,7 +259,9 @@ class NameModel:
|
||||
"dropout_rate": dropout_rate,
|
||||
}
|
||||
|
||||
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
|
||||
logging.info(
|
||||
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
|
||||
)
|
||||
|
||||
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
||||
"""Evaluate the trained model on test data"""
|
||||
@@ -291,10 +310,14 @@ class NameModel:
|
||||
entity_stats[label]["fp"] += 1
|
||||
|
||||
# Calculate overall metrics
|
||||
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
|
||||
precision = (
|
||||
correct_entities / predicted_entities if predicted_entities > 0 else 0
|
||||
)
|
||||
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
||||
f1_score = (
|
||||
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
||||
2 * (precision * recall) / (precision + recall)
|
||||
if (precision + recall) > 0
|
||||
else 0
|
||||
)
|
||||
|
||||
# Calculate per-label metrics
|
||||
@@ -304,7 +327,11 @@ class NameModel:
|
||||
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
||||
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
||||
label_f1 = (
|
||||
(2 * (label_precision * label_recall) / (label_precision + label_recall))
|
||||
(
|
||||
2
|
||||
* (label_precision * label_recall)
|
||||
/ (label_precision + label_recall)
|
||||
)
|
||||
if (label_precision + label_recall) > 0
|
||||
else 0
|
||||
)
|
||||
@@ -394,7 +421,9 @@ class NameModel:
|
||||
"label": ent.label_,
|
||||
"start": ent.start_char,
|
||||
"end": ent.end_char,
|
||||
"confidence": getattr(ent, "score", None), # If confidence scores are available
|
||||
"confidence": getattr(
|
||||
ent, "score", None
|
||||
), # If confidence scores are available
|
||||
}
|
||||
)
|
||||
|
||||
@@ -48,7 +48,9 @@ class NameTagger:
|
||||
# Find the first occurrence of this native word that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
|
||||
pos = name_lower.find(
|
||||
native_word_lower, start_pos
|
||||
) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
@@ -78,7 +80,9 @@ class NameTagger:
|
||||
# Find the first occurrence that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
|
||||
pos = name_lower.find(
|
||||
surname_lower, start_pos
|
||||
) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
@@ -120,8 +124,13 @@ class NameTagger:
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
|
||||
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
|
||||
if any(
|
||||
start < v_end and end > v_start
|
||||
for v_start, v_end, _ in validated_entities
|
||||
):
|
||||
logging.warning(
|
||||
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
|
||||
)
|
||||
continue
|
||||
|
||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||
@@ -200,10 +209,16 @@ class NameTagger:
|
||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
||||
return [
|
||||
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
|
||||
]
|
||||
else:
|
||||
parsed = ast.literal_eval(entities_str)
|
||||
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
|
||||
return [
|
||||
tuple(e)
|
||||
for e in parsed
|
||||
if isinstance(e, (list, tuple)) and len(e) == 3
|
||||
]
|
||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
@@ -251,7 +266,9 @@ class NameTagger:
|
||||
last_end = e
|
||||
return filtered
|
||||
|
||||
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
||||
def validate_entities(
|
||||
self, texts: pd.Series, entities_series: pd.Series
|
||||
) -> pd.Series:
|
||||
"""Vectorized entity validation."""
|
||||
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
||||
|
||||
@@ -4,9 +4,9 @@ from typing import Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.batch.batch_processor import BatchProcessor
|
||||
from processing.steps import PipelineStep
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.batch.batch_processor import BatchProcessor
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class Pipeline:
|
||||
@@ -8,9 +8,9 @@ from typing import List, Optional
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -38,7 +38,10 @@ class PipelineStep(ABC):
|
||||
"""Abstract base class for pipeline steps"""
|
||||
|
||||
def __init__(
|
||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
||||
self,
|
||||
name: str,
|
||||
pipeline_config: PipelineConfig,
|
||||
batch_config: Optional[BatchConfig] = None,
|
||||
):
|
||||
self.name = name
|
||||
self.pipeline_config = pipeline_config
|
||||
+3
-3
@@ -2,9 +2,9 @@ import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.text_cleaner import TextCleaner
|
||||
from processing.steps import PipelineStep
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.text_cleaner import TextCleaner
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class DataCleaningStep(PipelineStep):
|
||||
+8
-4
@@ -2,8 +2,8 @@ import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.steps import PipelineStep
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class DataSelectionStep(PipelineStep):
|
||||
@@ -31,8 +31,12 @@ class DataSelectionStep(PipelineStep):
|
||||
)
|
||||
|
||||
# Check which columns exist in the batch
|
||||
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
||||
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
|
||||
available_columns = [
|
||||
col for col in self.selected_columns if col in batch.columns
|
||||
]
|
||||
missing_columns = [
|
||||
col for col in self.selected_columns if col not in batch.columns
|
||||
]
|
||||
|
||||
if missing_columns:
|
||||
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
||||
+14
-8
@@ -1,11 +1,11 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.region_mapper import RegionMapper
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.steps import PipelineStep
|
||||
from processing.steps.feature_extraction_step import Gender
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.region_mapper import RegionMapper
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.steps import PipelineStep
|
||||
from ners.processing.steps.feature_extraction_step import Gender
|
||||
|
||||
|
||||
class DataSplittingStep(PipelineStep):
|
||||
@@ -26,7 +26,9 @@ class DataSplittingStep(PipelineStep):
|
||||
if self.eval_indices is None:
|
||||
np.random.seed(self.pipeline_config.data.random_seed)
|
||||
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
|
||||
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
|
||||
self.eval_indices = set(
|
||||
np.random.choice(total_size, size=eval_size, replace=False)
|
||||
)
|
||||
return self.eval_indices
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
@@ -45,7 +47,9 @@ class DataSplittingStep(PipelineStep):
|
||||
df_evaluation = df[eval_mask]
|
||||
df_featured = df[~eval_mask]
|
||||
|
||||
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
|
||||
self.data_loader.save_csv(
|
||||
df_evaluation, data_dir / output_files["evaluation"]
|
||||
)
|
||||
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
|
||||
else:
|
||||
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
||||
@@ -53,7 +57,9 @@ class DataSplittingStep(PipelineStep):
|
||||
if self.pipeline_config.data.split_by_province:
|
||||
for province in RegionMapper.get_provinces():
|
||||
df_region = df[df.province == province]
|
||||
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
|
||||
self.data_loader.save_csv(
|
||||
df_region, data_dir / "provinces" / f"{province}.csv"
|
||||
)
|
||||
|
||||
if self.pipeline_config.data.split_by_gender:
|
||||
df_males = df[df.sex == Gender.MALE.value]
|
||||
+10
-6
@@ -5,10 +5,10 @@ from typing import Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.region_mapper import RegionMapper
|
||||
from processing.ner.name_tagger import NameTagger
|
||||
from processing.steps import PipelineStep
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.region_mapper import RegionMapper
|
||||
from ners.processing.ner.name_tagger import NameTagger
|
||||
from ners.processing.steps import PipelineStep
|
||||
|
||||
|
||||
class Gender(Enum):
|
||||
@@ -64,10 +64,14 @@ class FeatureExtractionStep(PipelineStep):
|
||||
|
||||
self._assign_probable_names(result)
|
||||
self._process_simple_names(result)
|
||||
result["identified_category"] = self._assign_identified_category(result["words"])
|
||||
result["identified_category"] = self._assign_identified_category(
|
||||
result["words"]
|
||||
)
|
||||
|
||||
if "year" in result.columns:
|
||||
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
|
||||
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
|
||||
"Int16"
|
||||
)
|
||||
|
||||
if "region" in result.columns:
|
||||
result["province"] = self.region_mapper.map(result["region"]).str.lower()
|
||||
+17
-10
@@ -7,12 +7,12 @@ import ollama
|
||||
import pandas as pd
|
||||
from pydantic import ValidationError
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.prompt_manager import PromptManager
|
||||
from core.utils.rate_limiter import RateLimitConfig
|
||||
from core.utils.rate_limiter import RateLimiter
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.steps import PipelineStep, NameAnnotation
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.core.utils.prompt_manager import PromptManager
|
||||
from ners.core.utils.rate_limiter import RateLimitConfig
|
||||
from ners.core.utils.rate_limiter import RateLimiter
|
||||
from ners.processing.batch.batch_config import BatchConfig
|
||||
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||
|
||||
|
||||
class LLMAnnotationStep(PipelineStep):
|
||||
@@ -24,7 +24,8 @@ class LLMAnnotationStep(PipelineStep):
|
||||
batch_config = BatchConfig(
|
||||
batch_size=pipeline_config.processing.batch_size,
|
||||
max_workers=min(
|
||||
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
|
||||
self.llm_config.max_concurrent_requests,
|
||||
pipeline_config.processing.max_workers,
|
||||
),
|
||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||
@@ -33,7 +34,9 @@ class LLMAnnotationStep(PipelineStep):
|
||||
|
||||
self.prompt = PromptManager(pipeline_config).load_prompt()
|
||||
self.rate_limiter = (
|
||||
self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None
|
||||
self._create_rate_limiter()
|
||||
if self.llm_config.enable_rate_limiting
|
||||
else None
|
||||
)
|
||||
|
||||
# Statistics
|
||||
@@ -76,7 +79,9 @@ class LLMAnnotationStep(PipelineStep):
|
||||
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
|
||||
)
|
||||
|
||||
annotation = NameAnnotation.model_validate_json(response.message.content)
|
||||
annotation = NameAnnotation.model_validate_json(
|
||||
response.message.content
|
||||
)
|
||||
result = {
|
||||
**annotation.model_dump(),
|
||||
"annotated": 1,
|
||||
@@ -119,7 +124,9 @@ class LLMAnnotationStep(PipelineStep):
|
||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||
return batch
|
||||
|
||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM")
|
||||
logging.info(
|
||||
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
|
||||
)
|
||||
|
||||
batch = batch.copy()
|
||||
client = ollama.Client()
|
||||
+12
-6
@@ -5,9 +5,9 @@ from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.ner.name_model import NameModel
|
||||
from processing.steps import PipelineStep, NameAnnotation
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.processing.ner.name_model import NameModel
|
||||
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||
|
||||
|
||||
class NERAnnotationStep(PipelineStep):
|
||||
@@ -39,7 +39,9 @@ class NERAnnotationStep(PipelineStep):
|
||||
logging.info("NER model loaded successfully")
|
||||
else:
|
||||
logging.warning(f"NER model not found at {self.model_path}")
|
||||
logging.warning("NER annotation will be skipped. Train the model first.")
|
||||
logging.warning(
|
||||
"NER annotation will be skipped. Train the model first."
|
||||
)
|
||||
self.name_model.nlp = None
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load NER model: {e}")
|
||||
@@ -80,7 +82,9 @@ class NERAnnotationStep(PipelineStep):
|
||||
# Create annotation result in same format as LLM step
|
||||
annotation = NameAnnotation(
|
||||
identified_name=" ".join(native_parts) if native_parts else None,
|
||||
identified_surname=" ".join(surname_parts) if surname_parts else None,
|
||||
identified_surname=" ".join(surname_parts)
|
||||
if surname_parts
|
||||
else None,
|
||||
)
|
||||
|
||||
result = {
|
||||
@@ -124,7 +128,9 @@ class NERAnnotationStep(PipelineStep):
|
||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||
return batch
|
||||
|
||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER")
|
||||
logging.info(
|
||||
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
|
||||
)
|
||||
|
||||
batch = batch.copy()
|
||||
|
||||
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
|
||||
|
||||
class BaseModel(ABC):
|
||||
@@ -103,16 +103,25 @@ class BaseModel(ABC):
|
||||
feature_names = self._get_feature_names()
|
||||
return dict(zip(feature_names, coefficients))
|
||||
|
||||
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
|
||||
elif (
|
||||
hasattr(self.model, "named_steps")
|
||||
and "classifier" in self.model.named_steps
|
||||
):
|
||||
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
||||
classifier = self.model.named_steps["classifier"]
|
||||
if hasattr(classifier, "coef_"):
|
||||
coefficients = np.abs(classifier.coef_[0])
|
||||
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
|
||||
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
|
||||
if hasattr(
|
||||
self.model.named_steps["vectorizer"], "get_feature_names_out"
|
||||
):
|
||||
feature_names = self.model.named_steps[
|
||||
"vectorizer"
|
||||
].get_feature_names_out()
|
||||
# Take top features to avoid too many n-grams
|
||||
top_indices = np.argsort(coefficients)[-20:]
|
||||
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
|
||||
return dict(
|
||||
zip(feature_names[top_indices], coefficients[top_indices])
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@@ -143,7 +152,7 @@ class BaseModel(ABC):
|
||||
model_data = joblib.load(path)
|
||||
|
||||
# Recreate the model instance
|
||||
from research.experiment import ExperimentConfig
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
|
||||
config = ExperimentConfig.from_dict(model_data["config"])
|
||||
instance = cls(config)
|
||||
@@ -221,7 +230,9 @@ class BaseModel(ABC):
|
||||
if "accuracy" in self.training_history:
|
||||
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
||||
if "val_accuracy" in self.training_history:
|
||||
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
|
||||
axes[0].plot(
|
||||
self.training_history["val_accuracy"], label="Validation Accuracy"
|
||||
)
|
||||
axes[0].set_title("Model Accuracy")
|
||||
axes[0].set_xlabel("Epoch")
|
||||
axes[0].set_ylabel("Accuracy")
|
||||
@@ -18,7 +18,9 @@ class ExperimentConfig:
|
||||
tags: List[str] = field(default_factory=list)
|
||||
|
||||
# Model configuration
|
||||
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||
model_type: str = (
|
||||
"logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||
)
|
||||
model_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Feature configuration
|
||||
@@ -26,7 +28,9 @@ class ExperimentConfig:
|
||||
feature_params: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# Data configuration
|
||||
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
|
||||
train_data_filter: Optional[Dict[str, Any]] = (
|
||||
None # Filter criteria for training data
|
||||
)
|
||||
test_data_filter: Optional[Dict[str, Any]] = None
|
||||
target_column: str = "sex"
|
||||
|
||||
@@ -36,7 +40,9 @@ class ExperimentConfig:
|
||||
cross_validation_folds: int = 5
|
||||
|
||||
# Evaluation configuration
|
||||
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
|
||||
metrics: List[str] = field(
|
||||
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization"""
|
||||
+4
-2
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -51,6 +51,8 @@ class ExperimentResult:
|
||||
"""Create from dictionary"""
|
||||
data["config"] = ExperimentConfig.from_dict(data["config"])
|
||||
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
||||
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||
data["end_time"] = (
|
||||
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||
)
|
||||
data["status"] = ExperimentStatus(data["status"])
|
||||
return cls(**data)
|
||||
+6
-4
@@ -3,9 +3,9 @@ from typing import List, Dict
|
||||
|
||||
import yaml
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.experiment.feature_extractor import FeatureType
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
from ners.research.experiment.feature_extractor import FeatureType
|
||||
|
||||
|
||||
class ExperimentBuilder:
|
||||
@@ -27,7 +27,9 @@ class ExperimentBuilder:
|
||||
raise
|
||||
|
||||
@classmethod
|
||||
def find_template(cls, templates: dict, name: str, experiment_type: str = "baseline") -> dict:
|
||||
def find_template(
|
||||
cls, templates: dict, name: str, experiment_type: str = "baseline"
|
||||
) -> dict:
|
||||
"""Find experiment configuration by name and type"""
|
||||
|
||||
# Map type to section in templates
|
||||
+37
-15
@@ -9,12 +9,16 @@ import pandas as pd
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from research.model_registry import create_model
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment import (
|
||||
ExperimentConfig,
|
||||
ExperimentStatus,
|
||||
calculate_metrics,
|
||||
)
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
from ners.research.model_registry import create_model
|
||||
|
||||
|
||||
class ExperimentRunner:
|
||||
@@ -32,10 +36,14 @@ class ExperimentRunner:
|
||||
|
||||
try:
|
||||
logging.info(f"Starting experiment: {experiment_id}")
|
||||
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
||||
self.tracker.update_experiment(
|
||||
experiment_id, status=ExperimentStatus.RUNNING
|
||||
)
|
||||
|
||||
# Load data
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Apply data filters if specified
|
||||
@@ -63,8 +71,12 @@ class ExperimentRunner:
|
||||
test_pred = model.predict(X_test)
|
||||
|
||||
# Calculate metrics
|
||||
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
|
||||
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
|
||||
train_metrics = calculate_metrics(
|
||||
y_train, train_pred, experiment_config.metrics
|
||||
)
|
||||
test_metrics = calculate_metrics(
|
||||
y_test, test_pred, experiment_config.metrics
|
||||
)
|
||||
|
||||
# Cross-validation if requested
|
||||
cv_metrics = {}
|
||||
@@ -125,7 +137,9 @@ class ExperimentRunner:
|
||||
experiment_ids = []
|
||||
|
||||
for i, config in enumerate(experiments):
|
||||
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
|
||||
logging.info(
|
||||
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
|
||||
)
|
||||
try:
|
||||
exp_id = self.run_experiment(config)
|
||||
experiment_ids.append(exp_id)
|
||||
@@ -136,7 +150,9 @@ class ExperimentRunner:
|
||||
return experiment_ids
|
||||
|
||||
@classmethod
|
||||
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
|
||||
def _apply_data_filters(
|
||||
cls, df: pd.DataFrame, config: ExperimentConfig
|
||||
) -> pd.DataFrame:
|
||||
"""Apply data filters specified in experiment config"""
|
||||
filtered_df = df.copy()
|
||||
|
||||
@@ -148,9 +164,13 @@ class ExperimentRunner:
|
||||
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
||||
elif isinstance(criteria, dict):
|
||||
if "min" in criteria:
|
||||
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
|
||||
filtered_df = filtered_df[
|
||||
filtered_df[column] >= criteria["min"]
|
||||
]
|
||||
if "max" in criteria:
|
||||
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
|
||||
filtered_df = filtered_df[
|
||||
filtered_df[column] <= criteria["max"]
|
||||
]
|
||||
else:
|
||||
filtered_df = filtered_df[filtered_df[column] == criteria]
|
||||
|
||||
@@ -231,7 +251,9 @@ class ExperimentRunner:
|
||||
return model
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
|
||||
logging.error(
|
||||
f"Failed to load model for experiment {experiment_id}: {e}"
|
||||
)
|
||||
return None
|
||||
|
||||
return None
|
||||
+13
-6
@@ -6,9 +6,9 @@ from typing import Optional, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config import PipelineConfig, get_config
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from research.experiment.experiement_result import ExperimentResult
|
||||
from ners.core.config import PipelineConfig, get_config
|
||||
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from ners.research.experiment.experiement_result import ExperimentResult
|
||||
|
||||
|
||||
class ExperimentTracker:
|
||||
@@ -97,7 +97,10 @@ class ExperimentTracker:
|
||||
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||
|
||||
def get_best_experiment(
|
||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||
self,
|
||||
metric: str = "accuracy",
|
||||
dataset: str = "test",
|
||||
filters: Optional[Dict] = None,
|
||||
) -> Optional[ExperimentResult]:
|
||||
"""Get the best experiment based on a metric"""
|
||||
experiments = self.list_experiments()
|
||||
@@ -106,7 +109,9 @@ class ExperimentTracker:
|
||||
# Apply additional filters
|
||||
if "model_type" in filters:
|
||||
experiments = [
|
||||
e for e in experiments if e.config.model_type == filters["model_type"]
|
||||
e
|
||||
for e in experiments
|
||||
if e.config.model_type == filters["model_type"]
|
||||
]
|
||||
if "features" in filters:
|
||||
experiments = [
|
||||
@@ -118,7 +123,9 @@ class ExperimentTracker:
|
||||
valid_experiments = []
|
||||
for exp in experiments:
|
||||
if exp.status == ExperimentStatus.COMPLETED:
|
||||
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
|
||||
metrics_dict = (
|
||||
exp.test_metrics if dataset == "test" else exp.train_metrics
|
||||
)
|
||||
if metric in metrics_dict:
|
||||
valid_experiments.append((exp, metrics_dict[metric]))
|
||||
|
||||
+3
-1
@@ -24,7 +24,9 @@ class FeatureType(Enum):
|
||||
class FeatureExtractor:
|
||||
"""Extract different types of features from name data"""
|
||||
|
||||
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
|
||||
def __init__(
|
||||
self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None
|
||||
):
|
||||
self.feature_types = feature_types
|
||||
self.feature_params = feature_params or {}
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
from typing import List
|
||||
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.models.bigru_model import BiGRUModel
|
||||
from research.models.cnn_model import CNNModel
|
||||
from research.models.ensemble_model import EnsembleModel
|
||||
from research.models.lightgbm_model import LightGBMModel
|
||||
from research.models.logistic_regression_model import LogisticRegressionModel
|
||||
from research.models.lstm_model import LSTMModel
|
||||
from research.models.naive_bayes_model import NaiveBayesModel
|
||||
from research.models.random_forest_model import RandomForestModel
|
||||
from research.models.svm_model import SVMModel
|
||||
from research.models.transformer_model import TransformerModel
|
||||
from research.models.xgboost_model import XGBoostModel
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
from ners.research.models.bigru_model import BiGRUModel
|
||||
from ners.research.models.cnn_model import CNNModel
|
||||
from ners.research.models.ensemble_model import EnsembleModel
|
||||
from ners.research.models.lightgbm_model import LightGBMModel
|
||||
from ners.research.models.logistic_regression_model import LogisticRegressionModel
|
||||
from ners.research.models.lstm_model import LSTMModel
|
||||
from ners.research.models.naive_bayes_model import NaiveBayesModel
|
||||
from ners.research.models.random_forest_model import RandomForestModel
|
||||
from ners.research.models.svm_model import SVMModel
|
||||
from ners.research.models.transformer_model import TransformerModel
|
||||
from ners.research.models.xgboost_model import XGBoostModel
|
||||
|
||||
MODEL_REGISTRY = {
|
||||
"bigru": BiGRUModel,
|
||||
@@ -5,12 +5,12 @@ from typing import List, Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config import get_config
|
||||
from core.utils.data_loader import DataLoader
|
||||
from research.experiment import FeatureType, ExperimentConfig
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from research.model_registry import MODEL_REGISTRY
|
||||
from ners.core.config import get_config
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.research.experiment import FeatureType, ExperimentConfig
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
from ners.research.model_registry import MODEL_REGISTRY
|
||||
|
||||
|
||||
class ModelTrainer:
|
||||
@@ -66,7 +66,9 @@ class ModelTrainer:
|
||||
if experiment and experiment.test_metrics:
|
||||
logging.info("Training completed successfully!")
|
||||
logging.info(f"Experiment ID: {experiment_id}")
|
||||
logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
|
||||
logging.info(
|
||||
f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}"
|
||||
)
|
||||
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
|
||||
|
||||
if save_artifacts:
|
||||
@@ -144,13 +146,17 @@ class ModelTrainer:
|
||||
|
||||
try:
|
||||
# Load data for learning curve generation
|
||||
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
data_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
if data_path.exists():
|
||||
df = self.data_loader.load_csv_complete(data_path)
|
||||
|
||||
# Generate learning curve
|
||||
logging.info("Generating learning curve...")
|
||||
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
|
||||
trained_model.generate_learning_curve(
|
||||
df, df[experiment.config.target_column]
|
||||
)
|
||||
|
||||
# Plot and save learning curve
|
||||
learning_curve_path = model_dir / "learning_curve.png"
|
||||
@@ -187,8 +193,12 @@ class ModelTrainer:
|
||||
"model_path": str(model_path),
|
||||
"config_path": str(config_path),
|
||||
"results_path": str(results_path),
|
||||
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
||||
"training_history_plot": str(training_history_path) if training_history_path else None,
|
||||
"learning_curve_plot": str(learning_curve_path)
|
||||
if learning_curve_path
|
||||
else None,
|
||||
"training_history_plot": str(training_history_path)
|
||||
if training_history_path
|
||||
else None,
|
||||
"has_learning_curve": bool(trained_model.learning_curve_data),
|
||||
"has_training_history": bool(trained_model.training_history),
|
||||
}
|
||||
@@ -215,8 +225,12 @@ class ModelTrainer:
|
||||
"config_path": str(config_path),
|
||||
"results_path": str(results_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
||||
"training_history_plot": str(training_history_path) if training_history_path else None,
|
||||
"learning_curve_plot": str(learning_curve_path)
|
||||
if learning_curve_path
|
||||
else None,
|
||||
"training_history_plot": str(training_history_path)
|
||||
if training_history_path
|
||||
else None,
|
||||
}
|
||||
|
||||
def load_trained_model(self, experiment_id: str):
|
||||
@@ -227,7 +241,9 @@ class ModelTrainer:
|
||||
model_path = model_dir / "complete_model.joblib"
|
||||
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
|
||||
raise FileNotFoundError(
|
||||
f"Model artifacts not found for experiment {experiment_id}"
|
||||
)
|
||||
|
||||
# Load the model class dynamically
|
||||
metadata_path = model_dir / "metadata.json"
|
||||
@@ -261,7 +277,9 @@ class ModelTrainer:
|
||||
metadata = json.load(f)
|
||||
models_data.append(metadata)
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not read metadata for {model_dir.name}: {e}")
|
||||
logging.warning(
|
||||
f"Could not read metadata for {model_dir.name}: {e}"
|
||||
)
|
||||
|
||||
if not models_data:
|
||||
logging.info("No saved models found.")
|
||||
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class BiGRUModel(NeuralNetworkModel):
|
||||
@@ -53,7 +53,9 @@ class BiGRUModel(NeuralNetworkModel):
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
loss="sparse_categorical_crossentropy",
|
||||
optimizer="adam",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
@@ -15,7 +15,7 @@ from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class CNNModel(NeuralNetworkModel):
|
||||
@@ -29,7 +29,9 @@ class CNNModel(NeuralNetworkModel):
|
||||
[
|
||||
# Learn char/subword embeddings; spatial dropout regularizes across channels
|
||||
# to make the model robust to noisy characters and transliteration.
|
||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||
Embedding(
|
||||
input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)
|
||||
),
|
||||
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
|
||||
# Small kernels capture short n-gram like patterns; padding='same' keeps
|
||||
# sequence length stable for simpler pooling behavior.
|
||||
@@ -59,7 +61,9 @@ class CNNModel(NeuralNetworkModel):
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
loss="sparse_categorical_crossentropy",
|
||||
optimizer="adam",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
@@ -75,6 +79,8 @@ class CNNModel(NeuralNetworkModel):
|
||||
self.tokenizer.fit_on_texts(text_data)
|
||||
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
|
||||
max_len = self.config.model_params.get(
|
||||
"max_len", 20
|
||||
) # Longer for character level
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
@@ -8,8 +8,8 @@ from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.experiment import ExperimentConfig
|
||||
from research.traditional_model import TraditionalModel
|
||||
from ners.research.experiment import ExperimentConfig
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class EnsembleModel(TraditionalModel):
|
||||
@@ -40,22 +40,28 @@ class EnsembleModel(TraditionalModel):
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
|
||||
CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 4), max_features=5000
|
||||
),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
|
||||
LogisticRegression(
|
||||
max_iter=1000, random_state=self.config.random_seed
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append((f"logistic_regression", model))
|
||||
estimators.append(("logistic_regression", model))
|
||||
|
||||
elif model_type == "random_forest":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
|
||||
TfidfVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=3000
|
||||
),
|
||||
),
|
||||
(
|
||||
"classifier",
|
||||
@@ -65,19 +71,21 @@ class EnsembleModel(TraditionalModel):
|
||||
),
|
||||
]
|
||||
)
|
||||
estimators.append((f"rf", model))
|
||||
estimators.append(("rf", model))
|
||||
|
||||
elif model_type == "naive_bayes":
|
||||
model = Pipeline(
|
||||
[
|
||||
(
|
||||
"vectorizer",
|
||||
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
|
||||
CountVectorizer(
|
||||
analyzer="char", ngram_range=(1, 3), max_features=4000
|
||||
),
|
||||
),
|
||||
("classifier", MultinomialNB()),
|
||||
]
|
||||
)
|
||||
estimators.append((f"nb", model))
|
||||
estimators.append(("nb", model))
|
||||
|
||||
# Soft voting averages probabilities (preferred when members are calibrated);
|
||||
# hard voting uses majority class. Parallelize member predictions.
|
||||
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LightGBMModel(TraditionalModel):
|
||||
@@ -106,7 +106,9 @@ class LightGBMModel(TraditionalModel):
|
||||
lambda x: x if x in known_classes else default_class
|
||||
)
|
||||
|
||||
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
||||
encoded = self.label_encoders[feature_key].transform(
|
||||
column_mapped
|
||||
)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
+1
-1
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class LogisticRegressionModel(TraditionalModel):
|
||||
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class LSTMModel(NeuralNetworkModel):
|
||||
@@ -50,7 +50,9 @@ class LSTMModel(NeuralNetworkModel):
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||
loss="sparse_categorical_crossentropy",
|
||||
optimizer="adam",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
+1
-1
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class NaiveBayesModel(TraditionalModel):
|
||||
+7
-4
@@ -6,7 +6,7 @@ from sklearn.base import BaseEstimator
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class RandomForestModel(TraditionalModel):
|
||||
@@ -18,7 +18,6 @@ class RandomForestModel(TraditionalModel):
|
||||
self.label_encoders: Dict[str, LabelEncoder] = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
|
||||
params = self.config.model_params
|
||||
|
||||
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
|
||||
@@ -56,10 +55,14 @@ class RandomForestModel(TraditionalModel):
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
known_classes = set(encoder.classes_)
|
||||
default_class = (
|
||||
"unknown" if "unknown" in known_classes else encoder.classes_[0]
|
||||
"unknown"
|
||||
if "unknown" in known_classes
|
||||
else encoder.classes_[0]
|
||||
)
|
||||
column_mapped = column_clean.apply(
|
||||
lambda value: value if value in known_classes else default_class
|
||||
lambda value: value
|
||||
if value in known_classes
|
||||
else default_class
|
||||
)
|
||||
encoded = encoder.transform(column_mapped)
|
||||
|
||||
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class SVMModel(TraditionalModel):
|
||||
+12
-5
@@ -16,7 +16,7 @@ from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
from research.neural_network_model import NeuralNetworkModel
|
||||
from ners.research.neural_network_model import NeuralNetworkModel
|
||||
|
||||
|
||||
class TransformerModel(NeuralNetworkModel):
|
||||
@@ -37,7 +37,8 @@ class TransformerModel(NeuralNetworkModel):
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
||||
pos_embedding = Embedding(
|
||||
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
|
||||
input_dim=params.get("max_len", 8),
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
)(positions)
|
||||
x = x + pos_embedding
|
||||
|
||||
@@ -49,7 +50,9 @@ class TransformerModel(NeuralNetworkModel):
|
||||
|
||||
model = Model(inputs, outputs)
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
optimizer="adam",
|
||||
loss="sparse_categorical_crossentropy",
|
||||
metrics=["accuracy"],
|
||||
)
|
||||
return model
|
||||
|
||||
@@ -62,11 +65,15 @@ class TransformerModel(NeuralNetworkModel):
|
||||
key_dim=cfg_params.get("transformer_head_size", 64),
|
||||
dropout=cfg_params.get("attn_dropout", 0.1),
|
||||
)(x, x)
|
||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
|
||||
x = LayerNormalization(epsilon=1e-6)(
|
||||
x + Dropout(cfg_params.get("dropout", 0.1))(attn)
|
||||
)
|
||||
|
||||
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
|
||||
ff = Dense(x.shape[-1])(ff)
|
||||
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
|
||||
return LayerNormalization(epsilon=1e-6)(
|
||||
x + Dropout(cfg_params.get("dropout", 0.1))(ff)
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_data = self._collect_text_corpus(X)
|
||||
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class XGBoostModel(TraditionalModel):
|
||||
@@ -106,7 +106,9 @@ class XGBoostModel(TraditionalModel):
|
||||
lambda x: x if x in known_classes else default_class
|
||||
)
|
||||
|
||||
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
||||
encoded = self.label_encoders[feature_key].transform(
|
||||
column_mapped
|
||||
)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
@@ -10,8 +10,10 @@ from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment.feature_extractor import FeatureExtractor
|
||||
import tensorflow as tf
|
||||
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment.feature_extractor import FeatureExtractor
|
||||
|
||||
|
||||
class NeuralNetworkModel(BaseModel):
|
||||
@@ -34,8 +36,6 @@ class NeuralNetworkModel(BaseModel):
|
||||
# - Enables memory growth to avoid pre-allocating all VRAM
|
||||
# - Optionally enables mixed precision if requested via model params
|
||||
try:
|
||||
import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs
|
||||
|
||||
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||
|
||||
@@ -49,15 +49,15 @@ class NeuralNetworkModel(BaseModel):
|
||||
|
||||
if enable_mixed:
|
||||
try:
|
||||
from tensorflow.keras import mixed_precision
|
||||
|
||||
mixed_precision.set_global_policy("mixed_float16")
|
||||
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||
logging.info("Enabled TensorFlow mixed precision (float16)")
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not enable mixed precision: {e}")
|
||||
else:
|
||||
if requested_gpu:
|
||||
logging.warning("Requested GPU but no TensorFlow GPU device is available.")
|
||||
logging.warning(
|
||||
"Requested GPU but no TensorFlow GPU device is available."
|
||||
)
|
||||
except Exception as e:
|
||||
# Keep silent in non-TF environments / non-NN workflows
|
||||
logging.debug(f"TensorFlow GPU setup skipped: {e}")
|
||||
@@ -86,7 +86,9 @@ class NeuralNetworkModel(BaseModel):
|
||||
logging.info(f"Vocabulary size: {vocab_size}")
|
||||
|
||||
# Get additional model parameters
|
||||
self.model = self.build_model_with_vocab(vocab_size=vocab_size, **self.config.model_params)
|
||||
self.model = self.build_model_with_vocab(
|
||||
vocab_size=vocab_size, **self.config.model_params
|
||||
)
|
||||
|
||||
# Train the neural network
|
||||
logging.info(
|
||||
@@ -143,7 +145,7 @@ class NeuralNetworkModel(BaseModel):
|
||||
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
|
||||
invalid_mask = (arr < 0) | (arr > max_idx)
|
||||
# Avoid turning zeros into OOV
|
||||
invalid_mask &= (arr != 0)
|
||||
invalid_mask &= arr != 0
|
||||
if invalid_mask.any():
|
||||
arr[invalid_mask] = oov_index
|
||||
|
||||
@@ -157,10 +159,14 @@ class NeuralNetworkModel(BaseModel):
|
||||
"""Combine configured textual features into one string per record."""
|
||||
|
||||
column_names = [
|
||||
feature.value for feature in self.config.features if feature.value in X.columns
|
||||
feature.value
|
||||
for feature in self.config.features
|
||||
if feature.value in X.columns
|
||||
]
|
||||
if not column_names:
|
||||
raise ValueError("No configured text features found in the provided DataFrame.")
|
||||
raise ValueError(
|
||||
"No configured text features found in the provided DataFrame."
|
||||
)
|
||||
|
||||
text_frame = X[column_names].fillna("").astype(str)
|
||||
|
||||
@@ -193,9 +199,7 @@ class NeuralNetworkModel(BaseModel):
|
||||
pass
|
||||
if enable_mixed:
|
||||
try:
|
||||
from tensorflow.keras import mixed_precision
|
||||
|
||||
mixed_precision.set_global_policy("mixed_float16")
|
||||
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
@@ -208,7 +212,9 @@ class NeuralNetworkModel(BaseModel):
|
||||
X_prepared = self._sanitize_sequences(X_prepared)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
||||
cv = StratifiedKFold(
|
||||
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
|
||||
)
|
||||
|
||||
accuracies = []
|
||||
precisions = []
|
||||
@@ -280,14 +286,14 @@ class NeuralNetworkModel(BaseModel):
|
||||
pass
|
||||
if enable_mixed:
|
||||
try:
|
||||
from tensorflow.keras import mixed_precision
|
||||
|
||||
mixed_precision.set_global_policy("mixed_float16")
|
||||
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
if requested_gpu:
|
||||
logging.warning("Requested GPU for learning curve but none is available.")
|
||||
logging.warning(
|
||||
"Requested GPU for learning curve but none is available."
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -342,7 +348,7 @@ class NeuralNetworkModel(BaseModel):
|
||||
|
||||
# Train model
|
||||
if hasattr(model, "fit"):
|
||||
history = model.fit(
|
||||
model.fit(
|
||||
X_train_subset,
|
||||
y_train_subset,
|
||||
epochs=self.config.model_params.get("epochs", 10),
|
||||
@@ -3,12 +3,16 @@ import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
from research.statistics.utils import LETTERS, build_letter_frequencies
|
||||
from ners.research.statistics.utils import LETTERS, build_letter_frequencies
|
||||
|
||||
|
||||
def plot_transition_matrix(ax, df_probs, title=""):
|
||||
hm = sns.heatmap(
|
||||
df_probs.loc[list(LETTERS), list(LETTERS)], cmap="Reds", annot=False, cbar=False, ax=ax
|
||||
df_probs.loc[list(LETTERS), list(LETTERS)],
|
||||
cmap="Reds",
|
||||
annot=False,
|
||||
cbar=False,
|
||||
ax=ax,
|
||||
)
|
||||
ax.set_title(title, fontsize=12)
|
||||
return hm
|
||||
@@ -31,8 +35,12 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
||||
x = np.arange(len(df_plot))
|
||||
w = 0.4
|
||||
fig, ax = plt.subplots(figsize=(16, 6))
|
||||
ax.bar(x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
|
||||
ax.bar(x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
|
||||
ax.bar(
|
||||
x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8
|
||||
)
|
||||
ax.bar(
|
||||
x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8
|
||||
)
|
||||
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(df_plot["letter"])
|
||||
@@ -5,8 +5,6 @@ import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.spatial.distance import euclidean
|
||||
from scipy.stats import entropy
|
||||
from scipy.spatial.distance import euclidean
|
||||
from scipy.stats import entropy
|
||||
from typing import Dict, Any
|
||||
|
||||
LETTERS = "abcdefghijklmnopqrstuvwxyz"
|
||||
@@ -49,7 +47,12 @@ def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFram
|
||||
|
||||
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
||||
# Normalize: lowercase, remove non-letters, concatenate all into one string
|
||||
s = series.astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True).str.cat(sep="")
|
||||
s = (
|
||||
series.astype(str)
|
||||
.str.lower()
|
||||
.str.replace(r"[^a-z]", "", regex=True)
|
||||
.str.cat(sep="")
|
||||
)
|
||||
|
||||
# Convert string into Series of characters
|
||||
chars = pd.Series(list(s))
|
||||
@@ -150,8 +153,12 @@ def build_transition_comparisons(
|
||||
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
|
||||
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
|
||||
|
||||
kl_surnames_mf = entropy(prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12)
|
||||
kl_surnames_fm = entropy(prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12)
|
||||
kl_surnames_mf = entropy(
|
||||
prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12
|
||||
)
|
||||
kl_surnames_fm = entropy(
|
||||
prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12
|
||||
)
|
||||
|
||||
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
|
||||
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
|
||||
@@ -163,7 +170,9 @@ def build_transition_comparisons(
|
||||
P_f = transitions["f"]["probs"].flatten()
|
||||
|
||||
# Calculate the observed JSD (our test statistic)
|
||||
observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12))
|
||||
observed_jsd = 0.5 * (
|
||||
entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)
|
||||
)
|
||||
|
||||
# Concatenate male and female counts
|
||||
counts_m = transitions["m"]["counts"]
|
||||
@@ -194,10 +203,12 @@ def build_transition_comparisons(
|
||||
|
||||
permuted_jsd = 0.5 * (
|
||||
entropy(
|
||||
permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12
|
||||
permuted_probs_m.mean(axis=1) + 1e-12,
|
||||
permuted_probs_f.mean(axis=1) + 1e-12,
|
||||
)
|
||||
+ entropy(
|
||||
permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12
|
||||
permuted_probs_f.mean(axis=1) + 1e-12,
|
||||
permuted_probs_m.mean(axis=1) + 1e-12,
|
||||
)
|
||||
)
|
||||
permuted_jsds.append(permuted_jsd)
|
||||
@@ -8,8 +8,8 @@ from sklearn.model_selection import StratifiedKFold, cross_val_score
|
||||
from sklearn.model_selection import learning_curve
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from research.base_model import BaseModel
|
||||
from research.experiment.feature_extractor import FeatureExtractor
|
||||
from ners.research.base_model import BaseModel
|
||||
from ners.research.experiment.feature_extractor import FeatureExtractor
|
||||
|
||||
|
||||
class TraditionalModel(BaseModel):
|
||||
@@ -52,7 +52,9 @@ class TraditionalModel(BaseModel):
|
||||
# Train model
|
||||
if len(X_prepared.shape) == 1:
|
||||
# For text-based features (like LogisticRegression with vectorization)
|
||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
|
||||
logging.info(
|
||||
f"Fitting model with {X_prepared.shape[0]} samples (text features)"
|
||||
)
|
||||
else:
|
||||
# For numerical features
|
||||
logging.info(
|
||||
@@ -74,12 +76,16 @@ class TraditionalModel(BaseModel):
|
||||
|
||||
return self
|
||||
|
||||
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> Dict[str, float]:
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
||||
cv = StratifiedKFold(
|
||||
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
|
||||
)
|
||||
|
||||
# Calculate different metrics
|
||||
results = {}
|
||||
@@ -95,7 +101,11 @@ class TraditionalModel(BaseModel):
|
||||
for metric in ["precision", "recall", "f1"]:
|
||||
if metric in self.config.metrics:
|
||||
scores = cross_val_score(
|
||||
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
|
||||
self.model,
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
cv=cv,
|
||||
scoring=f"{metric}_weighted",
|
||||
)
|
||||
results[metric] = scores.mean()
|
||||
results[f"{metric}_std"] = scores.std()
|
||||
Executable
+46
@@ -0,0 +1,46 @@
|
||||
#!.venv/bin/python3
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from ners.core.config import setup_config
|
||||
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||
from ners.research.model_trainer import ModelTrainer
|
||||
|
||||
|
||||
def train_from_template(
|
||||
name: str,
|
||||
type: str,
|
||||
*,
|
||||
templates: str = "research_templates.yaml",
|
||||
config: str | None = None,
|
||||
env: str = "development",
|
||||
) -> int:
|
||||
try:
|
||||
cfg = setup_config(config_path=config, env=env)
|
||||
experiment_builder = ExperimentBuilder(cfg)
|
||||
|
||||
logging.info(f"Loading research templates from: {templates}")
|
||||
tmpl = experiment_builder.load_templates(templates)
|
||||
|
||||
logging.info(f"Looking for experiment: name='{name}', type='{type}'")
|
||||
experiment_config = experiment_builder.find_template(tmpl, name, type)
|
||||
|
||||
logging.info(f"Found experiment: {experiment_config.get('name')}")
|
||||
logging.info(f"Description: {experiment_config.get('description')}")
|
||||
logging.info(f"Features: {experiment_config.get('features')}")
|
||||
|
||||
trainer = ModelTrainer(cfg)
|
||||
trainer.train_single_model(
|
||||
model_name=experiment_config.get("name"),
|
||||
model_type=experiment_config.get("model_type"),
|
||||
features=experiment_config.get("features"),
|
||||
model_params=experiment_config.get("model_params", {}),
|
||||
tags=experiment_config.get("tags", []),
|
||||
)
|
||||
|
||||
logging.info("Training completed successfully!")
|
||||
return 0
|
||||
except Exception as e:
|
||||
logging.error(f"Training failed: {e}")
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
@@ -1,19 +1,13 @@
|
||||
#!.venv/bin/python3
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from core.config import setup_config, PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from ners.core.config import setup_config, PipelineConfig
|
||||
from ners.core.utils.data_loader import DataLoader
|
||||
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
# Page configuration
|
||||
st.set_page_config(
|
||||
@@ -65,19 +59,9 @@ class StreamlitApp:
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="DRC NERS Platform",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||
args = parser.parse_args()
|
||||
|
||||
config = setup_config(args.config, env=args.env)
|
||||
app = StreamlitApp(config)
|
||||
app.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Initialize app using environment variables when launched via Typer
|
||||
_config_path = os.environ.get("NERS_CONFIG")
|
||||
_env = os.environ.get("NERS_ENV", "development")
|
||||
_cfg = setup_config(_config_path, env=_env)
|
||||
_app = StreamlitApp(_cfg)
|
||||
_app.run()
|
||||
@@ -1,7 +1,7 @@
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
@st.cache_data
|
||||
@@ -25,7 +25,9 @@ class Dashboard:
|
||||
|
||||
# Load basic statistics
|
||||
try:
|
||||
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
data_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
if data_path.exists():
|
||||
df = load_dataset(str(data_path))
|
||||
|
||||
@@ -37,13 +39,17 @@ class Dashboard:
|
||||
st.metric("Annotated Names", f"{annotated:,}")
|
||||
|
||||
with col3:
|
||||
provinces = df["province"].nunique() if "province" in df.columns else 0
|
||||
provinces = (
|
||||
df["province"].nunique() if "province" in df.columns else 0
|
||||
)
|
||||
st.metric("Provinces", provinces)
|
||||
|
||||
with col4:
|
||||
if "sex" in df.columns:
|
||||
gender_dist = df["sex"].value_counts()
|
||||
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
|
||||
ratio = gender_dist.get("f", 0) / max(
|
||||
gender_dist.get("m", 1), 1
|
||||
)
|
||||
st.metric("F/M Rate", f"{ratio:.2%}")
|
||||
with col5:
|
||||
if "annotated" in df.columns:
|
||||
@@ -79,4 +85,6 @@ class Dashboard:
|
||||
|
||||
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
|
||||
else:
|
||||
st.info("No experiments found. Create your first experiment in the Experiments tab!")
|
||||
st.info(
|
||||
"No experiments found. Create your first experiment in the Experiments tab!"
|
||||
)
|
||||
@@ -0,0 +1,52 @@
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
class DataOverview:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
def index(self):
|
||||
st.title("Data Overview")
|
||||
data_files = {
|
||||
"Names": self.config.data.input_file,
|
||||
"Featured Dataset": self.config.data.output_files["featured"],
|
||||
"Evaluation Dataset": self.config.data.output_files["evaluation"],
|
||||
"Male Names": self.config.data.output_files["males"],
|
||||
"Female Names": self.config.data.output_files["females"],
|
||||
}
|
||||
|
||||
st.write("Available Data Files:")
|
||||
for name, rel_path in data_files.items():
|
||||
file_path = self.config.paths.get_data_path(rel_path)
|
||||
exists = file_path.exists()
|
||||
size = file_path.stat().st_size if exists else 0
|
||||
stats = (
|
||||
f"Size: {size / (1024 * 1024):.1f} MB, Last Modified: {datetime.fromtimestamp(file_path.stat().st_mtime)}"
|
||||
if exists
|
||||
else "Not found"
|
||||
)
|
||||
st.write(f"- {name}: {file_path} ({stats})")
|
||||
|
||||
# Preview featured dataset if available
|
||||
data_path = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["featured"]
|
||||
)
|
||||
if data_path.exists():
|
||||
df = load_dataset(str(data_path))
|
||||
st.subheader("Featured Dataset Preview")
|
||||
st.dataframe(df.head(), use_container_width=True)
|
||||
st.write(f"Rows: {len(df):,}")
|
||||
@@ -2,8 +2,8 @@ import pandas as pd
|
||||
import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from web.interfaces.log_reader import LogReader
|
||||
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from ners.web.interfaces.log_reader import LogReader
|
||||
|
||||
|
||||
@st.cache_data
|
||||
@@ -31,7 +31,9 @@ class DataProcessing:
|
||||
|
||||
# Step details
|
||||
for step_name, step_status in status["steps"].items():
|
||||
with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
|
||||
with st.expander(
|
||||
f"{step_name.replace('_', ' ').title()} - {step_status['status']}"
|
||||
):
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
@@ -63,14 +65,20 @@ class DataProcessing:
|
||||
|
||||
with col2:
|
||||
num_entries = st.number_input(
|
||||
"Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries"
|
||||
"Number of entries",
|
||||
min_value=5,
|
||||
max_value=50,
|
||||
value=10,
|
||||
key="num_log_entries",
|
||||
)
|
||||
|
||||
# Get log entries based on filter
|
||||
if log_level_filter == "All":
|
||||
log_entries = log_reader.read_last_entries(num_entries)
|
||||
else:
|
||||
log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
|
||||
log_entries = log_reader.read_entries_by_level(
|
||||
log_level_filter, num_entries
|
||||
)
|
||||
|
||||
if log_entries:
|
||||
for entry in log_entries:
|
||||
@@ -2,13 +2,13 @@ from typing import List, Dict
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from research.experiment.experiment_builder import ExperimentBuilder
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from research.experiment.feature_extractor import FeatureType
|
||||
from research.model_registry import list_available_models
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||
from ners.research.experiment.feature_extractor import FeatureType
|
||||
from ners.research.model_registry import list_available_models
|
||||
|
||||
|
||||
class Experiments:
|
||||
@@ -46,13 +46,19 @@ class Experiments:
|
||||
available_experiments = self.experiment_builder.get_templates()
|
||||
|
||||
# Create tabs for different experiment types
|
||||
exp_tabs = st.tabs(["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"])
|
||||
exp_tabs = st.tabs(
|
||||
["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]
|
||||
)
|
||||
|
||||
with exp_tabs[0]:
|
||||
self._show_experiments_by_type(available_experiments["baseline"], "baseline")
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["baseline"], "baseline"
|
||||
)
|
||||
|
||||
with exp_tabs[1]:
|
||||
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["advanced"], "advanced"
|
||||
)
|
||||
|
||||
with exp_tabs[2]:
|
||||
self._show_experiments_by_type(
|
||||
@@ -60,7 +66,9 @@ class Experiments:
|
||||
)
|
||||
|
||||
with exp_tabs[3]:
|
||||
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["tuning"], "tuning"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error loading experiment templates: {e}")
|
||||
@@ -79,7 +87,9 @@ class Experiments:
|
||||
# Show available experiments
|
||||
for i, exp_template in enumerate(experiments):
|
||||
exp_name = exp_template.get("name", f"Experiment {i + 1}")
|
||||
exp_description = exp_template.get("description", "No description available")
|
||||
exp_description = exp_template.get(
|
||||
"description", "No description available"
|
||||
)
|
||||
|
||||
with st.expander(f"📊 {exp_name} - {exp_description}"):
|
||||
col1, col2 = st.columns([2, 1])
|
||||
@@ -88,7 +98,7 @@ class Experiments:
|
||||
st.json(exp_template)
|
||||
|
||||
with col2:
|
||||
if st.button(f"🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
|
||||
if st.button("🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
|
||||
self._run_template_experiment(exp_template)
|
||||
|
||||
def _run_template_experiment(self, exp_template: Dict):
|
||||
@@ -100,7 +110,9 @@ class Experiments:
|
||||
|
||||
# Run the experiment
|
||||
experiment_id = self.experiment_runner.run_experiment(experiment_config)
|
||||
st.success(f"Experiment '{experiment_config.name}' completed successfully!")
|
||||
st.success(
|
||||
f"Experiment '{experiment_config.name}' completed successfully!"
|
||||
)
|
||||
st.info(f"Experiment ID: `{experiment_id}`")
|
||||
|
||||
# Show results
|
||||
@@ -130,13 +142,17 @@ class Experiments:
|
||||
)
|
||||
|
||||
with col2:
|
||||
model_filter = st.selectbox("Filter by Model", ["All"] + list_available_models())
|
||||
model_filter = st.selectbox(
|
||||
"Filter by Model", ["All"] + list_available_models()
|
||||
)
|
||||
|
||||
with col3:
|
||||
tag_filter = st.text_input("Filter by Tags (comma-separated)")
|
||||
|
||||
# Get and filter experiments
|
||||
experiments = self._get_filtered_experiments(status_filter, model_filter, tag_filter)
|
||||
experiments = self._get_filtered_experiments(
|
||||
status_filter, model_filter, tag_filter
|
||||
)
|
||||
|
||||
if not experiments:
|
||||
st.info("No experiments found matching the filters.")
|
||||
@@ -149,20 +165,28 @@ class Experiments:
|
||||
):
|
||||
self._display_experiment_details(exp, i)
|
||||
|
||||
def _get_filtered_experiments(self, status_filter: str, model_filter: str, tag_filter: str):
|
||||
def _get_filtered_experiments(
|
||||
self, status_filter: str, model_filter: str, tag_filter: str
|
||||
):
|
||||
"""Get experiments with applied filters"""
|
||||
experiments = self.experiment_tracker.list_experiments()
|
||||
|
||||
# Apply filters
|
||||
if status_filter != "All":
|
||||
experiments = [e for e in experiments if e.status == ExperimentStatus(status_filter)]
|
||||
experiments = [
|
||||
e for e in experiments if e.status == ExperimentStatus(status_filter)
|
||||
]
|
||||
|
||||
if model_filter != "All":
|
||||
experiments = [e for e in experiments if e.config.model_type == model_filter]
|
||||
experiments = [
|
||||
e for e in experiments if e.config.model_type == model_filter
|
||||
]
|
||||
|
||||
if tag_filter:
|
||||
tags = [tag.strip() for tag in tag_filter.split(",")]
|
||||
experiments = [e for e in experiments if any(tag in e.config.tags for tag in tags)]
|
||||
experiments = [
|
||||
e for e in experiments if any(tag in e.config.tags for tag in tags)
|
||||
]
|
||||
|
||||
return experiments
|
||||
|
||||
@@ -173,7 +197,9 @@ class Experiments:
|
||||
|
||||
with col1:
|
||||
st.write(f"**Model:** {exp.config.model_type}")
|
||||
st.write(f"**Features:** {', '.join([f.value for f in exp.config.features])}")
|
||||
st.write(
|
||||
f"**Features:** {', '.join([f.value for f in exp.config.features])}"
|
||||
)
|
||||
st.write(f"**Tags:** {', '.join(exp.config.tags)}")
|
||||
|
||||
with col2:
|
||||
@@ -185,7 +211,7 @@ class Experiments:
|
||||
st.write(f"**Train Size:** {exp.train_size:,}")
|
||||
st.write(f"**Test Size:** {exp.test_size:,}")
|
||||
|
||||
if st.button(f"View Details", key=f"details_{index}"):
|
||||
if st.button("View Details", key=f"details_{index}"):
|
||||
st.session_state.selected_experiment = exp.experiment_id
|
||||
st.rerun()
|
||||
|
||||
@@ -198,7 +224,9 @@ class Experiments:
|
||||
st.write("Run multiple experiments with different parameter combinations.")
|
||||
|
||||
# Add option to run template batch experiments
|
||||
batch_type = st.radio("Batch Type", ["Template Batch", "Custom Parameter Sweep"])
|
||||
batch_type = st.radio(
|
||||
"Batch Type", ["Template Batch", "Custom Parameter Sweep"]
|
||||
)
|
||||
|
||||
if batch_type == "Template Batch":
|
||||
self._show_template_batch_experiments()
|
||||
@@ -227,10 +255,13 @@ class Experiments:
|
||||
if experiments:
|
||||
st.write(f"**{exp_type.title()} Experiments:**")
|
||||
exp_names = [
|
||||
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
|
||||
exp.get("name", f"Exp {i}")
|
||||
for i, exp in enumerate(experiments)
|
||||
]
|
||||
selected_names = st.multiselect(
|
||||
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
|
||||
f"Select {exp_type} experiments",
|
||||
exp_names,
|
||||
key=f"select_{exp_type}",
|
||||
)
|
||||
|
||||
for name in selected_names:
|
||||
@@ -258,13 +289,17 @@ class Experiments:
|
||||
experiment_configs.append(config)
|
||||
|
||||
# Run batch experiments
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiment_configs)
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(
|
||||
experiment_configs
|
||||
)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} template experiments!")
|
||||
|
||||
# Show summary
|
||||
if experiment_ids:
|
||||
comparison = self.experiment_runner.compare_experiments(experiment_ids)
|
||||
comparison = self.experiment_runner.compare_experiments(
|
||||
experiment_ids
|
||||
)
|
||||
st.write("**Template Batch Results:**")
|
||||
st.dataframe(
|
||||
comparison[["name", "model_type", "test_accuracy"]],
|
||||
@@ -285,7 +320,9 @@ class Experiments:
|
||||
with col1:
|
||||
base_name = st.text_input("Base Experiment Name", "parameter_sweep")
|
||||
model_types = st.multiselect(
|
||||
"Model Types", list_available_models(), default=["logistic_regression"]
|
||||
"Model Types",
|
||||
list_available_models(),
|
||||
default=["logistic_regression"],
|
||||
)
|
||||
|
||||
# N-gram ranges for logistic regression
|
||||
@@ -301,13 +338,20 @@ class Experiments:
|
||||
default=["full_name", "native_name", "surname"],
|
||||
)
|
||||
|
||||
test_sizes = st.text_input("Test Sizes (comma-separated)", "0.15,0.2,0.25")
|
||||
test_sizes = st.text_input(
|
||||
"Test Sizes (comma-separated)", "0.15,0.2,0.25"
|
||||
)
|
||||
|
||||
tags = st.text_input("Common Tags", "parameter_sweep,batch")
|
||||
|
||||
if st.form_submit_button("🚀 Run Parameter Sweep"):
|
||||
self.run_batch_experiments(
|
||||
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
|
||||
base_name,
|
||||
model_types,
|
||||
ngram_ranges,
|
||||
feature_combinations,
|
||||
test_sizes,
|
||||
tags,
|
||||
)
|
||||
|
||||
def run_batch_experiments(
|
||||
@@ -369,13 +413,17 @@ class Experiments:
|
||||
exp_count += 1
|
||||
|
||||
# Run experiments
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(
|
||||
experiments
|
||||
)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} batch experiments")
|
||||
|
||||
# Show summary
|
||||
if experiment_ids:
|
||||
comparison = self.experiment_runner.compare_experiments(experiment_ids)
|
||||
comparison = self.experiment_runner.compare_experiments(
|
||||
experiment_ids
|
||||
)
|
||||
st.write("**Batch Results Summary:**")
|
||||
st.dataframe(
|
||||
comparison[["name", "model_type", "test_accuracy"]],
|
||||
@@ -0,0 +1,80 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogEntry:
|
||||
timestamp: datetime
|
||||
level: str
|
||||
message: str
|
||||
|
||||
|
||||
class LogReader:
|
||||
def __init__(self, log_file_path: Path):
|
||||
self.log_file_path = Path(log_file_path)
|
||||
|
||||
def read_last_entries(self, num_entries: int = 20) -> List[LogEntry]:
|
||||
entries = []
|
||||
if not self.log_file_path.exists():
|
||||
return entries
|
||||
|
||||
with open(self.log_file_path, "r") as f:
|
||||
lines = f.readlines()[-num_entries:]
|
||||
|
||||
for line in lines:
|
||||
entry = self._parse_log_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
||||
def read_entries_by_level(
|
||||
self, level: str, num_entries: int = 20
|
||||
) -> List[LogEntry]:
|
||||
entries = []
|
||||
if not self.log_file_path.exists():
|
||||
return entries
|
||||
|
||||
with open(self.log_file_path, "r") as f:
|
||||
for line in reversed(f.readlines()):
|
||||
entry = self._parse_log_line(line)
|
||||
if entry and entry.level == level:
|
||||
entries.append(entry)
|
||||
if len(entries) >= num_entries:
|
||||
break
|
||||
|
||||
return list(reversed(entries))
|
||||
|
||||
def get_log_stats(self) -> dict:
|
||||
if not self.log_file_path.exists():
|
||||
return {}
|
||||
|
||||
stats = {"total_lines": 0}
|
||||
with open(self.log_file_path, "r") as f:
|
||||
for line in f:
|
||||
stats["total_lines"] += 1
|
||||
entry = self._parse_log_line(line)
|
||||
if entry:
|
||||
stats[entry.level] = stats.get(entry.level, 0) + 1
|
||||
|
||||
return stats
|
||||
|
||||
@staticmethod
|
||||
def _parse_log_line(line: str) -> LogEntry | None:
|
||||
try:
|
||||
# Expected format from logging config: [timestamp] - LEVEL - message
|
||||
parts = line.strip().split(" - ")
|
||||
if len(parts) >= 3:
|
||||
timestamp_str = parts[0].strip("[]")
|
||||
timestamp = datetime.fromisoformat(timestamp_str)
|
||||
level = parts[1].strip()
|
||||
message = " - ".join(parts[2:])
|
||||
return LogEntry(timestamp, level, message)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
@@ -1,10 +1,8 @@
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
from spacy import displacy
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from processing.ner.name_model import NameModel
|
||||
from ners.core.config import PipelineConfig
|
||||
from ners.processing.ner.name_model import NameModel
|
||||
|
||||
|
||||
class NERTesting:
|
||||
@@ -56,12 +54,15 @@ class NERTesting:
|
||||
|
||||
with col1:
|
||||
st.metric(
|
||||
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
|
||||
"Training Examples",
|
||||
f"{self.training_stats.get('training_examples', 0):,}",
|
||||
)
|
||||
with col2:
|
||||
st.metric("Epochs", self.training_stats.get("epochs", 0))
|
||||
with col3:
|
||||
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
|
||||
st.metric(
|
||||
"Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}"
|
||||
)
|
||||
with col4:
|
||||
st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
|
||||
|
||||
@@ -99,9 +100,11 @@ class NERTesting:
|
||||
|
||||
if names_input.strip():
|
||||
if st.button("Analyze All Names", type="primary"):
|
||||
names = [name.strip() for name in names_input.split("\n") if name.strip()]
|
||||
names = [
|
||||
name.strip() for name in names_input.split("\n") if name.strip()
|
||||
]
|
||||
for i, name in enumerate(names):
|
||||
st.markdown(f"**Name {i+1}: {name}**")
|
||||
st.markdown(f"**Name {i + 1}: {name}**")
|
||||
self.analyze_and_display(name)
|
||||
if i < len(names) - 1:
|
||||
st.markdown("---")
|
||||
@@ -127,7 +130,9 @@ class NERTesting:
|
||||
|
||||
else:
|
||||
st.warning("No entities detected in the input text.")
|
||||
st.info("Try using traditional Congolese names or ensure the spelling is correct.")
|
||||
st.info(
|
||||
"Try using traditional Congolese names or ensure the spelling is correct."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error analyzing text: {e}")
|
||||
@@ -139,14 +144,21 @@ class NERTesting:
|
||||
ents = []
|
||||
for entity in entities:
|
||||
ents.append(
|
||||
{"start": entity["start"], "end": entity["end"], "label": entity["label"]}
|
||||
{
|
||||
"start": entity["start"],
|
||||
"end": entity["end"],
|
||||
"label": entity["label"],
|
||||
}
|
||||
)
|
||||
|
||||
# Create doc-like structure for displacy
|
||||
doc_data = {"text": text, "ents": ents, "title": None}
|
||||
|
||||
# Custom colors for our labels
|
||||
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
|
||||
colors = {
|
||||
"NATIVE": "#74C0FC",
|
||||
"SURNAME": "#69DB7C",
|
||||
} # Light blue # Light green
|
||||
|
||||
options = {"colors": colors, "distance": 90}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user