refactoring: uv
This commit is contained in:
@@ -0,0 +1,16 @@
|
|||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
.idea
|
||||||
|
.vscode
|
||||||
|
__pycache__
|
||||||
|
.ruff_cache
|
||||||
|
.venv
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*.DS_Store
|
||||||
|
dist
|
||||||
|
build
|
||||||
|
*.egg-info
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
3.11
|
||||||
+49
@@ -0,0 +1,49 @@
|
|||||||
|
# syntax=docker/dockerfile:1
|
||||||
|
|
||||||
|
# Minimal Linux base (glibc) – Python will be installed by uv
|
||||||
|
FROM debian:bookworm-slim
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
UV_INSTALL_DIR=/usr/local/bin \
|
||||||
|
UV_LINK_MODE=copy \
|
||||||
|
UV_PYTHON_DOWNLOADS=1 \
|
||||||
|
UV_PROJECT_ENVIRONMENT=/app/.venv \
|
||||||
|
PATH=/app/.venv/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# System deps for building/using common scientific stack
|
||||||
|
# Keep minimal; rely on wheels where possible
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates curl git \
|
||||||
|
build-essential pkg-config \
|
||||||
|
libssl-dev libffi-dev \
|
||||||
|
libopenblas0 libstdc++6 \
|
||||||
|
libfreetype6 libpng16-16 libjpeg62-turbo \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv (static binary)
|
||||||
|
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Copy project metadata first for layer caching
|
||||||
|
COPY pyproject.toml README.md ./
|
||||||
|
|
||||||
|
# Install a managed Python via uv and create the project venv
|
||||||
|
RUN uv python install 3.11 \
|
||||||
|
&& uv venv /app/.venv --python 3.11
|
||||||
|
|
||||||
|
# Resolve and install runtime deps into project venv
|
||||||
|
# Use lockfile if present for reproducibility
|
||||||
|
RUN if [ -f uv.lock ]; then uv sync --no-dev --no-install-project --frozen; else uv sync --no-dev --no-install-project; fi
|
||||||
|
|
||||||
|
# Copy source code and optional templates
|
||||||
|
COPY src ./src
|
||||||
|
|
||||||
|
# Re-sync to ensure the local package is installed
|
||||||
|
RUN uv sync --no-dev \
|
||||||
|
&& rm -rf /root/.cache
|
||||||
|
|
||||||
|
# Default command shows help; override in compose or docker run
|
||||||
|
CMD ["ners", "--help"]
|
||||||
@@ -10,37 +10,23 @@ million names from the Democratic Republic of Congo (DRC) annotated with gender
|
|||||||
|
|
||||||
### Installation & Setup
|
### Installation & Setup
|
||||||
|
|
||||||
Instructions and command line snippets bellow are provided to help you set up the project environment quickly and
|
**Unix based**
|
||||||
efficiently.
|
|
||||||
assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
|
|
||||||
|
|
||||||
**Using Makefile (Recommended)**
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||||
cd drc-ners-nlp
|
cd drc-ners-nlp
|
||||||
|
|
||||||
# Setup environment
|
uv sync
|
||||||
make setup
|
|
||||||
make activate
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Manual Setup**
|
**Macos & windows**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
docker compose build
|
||||||
cd drc-ners-nlp
|
docker compose run --rm app
|
||||||
|
docker compose run --rm app ners pipeline run --env=production
|
||||||
# Setup environment
|
docker compose run --rm app ners research train --name=lightgbm --type=baseline --env=production
|
||||||
python -m venv .venv
|
docker compose run --rm --service-ports app ners web run --env=production
|
||||||
.venv/bin/pip install --upgrade pip
|
|
||||||
.venv/bin/pip install -r requirements.txt
|
|
||||||
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install -r requirements.txt
|
|
||||||
pip install jupyter notebook ipykernel pytest black flake8 mypy
|
|
||||||
|
|
||||||
source .venv/bin/activate
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Data Processing
|
## Data Processing
|
||||||
@@ -55,6 +41,7 @@ the `drc-ners-nlp/config/pipeline.yaml` file.
|
|||||||
```yaml
|
```yaml
|
||||||
stages:
|
stages:
|
||||||
- "data_cleaning"
|
- "data_cleaning"
|
||||||
|
- "data_selection"
|
||||||
- "feature_extraction"
|
- "feature_extraction"
|
||||||
- "data_splitting"
|
- "data_splitting"
|
||||||
```
|
```
|
||||||
@@ -62,37 +49,7 @@ stages:
|
|||||||
**Running the Pipeline**
|
**Running the Pipeline**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python main.py --env production
|
uv run ners pipeline run --env="production"
|
||||||
```
|
|
||||||
|
|
||||||
## NER Processing (Optional)
|
|
||||||
|
|
||||||
This project implements a custom named entity recognition (NER) pipeline tailored for Congolese names.
|
|
||||||
Its main objective is to accurately identify and tag the different components of a Congolese name,
|
|
||||||
specifically distinguishing between the native part and the surname.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python ner.py --env production
|
|
||||||
```
|
|
||||||
|
|
||||||
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
|
|
||||||
|
|
||||||
**Running the Pipeline with NER Annotation**
|
|
||||||
```yaml
|
|
||||||
stages:
|
|
||||||
- "data_cleaning"
|
|
||||||
- "feature_extraction"
|
|
||||||
- "ner_annotation"
|
|
||||||
- "data_splitting"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Running the Pipeline with LLM Annotation**
|
|
||||||
```yaml
|
|
||||||
stages:
|
|
||||||
- "data_cleaning"
|
|
||||||
- "feature_extraction"
|
|
||||||
- "llm_annotation"
|
|
||||||
- "data_splitting"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Experiments
|
## Experiments
|
||||||
@@ -105,54 +62,94 @@ you can define model features, training parameters, and evaluation metrics in th
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# bigru
|
# bigru
|
||||||
python train.py --name="bigru" --type="baseline" --env="production"
|
uv run ners research train --name="bigru" --type="baseline" --env="production"
|
||||||
python train.py --name="bigru_native" --type="baseline" --env="production"
|
uv run ners research train --name="bigru_native" --type="baseline" --env="production"
|
||||||
python train.py --name="bigru_surname" --type="baseline" --env="production"
|
uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# cnn
|
# cnn
|
||||||
python train.py --name="cnn" --type="baseline" --env="production"
|
uv run ners research train --name="cnn" --type="baseline" --env="production"
|
||||||
python train.py --name="cnn_native" --type="baseline" --env="production"
|
uv run ners research train --name="cnn_native" --type="baseline" --env="production"
|
||||||
python train.py --name="cnn_surname" --type="baseline" --env="production"
|
uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# lightgbm
|
# lightgbm
|
||||||
python train.py --name="lightgbm" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm" --type="baseline" --env="production"
|
||||||
python train.py --name="lightgbm_native" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# logistic regression
|
# logistic regression
|
||||||
python train.py --name="logistic_regression" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
|
||||||
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
|
||||||
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# lstm
|
# lstm
|
||||||
python train.py --name="lstm" --type="baseline" --env="production"
|
uv run ners research train --name="lstm" --type="baseline" --env="production"
|
||||||
python train.py --name="lstm_native" --type="baseline" --env="production"
|
uv run ners research train --name="lstm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="lstm_surname" --type="baseline" --env="production"
|
uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# random forest
|
# random forest
|
||||||
python train.py --name="random_forest" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest" --type="baseline" --env="production"
|
||||||
python train.py --name="random_forest_native" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
|
||||||
python train.py --name="random_forest_surname" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# svm
|
# svm
|
||||||
python train.py --name="svm" --type="baseline" --env="production"
|
uv run ners research train --name="svm" --type="baseline" --env="production"
|
||||||
python train.py --name="svm_native" --type="baseline" --env="production"
|
uv run ners research train --name="svm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="svm_surname" --type="baseline" --env="production"
|
uv run ners research train --name="svm_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# naive bayes
|
# naive bayes
|
||||||
python train.py --name="naive_bayes" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
|
||||||
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
|
||||||
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# transformer
|
# transformer
|
||||||
python train.py --name="transformer" --type="baseline" --env="production"
|
uv run ners research train --name="transformer" --type="baseline" --env="production"
|
||||||
python train.py --name="transformer_native" --type="baseline" --env="production"
|
uv run ners research train --name="transformer_native" --type="baseline" --env="production"
|
||||||
python train.py --name="transformer_surname" --type="baseline" --env="production"
|
uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# xgboost
|
# xgboost
|
||||||
python train.py --name="xgboost" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost" --type="baseline" --env="production"
|
||||||
python train.py --name="xgboost_native" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
|
||||||
python train.py --name="xgboost_surname" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
## TensorFlow on macOS (Intel) with uv
|
||||||
|
|
||||||
|
TensorFlow no longer publishes wheels for macOS Intel. To keep using uv and run TF reliably, use a Linux container with TF preinstalled and install project code with minimal extras inside the container.
|
||||||
|
|
||||||
|
### One-time build
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f docker/compose.tf.yml build
|
||||||
|
|
||||||
|
If you see a message like `tensorflow/tensorflow:<tag>: not found`, update `docker/Dockerfile.tf-cpu` to a tag that exists (e.g., `2.17.0`) and rebuild:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sed -n '1,20p' docker/Dockerfile.tf-cpu # verify the FROM line
|
||||||
|
docker pull tensorflow/tensorflow:2.17.0 # quick availability check
|
||||||
|
docker compose -f docker/compose.tf.yml build
|
||||||
|
```
|
||||||
|
```
|
||||||
|
|
||||||
|
### Start a shell with uv and TF available
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f docker/compose.tf.yml run --rm tf bash
|
||||||
|
```
|
||||||
|
|
||||||
|
Inside the container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install project in editable mode without pulling full deps
|
||||||
|
uv pip install -e . --no-deps
|
||||||
|
|
||||||
|
# Install only what research needs alongside TensorFlow
|
||||||
|
uv pip install typer pandas scikit-learn seaborn plotly
|
||||||
|
|
||||||
|
# Sanity check
|
||||||
|
uv run python -c "import tensorflow as tf; print(tf.__version__)"
|
||||||
|
|
||||||
|
# Run an experiment
|
||||||
|
uv run ners research train --name="lstm" --type="baseline" --env="production"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Web Interface
|
## Web Interface
|
||||||
@@ -163,60 +160,9 @@ experiments and make predictions without needing to understand the underlying co
|
|||||||
### Running the Web Interface
|
### Running the Web Interface
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
streamlit run web/app.py
|
uv run ners web run --env="production"
|
||||||
```
|
```
|
||||||
|
|
||||||
## GPU Acceleration
|
|
||||||
|
|
||||||
This project can leverage GPUs for faster training when supported libraries and hardware are available.
|
|
||||||
|
|
||||||
- TensorFlow/Keras models (BiGRU, LSTM, CNN, Transformer)
|
|
||||||
- Uses GPU automatically if a TensorFlow GPU build is installed.
|
|
||||||
- The code enables safe GPU memory growth by default; optionally enable mixed precision for additional speed:
|
|
||||||
- Add `mixed_precision: true` in the experiment `model_params` (e.g., in `config/research_templates.yaml`).
|
|
||||||
- The final layer outputs are set to float32 for numerical stability under mixed precision.
|
|
||||||
|
|
||||||
- spaCy NER
|
|
||||||
- Automatically prefers GPU if available; otherwise falls back to CPU.
|
|
||||||
- Ensure a compatible CUDA-enabled spaCy/thinc stack is installed to use GPU.
|
|
||||||
|
|
||||||
- XGBoost
|
|
||||||
- Enable GPU by adding to the experiment `model_params`:
|
|
||||||
- `use_gpu: true` (sets `tree_method: gpu_hist` and `predictor: gpu_predictor`).
|
|
||||||
|
|
||||||
- LightGBM
|
|
||||||
- Enable GPU by adding to the experiment `model_params`:
|
|
||||||
- `use_gpu: true` (sets `device: gpu`). Optional: `gpu_platform_id`, `gpu_device_id`.
|
|
||||||
|
|
||||||
Example template snippet (GPU on):
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
- name: "lstm_gpu"
|
|
||||||
description: "LSTM with GPU + mixed precision"
|
|
||||||
model_type: "lstm"
|
|
||||||
features: ["full_name"]
|
|
||||||
model_params:
|
|
||||||
embedding_dim: 128
|
|
||||||
lstm_units: 64
|
|
||||||
epochs: 5
|
|
||||||
batch_size: 128
|
|
||||||
use_gpu: true
|
|
||||||
mixed_precision: true
|
|
||||||
tags: ["gpu", "mixed_precision"]
|
|
||||||
|
|
||||||
- name: "xgboost_gpu"
|
|
||||||
description: "XGBoost with GPU"
|
|
||||||
model_type: "xgboost"
|
|
||||||
features: ["full_name"]
|
|
||||||
model_params:
|
|
||||||
n_estimators: 200
|
|
||||||
use_gpu: true
|
|
||||||
```
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
- Install CUDA‑enabled binaries for TensorFlow/spaCy/LightGBM/XGBoost to actually use GPU.
|
|
||||||
- If GPU is requested but not available, training will proceed on CPU with a warning.
|
|
||||||
|
|
||||||
## Contributors
|
## Contributors
|
||||||
|
|
||||||
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||||
|
|||||||
+21
@@ -0,0 +1,21 @@
|
|||||||
|
services:
|
||||||
|
app:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: drc-ners:uv
|
||||||
|
working_dir: /app
|
||||||
|
tty: true
|
||||||
|
stdin_open: true
|
||||||
|
environment:
|
||||||
|
NERS_ENV: production
|
||||||
|
STREAMLIT_SERVER_ADDRESS: 0.0.0.0
|
||||||
|
# expose Streamlit for `ners web run`
|
||||||
|
ports:
|
||||||
|
- "8501:8501"
|
||||||
|
volumes:
|
||||||
|
- ./assets:/app/assets
|
||||||
|
- ./config:/app/config
|
||||||
|
- ./data:/app/data
|
||||||
|
# default command shows CLI help; override per run
|
||||||
|
command: ["ners", "--help"]
|
||||||
-90
@@ -1,90 +0,0 @@
|
|||||||
#!.venv/bin/python3
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from core.config import setup_config
|
|
||||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
choices = [
|
|
||||||
"data_cleaning",
|
|
||||||
"data_selection",
|
|
||||||
"feature_extraction",
|
|
||||||
"ner_annotation",
|
|
||||||
"llm_annotation",
|
|
||||||
"data_splitting",
|
|
||||||
]
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="DRC NERS Processing Monitoring")
|
|
||||||
parser.add_argument("--config", type=Path, help="Path to configuration file")
|
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment")
|
|
||||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
||||||
|
|
||||||
# Clean command
|
|
||||||
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
|
|
||||||
clean_parser.add_argument("--step", type=str, choices=choices, help="default: all")
|
|
||||||
clean_parser.add_argument("--keep-last", type=int, default=1, help="(default: 1)")
|
|
||||||
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
|
|
||||||
|
|
||||||
# Reset command
|
|
||||||
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
|
|
||||||
reset_parser.add_argument("--step", type=str, choices=choices, help="(default: all)")
|
|
||||||
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
|
|
||||||
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
try:
|
|
||||||
setup_config(config_path=args.config, env=args.env)
|
|
||||||
monitor = PipelineMonitor()
|
|
||||||
|
|
||||||
if not args.command:
|
|
||||||
parser.print_help()
|
|
||||||
monitor.print_status(detailed=True)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
elif args.command == "clean":
|
|
||||||
checkpoint_info = monitor.count_checkpoint_files()
|
|
||||||
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
|
|
||||||
|
|
||||||
if not args.force:
|
|
||||||
response = input("Are you sure you want to clean checkpoints? (y/N): ")
|
|
||||||
if response.lower() != "y":
|
|
||||||
print("Cancelled")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if args.step:
|
|
||||||
monitor.clean_step_checkpoints(args.step, args.keep_last)
|
|
||||||
else:
|
|
||||||
for step in monitor.steps:
|
|
||||||
monitor.clean_step_checkpoints(step, args.keep_last)
|
|
||||||
|
|
||||||
print("Checkpoint cleaning completed")
|
|
||||||
|
|
||||||
elif args.command == "reset":
|
|
||||||
if not args.force:
|
|
||||||
response = input(
|
|
||||||
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
|
|
||||||
)
|
|
||||||
if response.lower() != "y":
|
|
||||||
print("Cancelled")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if args.step:
|
|
||||||
monitor.reset_step(args.step)
|
|
||||||
else:
|
|
||||||
for step in monitor.steps:
|
|
||||||
monitor.reset_step(step)
|
|
||||||
|
|
||||||
print(f"Reset completed")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Monitoring failed: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
Vendored
-499
File diff suppressed because one or more lines are too long
@@ -0,0 +1,41 @@
|
|||||||
|
[project]
|
||||||
|
name = "ners"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"geopandas>=1.1.1",
|
||||||
|
"joblib>=1.5.2",
|
||||||
|
"lightgbm>=4.6.0",
|
||||||
|
"matplotlib>=3.10.6",
|
||||||
|
"numpy>=2.3.3",
|
||||||
|
"ollama>=0.6.0",
|
||||||
|
"pandas>=2.3.3",
|
||||||
|
"plotly>=6.3.1",
|
||||||
|
"psutil>=7.1.0",
|
||||||
|
"pydantic>=2.11.10",
|
||||||
|
"pyyaml>=6.0.3",
|
||||||
|
"scikit-learn>=1.7.2",
|
||||||
|
"seaborn>=0.13.2",
|
||||||
|
"spacy>=3.8.7",
|
||||||
|
"streamlit>=1.50.0",
|
||||||
|
"tqdm>=4.67.1",
|
||||||
|
"typer>=0.19.2",
|
||||||
|
"xgboost>=3.0.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
ners = "ners.cli:app"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["uv_build>=0.8.12,<0.9.0"]
|
||||||
|
build-backend = "uv_build"
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"ruff>=0.13.3",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.uv]
|
||||||
|
required-environments = ["sys_platform == 'linux' and platform_machine == 'x86_64'"]
|
||||||
@@ -1,170 +0,0 @@
|
|||||||
absl-py==2.3.0
|
|
||||||
altair==5.1.2
|
|
||||||
annotated-types==0.7.0
|
|
||||||
anyio==4.9.0
|
|
||||||
appnope==0.1.4
|
|
||||||
argon2-cffi==25.1.0
|
|
||||||
argon2-cffi-bindings==21.2.0
|
|
||||||
arrow==1.3.0
|
|
||||||
asttokens==3.0.0
|
|
||||||
astunparse==1.6.3
|
|
||||||
async-lru==2.0.5
|
|
||||||
attrs==25.3.0
|
|
||||||
babel==2.17.0
|
|
||||||
beautifulsoup4==4.13.4
|
|
||||||
black==25.1.0
|
|
||||||
bleach==6.2.0
|
|
||||||
blinker==1.9.0
|
|
||||||
cachetools==6.1.0
|
|
||||||
certifi==2025.6.15
|
|
||||||
cffi==1.17.1
|
|
||||||
charset-normalizer==3.4.2
|
|
||||||
click==8.2.1
|
|
||||||
comm==0.2.2
|
|
||||||
contourpy==1.3.2
|
|
||||||
cycler==0.12.1
|
|
||||||
debugpy==1.8.14
|
|
||||||
decorator==5.2.1
|
|
||||||
defusedxml==0.7.1
|
|
||||||
executing==2.2.0
|
|
||||||
fastjsonschema==2.21.1
|
|
||||||
flake8==7.3.0
|
|
||||||
flatbuffers==25.2.10
|
|
||||||
fonttools==4.58.4
|
|
||||||
fqdn==1.5.1
|
|
||||||
gast==0.6.0
|
|
||||||
gitdb==4.0.12
|
|
||||||
GitPython==3.1.45
|
|
||||||
google-pasta==0.2.0
|
|
||||||
grpcio==1.73.0
|
|
||||||
h11==0.16.0
|
|
||||||
h5py==3.14.0
|
|
||||||
httpcore==1.0.9
|
|
||||||
httpx==0.28.1
|
|
||||||
idna==3.10
|
|
||||||
imbalanced-learn==0.13.0
|
|
||||||
ipykernel==6.29.5
|
|
||||||
ipython>=8.0,<9.0
|
|
||||||
ipython_pygments_lexers==1.1.1
|
|
||||||
isoduration==20.11.0
|
|
||||||
jedi==0.19.2
|
|
||||||
Jinja2==3.1.6
|
|
||||||
joblib==1.5.1
|
|
||||||
json5==0.12.0
|
|
||||||
jsonpointer==3.0.0
|
|
||||||
jsonschema==4.24.0
|
|
||||||
jsonschema-specifications==2025.4.1
|
|
||||||
jupyter-events==0.12.0
|
|
||||||
jupyter-lsp==2.2.5
|
|
||||||
jupyter_client==8.6.3
|
|
||||||
jupyter_core==5.8.1
|
|
||||||
jupyter_server==2.16.0
|
|
||||||
jupyter_server_terminals==0.5.3
|
|
||||||
jupyterlab==4.4.4
|
|
||||||
jupyterlab_pygments==0.3.0
|
|
||||||
jupyterlab_server==2.27.3
|
|
||||||
keras==3.10.0
|
|
||||||
kiwisolver==1.4.8
|
|
||||||
libclang==18.1.1
|
|
||||||
lightgbm~=4.6.0
|
|
||||||
Markdown==3.8.2
|
|
||||||
markdown-it-py==3.0.0
|
|
||||||
MarkupSafe==3.0.2
|
|
||||||
matplotlib==3.10.3
|
|
||||||
matplotlib-inline==0.1.7
|
|
||||||
mccabe==0.7.0
|
|
||||||
mdurl==0.1.2
|
|
||||||
mistune==3.1.3
|
|
||||||
ml-dtypes==0.3.2
|
|
||||||
mypy==1.17.0
|
|
||||||
mypy_extensions==1.1.0
|
|
||||||
namex==0.1.0
|
|
||||||
narwhals==2.0.1
|
|
||||||
nbclient==0.10.2
|
|
||||||
nbconvert==7.16.6
|
|
||||||
nbformat==5.10.4
|
|
||||||
nest-asyncio==1.6.0
|
|
||||||
nltk==3.9.1
|
|
||||||
notebook==7.4.4
|
|
||||||
notebook_shim==0.2.4
|
|
||||||
numpy==1.26.4
|
|
||||||
ollama~=0.5.1
|
|
||||||
opt_einsum==3.4.0
|
|
||||||
optree==0.16.0
|
|
||||||
overrides==7.7.0
|
|
||||||
packaging==25.0
|
|
||||||
pandas==2.3.0
|
|
||||||
pandocfilters==1.5.1
|
|
||||||
parso==0.8.4
|
|
||||||
pathspec==0.12.1
|
|
||||||
pexpect==4.9.0
|
|
||||||
pillow==11.2.1
|
|
||||||
platformdirs==4.3.8
|
|
||||||
plotly~=6.2.0
|
|
||||||
prometheus_client==0.22.1
|
|
||||||
prompt_toolkit==3.0.51
|
|
||||||
protobuf==4.25.8
|
|
||||||
psutil==7.0.0
|
|
||||||
ptyprocess==0.7.0
|
|
||||||
pure_eval==0.2.3
|
|
||||||
pyarrow==21.0.0
|
|
||||||
pycodestyle==2.14.0
|
|
||||||
pycparser==2.22
|
|
||||||
pydantic~=2.11.7
|
|
||||||
pydantic_core==2.33.2
|
|
||||||
pydeck==0.9.1
|
|
||||||
pyflakes==3.4.0
|
|
||||||
Pygments==2.19.1
|
|
||||||
pyparsing==3.2.3
|
|
||||||
python-dateutil==2.9.0.post0
|
|
||||||
python-json-logger==3.3.0
|
|
||||||
pytz==2025.2
|
|
||||||
PyYAML~=6.0.2
|
|
||||||
pyzmq==27.0.0
|
|
||||||
referencing==0.36.2
|
|
||||||
regex==2024.11.6
|
|
||||||
requests==2.32.4
|
|
||||||
rfc3339-validator==0.1.4
|
|
||||||
rfc3986-validator==0.1.1
|
|
||||||
rich==14.0.0
|
|
||||||
rpds-py==0.26.0
|
|
||||||
scikit-learn~=1.6.1
|
|
||||||
scipy==1.15.3
|
|
||||||
seaborn==0.13.2
|
|
||||||
Send2Trash==1.8.3
|
|
||||||
six==1.17.0
|
|
||||||
sklearn-compat==0.1.3
|
|
||||||
smmap==5.0.2
|
|
||||||
sniffio==1.3.1
|
|
||||||
soupsieve==2.7
|
|
||||||
spacy~=3.8.7
|
|
||||||
stack-data==0.6.3
|
|
||||||
streamlit~=1.47.1
|
|
||||||
tenacity==9.1.2
|
|
||||||
tensorboard==2.16.2
|
|
||||||
tensorboard-data-server==0.7.2
|
|
||||||
tensorflow==2.16.2
|
|
||||||
tensorflow-io-gcs-filesystem==0.37.1
|
|
||||||
termcolor==3.1.0
|
|
||||||
terminado==0.18.1
|
|
||||||
threadpoolctl==3.6.0
|
|
||||||
tinycss2==1.4.0
|
|
||||||
toml==0.10.2
|
|
||||||
toolz==1.0.0
|
|
||||||
tornado==6.5.1
|
|
||||||
tqdm==4.67.1
|
|
||||||
traitlets==5.14.3
|
|
||||||
types-python-dateutil==2.9.0.20250516
|
|
||||||
types-PyYAML==6.0.12.20250516
|
|
||||||
typing-inspection==0.4.1
|
|
||||||
typing_extensions==4.14.0
|
|
||||||
tzdata==2025.2
|
|
||||||
uri-template==1.3.0
|
|
||||||
urllib3==2.5.0
|
|
||||||
wcwidth==0.2.13
|
|
||||||
webcolors==24.11.1
|
|
||||||
webencodings==0.5.1
|
|
||||||
websocket-client==1.8.0
|
|
||||||
Werkzeug==3.1.3
|
|
||||||
wrapt==1.17.2
|
|
||||||
xgboost~=3.0.3
|
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""DRC NERS NLP package."""
|
||||||
|
|
||||||
|
__all__: list[str] = []
|
||||||
+226
@@ -0,0 +1,226 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from ners.core.config import setup_config, PipelineConfig
|
||||||
|
|
||||||
|
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Pipeline commands
|
||||||
|
# -------------------------
|
||||||
|
pipeline_app = typer.Typer(help="Data processing pipeline")
|
||||||
|
app.add_typer(pipeline_app, name="pipeline")
|
||||||
|
|
||||||
|
|
||||||
|
@pipeline_app.command("run")
|
||||||
|
def pipeline_run(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
"""Run the full processing pipeline."""
|
||||||
|
from ners.main import run_pipeline as _run_pipeline
|
||||||
|
|
||||||
|
cfg = setup_config(config_path=config, env=env)
|
||||||
|
code = _run_pipeline(cfg)
|
||||||
|
raise typer.Exit(code)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# NER commands
|
||||||
|
# -------------------------
|
||||||
|
ner_app = typer.Typer(help="NER dataset and model")
|
||||||
|
app.add_typer(ner_app, name="ner")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
|
||||||
|
return setup_config(config_path=config, env=env)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("feature")
|
||||||
|
def ner_feature(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import feature as _feature
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
_feature(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("build")
|
||||||
|
def ner_build(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import build as _build
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
_build(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("train")
|
||||||
|
def ner_train(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import train as _train
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
_train(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("run")
|
||||||
|
def ner_run(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
reset: bool = typer.Option(
|
||||||
|
False, help="Reset intermediate outputs and rerun all steps"
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import run_pipeline as _ner_pipeline
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
code = _ner_pipeline(cfg, reset)
|
||||||
|
raise typer.Exit(code)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Research commands
|
||||||
|
# -------------------------
|
||||||
|
research_app = typer.Typer(help="Research experiments and training")
|
||||||
|
app.add_typer(research_app, name="research")
|
||||||
|
|
||||||
|
|
||||||
|
@research_app.command("train")
|
||||||
|
def research_train(
|
||||||
|
name: str = typer.Option(..., "--name", help="Model name to train"),
|
||||||
|
type: str = typer.Option(..., "--type", help="Experiment type"),
|
||||||
|
templates: str = typer.Option(
|
||||||
|
"research_templates.yaml", help="Templates file path"
|
||||||
|
),
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||||
|
from ners.research.model_trainer import ModelTrainer
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
exp_builder = ExperimentBuilder(cfg)
|
||||||
|
tmpl = exp_builder.load_templates(templates)
|
||||||
|
exp_cfg = exp_builder.find_template(tmpl, name, type)
|
||||||
|
|
||||||
|
trainer = ModelTrainer(cfg)
|
||||||
|
trainer.train_single_model(
|
||||||
|
model_name=exp_cfg.get("name"),
|
||||||
|
model_type=exp_cfg.get("model_type"),
|
||||||
|
features=exp_cfg.get("features"),
|
||||||
|
model_params=exp_cfg.get("model_params", {}),
|
||||||
|
tags=exp_cfg.get("tags", []),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Monitor commands
|
||||||
|
# -------------------------
|
||||||
|
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
|
||||||
|
app.add_typer(monitor_app, name="monitor")
|
||||||
|
|
||||||
|
|
||||||
|
@monitor_app.command("status")
|
||||||
|
def monitor_status(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
detailed: bool = typer.Option(
|
||||||
|
False, help="Show detailed status (failed batch IDs)"
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
_ = _load_config(config, env)
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
PipelineMonitor().print_status(detailed=detailed)
|
||||||
|
|
||||||
|
|
||||||
|
@monitor_app.command("clean")
|
||||||
|
def monitor_clean(
|
||||||
|
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
|
||||||
|
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
|
||||||
|
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
_ = _load_config(config, env)
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
mon = PipelineMonitor()
|
||||||
|
if not force:
|
||||||
|
typer.confirm("Clean checkpoints?", abort=True)
|
||||||
|
|
||||||
|
if step:
|
||||||
|
mon.clean_step_checkpoints(step, keep_last)
|
||||||
|
else:
|
||||||
|
for s in mon.steps:
|
||||||
|
mon.clean_step_checkpoints(s, keep_last)
|
||||||
|
|
||||||
|
|
||||||
|
@monitor_app.command("reset")
|
||||||
|
def monitor_reset(
|
||||||
|
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
|
||||||
|
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
_ = _load_config(config, env)
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
mon = PipelineMonitor()
|
||||||
|
if not force:
|
||||||
|
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
|
||||||
|
typer.confirm(msg, abort=True)
|
||||||
|
|
||||||
|
if step:
|
||||||
|
mon.reset_step(step)
|
||||||
|
else:
|
||||||
|
for s in mon.steps:
|
||||||
|
mon.reset_step(s)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Web commands
|
||||||
|
# -------------------------
|
||||||
|
web_app = typer.Typer(help="Web UI wrapper")
|
||||||
|
app.add_typer(web_app, name="web")
|
||||||
|
|
||||||
|
|
||||||
|
@web_app.command("run")
|
||||||
|
def web_run(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
"""Launch the Streamlit web app via subprocess."""
|
||||||
|
app_path = Path(__file__).parent / "web" / "app.py"
|
||||||
|
cmd = [
|
||||||
|
sys.executable,
|
||||||
|
"-m",
|
||||||
|
"streamlit",
|
||||||
|
"run",
|
||||||
|
str(app_path),
|
||||||
|
]
|
||||||
|
# Pass configuration via environment variables to avoid argparse in Streamlit
|
||||||
|
env_vars = os.environ.copy()
|
||||||
|
if config is not None:
|
||||||
|
env_vars["NERS_CONFIG"] = str(config)
|
||||||
|
env_vars["NERS_ENV"] = env
|
||||||
|
|
||||||
|
raise typer.Exit(subprocess.call(cmd, env=env_vars))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
app()
|
||||||
@@ -2,10 +2,10 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
from core.utils import ensure_directories
|
from ners.core.utils import ensure_directories
|
||||||
from .config_manager import ConfigManager
|
from ners.core.config.config_manager import ConfigManager
|
||||||
from .logging_config import LoggingConfig
|
from ners.core.config.logging_config import LoggingConfig
|
||||||
from .pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
config_manager = ConfigManager()
|
config_manager = ConfigManager()
|
||||||
|
|
||||||
@@ -22,7 +22,9 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfi
|
|||||||
return config_manager.get_config()
|
return config_manager.get_config()
|
||||||
|
|
||||||
|
|
||||||
def setup_config(config_path: Optional[Path] = None, env: str = "development") -> PipelineConfig:
|
def setup_config(
|
||||||
|
config_path: Optional[Path] = None, env: str = "development"
|
||||||
|
) -> PipelineConfig:
|
||||||
"""
|
"""
|
||||||
Unified configuration loading and logging setup for all entrypoint scripts.
|
Unified configuration loading and logging setup for all entrypoint scripts.
|
||||||
|
|
||||||
@@ -5,8 +5,8 @@ from typing import Optional, Union, Dict, Any
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.config.project_paths import ProjectPaths
|
from ners.core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
class ConfigManager:
|
class ConfigManager:
|
||||||
@@ -36,7 +36,7 @@ class ConfigManager:
|
|||||||
|
|
||||||
def _setup_default_paths(self):
|
def _setup_default_paths(self):
|
||||||
"""Setup default project paths"""
|
"""Setup default project paths"""
|
||||||
root_dir = Path(__file__).parent.parent.parent
|
root_dir = Path(__file__).parent.parent.parent.parent.parent
|
||||||
self.default_paths = ProjectPaths(
|
self.default_paths = ProjectPaths(
|
||||||
root_dir=root_dir,
|
root_dir=root_dir,
|
||||||
configs_dir=root_dir / "config",
|
configs_dir=root_dir / "config",
|
||||||
@@ -53,7 +53,9 @@ class ConfigManager:
|
|||||||
self.config_path = config_path
|
self.config_path = config_path
|
||||||
|
|
||||||
if not self.config_path.exists():
|
if not self.config_path.exists():
|
||||||
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
|
logging.warning(
|
||||||
|
f"Config file not found: {self.config_path}. Using defaults."
|
||||||
|
)
|
||||||
return self._create_default_config()
|
return self._create_default_config()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -122,7 +124,11 @@ class ConfigManager:
|
|||||||
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
||||||
"""Recursively update nested dictionaries"""
|
"""Recursively update nested dictionaries"""
|
||||||
for key, value in update_dict.items():
|
for key, value in update_dict.items():
|
||||||
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
|
if (
|
||||||
|
key in base_dict
|
||||||
|
and isinstance(base_dict[key], dict)
|
||||||
|
and isinstance(value, dict)
|
||||||
|
):
|
||||||
self._deep_update(base_dict[key], value)
|
self._deep_update(base_dict[key], value)
|
||||||
else:
|
else:
|
||||||
base_dict[key] = value
|
base_dict[key] = value
|
||||||
@@ -1,10 +1,10 @@
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from core.config.annotation_config import AnnotationConfig
|
from ners.core.config.annotation_config import AnnotationConfig
|
||||||
from core.config.data_config import DataConfig
|
from ners.core.config.data_config import DataConfig
|
||||||
from core.config.logging_config import LoggingConfig
|
from ners.core.config.logging_config import LoggingConfig
|
||||||
from core.config.processing_config import ProcessingConfig
|
from ners.core.config.processing_config import ProcessingConfig
|
||||||
from core.config.project_paths import ProjectPaths
|
from ners.core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
class PipelineConfig(BaseModel):
|
class PipelineConfig(BaseModel):
|
||||||
@@ -10,6 +10,8 @@ class ProcessingConfig(BaseModel):
|
|||||||
max_workers: int = 4
|
max_workers: int = 4
|
||||||
checkpoint_interval: int = 5
|
checkpoint_interval: int = 5
|
||||||
use_multiprocessing: bool = False
|
use_multiprocessing: bool = False
|
||||||
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
encoding_options: list = field(
|
||||||
|
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
|
||||||
|
)
|
||||||
chunk_size: int = 100_000
|
chunk_size: int = 100_000
|
||||||
epochs: int = 2
|
epochs: int = 2
|
||||||
@@ -4,13 +4,13 @@ from pathlib import Path
|
|||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def temporary_config_override(**overrides):
|
def temporary_config_override(**overrides):
|
||||||
"""Context manager for temporarily overriding configuration"""
|
"""Context manager for temporarily overriding configuration"""
|
||||||
from core.config import get_config
|
from ners.core.config import get_config
|
||||||
|
|
||||||
config = get_config()
|
config = get_config()
|
||||||
original_values = {}
|
original_values = {}
|
||||||
@@ -5,7 +5,7 @@ from typing import Optional, Union, Iterator, Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
OPTIMIZED_DTYPES = {
|
OPTIMIZED_DTYPES = {
|
||||||
# Numeric columns with appropriate bit-width
|
# Numeric columns with appropriate bit-width
|
||||||
@@ -113,7 +113,9 @@ class DataLoader:
|
|||||||
sex_values = df["sex"].dropna().unique()
|
sex_values = df["sex"].dropna().unique()
|
||||||
|
|
||||||
if len(sex_values) == 0:
|
if len(sex_values) == 0:
|
||||||
logging.warning(f"No valid values found in sex column 'sex', using random sampling")
|
logging.warning(
|
||||||
|
"No valid values found in sex column 'sex', using random sampling"
|
||||||
|
)
|
||||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||||
|
|
||||||
# Calculate samples per sex category
|
# Calculate samples per sex category
|
||||||
@@ -140,18 +142,22 @@ class DataLoader:
|
|||||||
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
||||||
|
|
||||||
if not balanced_samples:
|
if not balanced_samples:
|
||||||
logging.warning("No balanced samples could be created, using random sampling")
|
logging.warning(
|
||||||
|
"No balanced samples could be created, using random sampling"
|
||||||
|
)
|
||||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||||
|
|
||||||
# Create result using iloc with indices (no copying until final step)
|
# Create result using iloc with indices (no copying until final step)
|
||||||
result = df.iloc[balanced_samples].copy()
|
result = df.iloc[balanced_samples].copy()
|
||||||
|
|
||||||
# Shuffle the final result
|
# Shuffle the final result
|
||||||
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
result = result.sample(
|
||||||
drop=True
|
frac=1, random_state=self.config.data.random_seed
|
||||||
)
|
).reset_index(drop=True)
|
||||||
|
|
||||||
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
|
logging.info(
|
||||||
|
f"Created balanced dataset with {len(result)} records from {len(df)} total"
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
class PromptManager:
|
class PromptManager:
|
||||||
@@ -2,7 +2,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
class StateManager:
|
class StateManager:
|
||||||
+11
-41
@@ -1,21 +1,17 @@
|
|||||||
#!.venv/bin/python3
|
#!.venv/bin/python3
|
||||||
import argparse
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
from ners.core.utils.data_loader import DataLoader
|
||||||
import traceback
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
|
from ners.processing.pipeline import Pipeline
|
||||||
from core.config import setup_config
|
from ners.processing.steps.data_cleaning_step import DataCleaningStep
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.processing.steps.data_selection_step import DataSelectionStep
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.steps.data_splitting_step import DataSplittingStep
|
||||||
from processing.pipeline import Pipeline
|
from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
|
||||||
from processing.steps.data_cleaning_step import DataCleaningStep
|
from ners.processing.steps.ner_annotation_step import NERAnnotationStep
|
||||||
from processing.steps.data_selection_step import DataSelectionStep
|
from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||||
from processing.steps.data_splitting_step import DataSplittingStep
|
|
||||||
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(config) -> Pipeline:
|
def create_pipeline(config) -> Pipeline:
|
||||||
"""Create pipeline from configuration"""
|
|
||||||
batch_config = BatchConfig(
|
batch_config = BatchConfig(
|
||||||
batch_size=config.processing.batch_size,
|
batch_size=config.processing.batch_size,
|
||||||
max_workers=config.processing.max_workers,
|
max_workers=config.processing.max_workers,
|
||||||
@@ -23,14 +19,13 @@ def create_pipeline(config) -> Pipeline:
|
|||||||
use_multiprocessing=config.processing.use_multiprocessing,
|
use_multiprocessing=config.processing.use_multiprocessing,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add steps based on configuration
|
|
||||||
pipeline = Pipeline(batch_config)
|
pipeline = Pipeline(batch_config)
|
||||||
steps = [
|
steps = [
|
||||||
DataCleaningStep(config),
|
DataCleaningStep(config),
|
||||||
FeatureExtractionStep(config),
|
FeatureExtractionStep(config),
|
||||||
DataSelectionStep(config),
|
DataSelectionStep(config),
|
||||||
# NERAnnotationStep(config),
|
NERAnnotationStep(config),
|
||||||
# LLMAnnotationStep(config),
|
LLMAnnotationStep(config),
|
||||||
]
|
]
|
||||||
|
|
||||||
for stage in config.stages:
|
for stage in config.stages:
|
||||||
@@ -42,7 +37,6 @@ def create_pipeline(config) -> Pipeline:
|
|||||||
|
|
||||||
|
|
||||||
def run_pipeline(config) -> int:
|
def run_pipeline(config) -> int:
|
||||||
"""Run the complete pipeline"""
|
|
||||||
try:
|
try:
|
||||||
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
||||||
|
|
||||||
@@ -79,27 +73,3 @@ def run_pipeline(config) -> int:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main entry point with unified configuration loading"""
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="DRC NERS Processing Pipeline",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
try:
|
|
||||||
config = setup_config(config_path=args.config, env=args.env)
|
|
||||||
return run_pipeline(config)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Pipeline failed: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
Executable
+14
@@ -0,0 +1,14 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
|
||||||
|
def status(*, detailed: bool = False) -> None:
|
||||||
|
PipelineMonitor().print_status(detailed=detailed)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_step(step: str, *, keep_last: int = 1) -> None:
|
||||||
|
PipelineMonitor().clean_step_checkpoints(step, keep_last)
|
||||||
|
|
||||||
|
|
||||||
|
def reset_step(step: str) -> None:
|
||||||
|
PipelineMonitor().reset_step(step)
|
||||||
+10
-25
@@ -1,29 +1,24 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from core.config import setup_config, PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from processing.ner.name_builder import NameBuilder
|
from ners.processing.ner.name_builder import NameBuilder
|
||||||
from processing.ner.name_engineering import NameEngineering
|
from ners.processing.ner.name_engineering import NameEngineering
|
||||||
from processing.ner.name_model import NameModel
|
from ners.processing.ner.name_model import NameModel
|
||||||
|
|
||||||
|
|
||||||
def feature(config: PipelineConfig):
|
def feature(config: PipelineConfig):
|
||||||
"""Apply feature engineering to create position-independent NER dataset."""
|
|
||||||
NameEngineering(config).compute()
|
NameEngineering(config).compute()
|
||||||
|
|
||||||
|
|
||||||
def build(config: PipelineConfig):
|
def build(config: PipelineConfig):
|
||||||
"""Build NER dataset using NERDataBuilder."""
|
|
||||||
NameBuilder(config).build()
|
NameBuilder(config).build()
|
||||||
|
|
||||||
|
|
||||||
def train(config: PipelineConfig):
|
def train(config: PipelineConfig):
|
||||||
"""Train the NER model."""
|
|
||||||
name_model = NameModel(config)
|
name_model = NameModel(config)
|
||||||
|
|
||||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||||
@@ -37,7 +32,9 @@ def train(config: PipelineConfig):
|
|||||||
split_idx = int(len(data) * 0.9)
|
split_idx = int(len(data) * 0.9)
|
||||||
train_data, eval_data = data[:split_idx], data[split_idx:]
|
train_data, eval_data = data[:split_idx], data[split_idx:]
|
||||||
|
|
||||||
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
logging.info(
|
||||||
|
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
|
||||||
|
)
|
||||||
name_model.train(
|
name_model.train(
|
||||||
data=train_data,
|
data=train_data,
|
||||||
epochs=config.processing.epochs,
|
epochs=config.processing.epochs,
|
||||||
@@ -75,21 +72,9 @@ def run_pipeline(config: PipelineConfig, reset: bool = False):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="NER model management for DRC names")
|
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
|
||||||
parser.add_argument("--reset", action="store_true", help="Reset all steps")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config = setup_config(config_path=args.config, env=args.env)
|
logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
|
||||||
return run_pipeline(config, args.reset)
|
return 1
|
||||||
|
except Exception:
|
||||||
except Exception as e:
|
|
||||||
print(f"Pipeline failed: {e}")
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@@ -8,4 +8,6 @@ class BatchConfig:
|
|||||||
batch_size: int = 1000
|
batch_size: int = 1000
|
||||||
max_workers: int = 4
|
max_workers: int = 4
|
||||||
checkpoint_interval: int = 5 # Save checkpoint every N batches
|
checkpoint_interval: int = 5 # Save checkpoint every N batches
|
||||||
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
use_multiprocessing: bool = (
|
||||||
|
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
||||||
|
)
|
||||||
@@ -4,9 +4,9 @@ from typing import Iterator
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.batch.memory_monitor import MemoryMonitor
|
from ners.processing.batch.memory_monitor import MemoryMonitor
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class BatchProcessor:
|
class BatchProcessor:
|
||||||
@@ -33,7 +33,9 @@ class BatchProcessor:
|
|||||||
|
|
||||||
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
||||||
if step.batch_exists(batch_id):
|
if step.batch_exists(batch_id):
|
||||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
logging.info(
|
||||||
|
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||||
|
)
|
||||||
processed_batch = step.load_batch(batch_id)
|
processed_batch = step.load_batch(batch_id)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
@@ -80,7 +82,9 @@ class BatchProcessor:
|
|||||||
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Memory-optimized concurrent processing"""
|
"""Memory-optimized concurrent processing"""
|
||||||
executor_class = (
|
executor_class = (
|
||||||
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
|
ProcessPoolExecutor
|
||||||
|
if self.config.use_multiprocessing
|
||||||
|
else ThreadPoolExecutor
|
||||||
)
|
)
|
||||||
results = {}
|
results = {}
|
||||||
|
|
||||||
@@ -89,7 +93,9 @@ class BatchProcessor:
|
|||||||
future_to_batch = {}
|
future_to_batch = {}
|
||||||
for batch, batch_id in self.create_batches(df):
|
for batch, batch_id in self.create_batches(df):
|
||||||
if step.batch_exists(batch_id):
|
if step.batch_exists(batch_id):
|
||||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
logging.info(
|
||||||
|
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||||
|
)
|
||||||
results[batch_id] = step.load_batch(batch_id)
|
results[batch_id] = step.load_batch(batch_id)
|
||||||
else:
|
else:
|
||||||
# Only copy if necessary for concurrent processing
|
# Only copy if necessary for concurrent processing
|
||||||
@@ -121,7 +127,9 @@ class BatchProcessor:
|
|||||||
del results
|
del results
|
||||||
self.memory_monitor.cleanup_memory()
|
self.memory_monitor.cleanup_memory()
|
||||||
|
|
||||||
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
result = (
|
||||||
|
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
||||||
|
)
|
||||||
|
|
||||||
# Final cleanup
|
# Final cleanup
|
||||||
del ordered_results
|
del ordered_results
|
||||||
@@ -131,7 +139,9 @@ class BatchProcessor:
|
|||||||
|
|
||||||
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Process data using the configured strategy"""
|
"""Process data using the configured strategy"""
|
||||||
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
|
step.state.total_batches = (
|
||||||
|
len(df) + self.config.batch_size - 1
|
||||||
|
) // self.config.batch_size
|
||||||
step.load_state()
|
step.load_state()
|
||||||
|
|
||||||
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
||||||
+16
-6
@@ -4,8 +4,8 @@ import shutil
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Dict
|
from typing import Optional, Dict
|
||||||
|
|
||||||
from core.config.config_manager import ConfigManager
|
from ners.core.config.config_manager import ConfigManager
|
||||||
from core.config.project_paths import ProjectPaths
|
from ners.core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
class PipelineMonitor:
|
class PipelineMonitor:
|
||||||
@@ -97,7 +97,10 @@ class PipelineMonitor:
|
|||||||
|
|
||||||
avg_completion = total_completion / len(self.steps)
|
avg_completion = total_completion / len(self.steps)
|
||||||
|
|
||||||
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
|
if avg_completion >= 100 and overall_status not in [
|
||||||
|
"error",
|
||||||
|
"completed_with_errors",
|
||||||
|
]:
|
||||||
overall_status = "completed"
|
overall_status = "completed"
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -121,7 +124,9 @@ class PipelineMonitor:
|
|||||||
print(f"{step_name.replace('_', ' ').title()}:")
|
print(f"{step_name.replace('_', ' ').title()}:")
|
||||||
print(f" Status: {step_status['status']}")
|
print(f" Status: {step_status['status']}")
|
||||||
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
||||||
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
|
print(
|
||||||
|
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
|
||||||
|
)
|
||||||
|
|
||||||
if step_status["failed_batches"] > 0:
|
if step_status["failed_batches"] > 0:
|
||||||
print(f" Failed Batches: {step_status['failed_batches']}")
|
print(f" Failed Batches: {step_status['failed_batches']}")
|
||||||
@@ -141,7 +146,10 @@ class PipelineMonitor:
|
|||||||
if step_dir.exists():
|
if step_dir.exists():
|
||||||
csv_files = list(step_dir.glob("*.csv"))
|
csv_files = list(step_dir.glob("*.csv"))
|
||||||
step_size = sum(f.stat().st_size for f in csv_files)
|
step_size = sum(f.stat().st_size for f in csv_files)
|
||||||
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
|
counts[step] = {
|
||||||
|
"files": len(csv_files),
|
||||||
|
"size_mb": step_size / (1024 * 1024),
|
||||||
|
}
|
||||||
total_size += step_size
|
total_size += step_size
|
||||||
else:
|
else:
|
||||||
counts[step] = {"files": 0, "size_mb": 0}
|
counts[step] = {"files": 0, "size_mb": 0}
|
||||||
@@ -160,7 +168,9 @@ class PipelineMonitor:
|
|||||||
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
||||||
|
|
||||||
if len(csv_files) <= keep_last:
|
if len(csv_files) <= keep_last:
|
||||||
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
|
logging.info(
|
||||||
|
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
||||||
@@ -3,7 +3,7 @@ from typing import List, Tuple, Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.steps.feature_extraction_step import NameCategory
|
from ners.processing.steps.feature_extraction_step import NameCategory
|
||||||
|
|
||||||
|
|
||||||
class BaseNameFormatter(ABC):
|
class BaseNameFormatter(ABC):
|
||||||
@@ -12,7 +12,9 @@ class BaseNameFormatter(ABC):
|
|||||||
Contains common logic for NER tagging and attribute computation.
|
Contains common logic for NER tagging and attribute computation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
def __init__(
|
||||||
|
self, connectors: List[str] = None, additional_surnames: List[str] = None
|
||||||
|
):
|
||||||
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
||||||
self.additional_surnames = additional_surnames or [
|
self.additional_surnames = additional_surnames or [
|
||||||
"jean",
|
"jean",
|
||||||
@@ -46,7 +48,9 @@ class BaseNameFormatter(ABC):
|
|||||||
end_pos = current_pos + len(word)
|
end_pos = current_pos + len(word)
|
||||||
|
|
||||||
# Determine tag based on word content
|
# Determine tag based on word content
|
||||||
if word in native_parts or any(connector in word for connector in self.connectors):
|
if word in native_parts or any(
|
||||||
|
connector in word for connector in self.connectors
|
||||||
|
):
|
||||||
tag = "NATIVE"
|
tag = "NATIVE"
|
||||||
elif word == surname or word in self.additional_surnames:
|
elif word == surname or word in self.additional_surnames:
|
||||||
tag = "SURNAME"
|
tag = "SURNAME"
|
||||||
@@ -72,7 +76,9 @@ class BaseNameFormatter(ABC):
|
|||||||
"words": words_count,
|
"words": words_count,
|
||||||
"length": length,
|
"length": length,
|
||||||
"identified_category": (
|
"identified_category": (
|
||||||
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
|
NameCategory.SIMPLE.value
|
||||||
|
if words_count == 3
|
||||||
|
else NameCategory.COMPOSE.value
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
+1
-1
@@ -3,7 +3,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class ConnectorFormatter(BaseNameFormatter):
|
class ConnectorFormatter(BaseNameFormatter):
|
||||||
+7
-3
@@ -3,13 +3,15 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class ExtendedSurnameFormatter(BaseNameFormatter):
|
class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row["probable_native"])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
original_surname = (
|
||||||
|
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
)
|
||||||
|
|
||||||
# Add random additional surname
|
# Add random additional surname
|
||||||
additional_surname = random.choice(self.additional_surnames)
|
additional_surname = random.choice(self.additional_surnames)
|
||||||
@@ -22,7 +24,9 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
|
|||||||
"identified_name": row["probable_native"],
|
"identified_name": row["probable_native"],
|
||||||
"probable_surname": combined_surname,
|
"probable_surname": combined_surname,
|
||||||
"identified_surname": combined_surname,
|
"identified_surname": combined_surname,
|
||||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
|
"ner_entities": str(
|
||||||
|
self.create_ner_tags(full_name, native_parts, combined_surname)
|
||||||
|
),
|
||||||
"transformation_type": self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_numeric_features(full_name),
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
+1
-1
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class NativeOnlyFormatter(BaseNameFormatter):
|
class NativeOnlyFormatter(BaseNameFormatter):
|
||||||
+1
-1
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class OriginalFormatter(BaseNameFormatter):
|
class OriginalFormatter(BaseNameFormatter):
|
||||||
+1
-1
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class PositionFlippedFormatter(BaseNameFormatter):
|
class PositionFlippedFormatter(BaseNameFormatter):
|
||||||
+7
-3
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class ReducedNativeFormatter(BaseNameFormatter):
|
class ReducedNativeFormatter(BaseNameFormatter):
|
||||||
@@ -11,7 +11,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
|
|||||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
|
||||||
# Keep only first native component + surname
|
# Keep only first native component + surname
|
||||||
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
reduced_native = (
|
||||||
|
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
||||||
|
)
|
||||||
full_name = f"{reduced_native} {surname}".strip()
|
full_name = f"{reduced_native} {surname}".strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -20,7 +22,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
|
|||||||
"identified_name": reduced_native,
|
"identified_name": reduced_native,
|
||||||
"probable_surname": surname,
|
"probable_surname": surname,
|
||||||
"identified_surname": surname,
|
"identified_surname": surname,
|
||||||
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
|
"ner_entities": str(
|
||||||
|
self.create_ner_tags(full_name, [reduced_native], surname)
|
||||||
|
),
|
||||||
"transformation_type": self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_numeric_features(full_name),
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
@@ -4,8 +4,8 @@ import logging
|
|||||||
import spacy
|
import spacy
|
||||||
from spacy.tokens import DocBin
|
from spacy.tokens import DocBin
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from .name_tagger import NameTagger
|
from .name_tagger import NameTagger
|
||||||
|
|
||||||
|
|
||||||
@@ -20,7 +20,9 @@ class NameBuilder:
|
|||||||
self.tagger = NameTagger()
|
self.tagger = NameTagger()
|
||||||
|
|
||||||
def build(self) -> int:
|
def build(self) -> int:
|
||||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["engineered"]
|
||||||
|
)
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||||
|
|
||||||
@@ -38,7 +40,9 @@ class NameBuilder:
|
|||||||
|
|
||||||
# Use NERNameTagger for parsing and validation
|
# Use NERNameTagger for parsing and validation
|
||||||
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
|
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
|
||||||
validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities)
|
validated_entities = self.tagger.validate_entities(
|
||||||
|
ner_df["name"], parsed_entities
|
||||||
|
)
|
||||||
|
|
||||||
# Drop rows with no valid entities
|
# Drop rows with no valid entities
|
||||||
mask = validated_entities.map(bool)
|
mask = validated_entities.map(bool)
|
||||||
@@ -51,22 +55,33 @@ class NameBuilder:
|
|||||||
|
|
||||||
# Prepare training data
|
# Prepare training data
|
||||||
training_data = list(
|
training_data = list(
|
||||||
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
zip(
|
||||||
|
ner_df["name"].tolist(),
|
||||||
|
[{"entities": ents} for ents in validated_entities],
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use NERNameTagger to create spaCy DocBin
|
# Use NERNameTagger to create spaCy DocBin
|
||||||
docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
docs = self.tagger.create_docs(
|
||||||
|
nlp, ner_df["name"].tolist(), validated_entities.tolist()
|
||||||
|
)
|
||||||
doc_bin = DocBin(docs=docs)
|
doc_bin = DocBin(docs=docs)
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
json_path = self.config.paths.get_data_path(
|
||||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
self.config.data.output_files["ner_data"]
|
||||||
|
)
|
||||||
|
spacy_path = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["ner_spacy"]
|
||||||
|
)
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||||
doc_bin.to_disk(spacy_path)
|
doc_bin.to_disk(spacy_path)
|
||||||
|
|
||||||
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
logging.info(
|
||||||
|
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
|
||||||
|
)
|
||||||
logging.info(f"Saved NER JSON to {json_path}")
|
logging.info(f"Saved NER JSON to {json_path}")
|
||||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||||
return 0
|
return 0
|
||||||
@@ -6,14 +6,14 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
from ners.processing.ner.formats.connectors_format import ConnectorFormatter
|
||||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||||
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||||
from processing.ner.formats.original_format import OriginalFormatter
|
from ners.processing.ner.formats.original_format import OriginalFormatter
|
||||||
from processing.ner.formats.position_flipped_format import PositionFlippedFormatter
|
from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
|
||||||
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
||||||
|
|
||||||
|
|
||||||
class NameEngineering:
|
class NameEngineering:
|
||||||
@@ -44,42 +44,60 @@ class NameEngineering:
|
|||||||
# Initialize format classes
|
# Initialize format classes
|
||||||
self.formatters = {
|
self.formatters = {
|
||||||
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
||||||
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
|
"native_only": NativeOnlyFormatter(
|
||||||
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
|
self.connectors, self.additional_surnames
|
||||||
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
|
),
|
||||||
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
|
"position_flipped": PositionFlippedFormatter(
|
||||||
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
|
"reduced_native": ReducedNativeFormatter(
|
||||||
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
|
"connector_added": ConnectorFormatter(
|
||||||
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
|
"extended_surname": ExtendedSurnameFormatter(
|
||||||
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
def load_data(self) -> pd.DataFrame:
|
def load_data(self) -> pd.DataFrame:
|
||||||
"""Load and filter NER-tagged data from CSV file"""
|
"""Load and filter NER-tagged data from CSV file"""
|
||||||
|
|
||||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
|
||||||
# Filter only NER-tagged rows
|
# Filter only NER-tagged rows
|
||||||
ner_data = df[df["ner_tagged"] == 1].copy()
|
ner_data = df[df["ner_tagged"] == 1].copy()
|
||||||
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
|
logging.info(
|
||||||
|
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
|
||||||
|
)
|
||||||
|
|
||||||
return ner_data
|
return ner_data
|
||||||
|
|
||||||
def compute(self) -> None:
|
def compute(self) -> None:
|
||||||
logging.info("Applying feature engineering transformations...")
|
logging.info("Applying feature engineering transformations...")
|
||||||
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
input_filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
output_filepath = self.config.paths.get_data_path(
|
output_filepath = self.config.paths.get_data_path(
|
||||||
self.config.data.output_files["engineered"]
|
self.config.data.output_files["engineered"]
|
||||||
)
|
)
|
||||||
|
|
||||||
df = self.data_loader.load_csv_complete(input_filepath)
|
df = self.data_loader.load_csv_complete(input_filepath)
|
||||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||||
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
logging.info(
|
||||||
|
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
|
||||||
|
)
|
||||||
|
|
||||||
del df # No need to keep in memory
|
del df # No need to keep in memory
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
ner_df = ner_df.sample(
|
||||||
drop=True
|
frac=1, random_state=self.config.data.random_seed
|
||||||
)
|
).reset_index(drop=True)
|
||||||
total_rows = len(ner_df)
|
total_rows = len(ner_df)
|
||||||
|
|
||||||
# Calculate split points
|
# Calculate split points
|
||||||
@@ -94,7 +112,11 @@ class NameEngineering:
|
|||||||
(0, split_25_1, "original"), # First 25%: original format
|
(0, split_25_1, "original"), # First 25%: original format
|
||||||
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
||||||
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
||||||
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
|
(
|
||||||
|
split_25_3,
|
||||||
|
split_10_1,
|
||||||
|
"reduced_native",
|
||||||
|
), # Fourth 10%: reduce native components
|
||||||
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
||||||
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
||||||
]
|
]
|
||||||
@@ -11,7 +11,7 @@ from spacy.training import Example
|
|||||||
from spacy.util import minibatch
|
from spacy.util import minibatch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
class NameModel:
|
class NameModel:
|
||||||
@@ -87,7 +87,9 @@ class NameModel:
|
|||||||
|
|
||||||
# Handle different annotation formats from NERNameTagger
|
# Handle different annotation formats from NERNameTagger
|
||||||
if not isinstance(annotations, dict) or "entities" not in annotations:
|
if not isinstance(annotations, dict) or "entities" not in annotations:
|
||||||
logging.warning(f"Skipping invalid annotations at index {i}: {annotations}")
|
logging.warning(
|
||||||
|
f"Skipping invalid annotations at index {i}: {annotations}"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -124,7 +126,9 @@ class NameModel:
|
|||||||
valid_entities = []
|
valid_entities = []
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
||||||
logging.warning(f"Skipping invalid entity format in '{text}': {entity}")
|
logging.warning(
|
||||||
|
f"Skipping invalid entity format in '{text}': {entity}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start, end, label = entity
|
start, end, label = entity
|
||||||
@@ -138,21 +142,30 @@ class NameModel:
|
|||||||
or start < 0
|
or start < 0
|
||||||
or end > len(text)
|
or end > len(text)
|
||||||
):
|
):
|
||||||
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
|
logging.warning(
|
||||||
|
f"Skipping invalid entity bounds in '{text}': {entity}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for overlaps with already validated entities
|
# Check for overlaps with already validated entities
|
||||||
has_overlap = any(
|
has_overlap = any(
|
||||||
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
|
start < v_end and end > v_start
|
||||||
|
for v_start, v_end, _ in valid_entities
|
||||||
)
|
)
|
||||||
|
|
||||||
if has_overlap:
|
if has_overlap:
|
||||||
logging.warning(f"Skipping overlapping entity in '{text}': {entity}")
|
logging.warning(
|
||||||
|
f"Skipping overlapping entity in '{text}': {entity}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Validate that the span doesn't contain spaces (matching tagger validation)
|
# Validate that the span doesn't contain spaces (matching tagger validation)
|
||||||
span_text = text[start:end]
|
span_text = text[start:end]
|
||||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
if (
|
||||||
|
not span_text
|
||||||
|
or span_text != span_text.strip()
|
||||||
|
or " " in span_text
|
||||||
|
):
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
||||||
)
|
)
|
||||||
@@ -161,7 +174,9 @@ class NameModel:
|
|||||||
valid_entities.append((start, end, label))
|
valid_entities.append((start, end, label))
|
||||||
|
|
||||||
if not valid_entities:
|
if not valid_entities:
|
||||||
logging.warning(f"Skipping training example with no valid entities: '{text}'")
|
logging.warning(
|
||||||
|
f"Skipping training example with no valid entities: '{text}'"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -219,7 +234,9 @@ class NameModel:
|
|||||||
batches = minibatch(examples, size=batch_size)
|
batches = minibatch(examples, size=batch_size)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_losses = {}
|
batch_losses = {}
|
||||||
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
|
self.nlp.update(
|
||||||
|
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
|
||||||
|
)
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
||||||
)
|
)
|
||||||
@@ -230,7 +247,7 @@ class NameModel:
|
|||||||
|
|
||||||
del batches # free memory
|
del batches # free memory
|
||||||
losses_history.append(losses.get("ner", 0))
|
losses_history.append(losses.get("ner", 0))
|
||||||
logging.info(f"Epoch {epoch+1}/{epochs}, Total Loss: {losses['ner']:.4f}")
|
logging.info(f"Epoch {epoch + 1}/{epochs}, Total Loss: {losses['ner']:.4f}")
|
||||||
|
|
||||||
# Store training statistics
|
# Store training statistics
|
||||||
self.training_stats = {
|
self.training_stats = {
|
||||||
@@ -242,7 +259,9 @@ class NameModel:
|
|||||||
"dropout_rate": dropout_rate,
|
"dropout_rate": dropout_rate,
|
||||||
}
|
}
|
||||||
|
|
||||||
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
|
logging.info(
|
||||||
|
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
||||||
"""Evaluate the trained model on test data"""
|
"""Evaluate the trained model on test data"""
|
||||||
@@ -291,10 +310,14 @@ class NameModel:
|
|||||||
entity_stats[label]["fp"] += 1
|
entity_stats[label]["fp"] += 1
|
||||||
|
|
||||||
# Calculate overall metrics
|
# Calculate overall metrics
|
||||||
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
|
precision = (
|
||||||
|
correct_entities / predicted_entities if predicted_entities > 0 else 0
|
||||||
|
)
|
||||||
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
||||||
f1_score = (
|
f1_score = (
|
||||||
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
2 * (precision * recall) / (precision + recall)
|
||||||
|
if (precision + recall) > 0
|
||||||
|
else 0
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate per-label metrics
|
# Calculate per-label metrics
|
||||||
@@ -304,7 +327,11 @@ class NameModel:
|
|||||||
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
||||||
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
||||||
label_f1 = (
|
label_f1 = (
|
||||||
(2 * (label_precision * label_recall) / (label_precision + label_recall))
|
(
|
||||||
|
2
|
||||||
|
* (label_precision * label_recall)
|
||||||
|
/ (label_precision + label_recall)
|
||||||
|
)
|
||||||
if (label_precision + label_recall) > 0
|
if (label_precision + label_recall) > 0
|
||||||
else 0
|
else 0
|
||||||
)
|
)
|
||||||
@@ -394,7 +421,9 @@ class NameModel:
|
|||||||
"label": ent.label_,
|
"label": ent.label_,
|
||||||
"start": ent.start_char,
|
"start": ent.start_char,
|
||||||
"end": ent.end_char,
|
"end": ent.end_char,
|
||||||
"confidence": getattr(ent, "score", None), # If confidence scores are available
|
"confidence": getattr(
|
||||||
|
ent, "score", None
|
||||||
|
), # If confidence scores are available
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -48,7 +48,9 @@ class NameTagger:
|
|||||||
# Find the first occurrence of this native word that doesn't overlap
|
# Find the first occurrence of this native word that doesn't overlap
|
||||||
start_pos = 0
|
start_pos = 0
|
||||||
while True:
|
while True:
|
||||||
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
|
pos = name_lower.find(
|
||||||
|
native_word_lower, start_pos
|
||||||
|
) # Case-insensitive search
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -78,7 +80,9 @@ class NameTagger:
|
|||||||
# Find the first occurrence that doesn't overlap
|
# Find the first occurrence that doesn't overlap
|
||||||
start_pos = 0
|
start_pos = 0
|
||||||
while True:
|
while True:
|
||||||
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
|
pos = name_lower.find(
|
||||||
|
surname_lower, start_pos
|
||||||
|
) # Case-insensitive search
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -120,8 +124,13 @@ class NameTagger:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for overlaps with already validated entities
|
# Check for overlaps with already validated entities
|
||||||
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
|
if any(
|
||||||
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
|
start < v_end and end > v_start
|
||||||
|
for v_start, v_end, _ in validated_entities
|
||||||
|
):
|
||||||
|
logging.warning(
|
||||||
|
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||||
@@ -200,10 +209,16 @@ class NameTagger:
|
|||||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||||
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
return [
|
||||||
|
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
parsed = ast.literal_eval(entities_str)
|
parsed = ast.literal_eval(entities_str)
|
||||||
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
|
return [
|
||||||
|
tuple(e)
|
||||||
|
for e in parsed
|
||||||
|
if isinstance(e, (list, tuple)) and len(e) == 3
|
||||||
|
]
|
||||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@@ -251,7 +266,9 @@ class NameTagger:
|
|||||||
last_end = e
|
last_end = e
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
def validate_entities(
|
||||||
|
self, texts: pd.Series, entities_series: pd.Series
|
||||||
|
) -> pd.Series:
|
||||||
"""Vectorized entity validation."""
|
"""Vectorized entity validation."""
|
||||||
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
||||||
|
|
||||||
@@ -4,9 +4,9 @@ from typing import Dict, Any
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.batch.batch_processor import BatchProcessor
|
from ners.processing.batch.batch_processor import BatchProcessor
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
@@ -8,9 +8,9 @@ from typing import List, Optional
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -38,7 +38,10 @@ class PipelineStep(ABC):
|
|||||||
"""Abstract base class for pipeline steps"""
|
"""Abstract base class for pipeline steps"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
self,
|
||||||
|
name: str,
|
||||||
|
pipeline_config: PipelineConfig,
|
||||||
|
batch_config: Optional[BatchConfig] = None,
|
||||||
):
|
):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.pipeline_config = pipeline_config
|
self.pipeline_config = pipeline_config
|
||||||
+3
-3
@@ -2,9 +2,9 @@ import logging
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.text_cleaner import TextCleaner
|
from ners.core.utils.text_cleaner import TextCleaner
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class DataCleaningStep(PipelineStep):
|
class DataCleaningStep(PipelineStep):
|
||||||
+8
-4
@@ -2,8 +2,8 @@ import logging
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class DataSelectionStep(PipelineStep):
|
class DataSelectionStep(PipelineStep):
|
||||||
@@ -31,8 +31,12 @@ class DataSelectionStep(PipelineStep):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Check which columns exist in the batch
|
# Check which columns exist in the batch
|
||||||
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
available_columns = [
|
||||||
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
|
col for col in self.selected_columns if col in batch.columns
|
||||||
|
]
|
||||||
|
missing_columns = [
|
||||||
|
col for col in self.selected_columns if col not in batch.columns
|
||||||
|
]
|
||||||
|
|
||||||
if missing_columns:
|
if missing_columns:
|
||||||
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
||||||
+14
-8
@@ -1,11 +1,11 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.region_mapper import RegionMapper
|
from ners.core.utils.region_mapper import RegionMapper
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
from processing.steps.feature_extraction_step import Gender
|
from ners.processing.steps.feature_extraction_step import Gender
|
||||||
|
|
||||||
|
|
||||||
class DataSplittingStep(PipelineStep):
|
class DataSplittingStep(PipelineStep):
|
||||||
@@ -26,7 +26,9 @@ class DataSplittingStep(PipelineStep):
|
|||||||
if self.eval_indices is None:
|
if self.eval_indices is None:
|
||||||
np.random.seed(self.pipeline_config.data.random_seed)
|
np.random.seed(self.pipeline_config.data.random_seed)
|
||||||
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
|
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
|
||||||
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
|
self.eval_indices = set(
|
||||||
|
np.random.choice(total_size, size=eval_size, replace=False)
|
||||||
|
)
|
||||||
return self.eval_indices
|
return self.eval_indices
|
||||||
|
|
||||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
@@ -45,7 +47,9 @@ class DataSplittingStep(PipelineStep):
|
|||||||
df_evaluation = df[eval_mask]
|
df_evaluation = df[eval_mask]
|
||||||
df_featured = df[~eval_mask]
|
df_featured = df[~eval_mask]
|
||||||
|
|
||||||
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
|
self.data_loader.save_csv(
|
||||||
|
df_evaluation, data_dir / output_files["evaluation"]
|
||||||
|
)
|
||||||
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
|
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
|
||||||
else:
|
else:
|
||||||
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
||||||
@@ -53,7 +57,9 @@ class DataSplittingStep(PipelineStep):
|
|||||||
if self.pipeline_config.data.split_by_province:
|
if self.pipeline_config.data.split_by_province:
|
||||||
for province in RegionMapper.get_provinces():
|
for province in RegionMapper.get_provinces():
|
||||||
df_region = df[df.province == province]
|
df_region = df[df.province == province]
|
||||||
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
|
self.data_loader.save_csv(
|
||||||
|
df_region, data_dir / "provinces" / f"{province}.csv"
|
||||||
|
)
|
||||||
|
|
||||||
if self.pipeline_config.data.split_by_gender:
|
if self.pipeline_config.data.split_by_gender:
|
||||||
df_males = df[df.sex == Gender.MALE.value]
|
df_males = df[df.sex == Gender.MALE.value]
|
||||||
+10
-6
@@ -5,10 +5,10 @@ from typing import Dict, Any
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.region_mapper import RegionMapper
|
from ners.core.utils.region_mapper import RegionMapper
|
||||||
from processing.ner.name_tagger import NameTagger
|
from ners.processing.ner.name_tagger import NameTagger
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class Gender(Enum):
|
class Gender(Enum):
|
||||||
@@ -64,10 +64,14 @@ class FeatureExtractionStep(PipelineStep):
|
|||||||
|
|
||||||
self._assign_probable_names(result)
|
self._assign_probable_names(result)
|
||||||
self._process_simple_names(result)
|
self._process_simple_names(result)
|
||||||
result["identified_category"] = self._assign_identified_category(result["words"])
|
result["identified_category"] = self._assign_identified_category(
|
||||||
|
result["words"]
|
||||||
|
)
|
||||||
|
|
||||||
if "year" in result.columns:
|
if "year" in result.columns:
|
||||||
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
|
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
|
||||||
|
"Int16"
|
||||||
|
)
|
||||||
|
|
||||||
if "region" in result.columns:
|
if "region" in result.columns:
|
||||||
result["province"] = self.region_mapper.map(result["region"]).str.lower()
|
result["province"] = self.region_mapper.map(result["region"]).str.lower()
|
||||||
+17
-10
@@ -7,12 +7,12 @@ import ollama
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.prompt_manager import PromptManager
|
from ners.core.utils.prompt_manager import PromptManager
|
||||||
from core.utils.rate_limiter import RateLimitConfig
|
from ners.core.utils.rate_limiter import RateLimitConfig
|
||||||
from core.utils.rate_limiter import RateLimiter
|
from ners.core.utils.rate_limiter import RateLimiter
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.steps import PipelineStep, NameAnnotation
|
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||||
|
|
||||||
|
|
||||||
class LLMAnnotationStep(PipelineStep):
|
class LLMAnnotationStep(PipelineStep):
|
||||||
@@ -24,7 +24,8 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
batch_config = BatchConfig(
|
batch_config = BatchConfig(
|
||||||
batch_size=pipeline_config.processing.batch_size,
|
batch_size=pipeline_config.processing.batch_size,
|
||||||
max_workers=min(
|
max_workers=min(
|
||||||
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
|
self.llm_config.max_concurrent_requests,
|
||||||
|
pipeline_config.processing.max_workers,
|
||||||
),
|
),
|
||||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||||
@@ -33,7 +34,9 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
|
|
||||||
self.prompt = PromptManager(pipeline_config).load_prompt()
|
self.prompt = PromptManager(pipeline_config).load_prompt()
|
||||||
self.rate_limiter = (
|
self.rate_limiter = (
|
||||||
self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None
|
self._create_rate_limiter()
|
||||||
|
if self.llm_config.enable_rate_limiting
|
||||||
|
else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# Statistics
|
# Statistics
|
||||||
@@ -76,7 +79,9 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
|
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
|
||||||
)
|
)
|
||||||
|
|
||||||
annotation = NameAnnotation.model_validate_json(response.message.content)
|
annotation = NameAnnotation.model_validate_json(
|
||||||
|
response.message.content
|
||||||
|
)
|
||||||
result = {
|
result = {
|
||||||
**annotation.model_dump(),
|
**annotation.model_dump(),
|
||||||
"annotated": 1,
|
"annotated": 1,
|
||||||
@@ -119,7 +124,9 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM")
|
logging.info(
|
||||||
|
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
|
||||||
|
)
|
||||||
|
|
||||||
batch = batch.copy()
|
batch = batch.copy()
|
||||||
client = ollama.Client()
|
client = ollama.Client()
|
||||||
+12
-6
@@ -5,9 +5,9 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from processing.ner.name_model import NameModel
|
from ners.processing.ner.name_model import NameModel
|
||||||
from processing.steps import PipelineStep, NameAnnotation
|
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||||
|
|
||||||
|
|
||||||
class NERAnnotationStep(PipelineStep):
|
class NERAnnotationStep(PipelineStep):
|
||||||
@@ -39,7 +39,9 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
logging.info("NER model loaded successfully")
|
logging.info("NER model loaded successfully")
|
||||||
else:
|
else:
|
||||||
logging.warning(f"NER model not found at {self.model_path}")
|
logging.warning(f"NER model not found at {self.model_path}")
|
||||||
logging.warning("NER annotation will be skipped. Train the model first.")
|
logging.warning(
|
||||||
|
"NER annotation will be skipped. Train the model first."
|
||||||
|
)
|
||||||
self.name_model.nlp = None
|
self.name_model.nlp = None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to load NER model: {e}")
|
logging.error(f"Failed to load NER model: {e}")
|
||||||
@@ -80,7 +82,9 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
# Create annotation result in same format as LLM step
|
# Create annotation result in same format as LLM step
|
||||||
annotation = NameAnnotation(
|
annotation = NameAnnotation(
|
||||||
identified_name=" ".join(native_parts) if native_parts else None,
|
identified_name=" ".join(native_parts) if native_parts else None,
|
||||||
identified_surname=" ".join(surname_parts) if surname_parts else None,
|
identified_surname=" ".join(surname_parts)
|
||||||
|
if surname_parts
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -124,7 +128,9 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER")
|
logging.info(
|
||||||
|
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
|
||||||
|
)
|
||||||
|
|
||||||
batch = batch.copy()
|
batch = batch.copy()
|
||||||
|
|
||||||
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
|
|
||||||
|
|
||||||
class BaseModel(ABC):
|
class BaseModel(ABC):
|
||||||
@@ -103,16 +103,25 @@ class BaseModel(ABC):
|
|||||||
feature_names = self._get_feature_names()
|
feature_names = self._get_feature_names()
|
||||||
return dict(zip(feature_names, coefficients))
|
return dict(zip(feature_names, coefficients))
|
||||||
|
|
||||||
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
|
elif (
|
||||||
|
hasattr(self.model, "named_steps")
|
||||||
|
and "classifier" in self.model.named_steps
|
||||||
|
):
|
||||||
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
||||||
classifier = self.model.named_steps["classifier"]
|
classifier = self.model.named_steps["classifier"]
|
||||||
if hasattr(classifier, "coef_"):
|
if hasattr(classifier, "coef_"):
|
||||||
coefficients = np.abs(classifier.coef_[0])
|
coefficients = np.abs(classifier.coef_[0])
|
||||||
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
|
if hasattr(
|
||||||
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
|
self.model.named_steps["vectorizer"], "get_feature_names_out"
|
||||||
|
):
|
||||||
|
feature_names = self.model.named_steps[
|
||||||
|
"vectorizer"
|
||||||
|
].get_feature_names_out()
|
||||||
# Take top features to avoid too many n-grams
|
# Take top features to avoid too many n-grams
|
||||||
top_indices = np.argsort(coefficients)[-20:]
|
top_indices = np.argsort(coefficients)[-20:]
|
||||||
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
|
return dict(
|
||||||
|
zip(feature_names[top_indices], coefficients[top_indices])
|
||||||
|
)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -143,7 +152,7 @@ class BaseModel(ABC):
|
|||||||
model_data = joblib.load(path)
|
model_data = joblib.load(path)
|
||||||
|
|
||||||
# Recreate the model instance
|
# Recreate the model instance
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
|
|
||||||
config = ExperimentConfig.from_dict(model_data["config"])
|
config = ExperimentConfig.from_dict(model_data["config"])
|
||||||
instance = cls(config)
|
instance = cls(config)
|
||||||
@@ -221,7 +230,9 @@ class BaseModel(ABC):
|
|||||||
if "accuracy" in self.training_history:
|
if "accuracy" in self.training_history:
|
||||||
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
||||||
if "val_accuracy" in self.training_history:
|
if "val_accuracy" in self.training_history:
|
||||||
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
|
axes[0].plot(
|
||||||
|
self.training_history["val_accuracy"], label="Validation Accuracy"
|
||||||
|
)
|
||||||
axes[0].set_title("Model Accuracy")
|
axes[0].set_title("Model Accuracy")
|
||||||
axes[0].set_xlabel("Epoch")
|
axes[0].set_xlabel("Epoch")
|
||||||
axes[0].set_ylabel("Accuracy")
|
axes[0].set_ylabel("Accuracy")
|
||||||
@@ -18,7 +18,9 @@ class ExperimentConfig:
|
|||||||
tags: List[str] = field(default_factory=list)
|
tags: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
# Model configuration
|
# Model configuration
|
||||||
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
|
model_type: str = (
|
||||||
|
"logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||||
|
)
|
||||||
model_params: Dict[str, Any] = field(default_factory=dict)
|
model_params: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
# Feature configuration
|
# Feature configuration
|
||||||
@@ -26,7 +28,9 @@ class ExperimentConfig:
|
|||||||
feature_params: Dict[str, Any] = field(default_factory=dict)
|
feature_params: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
# Data configuration
|
# Data configuration
|
||||||
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
|
train_data_filter: Optional[Dict[str, Any]] = (
|
||||||
|
None # Filter criteria for training data
|
||||||
|
)
|
||||||
test_data_filter: Optional[Dict[str, Any]] = None
|
test_data_filter: Optional[Dict[str, Any]] = None
|
||||||
target_column: str = "sex"
|
target_column: str = "sex"
|
||||||
|
|
||||||
@@ -36,7 +40,9 @@ class ExperimentConfig:
|
|||||||
cross_validation_folds: int = 5
|
cross_validation_folds: int = 5
|
||||||
|
|
||||||
# Evaluation configuration
|
# Evaluation configuration
|
||||||
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
|
metrics: List[str] = field(
|
||||||
|
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
|
||||||
|
)
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
"""Convert to dictionary for serialization"""
|
"""Convert to dictionary for serialization"""
|
||||||
+4
-2
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field, asdict
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Dict, List, Any
|
from typing import Optional, Dict, List, Any
|
||||||
|
|
||||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -51,6 +51,8 @@ class ExperimentResult:
|
|||||||
"""Create from dictionary"""
|
"""Create from dictionary"""
|
||||||
data["config"] = ExperimentConfig.from_dict(data["config"])
|
data["config"] = ExperimentConfig.from_dict(data["config"])
|
||||||
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
||||||
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
data["end_time"] = (
|
||||||
|
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||||
|
)
|
||||||
data["status"] = ExperimentStatus(data["status"])
|
data["status"] = ExperimentStatus(data["status"])
|
||||||
return cls(**data)
|
return cls(**data)
|
||||||
+6
-4
@@ -3,9 +3,9 @@ from typing import List, Dict
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
from research.experiment.feature_extractor import FeatureType
|
from ners.research.experiment.feature_extractor import FeatureType
|
||||||
|
|
||||||
|
|
||||||
class ExperimentBuilder:
|
class ExperimentBuilder:
|
||||||
@@ -27,7 +27,9 @@ class ExperimentBuilder:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def find_template(cls, templates: dict, name: str, experiment_type: str = "baseline") -> dict:
|
def find_template(
|
||||||
|
cls, templates: dict, name: str, experiment_type: str = "baseline"
|
||||||
|
) -> dict:
|
||||||
"""Find experiment configuration by name and type"""
|
"""Find experiment configuration by name and type"""
|
||||||
|
|
||||||
# Map type to section in templates
|
# Map type to section in templates
|
||||||
+37
-15
@@ -9,12 +9,16 @@ import pandas as pd
|
|||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from research.base_model import BaseModel
|
from ners.research.base_model import BaseModel
|
||||||
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
from ners.research.experiment import (
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
ExperimentConfig,
|
||||||
from research.model_registry import create_model
|
ExperimentStatus,
|
||||||
|
calculate_metrics,
|
||||||
|
)
|
||||||
|
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||||
|
from ners.research.model_registry import create_model
|
||||||
|
|
||||||
|
|
||||||
class ExperimentRunner:
|
class ExperimentRunner:
|
||||||
@@ -32,10 +36,14 @@ class ExperimentRunner:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
logging.info(f"Starting experiment: {experiment_id}")
|
logging.info(f"Starting experiment: {experiment_id}")
|
||||||
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
self.tracker.update_experiment(
|
||||||
|
experiment_id, status=ExperimentStatus.RUNNING
|
||||||
|
)
|
||||||
|
|
||||||
# Load data
|
# Load data
|
||||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
|
||||||
# Apply data filters if specified
|
# Apply data filters if specified
|
||||||
@@ -63,8 +71,12 @@ class ExperimentRunner:
|
|||||||
test_pred = model.predict(X_test)
|
test_pred = model.predict(X_test)
|
||||||
|
|
||||||
# Calculate metrics
|
# Calculate metrics
|
||||||
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
|
train_metrics = calculate_metrics(
|
||||||
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
|
y_train, train_pred, experiment_config.metrics
|
||||||
|
)
|
||||||
|
test_metrics = calculate_metrics(
|
||||||
|
y_test, test_pred, experiment_config.metrics
|
||||||
|
)
|
||||||
|
|
||||||
# Cross-validation if requested
|
# Cross-validation if requested
|
||||||
cv_metrics = {}
|
cv_metrics = {}
|
||||||
@@ -125,7 +137,9 @@ class ExperimentRunner:
|
|||||||
experiment_ids = []
|
experiment_ids = []
|
||||||
|
|
||||||
for i, config in enumerate(experiments):
|
for i, config in enumerate(experiments):
|
||||||
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
|
logging.info(
|
||||||
|
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
exp_id = self.run_experiment(config)
|
exp_id = self.run_experiment(config)
|
||||||
experiment_ids.append(exp_id)
|
experiment_ids.append(exp_id)
|
||||||
@@ -136,7 +150,9 @@ class ExperimentRunner:
|
|||||||
return experiment_ids
|
return experiment_ids
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
|
def _apply_data_filters(
|
||||||
|
cls, df: pd.DataFrame, config: ExperimentConfig
|
||||||
|
) -> pd.DataFrame:
|
||||||
"""Apply data filters specified in experiment config"""
|
"""Apply data filters specified in experiment config"""
|
||||||
filtered_df = df.copy()
|
filtered_df = df.copy()
|
||||||
|
|
||||||
@@ -148,9 +164,13 @@ class ExperimentRunner:
|
|||||||
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
||||||
elif isinstance(criteria, dict):
|
elif isinstance(criteria, dict):
|
||||||
if "min" in criteria:
|
if "min" in criteria:
|
||||||
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
|
filtered_df = filtered_df[
|
||||||
|
filtered_df[column] >= criteria["min"]
|
||||||
|
]
|
||||||
if "max" in criteria:
|
if "max" in criteria:
|
||||||
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
|
filtered_df = filtered_df[
|
||||||
|
filtered_df[column] <= criteria["max"]
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
filtered_df = filtered_df[filtered_df[column] == criteria]
|
filtered_df = filtered_df[filtered_df[column] == criteria]
|
||||||
|
|
||||||
@@ -231,7 +251,9 @@ class ExperimentRunner:
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
|
logging.error(
|
||||||
|
f"Failed to load model for experiment {experiment_id}: {e}"
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return None
|
return None
|
||||||
+13
-6
@@ -6,9 +6,9 @@ from typing import Optional, Dict, List
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config import PipelineConfig, get_config
|
from ners.core.config import PipelineConfig, get_config
|
||||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||||
from research.experiment.experiement_result import ExperimentResult
|
from ners.research.experiment.experiement_result import ExperimentResult
|
||||||
|
|
||||||
|
|
||||||
class ExperimentTracker:
|
class ExperimentTracker:
|
||||||
@@ -97,7 +97,10 @@ class ExperimentTracker:
|
|||||||
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||||
|
|
||||||
def get_best_experiment(
|
def get_best_experiment(
|
||||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
self,
|
||||||
|
metric: str = "accuracy",
|
||||||
|
dataset: str = "test",
|
||||||
|
filters: Optional[Dict] = None,
|
||||||
) -> Optional[ExperimentResult]:
|
) -> Optional[ExperimentResult]:
|
||||||
"""Get the best experiment based on a metric"""
|
"""Get the best experiment based on a metric"""
|
||||||
experiments = self.list_experiments()
|
experiments = self.list_experiments()
|
||||||
@@ -106,7 +109,9 @@ class ExperimentTracker:
|
|||||||
# Apply additional filters
|
# Apply additional filters
|
||||||
if "model_type" in filters:
|
if "model_type" in filters:
|
||||||
experiments = [
|
experiments = [
|
||||||
e for e in experiments if e.config.model_type == filters["model_type"]
|
e
|
||||||
|
for e in experiments
|
||||||
|
if e.config.model_type == filters["model_type"]
|
||||||
]
|
]
|
||||||
if "features" in filters:
|
if "features" in filters:
|
||||||
experiments = [
|
experiments = [
|
||||||
@@ -118,7 +123,9 @@ class ExperimentTracker:
|
|||||||
valid_experiments = []
|
valid_experiments = []
|
||||||
for exp in experiments:
|
for exp in experiments:
|
||||||
if exp.status == ExperimentStatus.COMPLETED:
|
if exp.status == ExperimentStatus.COMPLETED:
|
||||||
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
|
metrics_dict = (
|
||||||
|
exp.test_metrics if dataset == "test" else exp.train_metrics
|
||||||
|
)
|
||||||
if metric in metrics_dict:
|
if metric in metrics_dict:
|
||||||
valid_experiments.append((exp, metrics_dict[metric]))
|
valid_experiments.append((exp, metrics_dict[metric]))
|
||||||
|
|
||||||
+3
-1
@@ -24,7 +24,9 @@ class FeatureType(Enum):
|
|||||||
class FeatureExtractor:
|
class FeatureExtractor:
|
||||||
"""Extract different types of features from name data"""
|
"""Extract different types of features from name data"""
|
||||||
|
|
||||||
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
|
def __init__(
|
||||||
|
self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None
|
||||||
|
):
|
||||||
self.feature_types = feature_types
|
self.feature_types = feature_types
|
||||||
self.feature_params = feature_params or {}
|
self.feature_params = feature_params or {}
|
||||||
|
|
||||||
@@ -1,18 +1,18 @@
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from research.base_model import BaseModel
|
from ners.research.base_model import BaseModel
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
from research.models.bigru_model import BiGRUModel
|
from ners.research.models.bigru_model import BiGRUModel
|
||||||
from research.models.cnn_model import CNNModel
|
from ners.research.models.cnn_model import CNNModel
|
||||||
from research.models.ensemble_model import EnsembleModel
|
from ners.research.models.ensemble_model import EnsembleModel
|
||||||
from research.models.lightgbm_model import LightGBMModel
|
from ners.research.models.lightgbm_model import LightGBMModel
|
||||||
from research.models.logistic_regression_model import LogisticRegressionModel
|
from ners.research.models.logistic_regression_model import LogisticRegressionModel
|
||||||
from research.models.lstm_model import LSTMModel
|
from ners.research.models.lstm_model import LSTMModel
|
||||||
from research.models.naive_bayes_model import NaiveBayesModel
|
from ners.research.models.naive_bayes_model import NaiveBayesModel
|
||||||
from research.models.random_forest_model import RandomForestModel
|
from ners.research.models.random_forest_model import RandomForestModel
|
||||||
from research.models.svm_model import SVMModel
|
from ners.research.models.svm_model import SVMModel
|
||||||
from research.models.transformer_model import TransformerModel
|
from ners.research.models.transformer_model import TransformerModel
|
||||||
from research.models.xgboost_model import XGBoostModel
|
from ners.research.models.xgboost_model import XGBoostModel
|
||||||
|
|
||||||
MODEL_REGISTRY = {
|
MODEL_REGISTRY = {
|
||||||
"bigru": BiGRUModel,
|
"bigru": BiGRUModel,
|
||||||
@@ -5,12 +5,12 @@ from typing import List, Dict, Any
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config import get_config
|
from ners.core.config import get_config
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from research.experiment import FeatureType, ExperimentConfig
|
from ners.research.experiment import FeatureType, ExperimentConfig
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||||
from research.model_registry import MODEL_REGISTRY
|
from ners.research.model_registry import MODEL_REGISTRY
|
||||||
|
|
||||||
|
|
||||||
class ModelTrainer:
|
class ModelTrainer:
|
||||||
@@ -66,7 +66,9 @@ class ModelTrainer:
|
|||||||
if experiment and experiment.test_metrics:
|
if experiment and experiment.test_metrics:
|
||||||
logging.info("Training completed successfully!")
|
logging.info("Training completed successfully!")
|
||||||
logging.info(f"Experiment ID: {experiment_id}")
|
logging.info(f"Experiment ID: {experiment_id}")
|
||||||
logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
|
logging.info(
|
||||||
|
f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}"
|
||||||
|
)
|
||||||
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
|
logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
|
||||||
|
|
||||||
if save_artifacts:
|
if save_artifacts:
|
||||||
@@ -144,13 +146,17 @@ class ModelTrainer:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Load data for learning curve generation
|
# Load data for learning curve generation
|
||||||
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
data_path = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
if data_path.exists():
|
if data_path.exists():
|
||||||
df = self.data_loader.load_csv_complete(data_path)
|
df = self.data_loader.load_csv_complete(data_path)
|
||||||
|
|
||||||
# Generate learning curve
|
# Generate learning curve
|
||||||
logging.info("Generating learning curve...")
|
logging.info("Generating learning curve...")
|
||||||
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
|
trained_model.generate_learning_curve(
|
||||||
|
df, df[experiment.config.target_column]
|
||||||
|
)
|
||||||
|
|
||||||
# Plot and save learning curve
|
# Plot and save learning curve
|
||||||
learning_curve_path = model_dir / "learning_curve.png"
|
learning_curve_path = model_dir / "learning_curve.png"
|
||||||
@@ -187,8 +193,12 @@ class ModelTrainer:
|
|||||||
"model_path": str(model_path),
|
"model_path": str(model_path),
|
||||||
"config_path": str(config_path),
|
"config_path": str(config_path),
|
||||||
"results_path": str(results_path),
|
"results_path": str(results_path),
|
||||||
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
"learning_curve_plot": str(learning_curve_path)
|
||||||
"training_history_plot": str(training_history_path) if training_history_path else None,
|
if learning_curve_path
|
||||||
|
else None,
|
||||||
|
"training_history_plot": str(training_history_path)
|
||||||
|
if training_history_path
|
||||||
|
else None,
|
||||||
"has_learning_curve": bool(trained_model.learning_curve_data),
|
"has_learning_curve": bool(trained_model.learning_curve_data),
|
||||||
"has_training_history": bool(trained_model.training_history),
|
"has_training_history": bool(trained_model.training_history),
|
||||||
}
|
}
|
||||||
@@ -215,8 +225,12 @@ class ModelTrainer:
|
|||||||
"config_path": str(config_path),
|
"config_path": str(config_path),
|
||||||
"results_path": str(results_path),
|
"results_path": str(results_path),
|
||||||
"metadata_path": str(metadata_path),
|
"metadata_path": str(metadata_path),
|
||||||
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
"learning_curve_plot": str(learning_curve_path)
|
||||||
"training_history_plot": str(training_history_path) if training_history_path else None,
|
if learning_curve_path
|
||||||
|
else None,
|
||||||
|
"training_history_plot": str(training_history_path)
|
||||||
|
if training_history_path
|
||||||
|
else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
def load_trained_model(self, experiment_id: str):
|
def load_trained_model(self, experiment_id: str):
|
||||||
@@ -227,7 +241,9 @@ class ModelTrainer:
|
|||||||
model_path = model_dir / "complete_model.joblib"
|
model_path = model_dir / "complete_model.joblib"
|
||||||
|
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
|
raise FileNotFoundError(
|
||||||
|
f"Model artifacts not found for experiment {experiment_id}"
|
||||||
|
)
|
||||||
|
|
||||||
# Load the model class dynamically
|
# Load the model class dynamically
|
||||||
metadata_path = model_dir / "metadata.json"
|
metadata_path = model_dir / "metadata.json"
|
||||||
@@ -261,7 +277,9 @@ class ModelTrainer:
|
|||||||
metadata = json.load(f)
|
metadata = json.load(f)
|
||||||
models_data.append(metadata)
|
models_data.append(metadata)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Could not read metadata for {model_dir.name}: {e}")
|
logging.warning(
|
||||||
|
f"Could not read metadata for {model_dir.name}: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
if not models_data:
|
if not models_data:
|
||||||
logging.info("No saved models found.")
|
logging.info("No saved models found.")
|
||||||
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
|
|||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
from research.neural_network_model import NeuralNetworkModel
|
from ners.research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
class BiGRUModel(NeuralNetworkModel):
|
class BiGRUModel(NeuralNetworkModel):
|
||||||
@@ -53,7 +53,9 @@ class BiGRUModel(NeuralNetworkModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
model.compile(
|
model.compile(
|
||||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
loss="sparse_categorical_crossentropy",
|
||||||
|
optimizer="adam",
|
||||||
|
metrics=["accuracy"],
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
@@ -15,7 +15,7 @@ from tensorflow.keras.models import Sequential
|
|||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
|
|
||||||
from research.neural_network_model import NeuralNetworkModel
|
from ners.research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
class CNNModel(NeuralNetworkModel):
|
class CNNModel(NeuralNetworkModel):
|
||||||
@@ -29,7 +29,9 @@ class CNNModel(NeuralNetworkModel):
|
|||||||
[
|
[
|
||||||
# Learn char/subword embeddings; spatial dropout regularizes across channels
|
# Learn char/subword embeddings; spatial dropout regularizes across channels
|
||||||
# to make the model robust to noisy characters and transliteration.
|
# to make the model robust to noisy characters and transliteration.
|
||||||
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
Embedding(
|
||||||
|
input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)
|
||||||
|
),
|
||||||
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
|
SpatialDropout1D(rate=params.get("embedding_dropout", 0.1)),
|
||||||
# Small kernels capture short n-gram like patterns; padding='same' keeps
|
# Small kernels capture short n-gram like patterns; padding='same' keeps
|
||||||
# sequence length stable for simpler pooling behavior.
|
# sequence length stable for simpler pooling behavior.
|
||||||
@@ -59,7 +61,9 @@ class CNNModel(NeuralNetworkModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
model.compile(
|
model.compile(
|
||||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
loss="sparse_categorical_crossentropy",
|
||||||
|
optimizer="adam",
|
||||||
|
metrics=["accuracy"],
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
@@ -75,6 +79,8 @@ class CNNModel(NeuralNetworkModel):
|
|||||||
self.tokenizer.fit_on_texts(text_data)
|
self.tokenizer.fit_on_texts(text_data)
|
||||||
|
|
||||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||||
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
|
max_len = self.config.model_params.get(
|
||||||
|
"max_len", 20
|
||||||
|
) # Longer for character level
|
||||||
|
|
||||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||||
@@ -8,8 +8,8 @@ from sklearn.linear_model import LogisticRegression
|
|||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
from research.traditional_model import TraditionalModel
|
from ners.research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
class EnsembleModel(TraditionalModel):
|
class EnsembleModel(TraditionalModel):
|
||||||
@@ -40,22 +40,28 @@ class EnsembleModel(TraditionalModel):
|
|||||||
[
|
[
|
||||||
(
|
(
|
||||||
"vectorizer",
|
"vectorizer",
|
||||||
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
|
CountVectorizer(
|
||||||
|
analyzer="char", ngram_range=(2, 4), max_features=5000
|
||||||
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"classifier",
|
"classifier",
|
||||||
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
|
LogisticRegression(
|
||||||
|
max_iter=1000, random_state=self.config.random_seed
|
||||||
|
),
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
estimators.append((f"logistic_regression", model))
|
estimators.append(("logistic_regression", model))
|
||||||
|
|
||||||
elif model_type == "random_forest":
|
elif model_type == "random_forest":
|
||||||
model = Pipeline(
|
model = Pipeline(
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"vectorizer",
|
"vectorizer",
|
||||||
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
|
TfidfVectorizer(
|
||||||
|
analyzer="char", ngram_range=(2, 3), max_features=3000
|
||||||
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"classifier",
|
"classifier",
|
||||||
@@ -65,19 +71,21 @@ class EnsembleModel(TraditionalModel):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
estimators.append((f"rf", model))
|
estimators.append(("rf", model))
|
||||||
|
|
||||||
elif model_type == "naive_bayes":
|
elif model_type == "naive_bayes":
|
||||||
model = Pipeline(
|
model = Pipeline(
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"vectorizer",
|
"vectorizer",
|
||||||
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
|
CountVectorizer(
|
||||||
|
analyzer="char", ngram_range=(1, 3), max_features=4000
|
||||||
|
),
|
||||||
),
|
),
|
||||||
("classifier", MultinomialNB()),
|
("classifier", MultinomialNB()),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
estimators.append((f"nb", model))
|
estimators.append(("nb", model))
|
||||||
|
|
||||||
# Soft voting averages probabilities (preferred when members are calibrated);
|
# Soft voting averages probabilities (preferred when members are calibrated);
|
||||||
# hard voting uses majority class. Parallelize member predictions.
|
# hard voting uses majority class. Parallelize member predictions.
|
||||||
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
|
|||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
from research.traditional_model import TraditionalModel
|
from ners.research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
class LightGBMModel(TraditionalModel):
|
class LightGBMModel(TraditionalModel):
|
||||||
@@ -106,7 +106,9 @@ class LightGBMModel(TraditionalModel):
|
|||||||
lambda x: x if x in known_classes else default_class
|
lambda x: x if x in known_classes else default_class
|
||||||
)
|
)
|
||||||
|
|
||||||
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
encoded = self.label_encoders[feature_key].transform(
|
||||||
|
column_mapped
|
||||||
|
)
|
||||||
|
|
||||||
features.append(encoded.reshape(-1, 1))
|
features.append(encoded.reshape(-1, 1))
|
||||||
|
|
||||||
+1
-1
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
|
|||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
from research.traditional_model import TraditionalModel
|
from ners.research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
class LogisticRegressionModel(TraditionalModel):
|
class LogisticRegressionModel(TraditionalModel):
|
||||||
@@ -7,7 +7,7 @@ from tensorflow.keras.models import Sequential
|
|||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
from research.neural_network_model import NeuralNetworkModel
|
from ners.research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
class LSTMModel(NeuralNetworkModel):
|
class LSTMModel(NeuralNetworkModel):
|
||||||
@@ -50,7 +50,9 @@ class LSTMModel(NeuralNetworkModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
model.compile(
|
model.compile(
|
||||||
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
loss="sparse_categorical_crossentropy",
|
||||||
|
optimizer="adam",
|
||||||
|
metrics=["accuracy"],
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
+1
-1
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
|
|||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
from research.traditional_model import TraditionalModel
|
from ners.research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
class NaiveBayesModel(TraditionalModel):
|
class NaiveBayesModel(TraditionalModel):
|
||||||
+7
-4
@@ -6,7 +6,7 @@ from sklearn.base import BaseEstimator
|
|||||||
from sklearn.ensemble import RandomForestClassifier
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
from research.traditional_model import TraditionalModel
|
from ners.research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
class RandomForestModel(TraditionalModel):
|
class RandomForestModel(TraditionalModel):
|
||||||
@@ -18,7 +18,6 @@ class RandomForestModel(TraditionalModel):
|
|||||||
self.label_encoders: Dict[str, LabelEncoder] = {}
|
self.label_encoders: Dict[str, LabelEncoder] = {}
|
||||||
|
|
||||||
def build_model(self) -> BaseEstimator:
|
def build_model(self) -> BaseEstimator:
|
||||||
|
|
||||||
params = self.config.model_params
|
params = self.config.model_params
|
||||||
|
|
||||||
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
|
# Tree ensemble is robust to mixed numeric/categorical encodings; parallelize
|
||||||
@@ -56,10 +55,14 @@ class RandomForestModel(TraditionalModel):
|
|||||||
column_clean = column.fillna("unknown").astype(str)
|
column_clean = column.fillna("unknown").astype(str)
|
||||||
known_classes = set(encoder.classes_)
|
known_classes = set(encoder.classes_)
|
||||||
default_class = (
|
default_class = (
|
||||||
"unknown" if "unknown" in known_classes else encoder.classes_[0]
|
"unknown"
|
||||||
|
if "unknown" in known_classes
|
||||||
|
else encoder.classes_[0]
|
||||||
)
|
)
|
||||||
column_mapped = column_clean.apply(
|
column_mapped = column_clean.apply(
|
||||||
lambda value: value if value in known_classes else default_class
|
lambda value: value
|
||||||
|
if value in known_classes
|
||||||
|
else default_class
|
||||||
)
|
)
|
||||||
encoded = encoder.transform(column_mapped)
|
encoded = encoder.transform(column_mapped)
|
||||||
|
|
||||||
@@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
from research.traditional_model import TraditionalModel
|
from ners.research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
class SVMModel(TraditionalModel):
|
class SVMModel(TraditionalModel):
|
||||||
+12
-5
@@ -16,7 +16,7 @@ from tensorflow.keras.models import Model
|
|||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
from research.neural_network_model import NeuralNetworkModel
|
from ners.research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
class TransformerModel(NeuralNetworkModel):
|
class TransformerModel(NeuralNetworkModel):
|
||||||
@@ -37,7 +37,8 @@ class TransformerModel(NeuralNetworkModel):
|
|||||||
# Add positional encoding
|
# Add positional encoding
|
||||||
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
||||||
pos_embedding = Embedding(
|
pos_embedding = Embedding(
|
||||||
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
|
input_dim=params.get("max_len", 8),
|
||||||
|
output_dim=params.get("embedding_dim", 64),
|
||||||
)(positions)
|
)(positions)
|
||||||
x = x + pos_embedding
|
x = x + pos_embedding
|
||||||
|
|
||||||
@@ -49,7 +50,9 @@ class TransformerModel(NeuralNetworkModel):
|
|||||||
|
|
||||||
model = Model(inputs, outputs)
|
model = Model(inputs, outputs)
|
||||||
model.compile(
|
model.compile(
|
||||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
optimizer="adam",
|
||||||
|
loss="sparse_categorical_crossentropy",
|
||||||
|
metrics=["accuracy"],
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
@@ -62,11 +65,15 @@ class TransformerModel(NeuralNetworkModel):
|
|||||||
key_dim=cfg_params.get("transformer_head_size", 64),
|
key_dim=cfg_params.get("transformer_head_size", 64),
|
||||||
dropout=cfg_params.get("attn_dropout", 0.1),
|
dropout=cfg_params.get("attn_dropout", 0.1),
|
||||||
)(x, x)
|
)(x, x)
|
||||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
|
x = LayerNormalization(epsilon=1e-6)(
|
||||||
|
x + Dropout(cfg_params.get("dropout", 0.1))(attn)
|
||||||
|
)
|
||||||
|
|
||||||
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
|
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
|
||||||
ff = Dense(x.shape[-1])(ff)
|
ff = Dense(x.shape[-1])(ff)
|
||||||
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
|
return LayerNormalization(epsilon=1e-6)(
|
||||||
|
x + Dropout(cfg_params.get("dropout", 0.1))(ff)
|
||||||
|
)
|
||||||
|
|
||||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
text_data = self._collect_text_corpus(X)
|
text_data = self._collect_text_corpus(X)
|
||||||
@@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator
|
|||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
from research.traditional_model import TraditionalModel
|
from ners.research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
class XGBoostModel(TraditionalModel):
|
class XGBoostModel(TraditionalModel):
|
||||||
@@ -106,7 +106,9 @@ class XGBoostModel(TraditionalModel):
|
|||||||
lambda x: x if x in known_classes else default_class
|
lambda x: x if x in known_classes else default_class
|
||||||
)
|
)
|
||||||
|
|
||||||
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
encoded = self.label_encoders[feature_key].transform(
|
||||||
|
column_mapped
|
||||||
|
)
|
||||||
|
|
||||||
features.append(encoded.reshape(-1, 1))
|
features.append(encoded.reshape(-1, 1))
|
||||||
|
|
||||||
@@ -10,8 +10,10 @@ from sklearn.model_selection import StratifiedKFold
|
|||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
from research.base_model import BaseModel
|
import tensorflow as tf
|
||||||
from research.experiment.feature_extractor import FeatureExtractor
|
|
||||||
|
from ners.research.base_model import BaseModel
|
||||||
|
from ners.research.experiment.feature_extractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
class NeuralNetworkModel(BaseModel):
|
class NeuralNetworkModel(BaseModel):
|
||||||
@@ -34,8 +36,6 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
# - Enables memory growth to avoid pre-allocating all VRAM
|
# - Enables memory growth to avoid pre-allocating all VRAM
|
||||||
# - Optionally enables mixed precision if requested via model params
|
# - Optionally enables mixed precision if requested via model params
|
||||||
try:
|
try:
|
||||||
import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs
|
|
||||||
|
|
||||||
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||||
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||||
|
|
||||||
@@ -49,15 +49,15 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
|
|
||||||
if enable_mixed:
|
if enable_mixed:
|
||||||
try:
|
try:
|
||||||
from tensorflow.keras import mixed_precision
|
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||||
|
|
||||||
mixed_precision.set_global_policy("mixed_float16")
|
|
||||||
logging.info("Enabled TensorFlow mixed precision (float16)")
|
logging.info("Enabled TensorFlow mixed precision (float16)")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Could not enable mixed precision: {e}")
|
logging.warning(f"Could not enable mixed precision: {e}")
|
||||||
else:
|
else:
|
||||||
if requested_gpu:
|
if requested_gpu:
|
||||||
logging.warning("Requested GPU but no TensorFlow GPU device is available.")
|
logging.warning(
|
||||||
|
"Requested GPU but no TensorFlow GPU device is available."
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Keep silent in non-TF environments / non-NN workflows
|
# Keep silent in non-TF environments / non-NN workflows
|
||||||
logging.debug(f"TensorFlow GPU setup skipped: {e}")
|
logging.debug(f"TensorFlow GPU setup skipped: {e}")
|
||||||
@@ -86,7 +86,9 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
logging.info(f"Vocabulary size: {vocab_size}")
|
logging.info(f"Vocabulary size: {vocab_size}")
|
||||||
|
|
||||||
# Get additional model parameters
|
# Get additional model parameters
|
||||||
self.model = self.build_model_with_vocab(vocab_size=vocab_size, **self.config.model_params)
|
self.model = self.build_model_with_vocab(
|
||||||
|
vocab_size=vocab_size, **self.config.model_params
|
||||||
|
)
|
||||||
|
|
||||||
# Train the neural network
|
# Train the neural network
|
||||||
logging.info(
|
logging.info(
|
||||||
@@ -143,7 +145,7 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
|
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
|
||||||
invalid_mask = (arr < 0) | (arr > max_idx)
|
invalid_mask = (arr < 0) | (arr > max_idx)
|
||||||
# Avoid turning zeros into OOV
|
# Avoid turning zeros into OOV
|
||||||
invalid_mask &= (arr != 0)
|
invalid_mask &= arr != 0
|
||||||
if invalid_mask.any():
|
if invalid_mask.any():
|
||||||
arr[invalid_mask] = oov_index
|
arr[invalid_mask] = oov_index
|
||||||
|
|
||||||
@@ -157,10 +159,14 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
"""Combine configured textual features into one string per record."""
|
"""Combine configured textual features into one string per record."""
|
||||||
|
|
||||||
column_names = [
|
column_names = [
|
||||||
feature.value for feature in self.config.features if feature.value in X.columns
|
feature.value
|
||||||
|
for feature in self.config.features
|
||||||
|
if feature.value in X.columns
|
||||||
]
|
]
|
||||||
if not column_names:
|
if not column_names:
|
||||||
raise ValueError("No configured text features found in the provided DataFrame.")
|
raise ValueError(
|
||||||
|
"No configured text features found in the provided DataFrame."
|
||||||
|
)
|
||||||
|
|
||||||
text_frame = X[column_names].fillna("").astype(str)
|
text_frame = X[column_names].fillna("").astype(str)
|
||||||
|
|
||||||
@@ -193,9 +199,7 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
pass
|
pass
|
||||||
if enable_mixed:
|
if enable_mixed:
|
||||||
try:
|
try:
|
||||||
from tensorflow.keras import mixed_precision
|
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||||
|
|
||||||
mixed_precision.set_global_policy("mixed_float16")
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
@@ -208,7 +212,9 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
X_prepared = self._sanitize_sequences(X_prepared)
|
X_prepared = self._sanitize_sequences(X_prepared)
|
||||||
y_encoded = self.label_encoder.transform(y)
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
cv = StratifiedKFold(
|
||||||
|
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
|
||||||
|
)
|
||||||
|
|
||||||
accuracies = []
|
accuracies = []
|
||||||
precisions = []
|
precisions = []
|
||||||
@@ -280,14 +286,14 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
pass
|
pass
|
||||||
if enable_mixed:
|
if enable_mixed:
|
||||||
try:
|
try:
|
||||||
from tensorflow.keras import mixed_precision
|
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||||
|
|
||||||
mixed_precision.set_global_policy("mixed_float16")
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
if requested_gpu:
|
if requested_gpu:
|
||||||
logging.warning("Requested GPU for learning curve but none is available.")
|
logging.warning(
|
||||||
|
"Requested GPU for learning curve but none is available."
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -342,7 +348,7 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
|
|
||||||
# Train model
|
# Train model
|
||||||
if hasattr(model, "fit"):
|
if hasattr(model, "fit"):
|
||||||
history = model.fit(
|
model.fit(
|
||||||
X_train_subset,
|
X_train_subset,
|
||||||
y_train_subset,
|
y_train_subset,
|
||||||
epochs=self.config.model_params.get("epochs", 10),
|
epochs=self.config.model_params.get("epochs", 10),
|
||||||
@@ -3,12 +3,16 @@ import pandas as pd
|
|||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
|
||||||
from research.statistics.utils import LETTERS, build_letter_frequencies
|
from ners.research.statistics.utils import LETTERS, build_letter_frequencies
|
||||||
|
|
||||||
|
|
||||||
def plot_transition_matrix(ax, df_probs, title=""):
|
def plot_transition_matrix(ax, df_probs, title=""):
|
||||||
hm = sns.heatmap(
|
hm = sns.heatmap(
|
||||||
df_probs.loc[list(LETTERS), list(LETTERS)], cmap="Reds", annot=False, cbar=False, ax=ax
|
df_probs.loc[list(LETTERS), list(LETTERS)],
|
||||||
|
cmap="Reds",
|
||||||
|
annot=False,
|
||||||
|
cbar=False,
|
||||||
|
ax=ax,
|
||||||
)
|
)
|
||||||
ax.set_title(title, fontsize=12)
|
ax.set_title(title, fontsize=12)
|
||||||
return hm
|
return hm
|
||||||
@@ -31,8 +35,12 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
|||||||
x = np.arange(len(df_plot))
|
x = np.arange(len(df_plot))
|
||||||
w = 0.4
|
w = 0.4
|
||||||
fig, ax = plt.subplots(figsize=(16, 6))
|
fig, ax = plt.subplots(figsize=(16, 6))
|
||||||
ax.bar(x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
|
ax.bar(
|
||||||
ax.bar(x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
|
x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8
|
||||||
|
)
|
||||||
|
ax.bar(
|
||||||
|
x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8
|
||||||
|
)
|
||||||
|
|
||||||
ax.set_xticks(x)
|
ax.set_xticks(x)
|
||||||
ax.set_xticklabels(df_plot["letter"])
|
ax.set_xticklabels(df_plot["letter"])
|
||||||
@@ -5,8 +5,6 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy.spatial.distance import euclidean
|
from scipy.spatial.distance import euclidean
|
||||||
from scipy.stats import entropy
|
from scipy.stats import entropy
|
||||||
from scipy.spatial.distance import euclidean
|
|
||||||
from scipy.stats import entropy
|
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
LETTERS = "abcdefghijklmnopqrstuvwxyz"
|
LETTERS = "abcdefghijklmnopqrstuvwxyz"
|
||||||
@@ -49,7 +47,12 @@ def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFram
|
|||||||
|
|
||||||
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
||||||
# Normalize: lowercase, remove non-letters, concatenate all into one string
|
# Normalize: lowercase, remove non-letters, concatenate all into one string
|
||||||
s = series.astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True).str.cat(sep="")
|
s = (
|
||||||
|
series.astype(str)
|
||||||
|
.str.lower()
|
||||||
|
.str.replace(r"[^a-z]", "", regex=True)
|
||||||
|
.str.cat(sep="")
|
||||||
|
)
|
||||||
|
|
||||||
# Convert string into Series of characters
|
# Convert string into Series of characters
|
||||||
chars = pd.Series(list(s))
|
chars = pd.Series(list(s))
|
||||||
@@ -150,8 +153,12 @@ def build_transition_comparisons(
|
|||||||
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
|
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
|
||||||
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
|
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
|
||||||
|
|
||||||
kl_surnames_mf = entropy(prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12)
|
kl_surnames_mf = entropy(
|
||||||
kl_surnames_fm = entropy(prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12)
|
prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12
|
||||||
|
)
|
||||||
|
kl_surnames_fm = entropy(
|
||||||
|
prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12
|
||||||
|
)
|
||||||
|
|
||||||
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
|
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
|
||||||
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
|
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
|
||||||
@@ -163,7 +170,9 @@ def build_transition_comparisons(
|
|||||||
P_f = transitions["f"]["probs"].flatten()
|
P_f = transitions["f"]["probs"].flatten()
|
||||||
|
|
||||||
# Calculate the observed JSD (our test statistic)
|
# Calculate the observed JSD (our test statistic)
|
||||||
observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12))
|
observed_jsd = 0.5 * (
|
||||||
|
entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)
|
||||||
|
)
|
||||||
|
|
||||||
# Concatenate male and female counts
|
# Concatenate male and female counts
|
||||||
counts_m = transitions["m"]["counts"]
|
counts_m = transitions["m"]["counts"]
|
||||||
@@ -194,10 +203,12 @@ def build_transition_comparisons(
|
|||||||
|
|
||||||
permuted_jsd = 0.5 * (
|
permuted_jsd = 0.5 * (
|
||||||
entropy(
|
entropy(
|
||||||
permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12
|
permuted_probs_m.mean(axis=1) + 1e-12,
|
||||||
|
permuted_probs_f.mean(axis=1) + 1e-12,
|
||||||
)
|
)
|
||||||
+ entropy(
|
+ entropy(
|
||||||
permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12
|
permuted_probs_f.mean(axis=1) + 1e-12,
|
||||||
|
permuted_probs_m.mean(axis=1) + 1e-12,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
permuted_jsds.append(permuted_jsd)
|
permuted_jsds.append(permuted_jsd)
|
||||||
@@ -8,8 +8,8 @@ from sklearn.model_selection import StratifiedKFold, cross_val_score
|
|||||||
from sklearn.model_selection import learning_curve
|
from sklearn.model_selection import learning_curve
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
from research.base_model import BaseModel
|
from ners.research.base_model import BaseModel
|
||||||
from research.experiment.feature_extractor import FeatureExtractor
|
from ners.research.experiment.feature_extractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
class TraditionalModel(BaseModel):
|
class TraditionalModel(BaseModel):
|
||||||
@@ -52,7 +52,9 @@ class TraditionalModel(BaseModel):
|
|||||||
# Train model
|
# Train model
|
||||||
if len(X_prepared.shape) == 1:
|
if len(X_prepared.shape) == 1:
|
||||||
# For text-based features (like LogisticRegression with vectorization)
|
# For text-based features (like LogisticRegression with vectorization)
|
||||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
|
logging.info(
|
||||||
|
f"Fitting model with {X_prepared.shape[0]} samples (text features)"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# For numerical features
|
# For numerical features
|
||||||
logging.info(
|
logging.info(
|
||||||
@@ -74,12 +76,16 @@ class TraditionalModel(BaseModel):
|
|||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
|
def cross_validate(
|
||||||
|
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||||
|
) -> Dict[str, float]:
|
||||||
features_df = self.feature_extractor.extract_features(X)
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
X_prepared = self.prepare_features(features_df)
|
X_prepared = self.prepare_features(features_df)
|
||||||
y_encoded = self.label_encoder.transform(y)
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
cv = StratifiedKFold(
|
||||||
|
n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed
|
||||||
|
)
|
||||||
|
|
||||||
# Calculate different metrics
|
# Calculate different metrics
|
||||||
results = {}
|
results = {}
|
||||||
@@ -95,7 +101,11 @@ class TraditionalModel(BaseModel):
|
|||||||
for metric in ["precision", "recall", "f1"]:
|
for metric in ["precision", "recall", "f1"]:
|
||||||
if metric in self.config.metrics:
|
if metric in self.config.metrics:
|
||||||
scores = cross_val_score(
|
scores = cross_val_score(
|
||||||
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
|
self.model,
|
||||||
|
X_prepared,
|
||||||
|
y_encoded,
|
||||||
|
cv=cv,
|
||||||
|
scoring=f"{metric}_weighted",
|
||||||
)
|
)
|
||||||
results[metric] = scores.mean()
|
results[metric] = scores.mean()
|
||||||
results[f"{metric}_std"] = scores.std()
|
results[f"{metric}_std"] = scores.std()
|
||||||
Executable
+46
@@ -0,0 +1,46 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
import logging
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from ners.core.config import setup_config
|
||||||
|
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||||
|
from ners.research.model_trainer import ModelTrainer
|
||||||
|
|
||||||
|
|
||||||
|
def train_from_template(
|
||||||
|
name: str,
|
||||||
|
type: str,
|
||||||
|
*,
|
||||||
|
templates: str = "research_templates.yaml",
|
||||||
|
config: str | None = None,
|
||||||
|
env: str = "development",
|
||||||
|
) -> int:
|
||||||
|
try:
|
||||||
|
cfg = setup_config(config_path=config, env=env)
|
||||||
|
experiment_builder = ExperimentBuilder(cfg)
|
||||||
|
|
||||||
|
logging.info(f"Loading research templates from: {templates}")
|
||||||
|
tmpl = experiment_builder.load_templates(templates)
|
||||||
|
|
||||||
|
logging.info(f"Looking for experiment: name='{name}', type='{type}'")
|
||||||
|
experiment_config = experiment_builder.find_template(tmpl, name, type)
|
||||||
|
|
||||||
|
logging.info(f"Found experiment: {experiment_config.get('name')}")
|
||||||
|
logging.info(f"Description: {experiment_config.get('description')}")
|
||||||
|
logging.info(f"Features: {experiment_config.get('features')}")
|
||||||
|
|
||||||
|
trainer = ModelTrainer(cfg)
|
||||||
|
trainer.train_single_model(
|
||||||
|
model_name=experiment_config.get("name"),
|
||||||
|
model_type=experiment_config.get("model_type"),
|
||||||
|
features=experiment_config.get("features"),
|
||||||
|
model_params=experiment_config.get("model_params", {}),
|
||||||
|
tags=experiment_config.get("tags", []),
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info("Training completed successfully!")
|
||||||
|
return 0
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Training failed: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
@@ -1,19 +1,13 @@
|
|||||||
#!.venv/bin/python3
|
#!.venv/bin/python3
|
||||||
import argparse
|
import os
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
# Add parent directory to Python path to access core modules
|
from ners.core.config import setup_config, PipelineConfig
|
||||||
parent_dir = Path(__file__).parent.parent
|
from ners.core.utils.data_loader import DataLoader
|
||||||
sys.path.insert(0, str(parent_dir))
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||||
from core.config import setup_config, PipelineConfig
|
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||||
from core.utils.data_loader import DataLoader
|
|
||||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
|
||||||
|
|
||||||
# Page configuration
|
# Page configuration
|
||||||
st.set_page_config(
|
st.set_page_config(
|
||||||
@@ -65,19 +59,9 @@ class StreamlitApp:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
# Initialize app using environment variables when launched via Typer
|
||||||
parser = argparse.ArgumentParser(
|
_config_path = os.environ.get("NERS_CONFIG")
|
||||||
description="DRC NERS Platform",
|
_env = os.environ.get("NERS_ENV", "development")
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
_cfg = setup_config(_config_path, env=_env)
|
||||||
)
|
_app = StreamlitApp(_cfg)
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
_app.run()
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
config = setup_config(args.config, env=args.env)
|
|
||||||
app = StreamlitApp(config)
|
|
||||||
app.run()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
|
|
||||||
|
|
||||||
@st.cache_data
|
@st.cache_data
|
||||||
@@ -25,7 +25,9 @@ class Dashboard:
|
|||||||
|
|
||||||
# Load basic statistics
|
# Load basic statistics
|
||||||
try:
|
try:
|
||||||
data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
data_path = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
if data_path.exists():
|
if data_path.exists():
|
||||||
df = load_dataset(str(data_path))
|
df = load_dataset(str(data_path))
|
||||||
|
|
||||||
@@ -37,13 +39,17 @@ class Dashboard:
|
|||||||
st.metric("Annotated Names", f"{annotated:,}")
|
st.metric("Annotated Names", f"{annotated:,}")
|
||||||
|
|
||||||
with col3:
|
with col3:
|
||||||
provinces = df["province"].nunique() if "province" in df.columns else 0
|
provinces = (
|
||||||
|
df["province"].nunique() if "province" in df.columns else 0
|
||||||
|
)
|
||||||
st.metric("Provinces", provinces)
|
st.metric("Provinces", provinces)
|
||||||
|
|
||||||
with col4:
|
with col4:
|
||||||
if "sex" in df.columns:
|
if "sex" in df.columns:
|
||||||
gender_dist = df["sex"].value_counts()
|
gender_dist = df["sex"].value_counts()
|
||||||
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
|
ratio = gender_dist.get("f", 0) / max(
|
||||||
|
gender_dist.get("m", 1), 1
|
||||||
|
)
|
||||||
st.metric("F/M Rate", f"{ratio:.2%}")
|
st.metric("F/M Rate", f"{ratio:.2%}")
|
||||||
with col5:
|
with col5:
|
||||||
if "annotated" in df.columns:
|
if "annotated" in df.columns:
|
||||||
@@ -79,4 +85,6 @@ class Dashboard:
|
|||||||
|
|
||||||
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
|
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
|
||||||
else:
|
else:
|
||||||
st.info("No experiments found. Create your first experiment in the Experiments tab!")
|
st.info(
|
||||||
|
"No experiments found. Create your first experiment in the Experiments tab!"
|
||||||
|
)
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data
|
||||||
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
|
try:
|
||||||
|
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error loading dataset: {e}")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
class DataOverview:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def index(self):
|
||||||
|
st.title("Data Overview")
|
||||||
|
data_files = {
|
||||||
|
"Names": self.config.data.input_file,
|
||||||
|
"Featured Dataset": self.config.data.output_files["featured"],
|
||||||
|
"Evaluation Dataset": self.config.data.output_files["evaluation"],
|
||||||
|
"Male Names": self.config.data.output_files["males"],
|
||||||
|
"Female Names": self.config.data.output_files["females"],
|
||||||
|
}
|
||||||
|
|
||||||
|
st.write("Available Data Files:")
|
||||||
|
for name, rel_path in data_files.items():
|
||||||
|
file_path = self.config.paths.get_data_path(rel_path)
|
||||||
|
exists = file_path.exists()
|
||||||
|
size = file_path.stat().st_size if exists else 0
|
||||||
|
stats = (
|
||||||
|
f"Size: {size / (1024 * 1024):.1f} MB, Last Modified: {datetime.fromtimestamp(file_path.stat().st_mtime)}"
|
||||||
|
if exists
|
||||||
|
else "Not found"
|
||||||
|
)
|
||||||
|
st.write(f"- {name}: {file_path} ({stats})")
|
||||||
|
|
||||||
|
# Preview featured dataset if available
|
||||||
|
data_path = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
|
if data_path.exists():
|
||||||
|
df = load_dataset(str(data_path))
|
||||||
|
st.subheader("Featured Dataset Preview")
|
||||||
|
st.dataframe(df.head(), use_container_width=True)
|
||||||
|
st.write(f"Rows: {len(df):,}")
|
||||||
@@ -2,8 +2,8 @@ import pandas as pd
|
|||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
from ners.core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
from web.interfaces.log_reader import LogReader
|
from ners.web.interfaces.log_reader import LogReader
|
||||||
|
|
||||||
|
|
||||||
@st.cache_data
|
@st.cache_data
|
||||||
@@ -31,7 +31,9 @@ class DataProcessing:
|
|||||||
|
|
||||||
# Step details
|
# Step details
|
||||||
for step_name, step_status in status["steps"].items():
|
for step_name, step_status in status["steps"].items():
|
||||||
with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
|
with st.expander(
|
||||||
|
f"{step_name.replace('_', ' ').title()} - {step_status['status']}"
|
||||||
|
):
|
||||||
col1, col2, col3 = st.columns(3)
|
col1, col2, col3 = st.columns(3)
|
||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
@@ -63,14 +65,20 @@ class DataProcessing:
|
|||||||
|
|
||||||
with col2:
|
with col2:
|
||||||
num_entries = st.number_input(
|
num_entries = st.number_input(
|
||||||
"Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries"
|
"Number of entries",
|
||||||
|
min_value=5,
|
||||||
|
max_value=50,
|
||||||
|
value=10,
|
||||||
|
key="num_log_entries",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get log entries based on filter
|
# Get log entries based on filter
|
||||||
if log_level_filter == "All":
|
if log_level_filter == "All":
|
||||||
log_entries = log_reader.read_last_entries(num_entries)
|
log_entries = log_reader.read_last_entries(num_entries)
|
||||||
else:
|
else:
|
||||||
log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
|
log_entries = log_reader.read_entries_by_level(
|
||||||
|
log_level_filter, num_entries
|
||||||
|
)
|
||||||
|
|
||||||
if log_entries:
|
if log_entries:
|
||||||
for entry in log_entries:
|
for entry in log_entries:
|
||||||
@@ -2,13 +2,13 @@ from typing import List, Dict
|
|||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||||
from research.experiment.experiment_builder import ExperimentBuilder
|
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
from ners.research.experiment.experiment_runner import ExperimentRunner
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||||
from research.experiment.feature_extractor import FeatureType
|
from ners.research.experiment.feature_extractor import FeatureType
|
||||||
from research.model_registry import list_available_models
|
from ners.research.model_registry import list_available_models
|
||||||
|
|
||||||
|
|
||||||
class Experiments:
|
class Experiments:
|
||||||
@@ -46,13 +46,19 @@ class Experiments:
|
|||||||
available_experiments = self.experiment_builder.get_templates()
|
available_experiments = self.experiment_builder.get_templates()
|
||||||
|
|
||||||
# Create tabs for different experiment types
|
# Create tabs for different experiment types
|
||||||
exp_tabs = st.tabs(["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"])
|
exp_tabs = st.tabs(
|
||||||
|
["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]
|
||||||
|
)
|
||||||
|
|
||||||
with exp_tabs[0]:
|
with exp_tabs[0]:
|
||||||
self._show_experiments_by_type(available_experiments["baseline"], "baseline")
|
self._show_experiments_by_type(
|
||||||
|
available_experiments["baseline"], "baseline"
|
||||||
|
)
|
||||||
|
|
||||||
with exp_tabs[1]:
|
with exp_tabs[1]:
|
||||||
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
|
self._show_experiments_by_type(
|
||||||
|
available_experiments["advanced"], "advanced"
|
||||||
|
)
|
||||||
|
|
||||||
with exp_tabs[2]:
|
with exp_tabs[2]:
|
||||||
self._show_experiments_by_type(
|
self._show_experiments_by_type(
|
||||||
@@ -60,7 +66,9 @@ class Experiments:
|
|||||||
)
|
)
|
||||||
|
|
||||||
with exp_tabs[3]:
|
with exp_tabs[3]:
|
||||||
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
|
self._show_experiments_by_type(
|
||||||
|
available_experiments["tuning"], "tuning"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error loading experiment templates: {e}")
|
st.error(f"Error loading experiment templates: {e}")
|
||||||
@@ -79,7 +87,9 @@ class Experiments:
|
|||||||
# Show available experiments
|
# Show available experiments
|
||||||
for i, exp_template in enumerate(experiments):
|
for i, exp_template in enumerate(experiments):
|
||||||
exp_name = exp_template.get("name", f"Experiment {i + 1}")
|
exp_name = exp_template.get("name", f"Experiment {i + 1}")
|
||||||
exp_description = exp_template.get("description", "No description available")
|
exp_description = exp_template.get(
|
||||||
|
"description", "No description available"
|
||||||
|
)
|
||||||
|
|
||||||
with st.expander(f"📊 {exp_name} - {exp_description}"):
|
with st.expander(f"📊 {exp_name} - {exp_description}"):
|
||||||
col1, col2 = st.columns([2, 1])
|
col1, col2 = st.columns([2, 1])
|
||||||
@@ -88,7 +98,7 @@ class Experiments:
|
|||||||
st.json(exp_template)
|
st.json(exp_template)
|
||||||
|
|
||||||
with col2:
|
with col2:
|
||||||
if st.button(f"🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
|
if st.button("🚀 Run Experiment", key=f"run_{experiment_type}_{i}"):
|
||||||
self._run_template_experiment(exp_template)
|
self._run_template_experiment(exp_template)
|
||||||
|
|
||||||
def _run_template_experiment(self, exp_template: Dict):
|
def _run_template_experiment(self, exp_template: Dict):
|
||||||
@@ -100,7 +110,9 @@ class Experiments:
|
|||||||
|
|
||||||
# Run the experiment
|
# Run the experiment
|
||||||
experiment_id = self.experiment_runner.run_experiment(experiment_config)
|
experiment_id = self.experiment_runner.run_experiment(experiment_config)
|
||||||
st.success(f"Experiment '{experiment_config.name}' completed successfully!")
|
st.success(
|
||||||
|
f"Experiment '{experiment_config.name}' completed successfully!"
|
||||||
|
)
|
||||||
st.info(f"Experiment ID: `{experiment_id}`")
|
st.info(f"Experiment ID: `{experiment_id}`")
|
||||||
|
|
||||||
# Show results
|
# Show results
|
||||||
@@ -130,13 +142,17 @@ class Experiments:
|
|||||||
)
|
)
|
||||||
|
|
||||||
with col2:
|
with col2:
|
||||||
model_filter = st.selectbox("Filter by Model", ["All"] + list_available_models())
|
model_filter = st.selectbox(
|
||||||
|
"Filter by Model", ["All"] + list_available_models()
|
||||||
|
)
|
||||||
|
|
||||||
with col3:
|
with col3:
|
||||||
tag_filter = st.text_input("Filter by Tags (comma-separated)")
|
tag_filter = st.text_input("Filter by Tags (comma-separated)")
|
||||||
|
|
||||||
# Get and filter experiments
|
# Get and filter experiments
|
||||||
experiments = self._get_filtered_experiments(status_filter, model_filter, tag_filter)
|
experiments = self._get_filtered_experiments(
|
||||||
|
status_filter, model_filter, tag_filter
|
||||||
|
)
|
||||||
|
|
||||||
if not experiments:
|
if not experiments:
|
||||||
st.info("No experiments found matching the filters.")
|
st.info("No experiments found matching the filters.")
|
||||||
@@ -149,20 +165,28 @@ class Experiments:
|
|||||||
):
|
):
|
||||||
self._display_experiment_details(exp, i)
|
self._display_experiment_details(exp, i)
|
||||||
|
|
||||||
def _get_filtered_experiments(self, status_filter: str, model_filter: str, tag_filter: str):
|
def _get_filtered_experiments(
|
||||||
|
self, status_filter: str, model_filter: str, tag_filter: str
|
||||||
|
):
|
||||||
"""Get experiments with applied filters"""
|
"""Get experiments with applied filters"""
|
||||||
experiments = self.experiment_tracker.list_experiments()
|
experiments = self.experiment_tracker.list_experiments()
|
||||||
|
|
||||||
# Apply filters
|
# Apply filters
|
||||||
if status_filter != "All":
|
if status_filter != "All":
|
||||||
experiments = [e for e in experiments if e.status == ExperimentStatus(status_filter)]
|
experiments = [
|
||||||
|
e for e in experiments if e.status == ExperimentStatus(status_filter)
|
||||||
|
]
|
||||||
|
|
||||||
if model_filter != "All":
|
if model_filter != "All":
|
||||||
experiments = [e for e in experiments if e.config.model_type == model_filter]
|
experiments = [
|
||||||
|
e for e in experiments if e.config.model_type == model_filter
|
||||||
|
]
|
||||||
|
|
||||||
if tag_filter:
|
if tag_filter:
|
||||||
tags = [tag.strip() for tag in tag_filter.split(",")]
|
tags = [tag.strip() for tag in tag_filter.split(",")]
|
||||||
experiments = [e for e in experiments if any(tag in e.config.tags for tag in tags)]
|
experiments = [
|
||||||
|
e for e in experiments if any(tag in e.config.tags for tag in tags)
|
||||||
|
]
|
||||||
|
|
||||||
return experiments
|
return experiments
|
||||||
|
|
||||||
@@ -173,7 +197,9 @@ class Experiments:
|
|||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
st.write(f"**Model:** {exp.config.model_type}")
|
st.write(f"**Model:** {exp.config.model_type}")
|
||||||
st.write(f"**Features:** {', '.join([f.value for f in exp.config.features])}")
|
st.write(
|
||||||
|
f"**Features:** {', '.join([f.value for f in exp.config.features])}"
|
||||||
|
)
|
||||||
st.write(f"**Tags:** {', '.join(exp.config.tags)}")
|
st.write(f"**Tags:** {', '.join(exp.config.tags)}")
|
||||||
|
|
||||||
with col2:
|
with col2:
|
||||||
@@ -185,7 +211,7 @@ class Experiments:
|
|||||||
st.write(f"**Train Size:** {exp.train_size:,}")
|
st.write(f"**Train Size:** {exp.train_size:,}")
|
||||||
st.write(f"**Test Size:** {exp.test_size:,}")
|
st.write(f"**Test Size:** {exp.test_size:,}")
|
||||||
|
|
||||||
if st.button(f"View Details", key=f"details_{index}"):
|
if st.button("View Details", key=f"details_{index}"):
|
||||||
st.session_state.selected_experiment = exp.experiment_id
|
st.session_state.selected_experiment = exp.experiment_id
|
||||||
st.rerun()
|
st.rerun()
|
||||||
|
|
||||||
@@ -198,7 +224,9 @@ class Experiments:
|
|||||||
st.write("Run multiple experiments with different parameter combinations.")
|
st.write("Run multiple experiments with different parameter combinations.")
|
||||||
|
|
||||||
# Add option to run template batch experiments
|
# Add option to run template batch experiments
|
||||||
batch_type = st.radio("Batch Type", ["Template Batch", "Custom Parameter Sweep"])
|
batch_type = st.radio(
|
||||||
|
"Batch Type", ["Template Batch", "Custom Parameter Sweep"]
|
||||||
|
)
|
||||||
|
|
||||||
if batch_type == "Template Batch":
|
if batch_type == "Template Batch":
|
||||||
self._show_template_batch_experiments()
|
self._show_template_batch_experiments()
|
||||||
@@ -227,10 +255,13 @@ class Experiments:
|
|||||||
if experiments:
|
if experiments:
|
||||||
st.write(f"**{exp_type.title()} Experiments:**")
|
st.write(f"**{exp_type.title()} Experiments:**")
|
||||||
exp_names = [
|
exp_names = [
|
||||||
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
|
exp.get("name", f"Exp {i}")
|
||||||
|
for i, exp in enumerate(experiments)
|
||||||
]
|
]
|
||||||
selected_names = st.multiselect(
|
selected_names = st.multiselect(
|
||||||
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
|
f"Select {exp_type} experiments",
|
||||||
|
exp_names,
|
||||||
|
key=f"select_{exp_type}",
|
||||||
)
|
)
|
||||||
|
|
||||||
for name in selected_names:
|
for name in selected_names:
|
||||||
@@ -258,13 +289,17 @@ class Experiments:
|
|||||||
experiment_configs.append(config)
|
experiment_configs.append(config)
|
||||||
|
|
||||||
# Run batch experiments
|
# Run batch experiments
|
||||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiment_configs)
|
experiment_ids = self.experiment_runner.run_experiment_batch(
|
||||||
|
experiment_configs
|
||||||
|
)
|
||||||
|
|
||||||
st.success(f"Completed {len(experiment_ids)} template experiments!")
|
st.success(f"Completed {len(experiment_ids)} template experiments!")
|
||||||
|
|
||||||
# Show summary
|
# Show summary
|
||||||
if experiment_ids:
|
if experiment_ids:
|
||||||
comparison = self.experiment_runner.compare_experiments(experiment_ids)
|
comparison = self.experiment_runner.compare_experiments(
|
||||||
|
experiment_ids
|
||||||
|
)
|
||||||
st.write("**Template Batch Results:**")
|
st.write("**Template Batch Results:**")
|
||||||
st.dataframe(
|
st.dataframe(
|
||||||
comparison[["name", "model_type", "test_accuracy"]],
|
comparison[["name", "model_type", "test_accuracy"]],
|
||||||
@@ -285,7 +320,9 @@ class Experiments:
|
|||||||
with col1:
|
with col1:
|
||||||
base_name = st.text_input("Base Experiment Name", "parameter_sweep")
|
base_name = st.text_input("Base Experiment Name", "parameter_sweep")
|
||||||
model_types = st.multiselect(
|
model_types = st.multiselect(
|
||||||
"Model Types", list_available_models(), default=["logistic_regression"]
|
"Model Types",
|
||||||
|
list_available_models(),
|
||||||
|
default=["logistic_regression"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# N-gram ranges for logistic regression
|
# N-gram ranges for logistic regression
|
||||||
@@ -301,13 +338,20 @@ class Experiments:
|
|||||||
default=["full_name", "native_name", "surname"],
|
default=["full_name", "native_name", "surname"],
|
||||||
)
|
)
|
||||||
|
|
||||||
test_sizes = st.text_input("Test Sizes (comma-separated)", "0.15,0.2,0.25")
|
test_sizes = st.text_input(
|
||||||
|
"Test Sizes (comma-separated)", "0.15,0.2,0.25"
|
||||||
|
)
|
||||||
|
|
||||||
tags = st.text_input("Common Tags", "parameter_sweep,batch")
|
tags = st.text_input("Common Tags", "parameter_sweep,batch")
|
||||||
|
|
||||||
if st.form_submit_button("🚀 Run Parameter Sweep"):
|
if st.form_submit_button("🚀 Run Parameter Sweep"):
|
||||||
self.run_batch_experiments(
|
self.run_batch_experiments(
|
||||||
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
|
base_name,
|
||||||
|
model_types,
|
||||||
|
ngram_ranges,
|
||||||
|
feature_combinations,
|
||||||
|
test_sizes,
|
||||||
|
tags,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_batch_experiments(
|
def run_batch_experiments(
|
||||||
@@ -369,13 +413,17 @@ class Experiments:
|
|||||||
exp_count += 1
|
exp_count += 1
|
||||||
|
|
||||||
# Run experiments
|
# Run experiments
|
||||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
|
experiment_ids = self.experiment_runner.run_experiment_batch(
|
||||||
|
experiments
|
||||||
|
)
|
||||||
|
|
||||||
st.success(f"Completed {len(experiment_ids)} batch experiments")
|
st.success(f"Completed {len(experiment_ids)} batch experiments")
|
||||||
|
|
||||||
# Show summary
|
# Show summary
|
||||||
if experiment_ids:
|
if experiment_ids:
|
||||||
comparison = self.experiment_runner.compare_experiments(experiment_ids)
|
comparison = self.experiment_runner.compare_experiments(
|
||||||
|
experiment_ids
|
||||||
|
)
|
||||||
st.write("**Batch Results Summary:**")
|
st.write("**Batch Results Summary:**")
|
||||||
st.dataframe(
|
st.dataframe(
|
||||||
comparison[["name", "model_type", "test_accuracy"]],
|
comparison[["name", "model_type", "test_accuracy"]],
|
||||||
@@ -0,0 +1,80 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LogEntry:
|
||||||
|
timestamp: datetime
|
||||||
|
level: str
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
class LogReader:
|
||||||
|
def __init__(self, log_file_path: Path):
|
||||||
|
self.log_file_path = Path(log_file_path)
|
||||||
|
|
||||||
|
def read_last_entries(self, num_entries: int = 20) -> List[LogEntry]:
|
||||||
|
entries = []
|
||||||
|
if not self.log_file_path.exists():
|
||||||
|
return entries
|
||||||
|
|
||||||
|
with open(self.log_file_path, "r") as f:
|
||||||
|
lines = f.readlines()[-num_entries:]
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
entry = self._parse_log_line(line)
|
||||||
|
if entry:
|
||||||
|
entries.append(entry)
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
def read_entries_by_level(
|
||||||
|
self, level: str, num_entries: int = 20
|
||||||
|
) -> List[LogEntry]:
|
||||||
|
entries = []
|
||||||
|
if not self.log_file_path.exists():
|
||||||
|
return entries
|
||||||
|
|
||||||
|
with open(self.log_file_path, "r") as f:
|
||||||
|
for line in reversed(f.readlines()):
|
||||||
|
entry = self._parse_log_line(line)
|
||||||
|
if entry and entry.level == level:
|
||||||
|
entries.append(entry)
|
||||||
|
if len(entries) >= num_entries:
|
||||||
|
break
|
||||||
|
|
||||||
|
return list(reversed(entries))
|
||||||
|
|
||||||
|
def get_log_stats(self) -> dict:
|
||||||
|
if not self.log_file_path.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
stats = {"total_lines": 0}
|
||||||
|
with open(self.log_file_path, "r") as f:
|
||||||
|
for line in f:
|
||||||
|
stats["total_lines"] += 1
|
||||||
|
entry = self._parse_log_line(line)
|
||||||
|
if entry:
|
||||||
|
stats[entry.level] = stats.get(entry.level, 0) + 1
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_log_line(line: str) -> LogEntry | None:
|
||||||
|
try:
|
||||||
|
# Expected format from logging config: [timestamp] - LEVEL - message
|
||||||
|
parts = line.strip().split(" - ")
|
||||||
|
if len(parts) >= 3:
|
||||||
|
timestamp_str = parts[0].strip("[]")
|
||||||
|
timestamp = datetime.fromisoformat(timestamp_str)
|
||||||
|
level = parts[1].strip()
|
||||||
|
message = " - ".join(parts[2:])
|
||||||
|
return LogEntry(timestamp, level, message)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
@@ -1,10 +1,8 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from processing.ner.name_model import NameModel
|
from ners.processing.ner.name_model import NameModel
|
||||||
|
|
||||||
|
|
||||||
class NERTesting:
|
class NERTesting:
|
||||||
@@ -56,12 +54,15 @@ class NERTesting:
|
|||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
st.metric(
|
st.metric(
|
||||||
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
|
"Training Examples",
|
||||||
|
f"{self.training_stats.get('training_examples', 0):,}",
|
||||||
)
|
)
|
||||||
with col2:
|
with col2:
|
||||||
st.metric("Epochs", self.training_stats.get("epochs", 0))
|
st.metric("Epochs", self.training_stats.get("epochs", 0))
|
||||||
with col3:
|
with col3:
|
||||||
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
|
st.metric(
|
||||||
|
"Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}"
|
||||||
|
)
|
||||||
with col4:
|
with col4:
|
||||||
st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
|
st.metric("Batch Size", f"{self.training_stats.get('batch_size', 0):,}")
|
||||||
|
|
||||||
@@ -99,9 +100,11 @@ class NERTesting:
|
|||||||
|
|
||||||
if names_input.strip():
|
if names_input.strip():
|
||||||
if st.button("Analyze All Names", type="primary"):
|
if st.button("Analyze All Names", type="primary"):
|
||||||
names = [name.strip() for name in names_input.split("\n") if name.strip()]
|
names = [
|
||||||
|
name.strip() for name in names_input.split("\n") if name.strip()
|
||||||
|
]
|
||||||
for i, name in enumerate(names):
|
for i, name in enumerate(names):
|
||||||
st.markdown(f"**Name {i+1}: {name}**")
|
st.markdown(f"**Name {i + 1}: {name}**")
|
||||||
self.analyze_and_display(name)
|
self.analyze_and_display(name)
|
||||||
if i < len(names) - 1:
|
if i < len(names) - 1:
|
||||||
st.markdown("---")
|
st.markdown("---")
|
||||||
@@ -127,7 +130,9 @@ class NERTesting:
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
st.warning("No entities detected in the input text.")
|
st.warning("No entities detected in the input text.")
|
||||||
st.info("Try using traditional Congolese names or ensure the spelling is correct.")
|
st.info(
|
||||||
|
"Try using traditional Congolese names or ensure the spelling is correct."
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error analyzing text: {e}")
|
st.error(f"Error analyzing text: {e}")
|
||||||
@@ -139,14 +144,21 @@ class NERTesting:
|
|||||||
ents = []
|
ents = []
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
ents.append(
|
ents.append(
|
||||||
{"start": entity["start"], "end": entity["end"], "label": entity["label"]}
|
{
|
||||||
|
"start": entity["start"],
|
||||||
|
"end": entity["end"],
|
||||||
|
"label": entity["label"],
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create doc-like structure for displacy
|
# Create doc-like structure for displacy
|
||||||
doc_data = {"text": text, "ents": ents, "title": None}
|
doc_data = {"text": text, "ents": ents, "title": None}
|
||||||
|
|
||||||
# Custom colors for our labels
|
# Custom colors for our labels
|
||||||
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
|
colors = {
|
||||||
|
"NATIVE": "#74C0FC",
|
||||||
|
"SURNAME": "#69DB7C",
|
||||||
|
} # Light blue # Light green
|
||||||
|
|
||||||
options = {"colors": colors, "distance": 90}
|
options = {"colors": colors, "distance": 90}
|
||||||
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user