Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fad7ff9277 | |||
| 8f90fdd625 | |||
| 137dea7fe5 | |||
| 9dd4f759b3 | |||
| f3b06fbd07 | |||
| 912d518106 | |||
| a1d500830b | |||
| 9e35f95107 | |||
| 9039e9a4cf |
@@ -0,0 +1,16 @@
|
|||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
.idea
|
||||||
|
.vscode
|
||||||
|
__pycache__
|
||||||
|
.ruff_cache
|
||||||
|
.venv
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*.DS_Store
|
||||||
|
dist
|
||||||
|
build
|
||||||
|
*.egg-info
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
3.11
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
# syntax=docker/dockerfile:1
|
||||||
|
|
||||||
|
# Minimal Linux base (glibc) – Python will be installed by uv
|
||||||
|
FROM debian:bookworm-slim
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
UV_INSTALL_DIR=/usr/local/bin \
|
||||||
|
UV_LINK_MODE=copy \
|
||||||
|
UV_PYTHON_DOWNLOADS=1 \
|
||||||
|
UV_PROJECT_ENVIRONMENT=/app/.venv \
|
||||||
|
PATH=/app/.venv/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# System deps for building/using common scientific stack
|
||||||
|
# Keep minimal; rely on wheels where possible
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates curl git \
|
||||||
|
build-essential pkg-config \
|
||||||
|
libssl-dev libffi-dev \
|
||||||
|
libopenblas0 libstdc++6 \
|
||||||
|
libfreetype6 libpng16-16 libjpeg62-turbo \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv (static binary)
|
||||||
|
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Copy project metadata first for layer caching
|
||||||
|
COPY pyproject.toml README.md ./
|
||||||
|
|
||||||
|
# Install a managed Python via uv and create the project venv
|
||||||
|
RUN uv python install 3.11 \
|
||||||
|
&& uv venv /app/.venv --python 3.11
|
||||||
|
|
||||||
|
# Resolve and install runtime deps into project venv
|
||||||
|
# Use lockfile if present for reproducibility
|
||||||
|
RUN if [ -f uv.lock ]; then uv sync --no-dev --no-install-project --frozen; else uv sync --no-dev --no-install-project; fi
|
||||||
|
|
||||||
|
# Copy source code and optional templates
|
||||||
|
COPY src ./src
|
||||||
|
|
||||||
|
# Re-sync to ensure the local package is installed
|
||||||
|
RUN uv sync --no-dev \
|
||||||
|
&& rm -rf /root/.cache
|
||||||
|
|
||||||
|
# Default command shows help; override in compose or docker run
|
||||||
|
CMD ["ners", "--help"]
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
.PHONY: default
|
|
||||||
default: help
|
|
||||||
|
|
||||||
.PHONY: help
|
|
||||||
help: ## Show this help message
|
|
||||||
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# ENVIRONMENT SETUP
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
.PHONY: setup
|
|
||||||
setup: ## Setup virtual environment and install dependencies
|
|
||||||
python -m venv .venv
|
|
||||||
source .venv/bin/activate
|
|
||||||
.venv/bin/pip install --upgrade pip
|
|
||||||
.venv/bin/pip install -r requirements.txt
|
|
||||||
|
|
||||||
.PHONY: install
|
|
||||||
install: ## Install/update dependencies
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# DEVELOPMENT & CODE QUALITY
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
.PHONY: format
|
|
||||||
format: ## Format code with black
|
|
||||||
black . --line-length 100
|
|
||||||
|
|
||||||
.PHONY: lint
|
|
||||||
lint: ## Lint code with flake8
|
|
||||||
flake8 . --max-line-length=100 --ignore=E203,W503 --exclude=.venv
|
|
||||||
|
|
||||||
.PHONY: type-check
|
|
||||||
type-check: ## Type check with mypy
|
|
||||||
mypy . --ignore-missing-imports
|
|
||||||
|
|
||||||
.PHONY: notebook
|
|
||||||
notebook: ## Start Jupyter notebook
|
|
||||||
jupyter notebook notebooks/
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# DEPLOYMENT & PRODUCTION
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
.PHONY: backup
|
|
||||||
backup: ## Backup datasets and results
|
|
||||||
@mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
|
|
||||||
@cp -r data/ backups/$(shell date +%Y%m%d_%H%M%S)/data/
|
|
||||||
@echo "Backup created in backups/$(shell date +%Y%m%d_%H%M%S)/"
|
|
||||||
@@ -10,51 +10,41 @@ million names from the Democratic Republic of Congo (DRC) annotated with gender
|
|||||||
|
|
||||||
### Installation & Setup
|
### Installation & Setup
|
||||||
|
|
||||||
Instructions and command line snippets bellow are provided to help you set up the project environment quickly and
|
> download [the dataset](https://drive.google.com/file/d/1a5wQnOZdsRWBOeoMA_0lNtbneTvS9xqy/view?usp=drive_link), if you need access please reach us at mlec.academia@gmail.com.
|
||||||
efficiently.
|
|
||||||
assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
|
|
||||||
|
|
||||||
**Using Makefile (Recommended)**
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||||
cd drc-ners-nlp
|
|
||||||
|
|
||||||
# Setup environment
|
mkdir -p drc-ners-nlp/data/dataset
|
||||||
make setup
|
cp names.csv drc-ners-nlp/data/dataset
|
||||||
make activate
|
|
||||||
|
cd drc-ners-nlp
|
||||||
```
|
```
|
||||||
|
|
||||||
**Manual Setup**
|
**Linux**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
cd drc-ners-nlp
|
|
||||||
|
|
||||||
# Setup environment
|
uv sync
|
||||||
python -m venv .venv
|
```
|
||||||
.venv/bin/pip install --upgrade pip
|
|
||||||
.venv/bin/pip install -r requirements.txt
|
|
||||||
|
|
||||||
pip install --upgrade pip
|
**Macos & windows**
|
||||||
pip install -r requirements.txt
|
```bash
|
||||||
pip install jupyter notebook ipykernel pytest black flake8 mypy
|
docker compose build
|
||||||
|
docker compose exec app bash
|
||||||
source .venv/bin/activate
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Data Processing
|
## Data Processing
|
||||||
|
|
||||||
This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching,
|
This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching,
|
||||||
checkpointing, and parallel processing capabilities.
|
checkpointing, and parallel processing capabilities.
|
||||||
step are defined in the `drc-ners-nlp/processing/steps` directory. and configuration to enable them is managed through
|
|
||||||
the `drc-ners-nlp/config/pipeline.yaml` file.
|
|
||||||
|
|
||||||
**Pipeline Configuration**
|
**Pipeline Configuration**
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
stages:
|
stages:
|
||||||
- "data_cleaning"
|
- "data_cleaning"
|
||||||
|
- "data_selection"
|
||||||
- "feature_extraction"
|
- "feature_extraction"
|
||||||
- "data_splitting"
|
- "data_splitting"
|
||||||
```
|
```
|
||||||
@@ -62,97 +52,77 @@ stages:
|
|||||||
**Running the Pipeline**
|
**Running the Pipeline**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python main.py --env production
|
uv run ners pipeline run --env="production"
|
||||||
```
|
|
||||||
|
|
||||||
## NER Processing (Optional)
|
|
||||||
|
|
||||||
This project implements a custom named entity recognition (NER) pipeline tailored for Congolese names.
|
|
||||||
Its main objective is to accurately identify and tag the different components of a Congolese name,
|
|
||||||
specifically distinguishing between the native part and the surname.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python ner.py --env production
|
|
||||||
```
|
|
||||||
|
|
||||||
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
|
|
||||||
|
|
||||||
**Running the Pipeline with NER Annotation**
|
|
||||||
```yaml
|
|
||||||
stages:
|
|
||||||
- "data_cleaning"
|
|
||||||
- "feature_extraction"
|
|
||||||
- "ner_annotation"
|
|
||||||
- "data_splitting"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Running the Pipeline with LLM Annotation**
|
|
||||||
```yaml
|
|
||||||
stages:
|
|
||||||
- "data_cleaning"
|
|
||||||
- "feature_extraction"
|
|
||||||
- "llm_annotation"
|
|
||||||
- "data_splitting"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Experiments
|
## Experiments
|
||||||
|
|
||||||
This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and
|
This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and
|
||||||
research iteration. models are defined in the `drc-ners-nlp/research/models` directory.
|
research iteration. you can define model features, training parameters, and evaluation metrics in the `config/research_templates.yaml` file.
|
||||||
you can define model features, training parameters, and evaluation metrics in the `research_templates.yaml` file.
|
|
||||||
|
|
||||||
**Running Experiments**
|
**Running Experiments**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# bigru
|
# bigru
|
||||||
python train.py --name="bigru" --type="baseline" --env="production"
|
uv run ners research train --name="bigru" --type="baseline" --env="production"
|
||||||
python train.py --name="bigru_native" --type="baseline" --env="production"
|
uv run ners research train --name="bigru_native" --type="baseline" --env="production"
|
||||||
python train.py --name="bigru_surname" --type="baseline" --env="production"
|
uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# cnn
|
# cnn
|
||||||
python train.py --name="cnn" --type="baseline" --env="production"
|
uv run ners research train --name="cnn" --type="baseline" --env="production"
|
||||||
python train.py --name="cnn_native" --type="baseline" --env="production"
|
uv run ners research train --name="cnn_native" --type="baseline" --env="production"
|
||||||
python train.py --name="cnn_surname" --type="baseline" --env="production"
|
uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# lightgbm
|
# lightgbm
|
||||||
python train.py --name="lightgbm" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm" --type="baseline" --env="production"
|
||||||
python train.py --name="lightgbm_native" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# logistic regression
|
# logistic regression
|
||||||
python train.py --name="logistic_regression" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
|
||||||
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
|
||||||
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# lstm
|
# lstm
|
||||||
python train.py --name="lstm" --type="baseline" --env="production"
|
uv run ners research train --name="lstm" --type="baseline" --env="production"
|
||||||
python train.py --name="lstm_native" --type="baseline" --env="production"
|
uv run ners research train --name="lstm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="lstm_surname" --type="baseline" --env="production"
|
uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# random forest
|
# random forest
|
||||||
python train.py --name="random_forest" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest" --type="baseline" --env="production"
|
||||||
python train.py --name="random_forest_native" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
|
||||||
python train.py --name="random_forest_surname" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
# svm
|
|
||||||
python train.py --name="svm" --type="baseline" --env="production"
|
|
||||||
python train.py --name="svm_native" --type="baseline" --env="production"
|
|
||||||
python train.py --name="svm_surname" --type="baseline" --env="production"
|
|
||||||
|
|
||||||
|
```bash
|
||||||
# naive bayes
|
# naive bayes
|
||||||
python train.py --name="naive_bayes" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
|
||||||
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
|
||||||
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# transformer
|
# transformer
|
||||||
python train.py --name="transformer" --type="baseline" --env="production"
|
uv run ners research train --name="transformer" --type="baseline" --env="production"
|
||||||
python train.py --name="transformer_native" --type="baseline" --env="production"
|
uv run ners research train --name="transformer_native" --type="baseline" --env="production"
|
||||||
python train.py --name="transformer_surname" --type="baseline" --env="production"
|
uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# xgboost
|
# xgboost
|
||||||
python train.py --name="xgboost" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost" --type="baseline" --env="production"
|
||||||
python train.py --name="xgboost_native" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
|
||||||
python train.py --name="xgboost_surname" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost_surname" --type="baseline" --env="production"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Web Interface
|
## Web Interface
|
||||||
@@ -162,10 +132,18 @@ experiments and make predictions without needing to understand the underlying co
|
|||||||
|
|
||||||
### Running the Web Interface
|
### Running the Web Interface
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
streamlit run web/app.py
|
uv run ners web run --env="production"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose run --rm --service-ports app ners web run --env=production
|
||||||
|
```
|
||||||
|
|
||||||
|
then open : http://localhost:8501/
|
||||||
|
|
||||||
## Contributors
|
## Contributors
|
||||||
|
|
||||||
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||||
|
|||||||
|
After Width: | Height: | Size: 38 KiB |
@@ -0,0 +1,930 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" standalone="no"?>
|
||||||
|
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||||
|
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||||
|
<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="720pt" height="720pt" viewBox="0 0 720 720" xmlns="http://www.w3.org/2000/svg" version="1.1">
|
||||||
|
<metadata>
|
||||||
|
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||||
|
<cc:Work>
|
||||||
|
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
|
||||||
|
<dc:date>2025-10-05T23:19:48.322300</dc:date>
|
||||||
|
<dc:format>image/svg+xml</dc:format>
|
||||||
|
<dc:creator>
|
||||||
|
<cc:Agent>
|
||||||
|
<dc:title>Matplotlib v3.10.6, https://matplotlib.org/</dc:title>
|
||||||
|
</cc:Agent>
|
||||||
|
</dc:creator>
|
||||||
|
</cc:Work>
|
||||||
|
</rdf:RDF>
|
||||||
|
</metadata>
|
||||||
|
<defs>
|
||||||
|
<style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
|
||||||
|
</defs>
|
||||||
|
<g id="figure_1">
|
||||||
|
<g id="patch_1">
|
||||||
|
<path d="M 0 720
|
||||||
|
L 720 720
|
||||||
|
L 720 0
|
||||||
|
L 0 0
|
||||||
|
z
|
||||||
|
" style="fill: #ffffff"/>
|
||||||
|
</g>
|
||||||
|
<g id="axes_1">
|
||||||
|
<g id="patch_2">
|
||||||
|
<path d="M 178.972717 225.461826
|
||||||
|
C 165.311247 225.461826 151.729128 227.542675 138.694269 231.632688
|
||||||
|
C 125.659409 235.722701 113.323232 241.774364 102.110723 249.579196
|
||||||
|
C 90.898214 257.384027 80.939626 266.851361 72.578035 277.655044
|
||||||
|
C 64.216444 288.458727 57.548986 300.473255 52.805355 313.284724
|
||||||
|
C 48.061725 326.096193 45.297029 339.555775 44.606511 353.199783
|
||||||
|
C 43.915994 366.843791 45.307677 380.513725 48.733616 393.738652
|
||||||
|
C 52.159555 406.96358 57.579952 419.589869 64.808073 431.182541
|
||||||
|
C 72.036193 442.775213 80.98807 453.199597 91.355308 462.096571
|
||||||
|
C 101.722546 470.993545 113.38471 478.259754 125.940037 483.644875
|
||||||
|
C 138.495365 489.029996 151.798001 492.471471 165.389667 493.850741
|
||||||
|
C 178.981332 495.230011 192.704136 494.531053 206.085323 491.777945
|
||||||
|
C 219.46651 489.024838 232.350634 484.249563 244.293832 477.616632
|
||||||
|
C 256.237031 470.983701 267.100562 462.570166 276.510175 452.665876
|
||||||
|
C 285.919787 442.761586 293.766171 431.481598 299.779017 419.21451
|
||||||
|
C 305.791862 406.947421 309.901319 393.835738 311.965815 380.33116
|
||||||
|
C 314.030311 366.826582 314.025864 353.08599 311.952626 339.582752
|
||||||
|
L 178.972717 360
|
||||||
|
z
|
||||||
|
" style="fill: #1f77b4"/>
|
||||||
|
</g>
|
||||||
|
<g id="patch_3">
|
||||||
|
<path d="M 311.952627 339.582734
|
||||||
|
C 307.075828 307.819612 290.976562 278.831415 266.590126 257.903424
|
||||||
|
C 242.20369 236.975434 211.108058 225.461834 178.972736 225.461826
|
||||||
|
L 178.972717 360
|
||||||
|
z
|
||||||
|
" style="fill: #ff7f0e"/>
|
||||||
|
</g>
|
||||||
|
<g id="matplotlib.axis_1"/>
|
||||||
|
<g id="matplotlib.axis_2"/>
|
||||||
|
<g id="text_1">
|
||||||
|
<!-- Simple -->
|
||||||
|
<g transform="translate(48.446708 475.065617) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-53" d="M 3425 4513
|
||||||
|
L 3425 3897
|
||||||
|
Q 3066 4069 2747 4153
|
||||||
|
Q 2428 4238 2131 4238
|
||||||
|
Q 1616 4238 1336 4038
|
||||||
|
Q 1056 3838 1056 3469
|
||||||
|
Q 1056 3159 1242 3001
|
||||||
|
Q 1428 2844 1947 2747
|
||||||
|
L 2328 2669
|
||||||
|
Q 3034 2534 3370 2195
|
||||||
|
Q 3706 1856 3706 1288
|
||||||
|
Q 3706 609 3251 259
|
||||||
|
Q 2797 -91 1919 -91
|
||||||
|
Q 1588 -91 1214 -16
|
||||||
|
Q 841 59 441 206
|
||||||
|
L 441 856
|
||||||
|
Q 825 641 1194 531
|
||||||
|
Q 1563 422 1919 422
|
||||||
|
Q 2459 422 2753 634
|
||||||
|
Q 3047 847 3047 1241
|
||||||
|
Q 3047 1584 2836 1778
|
||||||
|
Q 2625 1972 2144 2069
|
||||||
|
L 1759 2144
|
||||||
|
Q 1053 2284 737 2584
|
||||||
|
Q 422 2884 422 3419
|
||||||
|
Q 422 4038 858 4394
|
||||||
|
Q 1294 4750 2059 4750
|
||||||
|
Q 2388 4750 2728 4690
|
||||||
|
Q 3069 4631 3425 4513
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-69" d="M 603 3500
|
||||||
|
L 1178 3500
|
||||||
|
L 1178 0
|
||||||
|
L 603 0
|
||||||
|
L 603 3500
|
||||||
|
z
|
||||||
|
M 603 4863
|
||||||
|
L 1178 4863
|
||||||
|
L 1178 4134
|
||||||
|
L 603 4134
|
||||||
|
L 603 4863
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-6d" d="M 3328 2828
|
||||||
|
Q 3544 3216 3844 3400
|
||||||
|
Q 4144 3584 4550 3584
|
||||||
|
Q 5097 3584 5394 3201
|
||||||
|
Q 5691 2819 5691 2113
|
||||||
|
L 5691 0
|
||||||
|
L 5113 0
|
||||||
|
L 5113 2094
|
||||||
|
Q 5113 2597 4934 2840
|
||||||
|
Q 4756 3084 4391 3084
|
||||||
|
Q 3944 3084 3684 2787
|
||||||
|
Q 3425 2491 3425 1978
|
||||||
|
L 3425 0
|
||||||
|
L 2847 0
|
||||||
|
L 2847 2094
|
||||||
|
Q 2847 2600 2669 2842
|
||||||
|
Q 2491 3084 2119 3084
|
||||||
|
Q 1678 3084 1418 2786
|
||||||
|
Q 1159 2488 1159 1978
|
||||||
|
L 1159 0
|
||||||
|
L 581 0
|
||||||
|
L 581 3500
|
||||||
|
L 1159 3500
|
||||||
|
L 1159 2956
|
||||||
|
Q 1356 3278 1631 3431
|
||||||
|
Q 1906 3584 2284 3584
|
||||||
|
Q 2666 3584 2933 3390
|
||||||
|
Q 3200 3197 3328 2828
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-70" d="M 1159 525
|
||||||
|
L 1159 -1331
|
||||||
|
L 581 -1331
|
||||||
|
L 581 3500
|
||||||
|
L 1159 3500
|
||||||
|
L 1159 2969
|
||||||
|
Q 1341 3281 1617 3432
|
||||||
|
Q 1894 3584 2278 3584
|
||||||
|
Q 2916 3584 3314 3078
|
||||||
|
Q 3713 2572 3713 1747
|
||||||
|
Q 3713 922 3314 415
|
||||||
|
Q 2916 -91 2278 -91
|
||||||
|
Q 1894 -91 1617 61
|
||||||
|
Q 1341 213 1159 525
|
||||||
|
z
|
||||||
|
M 3116 1747
|
||||||
|
Q 3116 2381 2855 2742
|
||||||
|
Q 2594 3103 2138 3103
|
||||||
|
Q 1681 3103 1420 2742
|
||||||
|
Q 1159 2381 1159 1747
|
||||||
|
Q 1159 1113 1420 752
|
||||||
|
Q 1681 391 2138 391
|
||||||
|
Q 2594 391 2855 752
|
||||||
|
Q 3116 1113 3116 1747
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-6c" d="M 603 4863
|
||||||
|
L 1178 4863
|
||||||
|
L 1178 0
|
||||||
|
L 603 0
|
||||||
|
L 603 4863
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-65" d="M 3597 1894
|
||||||
|
L 3597 1613
|
||||||
|
L 953 1613
|
||||||
|
Q 991 1019 1311 708
|
||||||
|
Q 1631 397 2203 397
|
||||||
|
Q 2534 397 2845 478
|
||||||
|
Q 3156 559 3463 722
|
||||||
|
L 3463 178
|
||||||
|
Q 3153 47 2828 -22
|
||||||
|
Q 2503 -91 2169 -91
|
||||||
|
Q 1331 -91 842 396
|
||||||
|
Q 353 884 353 1716
|
||||||
|
Q 353 2575 817 3079
|
||||||
|
Q 1281 3584 2069 3584
|
||||||
|
Q 2775 3584 3186 3129
|
||||||
|
Q 3597 2675 3597 1894
|
||||||
|
z
|
||||||
|
M 3022 2063
|
||||||
|
Q 3016 2534 2758 2815
|
||||||
|
Q 2500 3097 2075 3097
|
||||||
|
Q 1594 3097 1305 2825
|
||||||
|
Q 1016 2553 972 2059
|
||||||
|
L 3022 2063
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-53"/>
|
||||||
|
<use xlink:href="#DejaVuSans-69" transform="translate(63.476562 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6d" transform="translate(91.259766 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-70" transform="translate(188.671875 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6c" transform="translate(252.148438 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-65" transform="translate(279.931641 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_2">
|
||||||
|
<!-- 77.4% -->
|
||||||
|
<g transform="translate(110.518687 424.017325) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-37" d="M 525 4666
|
||||||
|
L 3525 4666
|
||||||
|
L 3525 4397
|
||||||
|
L 1831 0
|
||||||
|
L 1172 0
|
||||||
|
L 2766 4134
|
||||||
|
L 525 4134
|
||||||
|
L 525 4666
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-2e" d="M 684 794
|
||||||
|
L 1344 794
|
||||||
|
L 1344 0
|
||||||
|
L 684 0
|
||||||
|
L 684 794
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-34" d="M 2419 4116
|
||||||
|
L 825 1625
|
||||||
|
L 2419 1625
|
||||||
|
L 2419 4116
|
||||||
|
z
|
||||||
|
M 2253 4666
|
||||||
|
L 3047 4666
|
||||||
|
L 3047 1625
|
||||||
|
L 3713 1625
|
||||||
|
L 3713 1100
|
||||||
|
L 3047 1100
|
||||||
|
L 3047 0
|
||||||
|
L 2419 0
|
||||||
|
L 2419 1100
|
||||||
|
L 313 1100
|
||||||
|
L 313 1709
|
||||||
|
L 2253 4666
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-25" d="M 4653 2053
|
||||||
|
Q 4381 2053 4226 1822
|
||||||
|
Q 4072 1591 4072 1178
|
||||||
|
Q 4072 772 4226 539
|
||||||
|
Q 4381 306 4653 306
|
||||||
|
Q 4919 306 5073 539
|
||||||
|
Q 5228 772 5228 1178
|
||||||
|
Q 5228 1588 5073 1820
|
||||||
|
Q 4919 2053 4653 2053
|
||||||
|
z
|
||||||
|
M 4653 2450
|
||||||
|
Q 5147 2450 5437 2106
|
||||||
|
Q 5728 1763 5728 1178
|
||||||
|
Q 5728 594 5436 251
|
||||||
|
Q 5144 -91 4653 -91
|
||||||
|
Q 4153 -91 3862 251
|
||||||
|
Q 3572 594 3572 1178
|
||||||
|
Q 3572 1766 3864 2108
|
||||||
|
Q 4156 2450 4653 2450
|
||||||
|
z
|
||||||
|
M 1428 4353
|
||||||
|
Q 1159 4353 1004 4120
|
||||||
|
Q 850 3888 850 3481
|
||||||
|
Q 850 3069 1003 2837
|
||||||
|
Q 1156 2606 1428 2606
|
||||||
|
Q 1700 2606 1854 2837
|
||||||
|
Q 2009 3069 2009 3481
|
||||||
|
Q 2009 3884 1853 4118
|
||||||
|
Q 1697 4353 1428 4353
|
||||||
|
z
|
||||||
|
M 4250 4750
|
||||||
|
L 4750 4750
|
||||||
|
L 1831 -91
|
||||||
|
L 1331 -91
|
||||||
|
L 4250 4750
|
||||||
|
z
|
||||||
|
M 1428 4750
|
||||||
|
Q 1922 4750 2215 4408
|
||||||
|
Q 2509 4066 2509 3481
|
||||||
|
Q 2509 2891 2217 2550
|
||||||
|
Q 1925 2209 1428 2209
|
||||||
|
Q 931 2209 642 2551
|
||||||
|
Q 353 2894 353 3481
|
||||||
|
Q 353 4063 643 4406
|
||||||
|
Q 934 4750 1428 4750
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-37"/>
|
||||||
|
<use xlink:href="#DejaVuSans-37" transform="translate(63.623047 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-34" transform="translate(159.033203 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_3">
|
||||||
|
<!-- Compose -->
|
||||||
|
<g transform="translate(275.351869 250.453148) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-43" d="M 4122 4306
|
||||||
|
L 4122 3641
|
||||||
|
Q 3803 3938 3442 4084
|
||||||
|
Q 3081 4231 2675 4231
|
||||||
|
Q 1875 4231 1450 3742
|
||||||
|
Q 1025 3253 1025 2328
|
||||||
|
Q 1025 1406 1450 917
|
||||||
|
Q 1875 428 2675 428
|
||||||
|
Q 3081 428 3442 575
|
||||||
|
Q 3803 722 4122 1019
|
||||||
|
L 4122 359
|
||||||
|
Q 3791 134 3420 21
|
||||||
|
Q 3050 -91 2638 -91
|
||||||
|
Q 1578 -91 968 557
|
||||||
|
Q 359 1206 359 2328
|
||||||
|
Q 359 3453 968 4101
|
||||||
|
Q 1578 4750 2638 4750
|
||||||
|
Q 3056 4750 3426 4639
|
||||||
|
Q 3797 4528 4122 4306
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-6f" d="M 1959 3097
|
||||||
|
Q 1497 3097 1228 2736
|
||||||
|
Q 959 2375 959 1747
|
||||||
|
Q 959 1119 1226 758
|
||||||
|
Q 1494 397 1959 397
|
||||||
|
Q 2419 397 2687 759
|
||||||
|
Q 2956 1122 2956 1747
|
||||||
|
Q 2956 2369 2687 2733
|
||||||
|
Q 2419 3097 1959 3097
|
||||||
|
z
|
||||||
|
M 1959 3584
|
||||||
|
Q 2709 3584 3137 3096
|
||||||
|
Q 3566 2609 3566 1747
|
||||||
|
Q 3566 888 3137 398
|
||||||
|
Q 2709 -91 1959 -91
|
||||||
|
Q 1206 -91 779 398
|
||||||
|
Q 353 888 353 1747
|
||||||
|
Q 353 2609 779 3096
|
||||||
|
Q 1206 3584 1959 3584
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-73" d="M 2834 3397
|
||||||
|
L 2834 2853
|
||||||
|
Q 2591 2978 2328 3040
|
||||||
|
Q 2066 3103 1784 3103
|
||||||
|
Q 1356 3103 1142 2972
|
||||||
|
Q 928 2841 928 2578
|
||||||
|
Q 928 2378 1081 2264
|
||||||
|
Q 1234 2150 1697 2047
|
||||||
|
L 1894 2003
|
||||||
|
Q 2506 1872 2764 1633
|
||||||
|
Q 3022 1394 3022 966
|
||||||
|
Q 3022 478 2636 193
|
||||||
|
Q 2250 -91 1575 -91
|
||||||
|
Q 1294 -91 989 -36
|
||||||
|
Q 684 19 347 128
|
||||||
|
L 347 722
|
||||||
|
Q 666 556 975 473
|
||||||
|
Q 1284 391 1588 391
|
||||||
|
Q 1994 391 2212 530
|
||||||
|
Q 2431 669 2431 922
|
||||||
|
Q 2431 1156 2273 1281
|
||||||
|
Q 2116 1406 1581 1522
|
||||||
|
L 1381 1569
|
||||||
|
Q 847 1681 609 1914
|
||||||
|
Q 372 2147 372 2553
|
||||||
|
Q 372 3047 722 3315
|
||||||
|
Q 1072 3584 1716 3584
|
||||||
|
Q 2034 3584 2315 3537
|
||||||
|
Q 2597 3491 2834 3397
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-43"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6f" transform="translate(69.824219 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6d" transform="translate(131.005859 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-70" transform="translate(228.417969 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6f" transform="translate(291.894531 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-73" transform="translate(353.076172 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-65" transform="translate(405.175781 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_4">
|
||||||
|
<!-- 22.6% -->
|
||||||
|
<g transform="translate(215.65957 301.501433) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-32" d="M 1228 531
|
||||||
|
L 3431 531
|
||||||
|
L 3431 0
|
||||||
|
L 469 0
|
||||||
|
L 469 531
|
||||||
|
Q 828 903 1448 1529
|
||||||
|
Q 2069 2156 2228 2338
|
||||||
|
Q 2531 2678 2651 2914
|
||||||
|
Q 2772 3150 2772 3378
|
||||||
|
Q 2772 3750 2511 3984
|
||||||
|
Q 2250 4219 1831 4219
|
||||||
|
Q 1534 4219 1204 4116
|
||||||
|
Q 875 4013 500 3803
|
||||||
|
L 500 4441
|
||||||
|
Q 881 4594 1212 4672
|
||||||
|
Q 1544 4750 1819 4750
|
||||||
|
Q 2544 4750 2975 4387
|
||||||
|
Q 3406 4025 3406 3419
|
||||||
|
Q 3406 3131 3298 2873
|
||||||
|
Q 3191 2616 2906 2266
|
||||||
|
Q 2828 2175 2409 1742
|
||||||
|
Q 1991 1309 1228 531
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-36" d="M 2113 2584
|
||||||
|
Q 1688 2584 1439 2293
|
||||||
|
Q 1191 2003 1191 1497
|
||||||
|
Q 1191 994 1439 701
|
||||||
|
Q 1688 409 2113 409
|
||||||
|
Q 2538 409 2786 701
|
||||||
|
Q 3034 994 3034 1497
|
||||||
|
Q 3034 2003 2786 2293
|
||||||
|
Q 2538 2584 2113 2584
|
||||||
|
z
|
||||||
|
M 3366 4563
|
||||||
|
L 3366 3988
|
||||||
|
Q 3128 4100 2886 4159
|
||||||
|
Q 2644 4219 2406 4219
|
||||||
|
Q 1781 4219 1451 3797
|
||||||
|
Q 1122 3375 1075 2522
|
||||||
|
Q 1259 2794 1537 2939
|
||||||
|
Q 1816 3084 2150 3084
|
||||||
|
Q 2853 3084 3261 2657
|
||||||
|
Q 3669 2231 3669 1497
|
||||||
|
Q 3669 778 3244 343
|
||||||
|
Q 2819 -91 2113 -91
|
||||||
|
Q 1303 -91 875 529
|
||||||
|
Q 447 1150 447 2328
|
||||||
|
Q 447 3434 972 4092
|
||||||
|
Q 1497 4750 2381 4750
|
||||||
|
Q 2619 4750 2861 4703
|
||||||
|
Q 3103 4656 3366 4563
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-32"/>
|
||||||
|
<use xlink:href="#DejaVuSans-32" transform="translate(63.623047 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-36" transform="translate(159.033203 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_5">
|
||||||
|
<!-- Distribution by Category -->
|
||||||
|
<g transform="translate(105.387405 185.827283) scale(0.12 -0.12)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-44" d="M 1259 4147
|
||||||
|
L 1259 519
|
||||||
|
L 2022 519
|
||||||
|
Q 2988 519 3436 956
|
||||||
|
Q 3884 1394 3884 2338
|
||||||
|
Q 3884 3275 3436 3711
|
||||||
|
Q 2988 4147 2022 4147
|
||||||
|
L 1259 4147
|
||||||
|
z
|
||||||
|
M 628 4666
|
||||||
|
L 1925 4666
|
||||||
|
Q 3281 4666 3915 4102
|
||||||
|
Q 4550 3538 4550 2338
|
||||||
|
Q 4550 1131 3912 565
|
||||||
|
Q 3275 0 1925 0
|
||||||
|
L 628 0
|
||||||
|
L 628 4666
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-74" d="M 1172 4494
|
||||||
|
L 1172 3500
|
||||||
|
L 2356 3500
|
||||||
|
L 2356 3053
|
||||||
|
L 1172 3053
|
||||||
|
L 1172 1153
|
||||||
|
Q 1172 725 1289 603
|
||||||
|
Q 1406 481 1766 481
|
||||||
|
L 2356 481
|
||||||
|
L 2356 0
|
||||||
|
L 1766 0
|
||||||
|
Q 1100 0 847 248
|
||||||
|
Q 594 497 594 1153
|
||||||
|
L 594 3053
|
||||||
|
L 172 3053
|
||||||
|
L 172 3500
|
||||||
|
L 594 3500
|
||||||
|
L 594 4494
|
||||||
|
L 1172 4494
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-72" d="M 2631 2963
|
||||||
|
Q 2534 3019 2420 3045
|
||||||
|
Q 2306 3072 2169 3072
|
||||||
|
Q 1681 3072 1420 2755
|
||||||
|
Q 1159 2438 1159 1844
|
||||||
|
L 1159 0
|
||||||
|
L 581 0
|
||||||
|
L 581 3500
|
||||||
|
L 1159 3500
|
||||||
|
L 1159 2956
|
||||||
|
Q 1341 3275 1631 3429
|
||||||
|
Q 1922 3584 2338 3584
|
||||||
|
Q 2397 3584 2469 3576
|
||||||
|
Q 2541 3569 2628 3553
|
||||||
|
L 2631 2963
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-62" d="M 3116 1747
|
||||||
|
Q 3116 2381 2855 2742
|
||||||
|
Q 2594 3103 2138 3103
|
||||||
|
Q 1681 3103 1420 2742
|
||||||
|
Q 1159 2381 1159 1747
|
||||||
|
Q 1159 1113 1420 752
|
||||||
|
Q 1681 391 2138 391
|
||||||
|
Q 2594 391 2855 752
|
||||||
|
Q 3116 1113 3116 1747
|
||||||
|
z
|
||||||
|
M 1159 2969
|
||||||
|
Q 1341 3281 1617 3432
|
||||||
|
Q 1894 3584 2278 3584
|
||||||
|
Q 2916 3584 3314 3078
|
||||||
|
Q 3713 2572 3713 1747
|
||||||
|
Q 3713 922 3314 415
|
||||||
|
Q 2916 -91 2278 -91
|
||||||
|
Q 1894 -91 1617 61
|
||||||
|
Q 1341 213 1159 525
|
||||||
|
L 1159 0
|
||||||
|
L 581 0
|
||||||
|
L 581 4863
|
||||||
|
L 1159 4863
|
||||||
|
L 1159 2969
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-75" d="M 544 1381
|
||||||
|
L 544 3500
|
||||||
|
L 1119 3500
|
||||||
|
L 1119 1403
|
||||||
|
Q 1119 906 1312 657
|
||||||
|
Q 1506 409 1894 409
|
||||||
|
Q 2359 409 2629 706
|
||||||
|
Q 2900 1003 2900 1516
|
||||||
|
L 2900 3500
|
||||||
|
L 3475 3500
|
||||||
|
L 3475 0
|
||||||
|
L 2900 0
|
||||||
|
L 2900 538
|
||||||
|
Q 2691 219 2414 64
|
||||||
|
Q 2138 -91 1772 -91
|
||||||
|
Q 1169 -91 856 284
|
||||||
|
Q 544 659 544 1381
|
||||||
|
z
|
||||||
|
M 1991 3584
|
||||||
|
L 1991 3584
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-6e" d="M 3513 2113
|
||||||
|
L 3513 0
|
||||||
|
L 2938 0
|
||||||
|
L 2938 2094
|
||||||
|
Q 2938 2591 2744 2837
|
||||||
|
Q 2550 3084 2163 3084
|
||||||
|
Q 1697 3084 1428 2787
|
||||||
|
Q 1159 2491 1159 1978
|
||||||
|
L 1159 0
|
||||||
|
L 581 0
|
||||||
|
L 581 3500
|
||||||
|
L 1159 3500
|
||||||
|
L 1159 2956
|
||||||
|
Q 1366 3272 1645 3428
|
||||||
|
Q 1925 3584 2291 3584
|
||||||
|
Q 2894 3584 3203 3211
|
||||||
|
Q 3513 2838 3513 2113
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-20" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-79" d="M 2059 -325
|
||||||
|
Q 1816 -950 1584 -1140
|
||||||
|
Q 1353 -1331 966 -1331
|
||||||
|
L 506 -1331
|
||||||
|
L 506 -850
|
||||||
|
L 844 -850
|
||||||
|
Q 1081 -850 1212 -737
|
||||||
|
Q 1344 -625 1503 -206
|
||||||
|
L 1606 56
|
||||||
|
L 191 3500
|
||||||
|
L 800 3500
|
||||||
|
L 1894 763
|
||||||
|
L 2988 3500
|
||||||
|
L 3597 3500
|
||||||
|
L 2059 -325
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-61" d="M 2194 1759
|
||||||
|
Q 1497 1759 1228 1600
|
||||||
|
Q 959 1441 959 1056
|
||||||
|
Q 959 750 1161 570
|
||||||
|
Q 1363 391 1709 391
|
||||||
|
Q 2188 391 2477 730
|
||||||
|
Q 2766 1069 2766 1631
|
||||||
|
L 2766 1759
|
||||||
|
L 2194 1759
|
||||||
|
z
|
||||||
|
M 3341 1997
|
||||||
|
L 3341 0
|
||||||
|
L 2766 0
|
||||||
|
L 2766 531
|
||||||
|
Q 2569 213 2275 61
|
||||||
|
Q 1981 -91 1556 -91
|
||||||
|
Q 1019 -91 701 211
|
||||||
|
Q 384 513 384 1019
|
||||||
|
Q 384 1609 779 1909
|
||||||
|
Q 1175 2209 1959 2209
|
||||||
|
L 2766 2209
|
||||||
|
L 2766 2266
|
||||||
|
Q 2766 2663 2505 2880
|
||||||
|
Q 2244 3097 1772 3097
|
||||||
|
Q 1472 3097 1187 3025
|
||||||
|
Q 903 2953 641 2809
|
||||||
|
L 641 3341
|
||||||
|
Q 956 3463 1253 3523
|
||||||
|
Q 1550 3584 1831 3584
|
||||||
|
Q 2591 3584 2966 3190
|
||||||
|
Q 3341 2797 3341 1997
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-67" d="M 2906 1791
|
||||||
|
Q 2906 2416 2648 2759
|
||||||
|
Q 2391 3103 1925 3103
|
||||||
|
Q 1463 3103 1205 2759
|
||||||
|
Q 947 2416 947 1791
|
||||||
|
Q 947 1169 1205 825
|
||||||
|
Q 1463 481 1925 481
|
||||||
|
Q 2391 481 2648 825
|
||||||
|
Q 2906 1169 2906 1791
|
||||||
|
z
|
||||||
|
M 3481 434
|
||||||
|
Q 3481 -459 3084 -895
|
||||||
|
Q 2688 -1331 1869 -1331
|
||||||
|
Q 1566 -1331 1297 -1286
|
||||||
|
Q 1028 -1241 775 -1147
|
||||||
|
L 775 -588
|
||||||
|
Q 1028 -725 1275 -790
|
||||||
|
Q 1522 -856 1778 -856
|
||||||
|
Q 2344 -856 2625 -561
|
||||||
|
Q 2906 -266 2906 331
|
||||||
|
L 2906 616
|
||||||
|
Q 2728 306 2450 153
|
||||||
|
Q 2172 0 1784 0
|
||||||
|
Q 1141 0 747 490
|
||||||
|
Q 353 981 353 1791
|
||||||
|
Q 353 2603 747 3093
|
||||||
|
Q 1141 3584 1784 3584
|
||||||
|
Q 2172 3584 2450 3431
|
||||||
|
Q 2728 3278 2906 2969
|
||||||
|
L 2906 3500
|
||||||
|
L 3481 3500
|
||||||
|
L 3481 434
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-44"/>
|
||||||
|
<use xlink:href="#DejaVuSans-69" transform="translate(77.001953 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-73" transform="translate(104.785156 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-74" transform="translate(156.884766 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-72" transform="translate(196.09375 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-69" transform="translate(237.207031 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-62" transform="translate(264.990234 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-75" transform="translate(328.466797 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-74" transform="translate(391.845703 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-69" transform="translate(431.054688 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6f" transform="translate(458.837891 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6e" transform="translate(520.019531 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-20" transform="translate(583.398438 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-62" transform="translate(615.185547 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-79" transform="translate(678.662109 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-20" transform="translate(737.841797 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-43" transform="translate(769.628906 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-61" transform="translate(839.453125 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-74" transform="translate(900.732422 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-65" transform="translate(939.941406 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-67" transform="translate(1001.464844 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6f" transform="translate(1064.941406 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-72" transform="translate(1126.123047 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-79" transform="translate(1167.236328 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="axes_2">
|
||||||
|
<g id="patch_4">
|
||||||
|
<path d="M 527.362719 225.461826
|
||||||
|
C 505.569772 225.461826 484.099667 230.756969 464.80585 240.890142
|
||||||
|
C 445.512033 251.023314 428.966145 265.69429 416.596539 283.636569
|
||||||
|
C 404.226933 301.578849 396.400097 322.260836 393.791699 343.897121
|
||||||
|
C 391.1833 365.533405 393.87062 387.482944 401.621671 407.850903
|
||||||
|
C 409.372721 428.218863 421.957854 446.401777 438.290633 460.829974
|
||||||
|
C 454.623413 475.258171 474.219929 485.50417 495.388478 490.683465
|
||||||
|
C 516.557026 495.862761 538.670422 495.8219 559.819686 490.564411
|
||||||
|
C 580.96895 485.306922 600.527468 474.988574 616.806816 460.500118
|
||||||
|
L 527.362719 360
|
||||||
|
z
|
||||||
|
" style="fill: #1f77b4"/>
|
||||||
|
</g>
|
||||||
|
<g id="patch_5">
|
||||||
|
<path d="M 616.806831 460.500101
|
||||||
|
C 637.179098 442.368988 651.564722 418.464933 658.046183 391.974229
|
||||||
|
C 664.527645 365.483527 662.803571 337.637926 653.103764 312.149096
|
||||||
|
C 643.403956 286.660262 626.179443 264.713372 603.726155 249.233823
|
||||||
|
C 581.272867 233.754273 554.634822 225.461826 527.362738 225.461826
|
||||||
|
L 527.362719 360
|
||||||
|
z
|
||||||
|
" style="fill: #ff7f0e"/>
|
||||||
|
</g>
|
||||||
|
<g id="matplotlib.axis_3"/>
|
||||||
|
<g id="matplotlib.axis_4"/>
|
||||||
|
<g id="text_6">
|
||||||
|
<!-- Male -->
|
||||||
|
<g transform="translate(365.360062 415.395359) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-4d" d="M 628 4666
|
||||||
|
L 1569 4666
|
||||||
|
L 2759 1491
|
||||||
|
L 3956 4666
|
||||||
|
L 4897 4666
|
||||||
|
L 4897 0
|
||||||
|
L 4281 0
|
||||||
|
L 4281 4097
|
||||||
|
L 3078 897
|
||||||
|
L 2444 897
|
||||||
|
L 1241 4097
|
||||||
|
L 1241 0
|
||||||
|
L 628 0
|
||||||
|
L 628 4666
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-4d"/>
|
||||||
|
<use xlink:href="#DejaVuSans-61" transform="translate(86.279297 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6c" transform="translate(147.558594 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-65" transform="translate(175.341797 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_7">
|
||||||
|
<!-- 61.6% -->
|
||||||
|
<g transform="translate(436.034494 391.469912) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-31" d="M 794 531
|
||||||
|
L 1825 531
|
||||||
|
L 1825 4091
|
||||||
|
L 703 3866
|
||||||
|
L 703 4441
|
||||||
|
L 1819 4666
|
||||||
|
L 2450 4666
|
||||||
|
L 2450 531
|
||||||
|
L 3481 531
|
||||||
|
L 3481 0
|
||||||
|
L 794 0
|
||||||
|
L 794 531
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-36"/>
|
||||||
|
<use xlink:href="#DejaVuSans-31" transform="translate(63.623047 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-36" transform="translate(159.033203 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_8">
|
||||||
|
<!-- Female -->
|
||||||
|
<g transform="translate(665.677871 310.123379) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-46" d="M 628 4666
|
||||||
|
L 3309 4666
|
||||||
|
L 3309 4134
|
||||||
|
L 1259 4134
|
||||||
|
L 1259 2759
|
||||||
|
L 3109 2759
|
||||||
|
L 3109 2228
|
||||||
|
L 1259 2228
|
||||||
|
L 1259 0
|
||||||
|
L 628 0
|
||||||
|
L 628 4666
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-46"/>
|
||||||
|
<use xlink:href="#DejaVuSans-65" transform="translate(52.019531 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6d" transform="translate(113.542969 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-61" transform="translate(210.955078 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6c" transform="translate(272.234375 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-65" transform="translate(300.017578 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_9">
|
||||||
|
<!-- 38.4% -->
|
||||||
|
<g transform="translate(586.923754 334.048832) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-33" d="M 2597 2516
|
||||||
|
Q 3050 2419 3304 2112
|
||||||
|
Q 3559 1806 3559 1356
|
||||||
|
Q 3559 666 3084 287
|
||||||
|
Q 2609 -91 1734 -91
|
||||||
|
Q 1441 -91 1130 -33
|
||||||
|
Q 819 25 488 141
|
||||||
|
L 488 750
|
||||||
|
Q 750 597 1062 519
|
||||||
|
Q 1375 441 1716 441
|
||||||
|
Q 2309 441 2620 675
|
||||||
|
Q 2931 909 2931 1356
|
||||||
|
Q 2931 1769 2642 2001
|
||||||
|
Q 2353 2234 1838 2234
|
||||||
|
L 1294 2234
|
||||||
|
L 1294 2753
|
||||||
|
L 1863 2753
|
||||||
|
Q 2328 2753 2575 2939
|
||||||
|
Q 2822 3125 2822 3475
|
||||||
|
Q 2822 3834 2567 4026
|
||||||
|
Q 2313 4219 1838 4219
|
||||||
|
Q 1578 4219 1281 4162
|
||||||
|
Q 984 4106 628 3988
|
||||||
|
L 628 4550
|
||||||
|
Q 988 4650 1302 4700
|
||||||
|
Q 1616 4750 1894 4750
|
||||||
|
Q 2613 4750 3031 4423
|
||||||
|
Q 3450 4097 3450 3541
|
||||||
|
Q 3450 3153 3228 2886
|
||||||
|
Q 3006 2619 2597 2516
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="DejaVuSans-38" d="M 2034 2216
|
||||||
|
Q 1584 2216 1326 1975
|
||||||
|
Q 1069 1734 1069 1313
|
||||||
|
Q 1069 891 1326 650
|
||||||
|
Q 1584 409 2034 409
|
||||||
|
Q 2484 409 2743 651
|
||||||
|
Q 3003 894 3003 1313
|
||||||
|
Q 3003 1734 2745 1975
|
||||||
|
Q 2488 2216 2034 2216
|
||||||
|
z
|
||||||
|
M 1403 2484
|
||||||
|
Q 997 2584 770 2862
|
||||||
|
Q 544 3141 544 3541
|
||||||
|
Q 544 4100 942 4425
|
||||||
|
Q 1341 4750 2034 4750
|
||||||
|
Q 2731 4750 3128 4425
|
||||||
|
Q 3525 4100 3525 3541
|
||||||
|
Q 3525 3141 3298 2862
|
||||||
|
Q 3072 2584 2669 2484
|
||||||
|
Q 3125 2378 3379 2068
|
||||||
|
Q 3634 1759 3634 1313
|
||||||
|
Q 3634 634 3220 271
|
||||||
|
Q 2806 -91 2034 -91
|
||||||
|
Q 1263 -91 848 271
|
||||||
|
Q 434 634 434 1313
|
||||||
|
Q 434 1759 690 2068
|
||||||
|
Q 947 2378 1403 2484
|
||||||
|
z
|
||||||
|
M 1172 3481
|
||||||
|
Q 1172 3119 1398 2916
|
||||||
|
Q 1625 2713 2034 2713
|
||||||
|
Q 2441 2713 2670 2916
|
||||||
|
Q 2900 3119 2900 3481
|
||||||
|
Q 2900 3844 2670 4047
|
||||||
|
Q 2441 4250 2034 4250
|
||||||
|
Q 1625 4250 1398 4047
|
||||||
|
Q 1172 3844 1172 3481
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-33"/>
|
||||||
|
<use xlink:href="#DejaVuSans-38" transform="translate(63.623047 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-34" transform="translate(159.033203 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_10">
|
||||||
|
<!-- Distribution by Sex -->
|
||||||
|
<g transform="translate(470.238969 185.827283) scale(0.12 -0.12)">
|
||||||
|
<defs>
|
||||||
|
<path id="DejaVuSans-78" d="M 3513 3500
|
||||||
|
L 2247 1797
|
||||||
|
L 3578 0
|
||||||
|
L 2900 0
|
||||||
|
L 1881 1375
|
||||||
|
L 863 0
|
||||||
|
L 184 0
|
||||||
|
L 1544 1831
|
||||||
|
L 300 3500
|
||||||
|
L 978 3500
|
||||||
|
L 1906 2253
|
||||||
|
L 2834 3500
|
||||||
|
L 3513 3500
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#DejaVuSans-44"/>
|
||||||
|
<use xlink:href="#DejaVuSans-69" transform="translate(77.001953 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-73" transform="translate(104.785156 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-74" transform="translate(156.884766 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-72" transform="translate(196.09375 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-69" transform="translate(237.207031 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-62" transform="translate(264.990234 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-75" transform="translate(328.466797 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-74" transform="translate(391.845703 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-69" transform="translate(431.054688 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6f" transform="translate(458.837891 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-6e" transform="translate(520.019531 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-20" transform="translate(583.398438 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-62" transform="translate(615.185547 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-79" transform="translate(678.662109 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-20" transform="translate(737.841797 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-53" transform="translate(769.628906 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-65" transform="translate(833.105469 0)"/>
|
||||||
|
<use xlink:href="#DejaVuSans-78" transform="translate(892.878906 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 24 KiB |
@@ -0,0 +1,13 @@
|
|||||||
|
compose,simple
|
||||||
|
0.2062165520477412,0.7937834479522587
|
||||||
|
0.6269061385346485,0.3730938614653515
|
||||||
|
0.09081330148566008,0.90918669851434
|
||||||
|
0.12423822403788959,0.8757617759621105
|
||||||
|
0.2612655252892886,0.7387344747107114
|
||||||
|
0.07622377139542966,0.9237762286045703
|
||||||
|
0.18062352012628255,0.8193764798737174
|
||||||
|
0.07679244621346286,0.9232075537865372
|
||||||
|
0.4611502742287561,0.5388497257712439
|
||||||
|
0.11962561930536533,0.8803743806946347
|
||||||
|
0.16090483213325235,0.8390951678667476
|
||||||
|
0.409646629226467,0.590353370773533
|
||||||
|
|
After Width: | Height: | Size: 20 KiB |
@@ -0,0 +1,487 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" standalone="no"?>
|
||||||
|
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||||
|
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||||
|
<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="432pt" height="432pt" viewBox="0 0 432 432" xmlns="http://www.w3.org/2000/svg" version="1.1">
|
||||||
|
<metadata>
|
||||||
|
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||||
|
<cc:Work>
|
||||||
|
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
|
||||||
|
<dc:date>2025-09-28T16:57:45.798630</dc:date>
|
||||||
|
<dc:format>image/svg+xml</dc:format>
|
||||||
|
<dc:creator>
|
||||||
|
<cc:Agent>
|
||||||
|
<dc:title>Matplotlib v3.10.3, https://matplotlib.org/</dc:title>
|
||||||
|
</cc:Agent>
|
||||||
|
</dc:creator>
|
||||||
|
</cc:Work>
|
||||||
|
</rdf:RDF>
|
||||||
|
</metadata>
|
||||||
|
<defs>
|
||||||
|
<style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
|
||||||
|
</defs>
|
||||||
|
<g id="figure_1">
|
||||||
|
<g id="patch_1">
|
||||||
|
<path d="M 0 432
|
||||||
|
L 432 432
|
||||||
|
L 432 0
|
||||||
|
L 0 0
|
||||||
|
z
|
||||||
|
" style="fill: #ffffff"/>
|
||||||
|
</g>
|
||||||
|
<g id="axes_1">
|
||||||
|
<g id="matplotlib.axis_1"/>
|
||||||
|
<g id="matplotlib.axis_2"/>
|
||||||
|
<g id="patch_2">
|
||||||
|
<path d="M 373.581818 218.160006
|
||||||
|
C 373.581818 202.706938 371.228081 187.343628 366.601695 172.599346
|
||||||
|
C 361.975308 157.855065 355.130015 143.901092 346.301639 131.218148
|
||||||
|
C 337.473263 118.535204 326.76436 107.270623 314.543856 97.812474
|
||||||
|
C 302.323352 88.354325 288.73321 80.812481 274.241615 75.44676
|
||||||
|
C 259.750019 70.08104 244.525316 66.953775 229.092 66.172703
|
||||||
|
C 213.658685 65.39163 198.196043 66.965822 183.236767 70.841048
|
||||||
|
C 168.277492 74.716274 153.995362 80.847516 140.8824 89.02355
|
||||||
|
C 127.769438 97.199585 115.977976 107.325433 105.914233 119.052255
|
||||||
|
C 95.85049 130.779078 87.631374 143.970648 81.540037 158.172511
|
||||||
|
C 75.4487 172.374374 71.555903 187.42155 69.995754 202.795659
|
||||||
|
C 68.435605 218.169768 69.226228 233.692213 72.340387 248.82824
|
||||||
|
C 75.454545 263.964267 80.856062 278.538045 88.358853 292.047502
|
||||||
|
C 95.861645 305.556959 105.378552 317.845158 116.581715 328.488768
|
||||||
|
C 127.784879 339.132378 140.544153 348.007753 154.419977 354.809136
|
||||||
|
C 168.295802 361.610518 183.126983 366.258896 198.402584 368.594133
|
||||||
|
C 213.678185 370.929369 229.22075 370.924336 244.494835 368.579206
|
||||||
|
L 221.399995 218.160006
|
||||||
|
z
|
||||||
|
" style="fill: #1f77b4; stroke: #ffffff; stroke-linejoin: miter"/>
|
||||||
|
</g>
|
||||||
|
<g id="patch_3">
|
||||||
|
<path d="M 244.494835 368.579206
|
||||||
|
C 280.423454 363.062851 313.213222 344.852289 336.885761 317.267751
|
||||||
|
C 360.5583 289.683214 373.581822 254.509625 373.581818 218.159992
|
||||||
|
L 221.399995 218.160006
|
||||||
|
z
|
||||||
|
" style="fill: #ff7f0e; stroke: #ffffff; stroke-linejoin: miter"/>
|
||||||
|
</g>
|
||||||
|
<g id="text_1">
|
||||||
|
<!-- Simple -->
|
||||||
|
<g style="fill: #262626" transform="translate(63.800032 111.787574) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="ArialMT-53" d="M 288 1472
|
||||||
|
L 859 1522
|
||||||
|
Q 900 1178 1048 958
|
||||||
|
Q 1197 738 1509 602
|
||||||
|
Q 1822 466 2213 466
|
||||||
|
Q 2559 466 2825 569
|
||||||
|
Q 3091 672 3220 851
|
||||||
|
Q 3350 1031 3350 1244
|
||||||
|
Q 3350 1459 3225 1620
|
||||||
|
Q 3100 1781 2813 1891
|
||||||
|
Q 2628 1963 1997 2114
|
||||||
|
Q 1366 2266 1113 2400
|
||||||
|
Q 784 2572 623 2826
|
||||||
|
Q 463 3081 463 3397
|
||||||
|
Q 463 3744 659 4045
|
||||||
|
Q 856 4347 1234 4503
|
||||||
|
Q 1613 4659 2075 4659
|
||||||
|
Q 2584 4659 2973 4495
|
||||||
|
Q 3363 4331 3572 4012
|
||||||
|
Q 3781 3694 3797 3291
|
||||||
|
L 3216 3247
|
||||||
|
Q 3169 3681 2898 3903
|
||||||
|
Q 2628 4125 2100 4125
|
||||||
|
Q 1550 4125 1298 3923
|
||||||
|
Q 1047 3722 1047 3438
|
||||||
|
Q 1047 3191 1225 3031
|
||||||
|
Q 1400 2872 2139 2705
|
||||||
|
Q 2878 2538 3153 2413
|
||||||
|
Q 3553 2228 3743 1945
|
||||||
|
Q 3934 1663 3934 1294
|
||||||
|
Q 3934 928 3725 604
|
||||||
|
Q 3516 281 3123 101
|
||||||
|
Q 2731 -78 2241 -78
|
||||||
|
Q 1619 -78 1198 103
|
||||||
|
Q 778 284 539 648
|
||||||
|
Q 300 1013 288 1472
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-69" d="M 425 3934
|
||||||
|
L 425 4581
|
||||||
|
L 988 4581
|
||||||
|
L 988 3934
|
||||||
|
L 425 3934
|
||||||
|
z
|
||||||
|
M 425 0
|
||||||
|
L 425 3319
|
||||||
|
L 988 3319
|
||||||
|
L 988 0
|
||||||
|
L 425 0
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-6d" d="M 422 0
|
||||||
|
L 422 3319
|
||||||
|
L 925 3319
|
||||||
|
L 925 2853
|
||||||
|
Q 1081 3097 1340 3245
|
||||||
|
Q 1600 3394 1931 3394
|
||||||
|
Q 2300 3394 2536 3241
|
||||||
|
Q 2772 3088 2869 2813
|
||||||
|
Q 3263 3394 3894 3394
|
||||||
|
Q 4388 3394 4653 3120
|
||||||
|
Q 4919 2847 4919 2278
|
||||||
|
L 4919 0
|
||||||
|
L 4359 0
|
||||||
|
L 4359 2091
|
||||||
|
Q 4359 2428 4304 2576
|
||||||
|
Q 4250 2725 4106 2815
|
||||||
|
Q 3963 2906 3769 2906
|
||||||
|
Q 3419 2906 3187 2673
|
||||||
|
Q 2956 2441 2956 1928
|
||||||
|
L 2956 0
|
||||||
|
L 2394 0
|
||||||
|
L 2394 2156
|
||||||
|
Q 2394 2531 2256 2718
|
||||||
|
Q 2119 2906 1806 2906
|
||||||
|
Q 1569 2906 1367 2781
|
||||||
|
Q 1166 2656 1075 2415
|
||||||
|
Q 984 2175 984 1722
|
||||||
|
L 984 0
|
||||||
|
L 422 0
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-70" d="M 422 -1272
|
||||||
|
L 422 3319
|
||||||
|
L 934 3319
|
||||||
|
L 934 2888
|
||||||
|
Q 1116 3141 1344 3267
|
||||||
|
Q 1572 3394 1897 3394
|
||||||
|
Q 2322 3394 2647 3175
|
||||||
|
Q 2972 2956 3137 2557
|
||||||
|
Q 3303 2159 3303 1684
|
||||||
|
Q 3303 1175 3120 767
|
||||||
|
Q 2938 359 2589 142
|
||||||
|
Q 2241 -75 1856 -75
|
||||||
|
Q 1575 -75 1351 44
|
||||||
|
Q 1128 163 984 344
|
||||||
|
L 984 -1272
|
||||||
|
L 422 -1272
|
||||||
|
z
|
||||||
|
M 931 1641
|
||||||
|
Q 931 1000 1190 694
|
||||||
|
Q 1450 388 1819 388
|
||||||
|
Q 2194 388 2461 705
|
||||||
|
Q 2728 1022 2728 1688
|
||||||
|
Q 2728 2322 2467 2637
|
||||||
|
Q 2206 2953 1844 2953
|
||||||
|
Q 1484 2953 1207 2617
|
||||||
|
Q 931 2281 931 1641
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-6c" d="M 409 0
|
||||||
|
L 409 4581
|
||||||
|
L 972 4581
|
||||||
|
L 972 0
|
||||||
|
L 409 0
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-65" d="M 2694 1069
|
||||||
|
L 3275 997
|
||||||
|
Q 3138 488 2766 206
|
||||||
|
Q 2394 -75 1816 -75
|
||||||
|
Q 1088 -75 661 373
|
||||||
|
Q 234 822 234 1631
|
||||||
|
Q 234 2469 665 2931
|
||||||
|
Q 1097 3394 1784 3394
|
||||||
|
Q 2450 3394 2872 2941
|
||||||
|
Q 3294 2488 3294 1666
|
||||||
|
Q 3294 1616 3291 1516
|
||||||
|
L 816 1516
|
||||||
|
Q 847 969 1125 678
|
||||||
|
Q 1403 388 1819 388
|
||||||
|
Q 2128 388 2347 550
|
||||||
|
Q 2566 713 2694 1069
|
||||||
|
z
|
||||||
|
M 847 1978
|
||||||
|
L 2700 1978
|
||||||
|
Q 2663 2397 2488 2606
|
||||||
|
Q 2219 2931 1791 2931
|
||||||
|
Q 1403 2931 1139 2672
|
||||||
|
Q 875 2413 847 1978
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#ArialMT-53"/>
|
||||||
|
<use xlink:href="#ArialMT-69" transform="translate(66.699219 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-6d" transform="translate(88.916016 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-70" transform="translate(172.216797 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-6c" transform="translate(227.832031 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-65" transform="translate(250.048828 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_2">
|
||||||
|
<!-- 77.4% -->
|
||||||
|
<g style="fill: #262626" transform="translate(137.931975 161.280512) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="ArialMT-37" d="M 303 3981
|
||||||
|
L 303 4522
|
||||||
|
L 3269 4522
|
||||||
|
L 3269 4084
|
||||||
|
Q 2831 3619 2401 2847
|
||||||
|
Q 1972 2075 1738 1259
|
||||||
|
Q 1569 684 1522 0
|
||||||
|
L 944 0
|
||||||
|
Q 953 541 1156 1306
|
||||||
|
Q 1359 2072 1739 2783
|
||||||
|
Q 2119 3494 2547 3981
|
||||||
|
L 303 3981
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-2e" d="M 581 0
|
||||||
|
L 581 641
|
||||||
|
L 1222 641
|
||||||
|
L 1222 0
|
||||||
|
L 581 0
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-34" d="M 2069 0
|
||||||
|
L 2069 1097
|
||||||
|
L 81 1097
|
||||||
|
L 81 1613
|
||||||
|
L 2172 4581
|
||||||
|
L 2631 4581
|
||||||
|
L 2631 1613
|
||||||
|
L 3250 1613
|
||||||
|
L 3250 1097
|
||||||
|
L 2631 1097
|
||||||
|
L 2631 0
|
||||||
|
L 2069 0
|
||||||
|
z
|
||||||
|
M 2069 1613
|
||||||
|
L 2069 3678
|
||||||
|
L 634 1613
|
||||||
|
L 2069 1613
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-25" d="M 372 3481
|
||||||
|
Q 372 3972 619 4315
|
||||||
|
Q 866 4659 1334 4659
|
||||||
|
Q 1766 4659 2048 4351
|
||||||
|
Q 2331 4044 2331 3447
|
||||||
|
Q 2331 2866 2045 2552
|
||||||
|
Q 1759 2238 1341 2238
|
||||||
|
Q 925 2238 648 2547
|
||||||
|
Q 372 2856 372 3481
|
||||||
|
z
|
||||||
|
M 1350 4272
|
||||||
|
Q 1141 4272 1002 4090
|
||||||
|
Q 863 3909 863 3425
|
||||||
|
Q 863 2984 1003 2804
|
||||||
|
Q 1144 2625 1350 2625
|
||||||
|
Q 1563 2625 1702 2806
|
||||||
|
Q 1841 2988 1841 3469
|
||||||
|
Q 1841 3913 1700 4092
|
||||||
|
Q 1559 4272 1350 4272
|
||||||
|
z
|
||||||
|
M 1353 -169
|
||||||
|
L 3859 4659
|
||||||
|
L 4316 4659
|
||||||
|
L 1819 -169
|
||||||
|
L 1353 -169
|
||||||
|
z
|
||||||
|
M 3334 1075
|
||||||
|
Q 3334 1569 3581 1911
|
||||||
|
Q 3828 2253 4300 2253
|
||||||
|
Q 4731 2253 5014 1945
|
||||||
|
Q 5297 1638 5297 1041
|
||||||
|
Q 5297 459 5011 145
|
||||||
|
Q 4725 -169 4303 -169
|
||||||
|
Q 3888 -169 3611 142
|
||||||
|
Q 3334 453 3334 1075
|
||||||
|
z
|
||||||
|
M 4316 1866
|
||||||
|
Q 4103 1866 3964 1684
|
||||||
|
Q 3825 1503 3825 1019
|
||||||
|
Q 3825 581 3965 400
|
||||||
|
Q 4106 219 4313 219
|
||||||
|
Q 4528 219 4667 400
|
||||||
|
Q 4806 581 4806 1063
|
||||||
|
Q 4806 1506 4665 1686
|
||||||
|
Q 4525 1866 4316 1866
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#ArialMT-37"/>
|
||||||
|
<use xlink:href="#ArialMT-37" transform="translate(55.615234 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-2e" transform="translate(111.230469 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-34" transform="translate(139.013672 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-25" transform="translate(194.628906 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_3">
|
||||||
|
<!-- Compose -->
|
||||||
|
<g style="fill: #262626" transform="translate(348.434338 329.82462) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="ArialMT-43" d="M 3763 1606
|
||||||
|
L 4369 1453
|
||||||
|
Q 4178 706 3683 314
|
||||||
|
Q 3188 -78 2472 -78
|
||||||
|
Q 1731 -78 1267 223
|
||||||
|
Q 803 525 561 1097
|
||||||
|
Q 319 1669 319 2325
|
||||||
|
Q 319 3041 592 3573
|
||||||
|
Q 866 4106 1370 4382
|
||||||
|
Q 1875 4659 2481 4659
|
||||||
|
Q 3169 4659 3637 4309
|
||||||
|
Q 4106 3959 4291 3325
|
||||||
|
L 3694 3184
|
||||||
|
Q 3534 3684 3231 3912
|
||||||
|
Q 2928 4141 2469 4141
|
||||||
|
Q 1941 4141 1586 3887
|
||||||
|
Q 1231 3634 1087 3207
|
||||||
|
Q 944 2781 944 2328
|
||||||
|
Q 944 1744 1114 1308
|
||||||
|
Q 1284 872 1643 656
|
||||||
|
Q 2003 441 2422 441
|
||||||
|
Q 2931 441 3284 734
|
||||||
|
Q 3638 1028 3763 1606
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-6f" d="M 213 1659
|
||||||
|
Q 213 2581 725 3025
|
||||||
|
Q 1153 3394 1769 3394
|
||||||
|
Q 2453 3394 2887 2945
|
||||||
|
Q 3322 2497 3322 1706
|
||||||
|
Q 3322 1066 3130 698
|
||||||
|
Q 2938 331 2570 128
|
||||||
|
Q 2203 -75 1769 -75
|
||||||
|
Q 1072 -75 642 372
|
||||||
|
Q 213 819 213 1659
|
||||||
|
z
|
||||||
|
M 791 1659
|
||||||
|
Q 791 1022 1069 705
|
||||||
|
Q 1347 388 1769 388
|
||||||
|
Q 2188 388 2466 706
|
||||||
|
Q 2744 1025 2744 1678
|
||||||
|
Q 2744 2294 2464 2611
|
||||||
|
Q 2184 2928 1769 2928
|
||||||
|
Q 1347 2928 1069 2612
|
||||||
|
Q 791 2297 791 1659
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-73" d="M 197 991
|
||||||
|
L 753 1078
|
||||||
|
Q 800 744 1014 566
|
||||||
|
Q 1228 388 1613 388
|
||||||
|
Q 2000 388 2187 545
|
||||||
|
Q 2375 703 2375 916
|
||||||
|
Q 2375 1106 2209 1216
|
||||||
|
Q 2094 1291 1634 1406
|
||||||
|
Q 1016 1563 777 1677
|
||||||
|
Q 538 1791 414 1992
|
||||||
|
Q 291 2194 291 2438
|
||||||
|
Q 291 2659 392 2848
|
||||||
|
Q 494 3038 669 3163
|
||||||
|
Q 800 3259 1026 3326
|
||||||
|
Q 1253 3394 1513 3394
|
||||||
|
Q 1903 3394 2198 3281
|
||||||
|
Q 2494 3169 2634 2976
|
||||||
|
Q 2775 2784 2828 2463
|
||||||
|
L 2278 2388
|
||||||
|
Q 2241 2644 2061 2787
|
||||||
|
Q 1881 2931 1553 2931
|
||||||
|
Q 1166 2931 1000 2803
|
||||||
|
Q 834 2675 834 2503
|
||||||
|
Q 834 2394 903 2306
|
||||||
|
Q 972 2216 1119 2156
|
||||||
|
Q 1203 2125 1616 2013
|
||||||
|
Q 2213 1853 2448 1751
|
||||||
|
Q 2684 1650 2818 1456
|
||||||
|
Q 2953 1263 2953 975
|
||||||
|
Q 2953 694 2789 445
|
||||||
|
Q 2625 197 2315 61
|
||||||
|
Q 2006 -75 1616 -75
|
||||||
|
Q 969 -75 630 194
|
||||||
|
Q 291 463 197 991
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#ArialMT-43"/>
|
||||||
|
<use xlink:href="#ArialMT-6f" transform="translate(72.216797 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-6d" transform="translate(127.832031 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-70" transform="translate(211.132812 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-6f" transform="translate(266.748047 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-73" transform="translate(322.363281 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-65" transform="translate(372.363281 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g id="text_4">
|
||||||
|
<!-- 22.6% -->
|
||||||
|
<g style="fill: #262626" transform="translate(276.514892 280.209809) scale(0.1 -0.1)">
|
||||||
|
<defs>
|
||||||
|
<path id="ArialMT-32" d="M 3222 541
|
||||||
|
L 3222 0
|
||||||
|
L 194 0
|
||||||
|
Q 188 203 259 391
|
||||||
|
Q 375 700 629 1000
|
||||||
|
Q 884 1300 1366 1694
|
||||||
|
Q 2113 2306 2375 2664
|
||||||
|
Q 2638 3022 2638 3341
|
||||||
|
Q 2638 3675 2398 3904
|
||||||
|
Q 2159 4134 1775 4134
|
||||||
|
Q 1369 4134 1125 3890
|
||||||
|
Q 881 3647 878 3216
|
||||||
|
L 300 3275
|
||||||
|
Q 359 3922 746 4261
|
||||||
|
Q 1134 4600 1788 4600
|
||||||
|
Q 2447 4600 2831 4234
|
||||||
|
Q 3216 3869 3216 3328
|
||||||
|
Q 3216 3053 3103 2787
|
||||||
|
Q 2991 2522 2730 2228
|
||||||
|
Q 2469 1934 1863 1422
|
||||||
|
Q 1356 997 1212 845
|
||||||
|
Q 1069 694 975 541
|
||||||
|
L 3222 541
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
<path id="ArialMT-36" d="M 3184 3459
|
||||||
|
L 2625 3416
|
||||||
|
Q 2550 3747 2413 3897
|
||||||
|
Q 2184 4138 1850 4138
|
||||||
|
Q 1581 4138 1378 3988
|
||||||
|
Q 1113 3794 959 3422
|
||||||
|
Q 806 3050 800 2363
|
||||||
|
Q 1003 2672 1297 2822
|
||||||
|
Q 1591 2972 1913 2972
|
||||||
|
Q 2475 2972 2870 2558
|
||||||
|
Q 3266 2144 3266 1488
|
||||||
|
Q 3266 1056 3080 686
|
||||||
|
Q 2894 316 2569 119
|
||||||
|
Q 2244 -78 1831 -78
|
||||||
|
Q 1128 -78 684 439
|
||||||
|
Q 241 956 241 2144
|
||||||
|
Q 241 3472 731 4075
|
||||||
|
Q 1159 4600 1884 4600
|
||||||
|
Q 2425 4600 2770 4297
|
||||||
|
Q 3116 3994 3184 3459
|
||||||
|
z
|
||||||
|
M 888 1484
|
||||||
|
Q 888 1194 1011 928
|
||||||
|
Q 1134 663 1356 523
|
||||||
|
Q 1578 384 1822 384
|
||||||
|
Q 2178 384 2434 671
|
||||||
|
Q 2691 959 2691 1453
|
||||||
|
Q 2691 1928 2437 2201
|
||||||
|
Q 2184 2475 1800 2475
|
||||||
|
Q 1419 2475 1153 2201
|
||||||
|
Q 888 1928 888 1484
|
||||||
|
z
|
||||||
|
" transform="scale(0.015625)"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#ArialMT-32"/>
|
||||||
|
<use xlink:href="#ArialMT-32" transform="translate(55.615234 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-2e" transform="translate(111.230469 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-36" transform="translate(139.013672 0)"/>
|
||||||
|
<use xlink:href="#ArialMT-25" transform="translate(194.628906 0)"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 12 KiB |
|
After Width: | Height: | Size: 41 KiB |
|
After Width: | Height: | Size: 44 KiB |
|
After Width: | Height: | Size: 49 KiB |
|
After Width: | Height: | Size: 30 KiB |
@@ -0,0 +1,27 @@
|
|||||||
|
letter,Male,Female
|
||||||
|
a,0.1726871198212362,0.1780007719968084
|
||||||
|
b,0.06275449167118631,0.05757115683434764
|
||||||
|
c,0.002527913112031784,0.002525362815502787
|
||||||
|
d,0.02639274743484273,0.025798130028588412
|
||||||
|
e,0.060460557268468315,0.05992111228866155
|
||||||
|
f,0.004168185425527368,0.005738163668905593
|
||||||
|
g,0.03710295718248242,0.035944081768606244
|
||||||
|
h,0.015744753594548896,0.016324638692088497
|
||||||
|
i,0.07320872667180656,0.07954877144283247
|
||||||
|
j,0.00442530712700423,0.004397881604276826
|
||||||
|
k,0.06012485644271973,0.05719911396875115
|
||||||
|
l,0.04930645065793003,0.04598845291218479
|
||||||
|
m,0.08281339976187696,0.08014229460267776
|
||||||
|
n,0.08138893330151427,0.08430430794896865
|
||||||
|
o,0.06920807306069308,0.06452478894803111
|
||||||
|
p,0.009832203366545821,0.009371006578405026
|
||||||
|
q,5.0822826402147366e-05,8.43622136063042e-05
|
||||||
|
r,0.009139850680293098,0.010064380634131025
|
||||||
|
s,0.032639239825093015,0.034139532349508485
|
||||||
|
t,0.0277669772899704,0.027953179679053274
|
||||||
|
u,0.06917254296038988,0.06619473621457156
|
||||||
|
v,0.0035449558612418576,0.006171217790567778
|
||||||
|
w,0.013512780220408454,0.014295070954872152
|
||||||
|
x,4.796818419701171e-05,1.6707334940670683e-05
|
||||||
|
y,0.020592394840652214,0.020809516372185803
|
||||||
|
z,0.01138579141093724,0.012971260356926069
|
||||||
|
@@ -0,0 +1,7 @@
|
|||||||
|
sex,position,2-grams,3-grams,4-grams
|
||||||
|
female,prefix,"ka, ma, mu, mb, ng, ba, ki, lu, ts, bo","tsh, kab, ngo, mas, kas, kal, muk, kav, mbu, man","tshi, kavi, ngoy, kaso, ilun, mbuy, kaba, ntum, kavu, ngal"
|
||||||
|
female,suffix,"ba, ga, la, ka, ma, da, go, ya, bo, na","nga, mba, ngo, nda, ala, mbo, ngu, ndo, mbe, mbu","anga, amba, ongo, umba, inga, ombo, unga, enga, anda, ungu"
|
||||||
|
female,any,"ng, ka, mb, an, ba, ma, nd, ga, la, am","nga, mba, ang, ngo, amb, ong, nda, ala, mbo, eng","anga, amba, ongo, tshi, umba, inga, ombo, unga, anda, enga"
|
||||||
|
male,prefix,"ka, mu, ma, ba, mb, ng, ki, lu, ts, bo","kab, tsh, kal, kas, muk, ngo, kam, mut, mul, mbu","tshi, ngoy, ilun, kaba, kaso, kamb, muke, kabe, kalo, muto"
|
||||||
|
male,suffix,"ba, ga, la, go, ka, da, bo, le, di, ma","nga, mba, ngo, nda, ala, mbo, ngu, mbe, ndo, ele","amba, ongo, anga, umba, unga, ombo, anda, enga, onga, angu"
|
||||||
|
male,any,"ng, ka, mb, ba, an, ma, mu, nd, am, al","nga, mba, ngo, amb, ang, ong, ala, nda, shi, mbo","amba, ongo, anga, tshi, umba, unga, ombo, anda, lung, enga"
|
||||||
|
|
After Width: | Height: | Size: 37 KiB |
|
After Width: | Height: | Size: 454 KiB |
@@ -0,0 +1,29 @@
|
|||||||
|
^,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,$
|
||||||
|
0.0,0.03240135083510582,0.09260231861008805,0.008075675043119726,0.017857766912447937,0.02028878683448995,0.008457996285223429,0.0056965765224603,0.0036982016623235744,0.026885400880147632,0.0030749451480349046,0.18878761331096985,0.060296922518691204,0.26743305983402327,0.1199608033359101,0.014603892627340732,0.012508764232704531,9.485640637203402e-06,0.005467323565586415,0.032579081785992364,0.042149993340081786,0.004793643382647348,0.005066031042208305,0.01062281917633087,8.287454451451392e-06,0.010728658956072298,0.005944601062910965,0.0
|
||||||
|
0.0,0.0009791886146586694,0.048581928219493954,0.0020218103912897157,0.020990537577026614,0.001131268956960644,0.006787613741763863,0.006186016110784558,0.011832275288244888,0.004851088661689449,0.0032179280340603345,0.04386672984621395,0.07634733982374604,0.08293271059721305,0.13517553734169577,0.001808420172771762,0.01144167917930759,6.458327509042543e-06,0.010475318749427155,0.046487395291047624,0.033763074574396215,0.004567187662011127,0.011274824306950683,0.008258962300707514,1.6278524132381206e-05,0.032413284125006325,0.011652857641829295,0.3829322859400618
|
||||||
|
0.0,0.39420804956524974,0.00012606255453160502,2.546718273365758e-07,0.0006208899150465718,0.11658188641534632,1.273359136682879e-06,5.781050480540271e-05,0.0016016311221197252,0.11411997386048364,6.366795683414395e-06,1.4261622330848246e-05,0.0003718208679114007,2.2156448978282096e-05,3.8200774100486376e-05,0.19131406078812782,6.366795683414395e-06,2.546718273365758e-07,0.0013390644681357156,1.884571522290661e-05,9.422857611453305e-06,0.14365426567670633,1.222424771215564e-05,0.02738409757801999,2.546718273365758e-07,0.004381374117498451,1.3242935021501942e-05,0.004095886999054148
|
||||||
|
0.0,0.07019293726020524,3.059582305823609e-05,0.004779067561696477,1.8357493834941653e-05,0.10123545933509158,0.0,7.954913995141383e-05,0.33171991359739567,0.3235936629931282,1.8357493834941653e-05,0.03783479479381475,0.015714014722710057,8.566830456306106e-05,4.283415228153053e-05,0.03822642132896017,2.4476658446588873e-05,0.00897069532067482,0.0032064422565031424,8.566830456306106e-05,0.013223514725769638,0.018045416439747646,6.119164611647218e-06,0.0037694054007746864,1.8357493834941653e-05,0.008328183036451864,1.2238329223294436e-05,0.02073784886887242
|
||||||
|
0.0,0.2424810241211335,1.5959776634059462e-05,5.911028382984986e-07,0.0006726750299836915,0.10030246732235734,9.457645412775978e-06,4.0786095842596404e-05,0.004887238267051987,0.3104158822239417,0.06585713162618893,1.5959776634059462e-05,2.6008524885133942e-05,0.00012176718468949073,7.152344343411834e-05,0.1405991300148426,1.773308514895496e-06,5.911028382984986e-07,0.009874964016614718,9.398535128946128e-05,1.4777570957462466e-05,0.0932293307592775,3.014624475322343e-05,0.00875600634371566,5.911028382984986e-07,0.004086293921157521,0.0022692437962279362,0.016124694325944745
|
||||||
|
0.0,0.006205216662872565,0.028941081239369993,0.0011474697534826386,0.01425752342098435,0.003730367449534814,0.004639925223263839,0.005866185675542982,0.003561493573938398,0.0016397191355402771,0.0038625407716201547,0.05415821120988892,0.10151501424777083,0.09480497248870047,0.17112338854414036,0.0027153276653648665,0.008680322523437667,5.646239001703881e-06,0.03422185458932722,0.03327456967317772,0.03469819184328915,0.002615748541152998,0.002961965650848386,0.005427832211228881,0.00028205530285784386,0.028868963368484594,0.01293322372785744,0.33786118926732095
|
||||||
|
0.0,0.23077320736794152,3.230986452473805e-06,6.46197290494761e-06,3.5540850977211856e-05,0.08830932171901404,0.0015282565920201096,3.877183742968566e-05,0.000132470444551426,0.14398568026804265,6.46197290494761e-06,9.692959357421415e-05,0.0038481048648963015,3.5540850977211856e-05,1.938591871484283e-05,0.08549513251890935,6.46197290494761e-06,6.46197290494761e-06,0.013518447317150399,0.0002003211600533759,0.0005040338865859135,0.3976407336924036,0.0,0.02652962976126241,0.0,0.004814169814185969,6.46197290494761e-06,0.0024587806903325652
|
||||||
|
0.0,0.38404120076639636,0.02137884680216012,5.907040897397653e-06,4.261508075979735e-05,0.09362322277181143,5.485109404726393e-06,4.514666971582492e-05,0.015463789206401714,0.07891806638923264,1.0548287316781523e-05,1.4345670750822873e-05,0.000878039436248894,2.9113272994317006e-05,0.0005134906265809245,0.2519884576420865,7.172835375411436e-06,8.438629853425219e-07,0.0015463789206401712,2.3206232096919352e-05,2.3628163589590613e-05,0.11810284411361265,5.485109404726393e-06,0.020353131343476286,2.5315889560275656e-06,0.002251004513401177,6.750903882740175e-06,0.010718747639820713
|
||||||
|
0.0,0.2614230396902226,4.453049370764763e-05,3.872216844143272e-06,2.5169409486931267e-05,0.09374152952565344,2.5169409486931267e-05,3.484995159728945e-05,2.420135527589545e-05,0.4197734753146176,1.1616650532429817e-05,9.486931268151016e-05,7.454017424975799e-05,0.0002468538238141336,0.0006030977734753146,0.1158296224588577,2.3233301064859633e-05,9.68054211035818e-07,0.002967086156824782,0.00020909970958373668,0.00020716360116166505,0.07815295256534366,1.1616650532429817e-05,0.007239109390125847,0.0,0.0033049370764762828,1.0648596321393998e-05,0.01591674733785092
|
||||||
|
0.0,0.057525431732821986,0.041996980300875106,0.0033912381338324768,0.016022951674192765,0.02875699673928052,0.007807629805802953,0.006448520094153173,0.005039572274652094,0.0003533603548473279,0.0023910036496201273,0.05831058618602033,0.08421986612340406,0.07337456279315632,0.10815890676433847,0.011699700072632914,0.009224952061459374,0.0004542620977921718,0.03307003561382166,0.050486207507906694,0.048087850694833095,0.002131396128804547,0.005388030520530441,0.005741799384458517,9.109752500688338e-05,0.017802008720851855,0.013278383415184936,0.3087466696297192
|
||||||
|
0.0,0.22479395580338848,3.1523973982213474e-05,4.2031965309617963e-05,0.00024168380053030328,0.10204660644420081,1.0507991327404491e-05,2.1015982654808982e-05,0.0002346784729787003,0.42807805335957994,9.457192194664042e-05,8.756659439503743e-05,3.502663775801497e-05,0.0007320567291425129,0.0001260958959288539,0.12674739139115296,2.8021310206411978e-05,0.0,3.502663775801497e-05,1.0507991327404491e-05,7.355593929183144e-05,0.06349628892772954,7.0053275516029945e-06,0.001986010360879449,0.0,0.00035727170513175267,1.0507991327404491e-05,0.05067303684452026
|
||||||
|
0.0,0.5034268610320017,6.895672925910058e-05,5.506050625251378e-06,1.3109644345836613e-05,0.07313660826232712,0.00021106527396796946,4.719471964501181e-06,0.00408968465012719,0.14606084028625171,6.292629286001574e-06,2.9627796221590745e-05,0.0002089677308726356,3.9853318811343305e-05,5.322515604409665e-05,0.12817613908388756,0.00323913092496931,2.8841217560840547e-06,0.0001966446651875492,0.0006470920449104952,2.0975430953338582e-05,0.10610631502055855,3.146314643000787e-06,0.021257812692547902,7.865786607501968e-07,0.01034980201815109,1.0225522589752559e-05,0.0026337275490785753
|
||||||
|
0.0,0.2907864656030218,0.0008378546668440517,4.703067334072724e-05,0.00035917260804733476,0.17203916946407904,0.00028798234223705586,9.696049777780069e-05,0.00010694646266521537,0.11111415343016283,4.831918493910333e-06,0.00016943927518645567,0.0027625688669183344,0.00026962105196019656,7.408941690662511e-05,0.17521728332147365,0.000927728350830784,3.5434068955342444e-06,9.663836987820666e-06,0.00025609168017724765,0.0001262741366408567,0.2173715845181466,0.0004770714192987469,0.010590921082852263,6.442557991880444e-07,0.004126136265899831,2.0938313473611443e-05,0.011915833133882475
|
||||||
|
0.0,0.2455144244761935,0.34861793685557824,2.345028152630318e-05,5.805835829496029e-05,0.02770991169645199,0.00440562708416741,6.392092867653609e-05,1.3805407672743e-05,0.04043547172794345,6.6190310759726715e-06,7.224199631490173e-05,7.98066032588705e-05,0.0006679547931524422,0.00019195190120320746,0.05560269776577443,0.03787277201050043,5.673455207976575e-07,1.7209480797528944e-05,0.0004308043654590213,0.00011952078971470652,0.19084103867348565,0.006140002341245849,0.037190633579328045,5.673455207976575e-07,0.0008513965115436847,0.0001529941754417683,0.00291861447415675
|
||||||
|
0.0,0.06852883077720875,0.00014519177304071664,0.0020409279683942802,0.189448848282653,0.02723591583598238,0.0006794974978305538,0.3949594662038515,0.00015362226308824212,0.04945662681480349,0.007555592524815616,0.040859213064337006,0.003013431831432164,4.027900800484397e-05,0.0018093705084222468,0.014132311483001883,5.18943498481013e-05,1.5924258978659246e-05,0.0002334309022048167,0.0345440266418473,0.03498709572990059,0.011673605896696897,0.0013887827271623645,0.0007604302022867985,0.0,0.03952251203124602,0.06717414469868305,0.009589026724278762
|
||||||
|
0.0,0.002811754741536618,0.022217453078125466,0.0009318958571949934,0.010439299379577709,0.0016556070980166227,0.007270623934115422,0.004762079642666141,0.004112736445620663,0.006897636060238115,0.0017462718427437218,0.0702396992225785,0.08710265314882101,0.10264970585602438,0.195075412409823,0.004091849124683534,0.008484842920460622,2.75437199170934e-06,0.007752409501665248,0.028939268392892802,0.03214650504623902,0.0037128934448241906,0.0025411376933511757,0.007407424409703652,2.2723568931602058e-05,0.045256168071778936,0.005312035917010772,0.33641715881938433
|
||||||
|
0.0,0.2755187546363871,0.00010093905364029487,3.204414401279202e-06,6.408828802558404e-06,0.20437434609918623,0.00369629201187556,3.204414401279202e-06,0.055202446890836816,0.1479045533126435,6.408828802558404e-06,0.0002483421160991382,0.0013074010757219146,6.729270242686325e-05,0.00011055229684413248,0.14242500468645605,0.0011712134636675483,0.0,0.0033470108421361265,0.0001538118912614017,0.0002675686025068134,0.14199881757108593,9.613243203837607e-06,0.012518044858597203,0.0,0.007613688617439385,8.011036003198005e-06,0.0019370685055732778
|
||||||
|
0.0,0.003864734299516908,0.0007246376811594203,0.0,0.0004830917874396135,0.0007246376811594203,0.0,0.0,0.0004830917874396135,0.00024154589371980676,0.0,0.0004830917874396135,0.0004830917874396135,0.0007246376811594203,0.0016908212560386474,0.0007246376811594203,0.0,0.0,0.00024154589371980676,0.001932367149758454,0.0007246376811594203,0.9835748792270531,0.0,0.00024154589371980676,0.0,0.0,0.0,0.0026570048309178746
|
||||||
|
0.0,0.2822393929187961,0.0012267882391033057,0.007813046583785243,0.008807168087886199,0.18842588750970124,0.00023592081521217416,0.004038313540390457,0.03821103686384766,0.19376257905381108,5.694640367190411e-05,0.000777725170147719,0.003924420733046649,0.0033565837364325194,0.004537814852598302,0.0694306823968904,0.0003530677027658055,0.00015294176986168533,0.0043604674811629435,0.0014366764126368952,0.011607304108438968,0.1015468270277394,0.0017295436315209734,0.0317907366098667,3.254080209823092e-06,0.008437829984071277,0.00010738464692416203,0.03162965963948045
|
||||||
|
0.0,0.23379934463089652,3.257301707105292e-05,0.0016072457280488111,4.281025100766955e-05,0.13016736016171107,7.445261044812096e-06,1.2098549197819656e-05,0.244367427355192,0.15211273242011003,1.535585090492495e-05,0.0005100003815696286,0.0002601188077531226,0.0006663508635106826,0.00011400555974868522,0.11456581564230732,0.001241497279222417,9.306576306015119e-06,0.00021265526859244547,0.016044072222754766,0.009141384576583351,0.07100824655726476,6.97993222951134e-06,0.0097877263010361,1.3959864459022679e-06,0.0037649754445984165,8.375918675413608e-06,0.010492699456216746
|
||||||
|
0.0,0.22270715194197135,1.4995917777938228e-05,0.004287166271403895,0.0006853689828879916,0.1378752450721748,7.386878016539942e-05,6.1094479836044635e-06,0.02331143188798605,0.08055140545073841,6.6648523457503235e-06,9.719576337552555e-05,2.221617448583441e-05,6.831473654394081e-05,4.6653966420252264e-05,0.14458064193636178,6.1094479836044635e-06,1.6662130864375809e-06,0.003284661397730618,0.19145788091019666,0.0033663058389660594,0.16493732261773184,8.331065432187904e-06,0.011131414226127331,4.443234897166882e-06,0.0022449444317935675,1.943915267510511e-05,0.009203050280756905
|
||||||
|
0.0,0.052226842718488986,0.03074039814488894,0.0014204842607524314,0.01637797660464469,0.022525374073740573,0.006430721206713545,0.012709093373135088,0.013169096520166011,0.009173231318596861,0.007254770344054158,0.09023383531205176,0.09354867756485294,0.11036573774681681,0.12381315549879986,0.0011981001392515705,0.007636325104666064,1.5917063911104562e-06,0.014746477553756472,0.05062194790299505,0.05958757523086564,0.0004966123940264624,0.005729233461488436,0.007166089559406575,7.64019067733019e-05,0.0306126068603455,0.018867178013714143,0.21327046547861703
|
||||||
|
0.0,0.18540060189078825,2.026568310551328e-05,0.0,1.68880692545944e-05,0.08221449874521645,3.3776138509188797e-06,4.053136621102656e-05,0.00018239114794961952,0.2600120918575863,3.3776138509188797e-06,3.3776138509188797e-06,9.119557397480976e-05,6.755227701837759e-06,4.390898006194544e-05,0.05670675894307707,0.0,3.3776138509188797e-06,0.0005843271962089662,1.013284155275664e-05,1.68880692545944e-05,0.3652484066106658,2.364329695643216e-05,0.011642634944117379,0.0,0.01438187977721259,3.37761385091888e-05,0.02330891318519119
|
||||||
|
0.0,0.5743844833643246,2.79704006032656e-05,0.0,2.0138688434351233e-05,0.2559571359204835,2.237632048261248e-06,1.5663424337828736e-05,0.00014768371518524237,0.09644529772813218,4.475264096522496e-06,2.1257504458481857e-05,3.915856084457184e-05,6.601014542370682e-05,7.94359377132743e-05,0.03731027677270805,1.3425792289567489e-05,5.59408012065312e-06,2.5732768555004354e-05,3.132684867565747e-05,3.692092879631059e-05,0.02981420941103287,7.831712168914368e-06,5.258435313413933e-05,2.237632048261248e-06,0.00013425792289567488,3.4683296748049344e-05,0.005319970194741118
|
||||||
|
0.0,0.2527091460771565,0.0013003901170351106,0.008669267446900737,0.0,0.045947117468573904,0.0,0.0,0.0034677069787602947,0.2492414390983962,0.0,0.0,0.0008669267446900737,0.0,0.00043346337234503684,0.03034243606415258,0.002600780234070221,0.0,0.0,0.002600780234070221,0.007368877329865626,0.014304291287386216,0.0,0.0013003901170351106,0.009102730819245773,0.0039011703511053317,0.0,0.3658430862592111
|
||||||
|
0.0,0.37652174563292656,0.0002512562814070352,0.00022957047140464225,0.0004613843024647045,0.13366460277578368,3.514596793491266e-05,9.496889207944485e-05,8.001316104331179e-05,0.20418461354391002,6.730078966259871e-06,0.0003701543431442929,0.0011328966259870784,0.0010469011725293131,0.0007941493180186647,0.10861898779612347,0.00036940655659248626,7.477865518066523e-07,0.0004120303900454654,0.0007156317300789662,0.0002736898779612347,0.05130638310600622,0.0006640344580043072,0.0017565506101938262,3.7389327590332615e-06,3.0659248624072745e-05,0.00010543790380473797,0.11686856903565446
|
||||||
|
0.0,0.42030782462700017,0.00013905412295474562,5.150152702027615e-06,0.00033475992563179497,0.11766811385957594,3.8626145265207114e-06,1.1587843579562133e-05,0.00012746627937518349,0.2313873481348722,6.308937059983828e-05,0.00010042797768953849,3.090091621216569e-05,9.656536316301779e-05,0.0001660924246403906,0.11382481240568783,3.8626145265207114e-06,0.0,0.0002060061080811046,7.338967600389351e-05,3.3475992563179495e-05,0.09470873311393682,0.00025750763510138075,0.005945851294490882,0.0,0.00251584959494049,0.0002600827114523946,0.011728185240692387
|
||||||
|
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
||||||
|
|
After Width: | Height: | Size: 224 KiB |
@@ -0,0 +1,27 @@
|
|||||||
|
letter,Male,Female
|
||||||
|
a,0.10419923843992125,0.11230333493368445
|
||||||
|
b,0.018546448249013497,0.015225074098649389
|
||||||
|
c,0.041480767577953326,0.04405987823025858
|
||||||
|
d,0.03868872722439995,0.026961499859145175
|
||||||
|
e,0.13138825792008238,0.18038344738539613
|
||||||
|
f,0.010247500256025038,0.007027086508994281
|
||||||
|
g,0.0180527572420696,0.017807501663033867
|
||||||
|
h,0.031508761634381516,0.03697185263156448
|
||||||
|
i,0.09337919525658041,0.10271520299704247
|
||||||
|
j,0.026664242619993696,0.012183083323972286
|
||||||
|
k,0.012803255631156278,0.004848323140290566
|
||||||
|
l,0.0509726758057992,0.06672341587576307
|
||||||
|
m,0.03320386129622267,0.030360801411648666
|
||||||
|
n,0.07989188838489009,0.08206189166389144
|
||||||
|
o,0.057005660062330925,0.03761362219276409
|
||||||
|
p,0.021467218695097115,0.011531157247822707
|
||||||
|
q,0.0018784195453980996,0.001950967247682959
|
||||||
|
r,0.0734505638264324,0.06822482369855525
|
||||||
|
s,0.05242163399917173,0.0432875249054165
|
||||||
|
t,0.03949576796436023,0.04783038894946737
|
||||||
|
u,0.031398878017324786,0.020461941209356852
|
||||||
|
v,0.011932256018971252,0.013217596918162086
|
||||||
|
w,0.0020867055933035655,0.0015356192415710282
|
||||||
|
x,0.002258056881804058,0.0006618534967773684
|
||||||
|
y,0.013323645811687641,0.011759717016584062
|
||||||
|
z,0.002253616045629267,0.002292394152504883
|
||||||
|
@@ -0,0 +1,7 @@
|
|||||||
|
sex,position,2-grams,3-grams,4-grams
|
||||||
|
female,prefix,"ma, ch, be, na, an, sa, es, jo, ju, me","mar, cha, est, chr, gra, dor, sar, rut, ben, mer","mari, chri, esth, grac, sara, dorc, ruth, rach, naom, jean"
|
||||||
|
female,suffix,"ne, ie, te, le, ce, ia, se, el, ah, th","ine, tte, lle, rah, nce, ene, nne, her, lie, rie","ette, line, tine, elle, ther, arie, ille, rcas, ruth, arah"
|
||||||
|
female,any,"ne, in, el, an, ie, ri, ra, li, ar, er","ine, tte, ett, mar, ari, lle, lin, eli, the, ell","ette, line, ther, mari, tine, elle, rist, chri, ance, hris"
|
||||||
|
male,prefix,"jo, je, pa, ch, ma, da, al, ju, be, fr","jea, jos, chr, pat, mar, jon, fra, dan, cha, ben","jean, chri, jose, jona, patr, fran, mich, emma, davi, dieu"
|
||||||
|
male,suffix,"in, el, an, ck, on, re, ce, er, se, is","ean, tin, ick, ier, ard, uel, ert, ain, iel, ise","jean, stin, rick, bert, seph, than, oise, avid, ndre, tian"
|
||||||
|
male,any,"an, er, el, ie, in, ri, is, on, en, re","ric, sti, jea, ean, tin, ris, ier, ist, ick, jos","jean, stin, rist, rick, chri, hris, usti, bert, jose, atha"
|
||||||
|
|
After Width: | Height: | Size: 38 KiB |
|
After Width: | Height: | Size: 455 KiB |
@@ -0,0 +1,29 @@
|
|||||||
|
^,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,$
|
||||||
|
0.0,0.08177487512565312,0.05064544871095967,0.07327059962450833,0.06495363065149523,0.061627841497941024,0.044425593979353147,0.05752526940789172,0.024078074472915836,0.016592203335813446,0.11910658380449511,0.017117380488251007,0.03302685352589563,0.08869583137140728,0.035182675783582286,0.013984289415344039,0.05503217558728984,0.00021346554218849907,0.05022750354744187,0.05679281701430199,0.024045325783562315,0.0015048422132203263,0.015574997094552255,0.0054400764881583534,0.00016613969233005726,0.0062853521103135605,0.002710153731134059,0.0
|
||||||
|
0.0,0.005939042406371192,0.02544883058808627,0.060741665942489004,0.03423882544765226,0.014545218880858291,0.0021159734936245364,0.009683458260117744,0.022014608259160358,0.0375319435995705,0.0012071918943162118,0.00483524835366186,0.06685892969977804,0.04498424729464408,0.22067338212572413,0.008778800791243883,0.010449662778691797,0.0001958961956369646,0.11578879149722136,0.04779808257392872,0.05734632827194516,0.041184450260777605,0.016855321085647877,0.0012893799222601413,0.0007600183229223588,0.005051176039980499,0.003479293182959682,0.1402042328307295
|
||||||
|
0.0,0.10740025063072939,0.008207325873470239,4.244850350569272e-05,0.003728085960065186,0.4237910943039645,3.691174217886323e-06,0.00013288227184390762,0.00039495564131383656,0.12999208243130264,3.1374980852033744e-05,2.7683806634147422e-05,0.046482956925842464,3.3220567960976905e-05,0.0004447864932553019,0.11562418678818012,9.227935544715807e-06,1.8455871089431614e-06,0.10180812169063161,0.0005019996936325399,4.060291639674955e-05,0.021691185291408975,2.214704530731794e-05,0.0056788715342181075,1.8455871089431614e-06,0.012356205694374466,1.291910976260213e-05,0.021538001561366694
|
||||||
|
0.0,0.10050204481617504,5.957221194601566e-06,0.012163156374077748,5.7338253998040075e-05,0.21854065952395846,8.191179142577154e-06,1.5637705635829112e-05,0.2589321085286557,0.08507730983805294,2.0105621531780285e-05,0.1057168473193994,0.04785063459298775,4.393450631018655e-05,2.978610597300783e-05,0.049474722021166005,5.957221194601566e-06,0.022256178383031452,0.010718530234386868,0.00032913647100173653,0.026146243823106274,0.0030076520506244655,1.1169789739877937e-05,1.71270109344795e-05,4.021124306356057e-05,0.017885067331492553,5.957221194601566e-06,0.04113833561197044
|
||||||
|
0.0,0.13123437278970704,4.496701950163427e-05,2.716757428223737e-05,0.01997097753616332,0.19341720307012325,3.6535703345077846e-05,0.0005414778598321794,0.001204741397481285,0.22505150128952298,0.023330388618097914,1.405219359426071e-05,0.000277296620260078,0.0018839307545372192,0.0003981454851707201,0.12731568440272087,7.494503250272378e-06,0.0,0.09698730337468113,0.00034755758823138154,4.215658078278213e-05,0.0051580918619999645,0.0001564477553494359,0.0013602523399244366,9.368129062840473e-07,0.04583919231738472,0.00014333237466145925,0.12520879217648806
|
||||||
|
0.0,0.031895413152132776,0.012309605339111969,0.00870721680660756,0.0298675163088017,0.01029528350039639,0.0020813614768936716,0.00574201810613231,0.0009116137714871438,0.010444608551166907,0.00043419130147119637,0.0026210201219160722,0.12830321631540123,0.027835024848523834,0.08354099609624636,0.013881382027362197,0.017625577146542382,3.717462802398891e-05,0.11198919346094116,0.05457966355708561,0.03163080498524293,0.014952763160862552,0.010523970116611377,0.0003372866531390005,0.007001151995776294,0.0020375037696743585,0.0024278373639262424,0.37798660543852275
|
||||||
|
0.0,0.24238728488124023,1.7778409899018633e-05,2.133409187882236e-05,4.977954771725217e-05,0.09607097141231688,0.010471483430521974,2.133409187882236e-05,0.00032712274214194285,0.18898094154458825,7.111363959607453e-06,5.689091167685962e-05,0.08809913241359693,7.111363959607453e-05,0.00019556250888920495,0.025647134120324277,1.066704593941118e-05,0.0,0.3050632911392405,0.00016711705305077514,0.0022969705589532072,0.02134475892476177,3.5556819798037263e-06,0.0007609159436779974,0.0,0.002400085336367515,3.5556819798037263e-06,0.015524107523823069
|
||||||
|
0.0,0.1356264584296794,0.001080252906268644,1.588607215100947e-05,0.0006989871746444167,0.31163178202896913,1.4120953023119531e-05,0.0013097183928943365,0.009215686966713384,0.0887413641546668,1.9416310406789355e-05,1.0590714767339647e-05,0.09359544175636414,8.649083726660713e-05,0.014772281981310918,0.05647322137771078,3.5302382557798828e-06,3.5302382557798828e-06,0.12695442815435615,0.00037420525511266755,0.00015709560238220477,0.14264104184391405,2.1181429534679295e-05,0.002757116077764088,8.825595639449706e-06,0.004749935573151832,3.883262081357871e-05,0.00899857731398292
|
||||||
|
0.0,0.23289963854076665,3.564871753738659e-05,4.690620728603499e-05,6.848306263761108e-05,0.26556981191549,1.7824358768693296e-05,3.940121412026939e-05,5.159682801463849e-05,0.12588969348669787,1.1257489748648397e-05,0.00019419169816418486,0.0004596808314031429,0.0005131539077092228,0.01605974725059266,0.07720574094452215,3.0019972663062393e-05,2.814372437162099e-06,0.09786511088158341,0.00017824358768693297,0.004121179372151034,0.016572901158301883,8.536929726058368e-05,0.00026642725738467874,0.0,0.010226491312501348,7.504993165765598e-06,0.15158116134140495
|
||||||
|
0.0,0.06228411243453732,0.005795446918619388,0.09845124598199072,0.03328635202876481,0.18937896710200938,0.005852137258390008,0.010564929010010733,0.0003991130242471751,0.00012739036120869017,0.0015853746742749012,0.004633946566422507,0.05203358674566824,0.029307928241758888,0.20644406259916737,0.010186993411539938,0.0038761205301784834,0.008475857408808376,0.03351050693544404,0.1267853384445865,0.034354671707545616,0.0007083034405823336,0.02195903569080437,0.0006861485951547353,0.002158142589888398,0.000965039002302149,0.0035929946378757934,0.052596254658219155
|
||||||
|
0.0,0.12490546761508407,4.295183412001663e-05,8.130168601288861e-05,4.295183412001663e-05,0.29643208317929476,7.669970378574399e-06,4.601982227144639e-06,0.0029621425602054325,0.02271998625541308,4.295183412001663e-05,3.374786966572735e-05,1.8407928908578555e-05,6.902973340716958e-05,5.6757780801450546e-05,0.36199192198719726,0.00021475917060008315,7.669970378574399e-06,0.00026691496917438906,3.988384596858687e-05,4.601982227144639e-06,0.1879909739788585,1.8407928908578555e-05,5.0621804498591024e-05,0.0,0.0005844517428473692,3.067988151429759e-06,0.0014066725674305445
|
||||||
|
0.0,0.17051293593552538,0.00010003301089359489,0.00018339385330492396,4.334763805389112e-05,0.16333056575336527,4.001320435743795e-05,1.0003301089359488e-05,0.0020973587950690394,0.09480795329125279,3.0009903268078467e-05,0.00013004291416167336,0.0040513369411905925,0.000266754695716253,0.00026342026201979984,0.04065008119346051,0.0005101683555573339,6.668867392906325e-06,0.002400792261446277,0.022800857616346728,0.00011337074567940754,0.031060249882461213,6.33542402326101e-05,0.007512479118108976,3.667877066098479e-05,0.03316094311122671,3.0009903268078467e-05,0.42578717643489017
|
||||||
|
0.0,0.14258031599664123,0.018121651433974158,0.0015416351691170474,0.010372152272561979,0.18043688111447329,0.003440854764523217,0.003236993073110312,0.0002837534353449891,0.16829829483469572,4.903700144796899e-05,0.0011846017203722846,0.08546157592801147,0.0018463257511476862,0.00020716755667906,0.0888495368482762,0.0208693764476935,4.297624846001776e-05,0.00014490709416647014,0.005683884347609529,0.0020215366102539125,0.020202693619018865,0.024516847791351416,0.001074406211500444,9.366618254106435e-06,0.0360691939639308,0.00015206980224313977,0.18330196434514115
|
||||||
|
0.0,0.31932204396028435,0.038368037519108326,0.00011873135546683685,0.00011081593176904773,0.17818311343296872,0.0006362021797098007,5.2439681997852944e-05,5.936567773341842e-05,0.20796192681201364,2.374627109336737e-05,0.00010092165214681132,0.00023053671519810822,0.04005699105062408,0.00020184330429362263,0.08672138203297763,0.010948020402004582,2.968283886670921e-06,0.0001612767578424534,0.005299376165669818,0.0002523041303670283,0.047970435892488755,0.0004581051465095455,0.006624220207087273,9.894279622236404e-07,0.018100595140919277,0.00036806720194719424,0.03766554366592954
|
||||||
|
0.0,0.09272466504327583,6.54951531625727e-05,0.0676682588133141,0.04243183895879178,0.2208676970453058,0.0016707146854644294,0.033734710117275545,0.0024899923798154134,0.0862990806755178,0.006808358436540491,0.00222879614025689,0.0004584660721380089,4.667019896015659e-05,0.05049048810360941,0.03840682843838769,0.0003961084113425055,0.0005533752162418567,0.005650231566546017,0.015463131131101295,0.04694629828845869,0.019948568656372817,0.007369969570245905,0.00011177316557684563,1.686402147299776e-05,0.014610517580350198,0.003478302475442259,0.23906279965503272
|
||||||
|
0.0,0.003164071575712822,0.016308332918058378,0.029613095334402047,0.041853890953758216,0.032495642848868175,0.0018930066187277572,0.011876432294110452,0.012510670409103,0.07853874127032717,0.0004316702272449279,0.00471407186694461,0.0692342092506865,0.04876514486192701,0.2053281179113374,0.0007785596452408519,0.01644424108555678,7.118999249916351e-05,0.17295026214096784,0.10292454961004063,0.022388605459236936,0.03825103015155055,0.011361275621116504,0.0008639876362398481,0.0006983091082417949,0.007007036807167667,0.002350564115972381,0.06718329028496059
|
||||||
|
0.0,0.26933300206615723,2.192209253680628e-05,0.0001096104626840314,0.00010778362163929754,0.08803364310467982,8.03810059682897e-05,2.5575774626273995e-05,0.34018520514511513,0.09085245883670416,2.9229456715741707e-05,2.5575774626273995e-05,0.04719826523174392,3.6536820894677135e-05,0.00012422519104190227,0.0354936946581341,0.013204407071336317,3.6536820894677134e-06,0.08679504487635026,0.003801656214091156,0.001415801809668739,0.0039459766566251305,1.8268410447338567e-06,0.0004694981484966012,3.6536820894677134e-06,0.01590082445336349,0.0,0.0028005473215770025
|
||||||
|
0.0,0.00038185048063353973,4.9806584430461705e-05,0.00014941975329138513,4.9806584430461705e-05,0.0014941975329138511,0.0,4.9806584430461705e-05,3.320438962030781e-05,0.00029883950658277027,0.0,0.0,3.320438962030781e-05,0.00011621536367107731,4.9806584430461705e-05,0.00018262414291169292,0.0,8.30109740507695e-05,8.30109740507695e-05,0.00023243072734215463,9.961316886092341e-05,0.9891919711785898,4.9806584430461705e-05,4.9806584430461705e-05,0.0,0.00011621536367107731,0.0,0.007205352547606794
|
||||||
|
0.0,0.1627793575149433,0.0035916362881505785,0.036496892212239035,0.040730844643505654,0.1503575411693532,0.0012619623047103333,0.015429811736916198,0.0012770756257248282,0.23819394035839017,0.00021692060750216365,0.000525410159974503,0.026246059979436992,0.01374067585882558,0.020855493981119907,0.0587779279725791,0.0009934786019822454,0.00013068577583122155,0.016929586592884027,0.006149343615133057,0.04790433801209155,0.02665767572236118,0.017815938419439997,0.00039161281805206186,2.5781547612962073e-05,0.01068822952333919,0.00010534873765986227,0.1017264302202411
|
||||||
|
0.0,0.11064837045719457,0.00019155378133028236,0.03693078718830975,0.00084374879871672,0.1936315534685894,6.319971696951493e-05,3.192563022171373e-05,0.02285614506485138,0.058747068868796735,1.1076239056512926e-05,0.0013578165996337023,0.008225084814671716,0.006192269176064638,0.0010717890145861037,0.04918110758481304,0.02873958263428148,0.0002521473244041472,0.005270335160478415,0.04403717185827366,0.20958133770996804,0.02021609090855783,8.274602118689069e-05,0.0006600135390733879,1.954630421737575e-06,0.024633555661684747,5.863891265212725e-06,0.17653570425659867
|
||||||
|
0.0,0.06351954441846226,1.4753288139094001e-05,0.0013484505359131916,1.99169389877769e-05,0.16353651069982222,1.84416101738675e-05,3.39325627199162e-05,0.18805131193614777,0.2149259016103214,1.4753288139094001e-05,1.69662813599581e-05,0.0001386809085074836,0.00011802630511275201,0.00019253041021517672,0.07676357118092694,5.38495017076931e-05,2.2129932208641e-06,0.06875179805699196,0.010426886392304685,0.08126775004979235,0.009717990897221218,0.0003732581899190782,0.0005030871255431055,1.10649661043205e-05,0.008581250046104025,3.98338779755538e-05,0.11155772592816623
|
||||||
|
0.0,0.018356080355773654,0.01654418269968032,0.045542802543262595,0.07255847970462294,0.18801741120403903,0.0036002028947896146,0.04503556556922605,0.002494898139737888,0.03139678907199226,0.0007679331862739315,0.011800927182003705,0.10299977587203472,0.02639873544642752,0.06185106106897243,0.0003314734644285327,0.009195144680499687,9.672891132789921e-05,0.06532858339329739,0.1206587002937256,0.050628148113196415,0.00014745260873155367,0.0076580986870819715,0.0010781734750451204,0.0062708645441355145,0.020688190815471907,0.010810045651327663,0.07974355042289408
|
||||||
|
0.0,0.12830504252517705,1.01674063454783e-05,3.304407062280448e-05,0.00014742739200943535,0.30917049215330417,5.08370317273915e-06,7.625554759108725e-06,0.00013471813407758747,0.5037873588636906,2.541851586369575e-06,7.625554759108725e-06,0.0008693132425383947,2.541851586369575e-05,0.00019572257215045728,0.04173211934501568,5.08370317273915e-06,0.0,0.00263081639189251,5.8462586486500224e-05,2.7960367450065326e-05,0.00580050532009537,4.321147696828278e-05,0.00031010589353708815,0.0,0.002524058625264988,5.08370317273915e-06,0.004161011046886994
|
||||||
|
0.0,0.36613233287858116,0.0001534788540245566,1.7053206002728514e-05,0.0002899045020463847,0.16732605729877217,1.7053206002728514e-05,6.821282401091406e-05,0.001892905866302865,0.4231412005457026,1.7053206002728514e-05,5.1159618008185536e-05,0.0003751705320600273,0.00042633015006821284,0.0036493860845839016,0.013250341064120055,1.7053206002728514e-05,5.1159618008185536e-05,0.0007162346521145975,0.0006309686221009549,0.0020804911323328784,0.007929740791268758,3.410641200545703e-05,3.410641200545703e-05,0.0,0.004280354706684857,6.821282401091406e-05,0.007349931787175989
|
||||||
|
0.0,0.42772328342798144,3.971248163297724e-05,0.012668281640919741,0.00015884992653190897,0.0298439299471824,1.985624081648862e-05,3.971248163297724e-05,0.0013899368571542036,0.19326079186688377,0.0,3.971248163297724e-05,0.00013899368571542036,0.00013899368571542036,0.00011913744489893174,0.02454231364917994,0.00230332393471268,3.971248163297724e-05,1.985624081648862e-05,0.0007148246693935904,0.002521742583694055,0.0016480679877685556,1.985624081648862e-05,0.0005956872244946587,7.942496326595449e-05,0.005877447281680632,0.00011913744489893174,0.29593741312894645
|
||||||
|
0.0,0.07903953601739935,0.0008749671887304226,0.020399235028686423,0.025884029348899416,0.017196855117933077,0.0011899553766733747,0.0022049173156006648,0.00033248753171756057,0.011642063422621652,9.499643763358873e-05,0.0003999850005624789,0.07594465207554717,0.013214504456082897,0.04049848130695099,0.024181593190255365,0.006899741259702761,5.749784383085634e-05,0.01051710560853968,0.03773608489681637,0.0037698586303013637,0.0075047185730535105,0.028846418259315276,0.00024249090659100284,0.00047998200067497467,6.749746884491831e-05,0.0010774595952651777,0.5897028861417697
|
||||||
|
0.0,0.40388373911101183,0.0005723698905517087,0.0004886084431538977,0.0008794951976770158,0.21811480902389993,2.792048246593701e-05,0.00011168192986374804,0.000991177127540764,0.20157192316283226,2.792048246593701e-05,0.00015356265356265356,0.0005584096493187402,0.0017589903953540316,0.0011168192986374804,0.07377987491623855,5.584096493187402e-05,2.792048246593701e-05,0.004844203707840071,6.980120616484253e-05,4.1880723698905516e-05,0.027669198123743577,6.980120616484253e-05,0.0018846325664507483,1.3960241232968505e-05,0.018092472637927185,0.004606879606879607,0.03858610676792495
|
||||||
|
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
||||||
|
@@ -0,0 +1,3 @@
|
|||||||
|
category,l2,kl_mf,kl_fm,jsd,permutation_p_value
|
||||||
|
names,0.3189041485139616,0.04320097944655348,0.0215380760498496,0.03236952774820154,0.978
|
||||||
|
surnames,1.2770018925640299,0.2936188220992242,0.23989460296618093,0.26675671253270256,0.001
|
||||||
|
|
After Width: | Height: | Size: 48 KiB |
|
After Width: | Height: | Size: 158 KiB |
@@ -0,0 +1,23 @@
|
|||||||
|
services:
|
||||||
|
app:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: drc-ners:uv
|
||||||
|
working_dir: /app
|
||||||
|
tty: true
|
||||||
|
stdin_open: true
|
||||||
|
environment:
|
||||||
|
NERS_ENV: production
|
||||||
|
STREAMLIT_SERVER_ADDRESS: 0.0.0.0
|
||||||
|
PYTHONPATH: /app/src
|
||||||
|
# expose Streamlit for `ners web run`
|
||||||
|
ports:
|
||||||
|
- "8501:8501"
|
||||||
|
volumes:
|
||||||
|
- ./src:/app/src
|
||||||
|
- ./assets:/app/assets
|
||||||
|
- ./config:/app/config
|
||||||
|
- ./data:/app/data
|
||||||
|
# default command shows CLI help; override per run
|
||||||
|
command: ["ners", "--help"]
|
||||||
@@ -30,7 +30,7 @@ llm:
|
|||||||
# Data handling configuration
|
# Data handling configuration
|
||||||
data:
|
data:
|
||||||
split_evaluation: false
|
split_evaluation: false
|
||||||
max_dataset_size: 100_000
|
max_dataset_size: 10_000
|
||||||
balance_by_sex: true
|
balance_by_sex: true
|
||||||
|
|
||||||
# Enhanced logging for development
|
# Enhanced logging for development
|
||||||
|
|||||||
@@ -73,37 +73,6 @@ baseline_experiments:
|
|||||||
batch_size: 32
|
batch_size: 32
|
||||||
tags: [ "baseline", "neural", "cnn", "surname" ]
|
tags: [ "baseline", "neural", "cnn", "surname" ]
|
||||||
|
|
||||||
## Ensemble Models
|
|
||||||
- name: "ensemble"
|
|
||||||
description: "Baseline Ensemble with multiple models"
|
|
||||||
model_type: "ensemble"
|
|
||||||
features: [ "full_name" ]
|
|
||||||
model_params:
|
|
||||||
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
|
||||||
voting: "soft"
|
|
||||||
cv_folds: 5
|
|
||||||
tags: [ "baseline", "ensemble" ]
|
|
||||||
|
|
||||||
- name: "ensemble_native"
|
|
||||||
description: "Baseline Ensemble with native name"
|
|
||||||
model_type: "ensemble"
|
|
||||||
features: [ "native_name" ]
|
|
||||||
model_params:
|
|
||||||
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
|
||||||
voting: "soft"
|
|
||||||
cv_folds: 5
|
|
||||||
tags: [ "baseline", "ensemble", "native" ]
|
|
||||||
|
|
||||||
- name: "ensemble_surname"
|
|
||||||
description: "Baseline Ensemble with surname"
|
|
||||||
model_type: "ensemble"
|
|
||||||
features: [ "surname" ]
|
|
||||||
model_params:
|
|
||||||
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
|
||||||
voting: "soft"
|
|
||||||
cv_folds: 5
|
|
||||||
tags: [ "baseline", "ensemble", "surname" ]
|
|
||||||
|
|
||||||
# LightGBM Models
|
# LightGBM Models
|
||||||
- name: "lightgbm"
|
- name: "lightgbm"
|
||||||
description: "Baseline LightGBM with engineered features"
|
description: "Baseline LightGBM with engineered features"
|
||||||
@@ -262,40 +231,6 @@ baseline_experiments:
|
|||||||
min_samples_leaf: 1
|
min_samples_leaf: 1
|
||||||
tags: [ "baseline", "random_forest", "engineered", "surname" ]
|
tags: [ "baseline", "random_forest", "engineered", "surname" ]
|
||||||
|
|
||||||
# SVM Models
|
|
||||||
- name: "svm"
|
|
||||||
description: "Baseline SVM with full name features"
|
|
||||||
model_type: "svm"
|
|
||||||
features: [ "full_name" ]
|
|
||||||
model_params:
|
|
||||||
C: 1.0
|
|
||||||
kernel: "rbf"
|
|
||||||
ngram_range: [ 2, 4 ]
|
|
||||||
max_features: 5000
|
|
||||||
tags: [ "baseline", "svm" ]
|
|
||||||
|
|
||||||
- name: "svm_native"
|
|
||||||
description: "Baseline SVM with native name features"
|
|
||||||
model_type: "svm"
|
|
||||||
features: [ "native_name" ]
|
|
||||||
model_params:
|
|
||||||
C: 1.0
|
|
||||||
kernel: "rbf"
|
|
||||||
ngram_range: [ 2, 4 ]
|
|
||||||
max_features: 5000
|
|
||||||
tags: [ "baseline", "svm", "native" ]
|
|
||||||
|
|
||||||
- name: "svm_surname"
|
|
||||||
description: "Baseline SVM with surname features"
|
|
||||||
model_type: "svm"
|
|
||||||
features: [ "surname" ]
|
|
||||||
model_params:
|
|
||||||
C: 1.0
|
|
||||||
kernel: "rbf"
|
|
||||||
ngram_range: [ 2, 4 ]
|
|
||||||
max_features: 5000
|
|
||||||
tags: [ "baseline", "svm", "surname" ]
|
|
||||||
|
|
||||||
# Transformer Models
|
# Transformer Models
|
||||||
- name: "transformer"
|
- name: "transformer"
|
||||||
description: "Baseline Transformer with attention mechanism"
|
description: "Baseline Transformer with attention mechanism"
|
||||||
|
|||||||
@@ -1,90 +0,0 @@
|
|||||||
#!.venv/bin/python3
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from core.config import setup_config
|
|
||||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
choices = [
|
|
||||||
"data_cleaning",
|
|
||||||
"data_selection",
|
|
||||||
"feature_extraction",
|
|
||||||
"ner_annotation",
|
|
||||||
"llm_annotation",
|
|
||||||
"data_splitting",
|
|
||||||
]
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="DRC NERS Processing Monitoring")
|
|
||||||
parser.add_argument("--config", type=Path, help="Path to configuration file")
|
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment")
|
|
||||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
||||||
|
|
||||||
# Clean command
|
|
||||||
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
|
|
||||||
clean_parser.add_argument("--step", type=str, choices=choices, help="default: all")
|
|
||||||
clean_parser.add_argument("--keep-last", type=int, default=1, help="(default: 1)")
|
|
||||||
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
|
|
||||||
|
|
||||||
# Reset command
|
|
||||||
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
|
|
||||||
reset_parser.add_argument("--step", type=str, choices=choices, help="(default: all)")
|
|
||||||
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
|
|
||||||
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
try:
|
|
||||||
setup_config(config_path=args.config, env=args.env)
|
|
||||||
monitor = PipelineMonitor()
|
|
||||||
|
|
||||||
if not args.command:
|
|
||||||
parser.print_help()
|
|
||||||
monitor.print_status(detailed=True)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
elif args.command == "clean":
|
|
||||||
checkpoint_info = monitor.count_checkpoint_files()
|
|
||||||
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
|
|
||||||
|
|
||||||
if not args.force:
|
|
||||||
response = input("Are you sure you want to clean checkpoints? (y/N): ")
|
|
||||||
if response.lower() != "y":
|
|
||||||
print("Cancelled")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if args.step:
|
|
||||||
monitor.clean_step_checkpoints(args.step, args.keep_last)
|
|
||||||
else:
|
|
||||||
for step in monitor.steps:
|
|
||||||
monitor.clean_step_checkpoints(step, args.keep_last)
|
|
||||||
|
|
||||||
print("Checkpoint cleaning completed")
|
|
||||||
|
|
||||||
elif args.command == "reset":
|
|
||||||
if not args.force:
|
|
||||||
response = input(
|
|
||||||
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
|
|
||||||
)
|
|
||||||
if response.lower() != "y":
|
|
||||||
print("Cancelled")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if args.step:
|
|
||||||
monitor.reset_step(args.step)
|
|
||||||
else:
|
|
||||||
for step in monitor.steps:
|
|
||||||
monitor.reset_step(step)
|
|
||||||
|
|
||||||
print(f"Reset completed")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Monitoring failed: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"metadata": {},
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"source": "# Qualitative Analysis",
|
|
||||||
"id": "d20715dd63f57364"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"id": "c93a55c8",
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2025-09-21T13:34:50.973298Z",
|
|
||||||
"start_time": "2025-09-21T13:34:50.969142Z"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import geopandas as gpd\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"import seaborn as sns\n",
|
|
||||||
"import sys\n",
|
|
||||||
"import os\n",
|
|
||||||
"\n",
|
|
||||||
"sys.path.append(os.path.abspath(\"..\"))\n",
|
|
||||||
"from core.utils.data_loader import DataLoader\n",
|
|
||||||
"from core.config.pipeline_config import PipelineConfig"
|
|
||||||
],
|
|
||||||
"outputs": [],
|
|
||||||
"execution_count": 3
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"id": "c0b00261",
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2025-09-21T13:34:51.002610Z",
|
|
||||||
"start_time": "2025-09-21T13:34:50.998586Z"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"config = PipelineConfig(\n",
|
|
||||||
" paths={\n",
|
|
||||||
" \"root_dir\": \"../data\",\n",
|
|
||||||
" \"data_dir\": \"../data/dataset\",\n",
|
|
||||||
" \"models_dir\": \"../models\",\n",
|
|
||||||
" \"outputs_dir\": \"../data/processed\",\n",
|
|
||||||
" \"logs_dir\": \"../logs\",\n",
|
|
||||||
" \"configs_dir\": \"../configs\",\n",
|
|
||||||
" \"checkpoints_dir\": \"../checkpoints\"\n",
|
|
||||||
" }\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"loader = DataLoader(config)"
|
|
||||||
],
|
|
||||||
"outputs": [],
|
|
||||||
"execution_count": 4
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2025-09-21T13:35:27.430639Z",
|
|
||||||
"start_time": "2025-09-21T13:34:51.013412Z"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cell_type": "code",
|
|
||||||
"outputs": [],
|
|
||||||
"execution_count": 5,
|
|
||||||
"source": [
|
|
||||||
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
|
|
||||||
"gdf_proj = gdf.to_crs(epsg=32732)\n",
|
|
||||||
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
|
|
||||||
"\n",
|
|
||||||
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
|
|
||||||
],
|
|
||||||
"id": "b38394ce38864379"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"metadata": {},
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"source": "## Exploration",
|
|
||||||
"id": "a1af5626d2a948d6"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.11"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"metadata": {},
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"source": "# Quantitative Analysis",
|
|
||||||
"id": "a605c0f92056a825"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"id": "c93a55c8",
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2025-09-21T14:14:47.287549Z",
|
|
||||||
"start_time": "2025-09-21T14:14:47.279199Z"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import geopandas as gpd\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"import seaborn as sns\n",
|
|
||||||
"import sys\n",
|
|
||||||
"import os\n",
|
|
||||||
"\n",
|
|
||||||
"sys.path.append(os.path.abspath(\"..\"))\n",
|
|
||||||
"from core.utils.data_loader import DataLoader\n",
|
|
||||||
"from core.config.pipeline_config import PipelineConfig"
|
|
||||||
],
|
|
||||||
"outputs": [],
|
|
||||||
"execution_count": 30
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"id": "c0b00261",
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2025-09-21T14:14:47.315980Z",
|
|
||||||
"start_time": "2025-09-21T14:14:47.308376Z"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"config = PipelineConfig(\n",
|
|
||||||
" paths={\n",
|
|
||||||
" \"root_dir\": \"../data\",\n",
|
|
||||||
" \"data_dir\": \"../data/dataset\",\n",
|
|
||||||
" \"models_dir\": \"../models\",\n",
|
|
||||||
" \"outputs_dir\": \"../data/processed\",\n",
|
|
||||||
" \"logs_dir\": \"../logs\",\n",
|
|
||||||
" \"configs_dir\": \"../configs\",\n",
|
|
||||||
" \"checkpoints_dir\": \"../checkpoints\"\n",
|
|
||||||
" }\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"loader = DataLoader(config)"
|
|
||||||
],
|
|
||||||
"outputs": [],
|
|
||||||
"execution_count": 31
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2025-09-21T14:15:47.899044Z",
|
|
||||||
"start_time": "2025-09-21T14:14:47.339266Z"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cell_type": "code",
|
|
||||||
"source": [
|
|
||||||
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
|
|
||||||
"gdf_proj = gdf.to_crs(epsg=32732)\n",
|
|
||||||
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
|
|
||||||
"\n",
|
|
||||||
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
|
|
||||||
],
|
|
||||||
"id": "b38394ce38864379",
|
|
||||||
"outputs": [],
|
|
||||||
"execution_count": 32
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"metadata": {},
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"source": "## Exploration",
|
|
||||||
"id": "a1af5626d2a948d6"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.11"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
[project]
|
||||||
|
name = "ners"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"geopandas>=1.1.1",
|
||||||
|
"joblib>=1.5.2",
|
||||||
|
"lightgbm>=4.6.0",
|
||||||
|
"matplotlib>=3.10.6",
|
||||||
|
"numpy>=2.3.3",
|
||||||
|
"ollama>=0.6.0",
|
||||||
|
"pandas>=2.3.3",
|
||||||
|
"plotly>=6.3.1",
|
||||||
|
"psutil>=7.1.0",
|
||||||
|
"pydantic>=2.11.10",
|
||||||
|
"pyyaml>=6.0.3",
|
||||||
|
"scikit-learn>=1.7.2",
|
||||||
|
"seaborn>=0.13.2",
|
||||||
|
"spacy>=3.8.7",
|
||||||
|
"streamlit>=1.50.0",
|
||||||
|
"tqdm>=4.67.1",
|
||||||
|
"typer>=0.19.2",
|
||||||
|
"tensorflow==2.20.0; sys_platform == 'linux' and platform_machine == 'x86_64'",
|
||||||
|
"xgboost>=3.0.5",
|
||||||
|
"networkx>=3.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
ners = "ners.cli:app"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["uv_build>=0.8.12,<0.9.0"]
|
||||||
|
build-backend = "uv_build"
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"ipykernel>=6.30.1",
|
||||||
|
"ruff>=0.13.3",
|
||||||
|
]
|
||||||
@@ -1,170 +0,0 @@
|
|||||||
absl-py==2.3.0
|
|
||||||
altair==5.1.2
|
|
||||||
annotated-types==0.7.0
|
|
||||||
anyio==4.9.0
|
|
||||||
appnope==0.1.4
|
|
||||||
argon2-cffi==25.1.0
|
|
||||||
argon2-cffi-bindings==21.2.0
|
|
||||||
arrow==1.3.0
|
|
||||||
asttokens==3.0.0
|
|
||||||
astunparse==1.6.3
|
|
||||||
async-lru==2.0.5
|
|
||||||
attrs==25.3.0
|
|
||||||
babel==2.17.0
|
|
||||||
beautifulsoup4==4.13.4
|
|
||||||
black==25.1.0
|
|
||||||
bleach==6.2.0
|
|
||||||
blinker==1.9.0
|
|
||||||
cachetools==6.1.0
|
|
||||||
certifi==2025.6.15
|
|
||||||
cffi==1.17.1
|
|
||||||
charset-normalizer==3.4.2
|
|
||||||
click==8.2.1
|
|
||||||
comm==0.2.2
|
|
||||||
contourpy==1.3.2
|
|
||||||
cycler==0.12.1
|
|
||||||
debugpy==1.8.14
|
|
||||||
decorator==5.2.1
|
|
||||||
defusedxml==0.7.1
|
|
||||||
executing==2.2.0
|
|
||||||
fastjsonschema==2.21.1
|
|
||||||
flake8==7.3.0
|
|
||||||
flatbuffers==25.2.10
|
|
||||||
fonttools==4.58.4
|
|
||||||
fqdn==1.5.1
|
|
||||||
gast==0.6.0
|
|
||||||
gitdb==4.0.12
|
|
||||||
GitPython==3.1.45
|
|
||||||
google-pasta==0.2.0
|
|
||||||
grpcio==1.73.0
|
|
||||||
h11==0.16.0
|
|
||||||
h5py==3.14.0
|
|
||||||
httpcore==1.0.9
|
|
||||||
httpx==0.28.1
|
|
||||||
idna==3.10
|
|
||||||
imbalanced-learn==0.13.0
|
|
||||||
ipykernel==6.29.5
|
|
||||||
ipython>=8.0,<9.0
|
|
||||||
ipython_pygments_lexers==1.1.1
|
|
||||||
isoduration==20.11.0
|
|
||||||
jedi==0.19.2
|
|
||||||
Jinja2==3.1.6
|
|
||||||
joblib==1.5.1
|
|
||||||
json5==0.12.0
|
|
||||||
jsonpointer==3.0.0
|
|
||||||
jsonschema==4.24.0
|
|
||||||
jsonschema-specifications==2025.4.1
|
|
||||||
jupyter-events==0.12.0
|
|
||||||
jupyter-lsp==2.2.5
|
|
||||||
jupyter_client==8.6.3
|
|
||||||
jupyter_core==5.8.1
|
|
||||||
jupyter_server==2.16.0
|
|
||||||
jupyter_server_terminals==0.5.3
|
|
||||||
jupyterlab==4.4.4
|
|
||||||
jupyterlab_pygments==0.3.0
|
|
||||||
jupyterlab_server==2.27.3
|
|
||||||
keras==3.10.0
|
|
||||||
kiwisolver==1.4.8
|
|
||||||
libclang==18.1.1
|
|
||||||
lightgbm~=4.6.0
|
|
||||||
Markdown==3.8.2
|
|
||||||
markdown-it-py==3.0.0
|
|
||||||
MarkupSafe==3.0.2
|
|
||||||
matplotlib==3.10.3
|
|
||||||
matplotlib-inline==0.1.7
|
|
||||||
mccabe==0.7.0
|
|
||||||
mdurl==0.1.2
|
|
||||||
mistune==3.1.3
|
|
||||||
ml-dtypes==0.3.2
|
|
||||||
mypy==1.17.0
|
|
||||||
mypy_extensions==1.1.0
|
|
||||||
namex==0.1.0
|
|
||||||
narwhals==2.0.1
|
|
||||||
nbclient==0.10.2
|
|
||||||
nbconvert==7.16.6
|
|
||||||
nbformat==5.10.4
|
|
||||||
nest-asyncio==1.6.0
|
|
||||||
nltk==3.9.1
|
|
||||||
notebook==7.4.4
|
|
||||||
notebook_shim==0.2.4
|
|
||||||
numpy==1.26.4
|
|
||||||
ollama~=0.5.1
|
|
||||||
opt_einsum==3.4.0
|
|
||||||
optree==0.16.0
|
|
||||||
overrides==7.7.0
|
|
||||||
packaging==25.0
|
|
||||||
pandas==2.3.0
|
|
||||||
pandocfilters==1.5.1
|
|
||||||
parso==0.8.4
|
|
||||||
pathspec==0.12.1
|
|
||||||
pexpect==4.9.0
|
|
||||||
pillow==11.2.1
|
|
||||||
platformdirs==4.3.8
|
|
||||||
plotly~=6.2.0
|
|
||||||
prometheus_client==0.22.1
|
|
||||||
prompt_toolkit==3.0.51
|
|
||||||
protobuf==4.25.8
|
|
||||||
psutil==7.0.0
|
|
||||||
ptyprocess==0.7.0
|
|
||||||
pure_eval==0.2.3
|
|
||||||
pyarrow==21.0.0
|
|
||||||
pycodestyle==2.14.0
|
|
||||||
pycparser==2.22
|
|
||||||
pydantic~=2.11.7
|
|
||||||
pydantic_core==2.33.2
|
|
||||||
pydeck==0.9.1
|
|
||||||
pyflakes==3.4.0
|
|
||||||
Pygments==2.19.1
|
|
||||||
pyparsing==3.2.3
|
|
||||||
python-dateutil==2.9.0.post0
|
|
||||||
python-json-logger==3.3.0
|
|
||||||
pytz==2025.2
|
|
||||||
PyYAML~=6.0.2
|
|
||||||
pyzmq==27.0.0
|
|
||||||
referencing==0.36.2
|
|
||||||
regex==2024.11.6
|
|
||||||
requests==2.32.4
|
|
||||||
rfc3339-validator==0.1.4
|
|
||||||
rfc3986-validator==0.1.1
|
|
||||||
rich==14.0.0
|
|
||||||
rpds-py==0.26.0
|
|
||||||
scikit-learn~=1.6.1
|
|
||||||
scipy==1.15.3
|
|
||||||
seaborn==0.13.2
|
|
||||||
Send2Trash==1.8.3
|
|
||||||
six==1.17.0
|
|
||||||
sklearn-compat==0.1.3
|
|
||||||
smmap==5.0.2
|
|
||||||
sniffio==1.3.1
|
|
||||||
soupsieve==2.7
|
|
||||||
spacy~=3.8.7
|
|
||||||
stack-data==0.6.3
|
|
||||||
streamlit~=1.47.1
|
|
||||||
tenacity==9.1.2
|
|
||||||
tensorboard==2.16.2
|
|
||||||
tensorboard-data-server==0.7.2
|
|
||||||
tensorflow==2.16.2
|
|
||||||
tensorflow-io-gcs-filesystem==0.37.1
|
|
||||||
termcolor==3.1.0
|
|
||||||
terminado==0.18.1
|
|
||||||
threadpoolctl==3.6.0
|
|
||||||
tinycss2==1.4.0
|
|
||||||
toml==0.10.2
|
|
||||||
toolz==1.0.0
|
|
||||||
tornado==6.5.1
|
|
||||||
tqdm==4.67.1
|
|
||||||
traitlets==5.14.3
|
|
||||||
types-python-dateutil==2.9.0.20250516
|
|
||||||
types-PyYAML==6.0.12.20250516
|
|
||||||
typing-inspection==0.4.1
|
|
||||||
typing_extensions==4.14.0
|
|
||||||
tzdata==2025.2
|
|
||||||
uri-template==1.3.0
|
|
||||||
urllib3==2.5.0
|
|
||||||
wcwidth==0.2.13
|
|
||||||
webcolors==24.11.1
|
|
||||||
webencodings==0.5.1
|
|
||||||
websocket-client==1.8.0
|
|
||||||
Werkzeug==3.1.3
|
|
||||||
wrapt==1.17.2
|
|
||||||
xgboost~=3.0.3
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.base import BaseEstimator
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.pipeline import Pipeline
|
|
||||||
from sklearn.svm import SVC
|
|
||||||
|
|
||||||
from research.traditional_model import TraditionalModel
|
|
||||||
|
|
||||||
|
|
||||||
class SVMModel(TraditionalModel):
|
|
||||||
"""Support Vector Machine with character n-grams and RBF kernel"""
|
|
||||||
|
|
||||||
def build_model(self) -> BaseEstimator:
|
|
||||||
params = self.config.model_params
|
|
||||||
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
|
|
||||||
# for distinguishing name morphology under RBF kernels.
|
|
||||||
vectorizer = TfidfVectorizer(
|
|
||||||
analyzer="char",
|
|
||||||
ngram_range=params.get("ngram_range", (2, 4)),
|
|
||||||
max_features=params.get("max_features", 5000),
|
|
||||||
)
|
|
||||||
|
|
||||||
# RBF kernel captures non-linear interactions between n-grams; probability=True
|
|
||||||
# adds calibration at some cost. Larger cache helps speed kernel computations.
|
|
||||||
classifier = SVC(
|
|
||||||
kernel=params.get("kernel", "rbf"),
|
|
||||||
C=params.get("C", 1.0),
|
|
||||||
gamma=params.get("gamma", "scale"),
|
|
||||||
probability=True, # Enable probability prediction
|
|
||||||
class_weight=params.get("class_weight", None),
|
|
||||||
cache_size=params.get("cache_size", 1000),
|
|
||||||
random_state=self.config.random_seed,
|
|
||||||
verbose=2,
|
|
||||||
)
|
|
||||||
|
|
||||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
|
||||||
|
|
||||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
|
||||||
text_features = []
|
|
||||||
|
|
||||||
for feature_type in self.config.features:
|
|
||||||
if feature_type.value in X.columns:
|
|
||||||
text_features.append(X[feature_type.value].astype(str))
|
|
||||||
|
|
||||||
if len(text_features) == 1:
|
|
||||||
return text_features[0].values
|
|
||||||
else:
|
|
||||||
combined = text_features[0].astype(str)
|
|
||||||
for feature in text_features[1:]:
|
|
||||||
combined = combined + " " + feature.astype(str)
|
|
||||||
return combined.values
|
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""DRC NERS NLP package."""
|
||||||
|
|
||||||
|
__all__: list[str] = []
|
||||||
@@ -0,0 +1,225 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from ners.core.config import setup_config, PipelineConfig
|
||||||
|
|
||||||
|
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Pipeline commands
|
||||||
|
# -------------------------
|
||||||
|
pipeline_app = typer.Typer(help="Data processing pipeline")
|
||||||
|
app.add_typer(pipeline_app, name="pipeline")
|
||||||
|
|
||||||
|
|
||||||
|
@pipeline_app.command("run")
|
||||||
|
def pipeline_run(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
"""Run the full processing pipeline."""
|
||||||
|
from ners.main import run_pipeline as _run_pipeline
|
||||||
|
|
||||||
|
cfg = setup_config(config_path=config, env=env)
|
||||||
|
code = _run_pipeline(cfg)
|
||||||
|
raise typer.Exit(code)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# NER commands
|
||||||
|
# -------------------------
|
||||||
|
ner_app = typer.Typer(help="NER dataset and model")
|
||||||
|
app.add_typer(ner_app, name="ner")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
|
||||||
|
return setup_config(config_path=config, env=env)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("feature")
|
||||||
|
def ner_feature(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import feature as _feature
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
_feature(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("build")
|
||||||
|
def ner_build(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import build as _build
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
_build(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("train")
|
||||||
|
def ner_train(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import train as _train
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
_train(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ner_app.command("run")
|
||||||
|
def ner_run(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
reset: bool = typer.Option(
|
||||||
|
False, help="Reset intermediate outputs and rerun all steps"
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
from ners.ner import run_pipeline as _ner_pipeline
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
code = _ner_pipeline(cfg, reset)
|
||||||
|
raise typer.Exit(code)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Research commands
|
||||||
|
# -------------------------
|
||||||
|
research_app = typer.Typer(help="Research experiments and training")
|
||||||
|
app.add_typer(research_app, name="research")
|
||||||
|
|
||||||
|
|
||||||
|
@research_app.command("train")
|
||||||
|
def research_train(
|
||||||
|
name: str = typer.Option(..., "--name", help="Model name to train"),
|
||||||
|
type: str = typer.Option(..., "--type", help="Experiment type"),
|
||||||
|
templates: str = typer.Option(
|
||||||
|
"research_templates.yaml", help="Templates file path"
|
||||||
|
),
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
from ners.research.experiment.experiment_builder import ExperimentBuilder
|
||||||
|
from ners.research.model_trainer import ModelTrainer
|
||||||
|
|
||||||
|
cfg = _load_config(config, env)
|
||||||
|
exp_builder = ExperimentBuilder(cfg)
|
||||||
|
tmpl = exp_builder.load_templates(templates)
|
||||||
|
exp_cfg = exp_builder.find_template(tmpl, name, type)
|
||||||
|
|
||||||
|
trainer = ModelTrainer(cfg)
|
||||||
|
trainer.train_single_model(
|
||||||
|
model_name=exp_cfg.get("name"),
|
||||||
|
model_type=exp_cfg.get("model_type"),
|
||||||
|
features=exp_cfg.get("features"),
|
||||||
|
model_params=exp_cfg.get("model_params", {}),
|
||||||
|
tags=exp_cfg.get("tags", []),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Monitor commands
|
||||||
|
# -------------------------
|
||||||
|
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
|
||||||
|
app.add_typer(monitor_app, name="monitor")
|
||||||
|
|
||||||
|
|
||||||
|
@monitor_app.command("status")
|
||||||
|
def monitor_status(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
detailed: bool = typer.Option(
|
||||||
|
False, help="Show detailed status (failed batch IDs)"
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
_ = _load_config(config, env)
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
PipelineMonitor().print_status(detailed=detailed)
|
||||||
|
|
||||||
|
|
||||||
|
@monitor_app.command("clean")
|
||||||
|
def monitor_clean(
|
||||||
|
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
|
||||||
|
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
|
||||||
|
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
_ = _load_config(config, env)
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
mon = PipelineMonitor()
|
||||||
|
if not force:
|
||||||
|
typer.confirm("Clean checkpoints?", abort=True)
|
||||||
|
|
||||||
|
if step:
|
||||||
|
mon.clean_step_checkpoints(step, keep_last)
|
||||||
|
else:
|
||||||
|
for s in mon.steps:
|
||||||
|
mon.clean_step_checkpoints(s, keep_last)
|
||||||
|
|
||||||
|
|
||||||
|
@monitor_app.command("reset")
|
||||||
|
def monitor_reset(
|
||||||
|
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
|
||||||
|
force: bool = typer.Option(False, help="Do not ask for confirmation"),
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
_ = _load_config(config, env)
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
mon = PipelineMonitor()
|
||||||
|
if not force:
|
||||||
|
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
|
||||||
|
typer.confirm(msg, abort=True)
|
||||||
|
|
||||||
|
if step:
|
||||||
|
mon.reset_step(step)
|
||||||
|
else:
|
||||||
|
for s in mon.steps:
|
||||||
|
mon.reset_step(s)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Web commands
|
||||||
|
# -------------------------
|
||||||
|
web_app = typer.Typer(help="Web UI wrapper")
|
||||||
|
app.add_typer(web_app, name="web")
|
||||||
|
|
||||||
|
|
||||||
|
@web_app.command("run")
|
||||||
|
def web_run(
|
||||||
|
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||||
|
env: str = typer.Option("development", help="Environment name"),
|
||||||
|
) -> None:
|
||||||
|
app_path = Path(__file__).parent / "web" / "app.py"
|
||||||
|
cmd = [
|
||||||
|
sys.executable,
|
||||||
|
"-m",
|
||||||
|
"streamlit",
|
||||||
|
"run",
|
||||||
|
str(app_path),
|
||||||
|
]
|
||||||
|
# Pass configuration via environment variables to avoid argparse in Streamlit
|
||||||
|
env_vars = os.environ.copy()
|
||||||
|
if config is not None:
|
||||||
|
env_vars["NERS_CONFIG"] = str(config)
|
||||||
|
env_vars["NERS_ENV"] = env
|
||||||
|
|
||||||
|
raise typer.Exit(subprocess.call(cmd, env=env_vars))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
app()
|
||||||
@@ -2,10 +2,9 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
from core.utils import ensure_directories
|
from ners.core.utils import ensure_directories
|
||||||
from .config_manager import ConfigManager
|
from ners.core.config.config_manager import ConfigManager
|
||||||
from .logging_config import LoggingConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from .pipeline_config import PipelineConfig
|
|
||||||
|
|
||||||
config_manager = ConfigManager()
|
config_manager = ConfigManager()
|
||||||
|
|
||||||
@@ -22,7 +21,9 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfi
|
|||||||
return config_manager.get_config()
|
return config_manager.get_config()
|
||||||
|
|
||||||
|
|
||||||
def setup_config(config_path: Optional[Path] = None, env: str = "development") -> PipelineConfig:
|
def setup_config(
|
||||||
|
config_path: Optional[Path] = None, env: str = "development"
|
||||||
|
) -> PipelineConfig:
|
||||||
"""
|
"""
|
||||||
Unified configuration loading and logging setup for all entrypoint scripts.
|
Unified configuration loading and logging setup for all entrypoint scripts.
|
||||||
|
|
||||||
@@ -5,8 +5,8 @@ from typing import Optional, Union, Dict, Any
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.config.project_paths import ProjectPaths
|
from ners.core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
class ConfigManager:
|
class ConfigManager:
|
||||||
@@ -36,7 +36,7 @@ class ConfigManager:
|
|||||||
|
|
||||||
def _setup_default_paths(self):
|
def _setup_default_paths(self):
|
||||||
"""Setup default project paths"""
|
"""Setup default project paths"""
|
||||||
root_dir = Path(__file__).parent.parent.parent
|
root_dir = Path(__file__).parent.parent.parent.parent.parent
|
||||||
self.default_paths = ProjectPaths(
|
self.default_paths = ProjectPaths(
|
||||||
root_dir=root_dir,
|
root_dir=root_dir,
|
||||||
configs_dir=root_dir / "config",
|
configs_dir=root_dir / "config",
|
||||||
@@ -53,7 +53,9 @@ class ConfigManager:
|
|||||||
self.config_path = config_path
|
self.config_path = config_path
|
||||||
|
|
||||||
if not self.config_path.exists():
|
if not self.config_path.exists():
|
||||||
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
|
logging.warning(
|
||||||
|
f"Config file not found: {self.config_path}. Using defaults."
|
||||||
|
)
|
||||||
return self._create_default_config()
|
return self._create_default_config()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -122,7 +124,11 @@ class ConfigManager:
|
|||||||
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
||||||
"""Recursively update nested dictionaries"""
|
"""Recursively update nested dictionaries"""
|
||||||
for key, value in update_dict.items():
|
for key, value in update_dict.items():
|
||||||
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
|
if (
|
||||||
|
key in base_dict
|
||||||
|
and isinstance(base_dict[key], dict)
|
||||||
|
and isinstance(value, dict)
|
||||||
|
):
|
||||||
self._deep_update(base_dict[key], value)
|
self._deep_update(base_dict[key], value)
|
||||||
else:
|
else:
|
||||||
base_dict[key] = value
|
base_dict[key] = value
|
||||||
@@ -1,10 +1,10 @@
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from core.config.annotation_config import AnnotationConfig
|
from ners.core.config.annotation_config import AnnotationConfig
|
||||||
from core.config.data_config import DataConfig
|
from ners.core.config.data_config import DataConfig
|
||||||
from core.config.logging_config import LoggingConfig
|
from ners.core.config.logging_config import LoggingConfig
|
||||||
from core.config.processing_config import ProcessingConfig
|
from ners.core.config.processing_config import ProcessingConfig
|
||||||
from core.config.project_paths import ProjectPaths
|
from ners.core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
class PipelineConfig(BaseModel):
|
class PipelineConfig(BaseModel):
|
||||||
@@ -10,6 +10,8 @@ class ProcessingConfig(BaseModel):
|
|||||||
max_workers: int = 4
|
max_workers: int = 4
|
||||||
checkpoint_interval: int = 5
|
checkpoint_interval: int = 5
|
||||||
use_multiprocessing: bool = False
|
use_multiprocessing: bool = False
|
||||||
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
encoding_options: list = field(
|
||||||
|
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
|
||||||
|
)
|
||||||
chunk_size: int = 100_000
|
chunk_size: int = 100_000
|
||||||
epochs: int = 2
|
epochs: int = 2
|
||||||
@@ -4,13 +4,13 @@ from pathlib import Path
|
|||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def temporary_config_override(**overrides):
|
def temporary_config_override(**overrides):
|
||||||
"""Context manager for temporarily overriding configuration"""
|
"""Context manager for temporarily overriding configuration"""
|
||||||
from core.config import get_config
|
from ners.core.config import get_config
|
||||||
|
|
||||||
config = get_config()
|
config = get_config()
|
||||||
original_values = {}
|
original_values = {}
|
||||||
@@ -5,7 +5,7 @@ from typing import Optional, Union, Iterator, Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
OPTIMIZED_DTYPES = {
|
OPTIMIZED_DTYPES = {
|
||||||
# Numeric columns with appropriate bit-width
|
# Numeric columns with appropriate bit-width
|
||||||
@@ -113,7 +113,9 @@ class DataLoader:
|
|||||||
sex_values = df["sex"].dropna().unique()
|
sex_values = df["sex"].dropna().unique()
|
||||||
|
|
||||||
if len(sex_values) == 0:
|
if len(sex_values) == 0:
|
||||||
logging.warning(f"No valid values found in sex column 'sex', using random sampling")
|
logging.warning(
|
||||||
|
"No valid values found in sex column 'sex', using random sampling"
|
||||||
|
)
|
||||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||||
|
|
||||||
# Calculate samples per sex category
|
# Calculate samples per sex category
|
||||||
@@ -140,18 +142,22 @@ class DataLoader:
|
|||||||
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
||||||
|
|
||||||
if not balanced_samples:
|
if not balanced_samples:
|
||||||
logging.warning("No balanced samples could be created, using random sampling")
|
logging.warning(
|
||||||
|
"No balanced samples could be created, using random sampling"
|
||||||
|
)
|
||||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||||
|
|
||||||
# Create result using iloc with indices (no copying until final step)
|
# Create result using iloc with indices (no copying until final step)
|
||||||
result = df.iloc[balanced_samples].copy()
|
result = df.iloc[balanced_samples].copy()
|
||||||
|
|
||||||
# Shuffle the final result
|
# Shuffle the final result
|
||||||
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
result = result.sample(
|
||||||
drop=True
|
frac=1, random_state=self.config.data.random_seed
|
||||||
)
|
).reset_index(drop=True)
|
||||||
|
|
||||||
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
|
logging.info(
|
||||||
|
f"Created balanced dataset with {len(result)} records from {len(df)} total"
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
class PromptManager:
|
class PromptManager:
|
||||||
@@ -19,9 +19,15 @@ class RegionMapper:
|
|||||||
return (
|
return (
|
||||||
series.str.upper()
|
series.str.upper()
|
||||||
.str.strip()
|
.str.strip()
|
||||||
.apply(lambda x: unicodedata.normalize("NFKD", x)
|
.apply(
|
||||||
|
lambda x: (
|
||||||
|
unicodedata.normalize("NFKD", x)
|
||||||
.encode("ascii", errors="ignore")
|
.encode("ascii", errors="ignore")
|
||||||
.decode("utf-8") if isinstance(x, str) else x)
|
.decode("utf-8")
|
||||||
|
if isinstance(x, str)
|
||||||
|
else x
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -2,7 +2,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
class StateManager:
|
class StateManager:
|
||||||
@@ -1,21 +1,17 @@
|
|||||||
#!.venv/bin/python3
|
#!.venv/bin/python3
|
||||||
import argparse
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
from ners.core.utils.data_loader import DataLoader
|
||||||
import traceback
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
|
from ners.processing.pipeline import Pipeline
|
||||||
from core.config import setup_config
|
from ners.processing.steps.data_cleaning_step import DataCleaningStep
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.processing.steps.data_selection_step import DataSelectionStep
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.steps.data_splitting_step import DataSplittingStep
|
||||||
from processing.pipeline import Pipeline
|
from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
|
||||||
from processing.steps.data_cleaning_step import DataCleaningStep
|
from ners.processing.steps.ner_annotation_step import NERAnnotationStep
|
||||||
from processing.steps.data_selection_step import DataSelectionStep
|
from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||||
from processing.steps.data_splitting_step import DataSplittingStep
|
|
||||||
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(config) -> Pipeline:
|
def create_pipeline(config) -> Pipeline:
|
||||||
"""Create pipeline from configuration"""
|
|
||||||
batch_config = BatchConfig(
|
batch_config = BatchConfig(
|
||||||
batch_size=config.processing.batch_size,
|
batch_size=config.processing.batch_size,
|
||||||
max_workers=config.processing.max_workers,
|
max_workers=config.processing.max_workers,
|
||||||
@@ -23,14 +19,13 @@ def create_pipeline(config) -> Pipeline:
|
|||||||
use_multiprocessing=config.processing.use_multiprocessing,
|
use_multiprocessing=config.processing.use_multiprocessing,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add steps based on configuration
|
|
||||||
pipeline = Pipeline(batch_config)
|
pipeline = Pipeline(batch_config)
|
||||||
steps = [
|
steps = [
|
||||||
DataCleaningStep(config),
|
DataCleaningStep(config),
|
||||||
FeatureExtractionStep(config),
|
FeatureExtractionStep(config),
|
||||||
DataSelectionStep(config),
|
DataSelectionStep(config),
|
||||||
# NERAnnotationStep(config),
|
NERAnnotationStep(config),
|
||||||
# LLMAnnotationStep(config),
|
LLMAnnotationStep(config),
|
||||||
]
|
]
|
||||||
|
|
||||||
for stage in config.stages:
|
for stage in config.stages:
|
||||||
@@ -42,7 +37,6 @@ def create_pipeline(config) -> Pipeline:
|
|||||||
|
|
||||||
|
|
||||||
def run_pipeline(config) -> int:
|
def run_pipeline(config) -> int:
|
||||||
"""Run the complete pipeline"""
|
|
||||||
try:
|
try:
|
||||||
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
||||||
|
|
||||||
@@ -79,27 +73,3 @@ def run_pipeline(config) -> int:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main entry point with unified configuration loading"""
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="DRC NERS Processing Pipeline",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
try:
|
|
||||||
config = setup_config(config_path=args.config, env=args.env)
|
|
||||||
return run_pipeline(config)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Pipeline failed: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
|
||||||
|
|
||||||
|
def status(*, detailed: bool = False) -> None:
|
||||||
|
PipelineMonitor().print_status(detailed=detailed)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_step(step: str, *, keep_last: int = 1) -> None:
|
||||||
|
PipelineMonitor().clean_step_checkpoints(step, keep_last)
|
||||||
|
|
||||||
|
|
||||||
|
def reset_step(step: str) -> None:
|
||||||
|
PipelineMonitor().reset_step(step)
|
||||||
@@ -1,29 +1,24 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from core.config import setup_config, PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from processing.ner.name_builder import NameBuilder
|
from ners.processing.ner.name_builder import NameBuilder
|
||||||
from processing.ner.name_engineering import NameEngineering
|
from ners.processing.ner.name_engineering import NameEngineering
|
||||||
from processing.ner.name_model import NameModel
|
from ners.processing.ner.name_model import NameModel
|
||||||
|
|
||||||
|
|
||||||
def feature(config: PipelineConfig):
|
def feature(config: PipelineConfig):
|
||||||
"""Apply feature engineering to create position-independent NER dataset."""
|
|
||||||
NameEngineering(config).compute()
|
NameEngineering(config).compute()
|
||||||
|
|
||||||
|
|
||||||
def build(config: PipelineConfig):
|
def build(config: PipelineConfig):
|
||||||
"""Build NER dataset using NERDataBuilder."""
|
|
||||||
NameBuilder(config).build()
|
NameBuilder(config).build()
|
||||||
|
|
||||||
|
|
||||||
def train(config: PipelineConfig):
|
def train(config: PipelineConfig):
|
||||||
"""Train the NER model."""
|
|
||||||
name_model = NameModel(config)
|
name_model = NameModel(config)
|
||||||
|
|
||||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||||
@@ -37,7 +32,9 @@ def train(config: PipelineConfig):
|
|||||||
split_idx = int(len(data) * 0.9)
|
split_idx = int(len(data) * 0.9)
|
||||||
train_data, eval_data = data[:split_idx], data[split_idx:]
|
train_data, eval_data = data[:split_idx], data[split_idx:]
|
||||||
|
|
||||||
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
logging.info(
|
||||||
|
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
|
||||||
|
)
|
||||||
name_model.train(
|
name_model.train(
|
||||||
data=train_data,
|
data=train_data,
|
||||||
epochs=config.processing.epochs,
|
epochs=config.processing.epochs,
|
||||||
@@ -75,21 +72,9 @@ def run_pipeline(config: PipelineConfig, reset: bool = False):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="NER model management for DRC names")
|
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
|
||||||
parser.add_argument("--reset", action="store_true", help="Reset all steps")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config = setup_config(config_path=args.config, env=args.env)
|
logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
|
||||||
return run_pipeline(config, args.reset)
|
return 1
|
||||||
|
except Exception:
|
||||||
except Exception as e:
|
|
||||||
print(f"Pipeline failed: {e}")
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@@ -8,4 +8,6 @@ class BatchConfig:
|
|||||||
batch_size: int = 1000
|
batch_size: int = 1000
|
||||||
max_workers: int = 4
|
max_workers: int = 4
|
||||||
checkpoint_interval: int = 5 # Save checkpoint every N batches
|
checkpoint_interval: int = 5 # Save checkpoint every N batches
|
||||||
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
use_multiprocessing: bool = (
|
||||||
|
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
||||||
|
)
|
||||||
@@ -4,9 +4,9 @@ from typing import Iterator
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.batch.memory_monitor import MemoryMonitor
|
from ners.processing.batch.memory_monitor import MemoryMonitor
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class BatchProcessor:
|
class BatchProcessor:
|
||||||
@@ -33,7 +33,9 @@ class BatchProcessor:
|
|||||||
|
|
||||||
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
||||||
if step.batch_exists(batch_id):
|
if step.batch_exists(batch_id):
|
||||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
logging.info(
|
||||||
|
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||||
|
)
|
||||||
processed_batch = step.load_batch(batch_id)
|
processed_batch = step.load_batch(batch_id)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
@@ -80,7 +82,9 @@ class BatchProcessor:
|
|||||||
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Memory-optimized concurrent processing"""
|
"""Memory-optimized concurrent processing"""
|
||||||
executor_class = (
|
executor_class = (
|
||||||
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
|
ProcessPoolExecutor
|
||||||
|
if self.config.use_multiprocessing
|
||||||
|
else ThreadPoolExecutor
|
||||||
)
|
)
|
||||||
results = {}
|
results = {}
|
||||||
|
|
||||||
@@ -89,7 +93,9 @@ class BatchProcessor:
|
|||||||
future_to_batch = {}
|
future_to_batch = {}
|
||||||
for batch, batch_id in self.create_batches(df):
|
for batch, batch_id in self.create_batches(df):
|
||||||
if step.batch_exists(batch_id):
|
if step.batch_exists(batch_id):
|
||||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
logging.info(
|
||||||
|
f"Batch {batch_id} already processed, loading from checkpoint"
|
||||||
|
)
|
||||||
results[batch_id] = step.load_batch(batch_id)
|
results[batch_id] = step.load_batch(batch_id)
|
||||||
else:
|
else:
|
||||||
# Only copy if necessary for concurrent processing
|
# Only copy if necessary for concurrent processing
|
||||||
@@ -121,7 +127,9 @@ class BatchProcessor:
|
|||||||
del results
|
del results
|
||||||
self.memory_monitor.cleanup_memory()
|
self.memory_monitor.cleanup_memory()
|
||||||
|
|
||||||
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
result = (
|
||||||
|
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
||||||
|
)
|
||||||
|
|
||||||
# Final cleanup
|
# Final cleanup
|
||||||
del ordered_results
|
del ordered_results
|
||||||
@@ -131,7 +139,9 @@ class BatchProcessor:
|
|||||||
|
|
||||||
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Process data using the configured strategy"""
|
"""Process data using the configured strategy"""
|
||||||
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
|
step.state.total_batches = (
|
||||||
|
len(df) + self.config.batch_size - 1
|
||||||
|
) // self.config.batch_size
|
||||||
step.load_state()
|
step.load_state()
|
||||||
|
|
||||||
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
||||||
@@ -4,8 +4,8 @@ import shutil
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Dict
|
from typing import Optional, Dict
|
||||||
|
|
||||||
from core.config.config_manager import ConfigManager
|
from ners.core.config.config_manager import ConfigManager
|
||||||
from core.config.project_paths import ProjectPaths
|
from ners.core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
class PipelineMonitor:
|
class PipelineMonitor:
|
||||||
@@ -97,7 +97,10 @@ class PipelineMonitor:
|
|||||||
|
|
||||||
avg_completion = total_completion / len(self.steps)
|
avg_completion = total_completion / len(self.steps)
|
||||||
|
|
||||||
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
|
if avg_completion >= 100 and overall_status not in [
|
||||||
|
"error",
|
||||||
|
"completed_with_errors",
|
||||||
|
]:
|
||||||
overall_status = "completed"
|
overall_status = "completed"
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -121,7 +124,9 @@ class PipelineMonitor:
|
|||||||
print(f"{step_name.replace('_', ' ').title()}:")
|
print(f"{step_name.replace('_', ' ').title()}:")
|
||||||
print(f" Status: {step_status['status']}")
|
print(f" Status: {step_status['status']}")
|
||||||
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
||||||
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
|
print(
|
||||||
|
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
|
||||||
|
)
|
||||||
|
|
||||||
if step_status["failed_batches"] > 0:
|
if step_status["failed_batches"] > 0:
|
||||||
print(f" Failed Batches: {step_status['failed_batches']}")
|
print(f" Failed Batches: {step_status['failed_batches']}")
|
||||||
@@ -141,7 +146,10 @@ class PipelineMonitor:
|
|||||||
if step_dir.exists():
|
if step_dir.exists():
|
||||||
csv_files = list(step_dir.glob("*.csv"))
|
csv_files = list(step_dir.glob("*.csv"))
|
||||||
step_size = sum(f.stat().st_size for f in csv_files)
|
step_size = sum(f.stat().st_size for f in csv_files)
|
||||||
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
|
counts[step] = {
|
||||||
|
"files": len(csv_files),
|
||||||
|
"size_mb": step_size / (1024 * 1024),
|
||||||
|
}
|
||||||
total_size += step_size
|
total_size += step_size
|
||||||
else:
|
else:
|
||||||
counts[step] = {"files": 0, "size_mb": 0}
|
counts[step] = {"files": 0, "size_mb": 0}
|
||||||
@@ -160,7 +168,9 @@ class PipelineMonitor:
|
|||||||
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
||||||
|
|
||||||
if len(csv_files) <= keep_last:
|
if len(csv_files) <= keep_last:
|
||||||
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
|
logging.info(
|
||||||
|
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
||||||
@@ -3,7 +3,7 @@ from typing import List, Tuple, Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.steps.feature_extraction_step import NameCategory
|
from ners.processing.steps.feature_extraction_step import NameCategory
|
||||||
|
|
||||||
|
|
||||||
class BaseNameFormatter(ABC):
|
class BaseNameFormatter(ABC):
|
||||||
@@ -12,7 +12,9 @@ class BaseNameFormatter(ABC):
|
|||||||
Contains common logic for NER tagging and attribute computation.
|
Contains common logic for NER tagging and attribute computation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
def __init__(
|
||||||
|
self, connectors: List[str] = None, additional_surnames: List[str] = None
|
||||||
|
):
|
||||||
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
||||||
self.additional_surnames = additional_surnames or [
|
self.additional_surnames = additional_surnames or [
|
||||||
"jean",
|
"jean",
|
||||||
@@ -46,7 +48,9 @@ class BaseNameFormatter(ABC):
|
|||||||
end_pos = current_pos + len(word)
|
end_pos = current_pos + len(word)
|
||||||
|
|
||||||
# Determine tag based on word content
|
# Determine tag based on word content
|
||||||
if word in native_parts or any(connector in word for connector in self.connectors):
|
if word in native_parts or any(
|
||||||
|
connector in word for connector in self.connectors
|
||||||
|
):
|
||||||
tag = "NATIVE"
|
tag = "NATIVE"
|
||||||
elif word == surname or word in self.additional_surnames:
|
elif word == surname or word in self.additional_surnames:
|
||||||
tag = "SURNAME"
|
tag = "SURNAME"
|
||||||
@@ -72,7 +76,9 @@ class BaseNameFormatter(ABC):
|
|||||||
"words": words_count,
|
"words": words_count,
|
||||||
"length": length,
|
"length": length,
|
||||||
"identified_category": (
|
"identified_category": (
|
||||||
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
|
NameCategory.SIMPLE.value
|
||||||
|
if words_count == 3
|
||||||
|
else NameCategory.COMPOSE.value
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3,7 +3,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class ConnectorFormatter(BaseNameFormatter):
|
class ConnectorFormatter(BaseNameFormatter):
|
||||||
@@ -3,13 +3,15 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class ExtendedSurnameFormatter(BaseNameFormatter):
|
class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||||
def transform(self, row: pd.Series) -> Dict:
|
def transform(self, row: pd.Series) -> Dict:
|
||||||
native_parts = self.parse_native_components(row["probable_native"])
|
native_parts = self.parse_native_components(row["probable_native"])
|
||||||
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
original_surname = (
|
||||||
|
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
)
|
||||||
|
|
||||||
# Add random additional surname
|
# Add random additional surname
|
||||||
additional_surname = random.choice(self.additional_surnames)
|
additional_surname = random.choice(self.additional_surnames)
|
||||||
@@ -22,7 +24,9 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
|
|||||||
"identified_name": row["probable_native"],
|
"identified_name": row["probable_native"],
|
||||||
"probable_surname": combined_surname,
|
"probable_surname": combined_surname,
|
||||||
"identified_surname": combined_surname,
|
"identified_surname": combined_surname,
|
||||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
|
"ner_entities": str(
|
||||||
|
self.create_ner_tags(full_name, native_parts, combined_surname)
|
||||||
|
),
|
||||||
"transformation_type": self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_numeric_features(full_name),
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class NativeOnlyFormatter(BaseNameFormatter):
|
class NativeOnlyFormatter(BaseNameFormatter):
|
||||||
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class OriginalFormatter(BaseNameFormatter):
|
class OriginalFormatter(BaseNameFormatter):
|
||||||
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class PositionFlippedFormatter(BaseNameFormatter):
|
class PositionFlippedFormatter(BaseNameFormatter):
|
||||||
@@ -2,7 +2,7 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.ner.formats import BaseNameFormatter
|
from ners.processing.ner.formats import BaseNameFormatter
|
||||||
|
|
||||||
|
|
||||||
class ReducedNativeFormatter(BaseNameFormatter):
|
class ReducedNativeFormatter(BaseNameFormatter):
|
||||||
@@ -11,7 +11,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
|
|||||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||||
|
|
||||||
# Keep only first native component + surname
|
# Keep only first native component + surname
|
||||||
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
reduced_native = (
|
||||||
|
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
||||||
|
)
|
||||||
full_name = f"{reduced_native} {surname}".strip()
|
full_name = f"{reduced_native} {surname}".strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -20,7 +22,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
|
|||||||
"identified_name": reduced_native,
|
"identified_name": reduced_native,
|
||||||
"probable_surname": surname,
|
"probable_surname": surname,
|
||||||
"identified_surname": surname,
|
"identified_surname": surname,
|
||||||
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
|
"ner_entities": str(
|
||||||
|
self.create_ner_tags(full_name, [reduced_native], surname)
|
||||||
|
),
|
||||||
"transformation_type": self.transformation_type,
|
"transformation_type": self.transformation_type,
|
||||||
**self.compute_numeric_features(full_name),
|
**self.compute_numeric_features(full_name),
|
||||||
}
|
}
|
||||||
@@ -4,8 +4,8 @@ import logging
|
|||||||
import spacy
|
import spacy
|
||||||
from spacy.tokens import DocBin
|
from spacy.tokens import DocBin
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from .name_tagger import NameTagger
|
from .name_tagger import NameTagger
|
||||||
|
|
||||||
|
|
||||||
@@ -20,7 +20,9 @@ class NameBuilder:
|
|||||||
self.tagger = NameTagger()
|
self.tagger = NameTagger()
|
||||||
|
|
||||||
def build(self) -> int:
|
def build(self) -> int:
|
||||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["engineered"]
|
||||||
|
)
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||||
|
|
||||||
@@ -38,7 +40,9 @@ class NameBuilder:
|
|||||||
|
|
||||||
# Use NERNameTagger for parsing and validation
|
# Use NERNameTagger for parsing and validation
|
||||||
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
|
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
|
||||||
validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities)
|
validated_entities = self.tagger.validate_entities(
|
||||||
|
ner_df["name"], parsed_entities
|
||||||
|
)
|
||||||
|
|
||||||
# Drop rows with no valid entities
|
# Drop rows with no valid entities
|
||||||
mask = validated_entities.map(bool)
|
mask = validated_entities.map(bool)
|
||||||
@@ -51,22 +55,33 @@ class NameBuilder:
|
|||||||
|
|
||||||
# Prepare training data
|
# Prepare training data
|
||||||
training_data = list(
|
training_data = list(
|
||||||
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
zip(
|
||||||
|
ner_df["name"].tolist(),
|
||||||
|
[{"entities": ents} for ents in validated_entities],
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use NERNameTagger to create spaCy DocBin
|
# Use NERNameTagger to create spaCy DocBin
|
||||||
docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
docs = self.tagger.create_docs(
|
||||||
|
nlp, ner_df["name"].tolist(), validated_entities.tolist()
|
||||||
|
)
|
||||||
doc_bin = DocBin(docs=docs)
|
doc_bin = DocBin(docs=docs)
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
json_path = self.config.paths.get_data_path(
|
||||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
self.config.data.output_files["ner_data"]
|
||||||
|
)
|
||||||
|
spacy_path = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["ner_spacy"]
|
||||||
|
)
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||||
doc_bin.to_disk(spacy_path)
|
doc_bin.to_disk(spacy_path)
|
||||||
|
|
||||||
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
logging.info(
|
||||||
|
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
|
||||||
|
)
|
||||||
logging.info(f"Saved NER JSON to {json_path}")
|
logging.info(f"Saved NER JSON to {json_path}")
|
||||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||||
return 0
|
return 0
|
||||||
@@ -6,14 +6,14 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
from ners.processing.ner.formats.connectors_format import ConnectorFormatter
|
||||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||||
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||||
from processing.ner.formats.original_format import OriginalFormatter
|
from ners.processing.ner.formats.original_format import OriginalFormatter
|
||||||
from processing.ner.formats.position_flipped_format import PositionFlippedFormatter
|
from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
|
||||||
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
||||||
|
|
||||||
|
|
||||||
class NameEngineering:
|
class NameEngineering:
|
||||||
@@ -44,42 +44,60 @@ class NameEngineering:
|
|||||||
# Initialize format classes
|
# Initialize format classes
|
||||||
self.formatters = {
|
self.formatters = {
|
||||||
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
||||||
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
|
"native_only": NativeOnlyFormatter(
|
||||||
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
|
self.connectors, self.additional_surnames
|
||||||
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
|
),
|
||||||
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
|
"position_flipped": PositionFlippedFormatter(
|
||||||
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
|
"reduced_native": ReducedNativeFormatter(
|
||||||
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
|
"connector_added": ConnectorFormatter(
|
||||||
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
|
"extended_surname": ExtendedSurnameFormatter(
|
||||||
|
self.connectors, self.additional_surnames
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
def load_data(self) -> pd.DataFrame:
|
def load_data(self) -> pd.DataFrame:
|
||||||
"""Load and filter NER-tagged data from CSV file"""
|
"""Load and filter NER-tagged data from CSV file"""
|
||||||
|
|
||||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
|
||||||
# Filter only NER-tagged rows
|
# Filter only NER-tagged rows
|
||||||
ner_data = df[df["ner_tagged"] == 1].copy()
|
ner_data = df[df["ner_tagged"] == 1].copy()
|
||||||
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
|
logging.info(
|
||||||
|
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
|
||||||
|
)
|
||||||
|
|
||||||
return ner_data
|
return ner_data
|
||||||
|
|
||||||
def compute(self) -> None:
|
def compute(self) -> None:
|
||||||
logging.info("Applying feature engineering transformations...")
|
logging.info("Applying feature engineering transformations...")
|
||||||
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
input_filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
output_filepath = self.config.paths.get_data_path(
|
output_filepath = self.config.paths.get_data_path(
|
||||||
self.config.data.output_files["engineered"]
|
self.config.data.output_files["engineered"]
|
||||||
)
|
)
|
||||||
|
|
||||||
df = self.data_loader.load_csv_complete(input_filepath)
|
df = self.data_loader.load_csv_complete(input_filepath)
|
||||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||||
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
logging.info(
|
||||||
|
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
|
||||||
|
)
|
||||||
|
|
||||||
del df # No need to keep in memory
|
del df # No need to keep in memory
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
ner_df = ner_df.sample(
|
||||||
drop=True
|
frac=1, random_state=self.config.data.random_seed
|
||||||
)
|
).reset_index(drop=True)
|
||||||
total_rows = len(ner_df)
|
total_rows = len(ner_df)
|
||||||
|
|
||||||
# Calculate split points
|
# Calculate split points
|
||||||
@@ -94,7 +112,11 @@ class NameEngineering:
|
|||||||
(0, split_25_1, "original"), # First 25%: original format
|
(0, split_25_1, "original"), # First 25%: original format
|
||||||
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
||||||
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
||||||
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
|
(
|
||||||
|
split_25_3,
|
||||||
|
split_10_1,
|
||||||
|
"reduced_native",
|
||||||
|
), # Fourth 10%: reduce native components
|
||||||
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
||||||
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
||||||
]
|
]
|
||||||
@@ -11,7 +11,7 @@ from spacy.training import Example
|
|||||||
from spacy.util import minibatch
|
from spacy.util import minibatch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
class NameModel:
|
class NameModel:
|
||||||
@@ -29,6 +29,15 @@ class NameModel:
|
|||||||
"""Create a blank spaCy model with NER pipeline"""
|
"""Create a blank spaCy model with NER pipeline"""
|
||||||
logging.info(f"Creating blank {language} model for NER training")
|
logging.info(f"Creating blank {language} model for NER training")
|
||||||
|
|
||||||
|
# Prefer GPU for spaCy if available (falls back to CPU automatically)
|
||||||
|
try:
|
||||||
|
if spacy.prefer_gpu():
|
||||||
|
logging.info("spaCy GPU enabled (cupy) for NER training")
|
||||||
|
else:
|
||||||
|
logging.info("spaCy running on CPU")
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"spaCy GPU selection skipped: {e}")
|
||||||
|
|
||||||
# Create blank model - French tokenizer works well for DRC names
|
# Create blank model - French tokenizer works well for DRC names
|
||||||
self.nlp = spacy.blank(language)
|
self.nlp = spacy.blank(language)
|
||||||
|
|
||||||
@@ -78,7 +87,9 @@ class NameModel:
|
|||||||
|
|
||||||
# Handle different annotation formats from NERNameTagger
|
# Handle different annotation formats from NERNameTagger
|
||||||
if not isinstance(annotations, dict) or "entities" not in annotations:
|
if not isinstance(annotations, dict) or "entities" not in annotations:
|
||||||
logging.warning(f"Skipping invalid annotations at index {i}: {annotations}")
|
logging.warning(
|
||||||
|
f"Skipping invalid annotations at index {i}: {annotations}"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -115,7 +126,9 @@ class NameModel:
|
|||||||
valid_entities = []
|
valid_entities = []
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
||||||
logging.warning(f"Skipping invalid entity format in '{text}': {entity}")
|
logging.warning(
|
||||||
|
f"Skipping invalid entity format in '{text}': {entity}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start, end, label = entity
|
start, end, label = entity
|
||||||
@@ -129,21 +142,30 @@ class NameModel:
|
|||||||
or start < 0
|
or start < 0
|
||||||
or end > len(text)
|
or end > len(text)
|
||||||
):
|
):
|
||||||
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
|
logging.warning(
|
||||||
|
f"Skipping invalid entity bounds in '{text}': {entity}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for overlaps with already validated entities
|
# Check for overlaps with already validated entities
|
||||||
has_overlap = any(
|
has_overlap = any(
|
||||||
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
|
start < v_end and end > v_start
|
||||||
|
for v_start, v_end, _ in valid_entities
|
||||||
)
|
)
|
||||||
|
|
||||||
if has_overlap:
|
if has_overlap:
|
||||||
logging.warning(f"Skipping overlapping entity in '{text}': {entity}")
|
logging.warning(
|
||||||
|
f"Skipping overlapping entity in '{text}': {entity}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Validate that the span doesn't contain spaces (matching tagger validation)
|
# Validate that the span doesn't contain spaces (matching tagger validation)
|
||||||
span_text = text[start:end]
|
span_text = text[start:end]
|
||||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
if (
|
||||||
|
not span_text
|
||||||
|
or span_text != span_text.strip()
|
||||||
|
or " " in span_text
|
||||||
|
):
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
||||||
)
|
)
|
||||||
@@ -152,7 +174,9 @@ class NameModel:
|
|||||||
valid_entities.append((start, end, label))
|
valid_entities.append((start, end, label))
|
||||||
|
|
||||||
if not valid_entities:
|
if not valid_entities:
|
||||||
logging.warning(f"Skipping training example with no valid entities: '{text}'")
|
logging.warning(
|
||||||
|
f"Skipping training example with no valid entities: '{text}'"
|
||||||
|
)
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -210,7 +234,9 @@ class NameModel:
|
|||||||
batches = minibatch(examples, size=batch_size)
|
batches = minibatch(examples, size=batch_size)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_losses = {}
|
batch_losses = {}
|
||||||
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
|
self.nlp.update(
|
||||||
|
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
|
||||||
|
)
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
||||||
)
|
)
|
||||||
@@ -233,7 +259,9 @@ class NameModel:
|
|||||||
"dropout_rate": dropout_rate,
|
"dropout_rate": dropout_rate,
|
||||||
}
|
}
|
||||||
|
|
||||||
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
|
logging.info(
|
||||||
|
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
||||||
"""Evaluate the trained model on test data"""
|
"""Evaluate the trained model on test data"""
|
||||||
@@ -282,10 +310,14 @@ class NameModel:
|
|||||||
entity_stats[label]["fp"] += 1
|
entity_stats[label]["fp"] += 1
|
||||||
|
|
||||||
# Calculate overall metrics
|
# Calculate overall metrics
|
||||||
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
|
precision = (
|
||||||
|
correct_entities / predicted_entities if predicted_entities > 0 else 0
|
||||||
|
)
|
||||||
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
||||||
f1_score = (
|
f1_score = (
|
||||||
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
2 * (precision * recall) / (precision + recall)
|
||||||
|
if (precision + recall) > 0
|
||||||
|
else 0
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate per-label metrics
|
# Calculate per-label metrics
|
||||||
@@ -295,7 +327,11 @@ class NameModel:
|
|||||||
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
||||||
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
||||||
label_f1 = (
|
label_f1 = (
|
||||||
(2 * (label_precision * label_recall) / (label_precision + label_recall))
|
(
|
||||||
|
2
|
||||||
|
* (label_precision * label_recall)
|
||||||
|
/ (label_precision + label_recall)
|
||||||
|
)
|
||||||
if (label_precision + label_recall) > 0
|
if (label_precision + label_recall) > 0
|
||||||
else 0
|
else 0
|
||||||
)
|
)
|
||||||
@@ -385,7 +421,9 @@ class NameModel:
|
|||||||
"label": ent.label_,
|
"label": ent.label_,
|
||||||
"start": ent.start_char,
|
"start": ent.start_char,
|
||||||
"end": ent.end_char,
|
"end": ent.end_char,
|
||||||
"confidence": getattr(ent, "score", None), # If confidence scores are available
|
"confidence": getattr(
|
||||||
|
ent, "score", None
|
||||||
|
), # If confidence scores are available
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -48,7 +48,9 @@ class NameTagger:
|
|||||||
# Find the first occurrence of this native word that doesn't overlap
|
# Find the first occurrence of this native word that doesn't overlap
|
||||||
start_pos = 0
|
start_pos = 0
|
||||||
while True:
|
while True:
|
||||||
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
|
pos = name_lower.find(
|
||||||
|
native_word_lower, start_pos
|
||||||
|
) # Case-insensitive search
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -78,7 +80,9 @@ class NameTagger:
|
|||||||
# Find the first occurrence that doesn't overlap
|
# Find the first occurrence that doesn't overlap
|
||||||
start_pos = 0
|
start_pos = 0
|
||||||
while True:
|
while True:
|
||||||
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
|
pos = name_lower.find(
|
||||||
|
surname_lower, start_pos
|
||||||
|
) # Case-insensitive search
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -120,8 +124,13 @@ class NameTagger:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for overlaps with already validated entities
|
# Check for overlaps with already validated entities
|
||||||
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
|
if any(
|
||||||
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
|
start < v_end and end > v_start
|
||||||
|
for v_start, v_end, _ in validated_entities
|
||||||
|
):
|
||||||
|
logging.warning(
|
||||||
|
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||||
@@ -200,10 +209,16 @@ class NameTagger:
|
|||||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||||
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
return [
|
||||||
|
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
parsed = ast.literal_eval(entities_str)
|
parsed = ast.literal_eval(entities_str)
|
||||||
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
|
return [
|
||||||
|
tuple(e)
|
||||||
|
for e in parsed
|
||||||
|
if isinstance(e, (list, tuple)) and len(e) == 3
|
||||||
|
]
|
||||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@@ -251,7 +266,9 @@ class NameTagger:
|
|||||||
last_end = e
|
last_end = e
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
def validate_entities(
|
||||||
|
self, texts: pd.Series, entities_series: pd.Series
|
||||||
|
) -> pd.Series:
|
||||||
"""Vectorized entity validation."""
|
"""Vectorized entity validation."""
|
||||||
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
||||||
|
|
||||||
@@ -4,9 +4,9 @@ from typing import Dict, Any
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.batch.batch_processor import BatchProcessor
|
from ners.processing.batch.batch_processor import BatchProcessor
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
@@ -8,9 +8,9 @@ from typing import List, Optional
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -38,7 +38,10 @@ class PipelineStep(ABC):
|
|||||||
"""Abstract base class for pipeline steps"""
|
"""Abstract base class for pipeline steps"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
self,
|
||||||
|
name: str,
|
||||||
|
pipeline_config: PipelineConfig,
|
||||||
|
batch_config: Optional[BatchConfig] = None,
|
||||||
):
|
):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.pipeline_config = pipeline_config
|
self.pipeline_config = pipeline_config
|
||||||
@@ -2,9 +2,9 @@ import logging
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.text_cleaner import TextCleaner
|
from ners.core.utils.text_cleaner import TextCleaner
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class DataCleaningStep(PipelineStep):
|
class DataCleaningStep(PipelineStep):
|
||||||
@@ -2,8 +2,8 @@ import logging
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class DataSelectionStep(PipelineStep):
|
class DataSelectionStep(PipelineStep):
|
||||||
@@ -20,15 +20,23 @@ class DataSelectionStep(PipelineStep):
|
|||||||
# Remove rows where region == "global" only for specific years
|
# Remove rows where region == "global" only for specific years
|
||||||
if "region" in batch.columns and "year" in batch.columns:
|
if "region" in batch.columns and "year" in batch.columns:
|
||||||
target_years = {2015, 2021, 2022}
|
target_years = {2015, 2021, 2022}
|
||||||
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(target_years)
|
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
|
||||||
|
target_years
|
||||||
|
)
|
||||||
removed = int(mask_remove.sum())
|
removed = int(mask_remove.sum())
|
||||||
if removed:
|
if removed:
|
||||||
batch = batch[~mask_remove]
|
batch = batch[~mask_remove]
|
||||||
logging.info(f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}")
|
logging.info(
|
||||||
|
f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}"
|
||||||
|
)
|
||||||
|
|
||||||
# Check which columns exist in the batch
|
# Check which columns exist in the batch
|
||||||
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
available_columns = [
|
||||||
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
|
col for col in self.selected_columns if col in batch.columns
|
||||||
|
]
|
||||||
|
missing_columns = [
|
||||||
|
col for col in self.selected_columns if col not in batch.columns
|
||||||
|
]
|
||||||
|
|
||||||
if missing_columns:
|
if missing_columns:
|
||||||
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
||||||
@@ -1,11 +1,11 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.region_mapper import RegionMapper
|
from ners.core.utils.region_mapper import RegionMapper
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
from processing.steps.feature_extraction_step import Gender
|
from ners.processing.steps.feature_extraction_step import Gender
|
||||||
|
|
||||||
|
|
||||||
class DataSplittingStep(PipelineStep):
|
class DataSplittingStep(PipelineStep):
|
||||||
@@ -26,7 +26,9 @@ class DataSplittingStep(PipelineStep):
|
|||||||
if self.eval_indices is None:
|
if self.eval_indices is None:
|
||||||
np.random.seed(self.pipeline_config.data.random_seed)
|
np.random.seed(self.pipeline_config.data.random_seed)
|
||||||
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
|
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
|
||||||
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
|
self.eval_indices = set(
|
||||||
|
np.random.choice(total_size, size=eval_size, replace=False)
|
||||||
|
)
|
||||||
return self.eval_indices
|
return self.eval_indices
|
||||||
|
|
||||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
@@ -45,7 +47,9 @@ class DataSplittingStep(PipelineStep):
|
|||||||
df_evaluation = df[eval_mask]
|
df_evaluation = df[eval_mask]
|
||||||
df_featured = df[~eval_mask]
|
df_featured = df[~eval_mask]
|
||||||
|
|
||||||
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
|
self.data_loader.save_csv(
|
||||||
|
df_evaluation, data_dir / output_files["evaluation"]
|
||||||
|
)
|
||||||
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
|
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
|
||||||
else:
|
else:
|
||||||
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
||||||
@@ -53,7 +57,9 @@ class DataSplittingStep(PipelineStep):
|
|||||||
if self.pipeline_config.data.split_by_province:
|
if self.pipeline_config.data.split_by_province:
|
||||||
for province in RegionMapper.get_provinces():
|
for province in RegionMapper.get_provinces():
|
||||||
df_region = df[df.province == province]
|
df_region = df[df.province == province]
|
||||||
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
|
self.data_loader.save_csv(
|
||||||
|
df_region, data_dir / "provinces" / f"{province}.csv"
|
||||||
|
)
|
||||||
|
|
||||||
if self.pipeline_config.data.split_by_gender:
|
if self.pipeline_config.data.split_by_gender:
|
||||||
df_males = df[df.sex == Gender.MALE.value]
|
df_males = df[df.sex == Gender.MALE.value]
|
||||||
@@ -5,10 +5,10 @@ from typing import Dict, Any
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.region_mapper import RegionMapper
|
from ners.core.utils.region_mapper import RegionMapper
|
||||||
from processing.ner.name_tagger import NameTagger
|
from ners.processing.ner.name_tagger import NameTagger
|
||||||
from processing.steps import PipelineStep
|
from ners.processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
class Gender(Enum):
|
class Gender(Enum):
|
||||||
@@ -64,10 +64,14 @@ class FeatureExtractionStep(PipelineStep):
|
|||||||
|
|
||||||
self._assign_probable_names(result)
|
self._assign_probable_names(result)
|
||||||
self._process_simple_names(result)
|
self._process_simple_names(result)
|
||||||
result["identified_category"] = self._assign_identified_category(result["words"])
|
result["identified_category"] = self._assign_identified_category(
|
||||||
|
result["words"]
|
||||||
|
)
|
||||||
|
|
||||||
if "year" in result.columns:
|
if "year" in result.columns:
|
||||||
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
|
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
|
||||||
|
"Int16"
|
||||||
|
)
|
||||||
|
|
||||||
if "region" in result.columns:
|
if "region" in result.columns:
|
||||||
result["province"] = self.region_mapper.map(result["region"]).str.lower()
|
result["province"] = self.region_mapper.map(result["region"]).str.lower()
|
||||||
@@ -7,12 +7,12 @@ import ollama
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from core.utils.prompt_manager import PromptManager
|
from ners.core.utils.prompt_manager import PromptManager
|
||||||
from core.utils.rate_limiter import RateLimitConfig
|
from ners.core.utils.rate_limiter import RateLimitConfig
|
||||||
from core.utils.rate_limiter import RateLimiter
|
from ners.core.utils.rate_limiter import RateLimiter
|
||||||
from processing.batch.batch_config import BatchConfig
|
from ners.processing.batch.batch_config import BatchConfig
|
||||||
from processing.steps import PipelineStep, NameAnnotation
|
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||||
|
|
||||||
|
|
||||||
class LLMAnnotationStep(PipelineStep):
|
class LLMAnnotationStep(PipelineStep):
|
||||||
@@ -24,7 +24,8 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
batch_config = BatchConfig(
|
batch_config = BatchConfig(
|
||||||
batch_size=pipeline_config.processing.batch_size,
|
batch_size=pipeline_config.processing.batch_size,
|
||||||
max_workers=min(
|
max_workers=min(
|
||||||
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
|
self.llm_config.max_concurrent_requests,
|
||||||
|
pipeline_config.processing.max_workers,
|
||||||
),
|
),
|
||||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||||
@@ -33,7 +34,9 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
|
|
||||||
self.prompt = PromptManager(pipeline_config).load_prompt()
|
self.prompt = PromptManager(pipeline_config).load_prompt()
|
||||||
self.rate_limiter = (
|
self.rate_limiter = (
|
||||||
self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None
|
self._create_rate_limiter()
|
||||||
|
if self.llm_config.enable_rate_limiting
|
||||||
|
else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# Statistics
|
# Statistics
|
||||||
@@ -76,7 +79,9 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
|
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
|
||||||
)
|
)
|
||||||
|
|
||||||
annotation = NameAnnotation.model_validate_json(response.message.content)
|
annotation = NameAnnotation.model_validate_json(
|
||||||
|
response.message.content
|
||||||
|
)
|
||||||
result = {
|
result = {
|
||||||
**annotation.model_dump(),
|
**annotation.model_dump(),
|
||||||
"annotated": 1,
|
"annotated": 1,
|
||||||
@@ -119,7 +124,9 @@ class LLMAnnotationStep(PipelineStep):
|
|||||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM")
|
logging.info(
|
||||||
|
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
|
||||||
|
)
|
||||||
|
|
||||||
batch = batch.copy()
|
batch = batch.copy()
|
||||||
client = ollama.Client()
|
client = ollama.Client()
|
||||||
@@ -5,9 +5,9 @@ from typing import Dict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from processing.ner.name_model import NameModel
|
from ners.processing.ner.name_model import NameModel
|
||||||
from processing.steps import PipelineStep, NameAnnotation
|
from ners.processing.steps import PipelineStep, NameAnnotation
|
||||||
|
|
||||||
|
|
||||||
class NERAnnotationStep(PipelineStep):
|
class NERAnnotationStep(PipelineStep):
|
||||||
@@ -39,7 +39,9 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
logging.info("NER model loaded successfully")
|
logging.info("NER model loaded successfully")
|
||||||
else:
|
else:
|
||||||
logging.warning(f"NER model not found at {self.model_path}")
|
logging.warning(f"NER model not found at {self.model_path}")
|
||||||
logging.warning("NER annotation will be skipped. Train the model first.")
|
logging.warning(
|
||||||
|
"NER annotation will be skipped. Train the model first."
|
||||||
|
)
|
||||||
self.name_model.nlp = None
|
self.name_model.nlp = None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to load NER model: {e}")
|
logging.error(f"Failed to load NER model: {e}")
|
||||||
@@ -80,7 +82,9 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
# Create annotation result in same format as LLM step
|
# Create annotation result in same format as LLM step
|
||||||
annotation = NameAnnotation(
|
annotation = NameAnnotation(
|
||||||
identified_name=" ".join(native_parts) if native_parts else None,
|
identified_name=" ".join(native_parts) if native_parts else None,
|
||||||
identified_surname=" ".join(surname_parts) if surname_parts else None,
|
identified_surname=" ".join(surname_parts)
|
||||||
|
if surname_parts
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -124,7 +128,9 @@ class NERAnnotationStep(PipelineStep):
|
|||||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER")
|
logging.info(
|
||||||
|
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
|
||||||
|
)
|
||||||
|
|
||||||
batch = batch.copy()
|
batch = batch.copy()
|
||||||
|
|
||||||
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
|
|
||||||
|
|
||||||
class BaseModel(ABC):
|
class BaseModel(ABC):
|
||||||
@@ -103,16 +103,25 @@ class BaseModel(ABC):
|
|||||||
feature_names = self._get_feature_names()
|
feature_names = self._get_feature_names()
|
||||||
return dict(zip(feature_names, coefficients))
|
return dict(zip(feature_names, coefficients))
|
||||||
|
|
||||||
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
|
elif (
|
||||||
|
hasattr(self.model, "named_steps")
|
||||||
|
and "classifier" in self.model.named_steps
|
||||||
|
):
|
||||||
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
||||||
classifier = self.model.named_steps["classifier"]
|
classifier = self.model.named_steps["classifier"]
|
||||||
if hasattr(classifier, "coef_"):
|
if hasattr(classifier, "coef_"):
|
||||||
coefficients = np.abs(classifier.coef_[0])
|
coefficients = np.abs(classifier.coef_[0])
|
||||||
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
|
if hasattr(
|
||||||
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
|
self.model.named_steps["vectorizer"], "get_feature_names_out"
|
||||||
|
):
|
||||||
|
feature_names = self.model.named_steps[
|
||||||
|
"vectorizer"
|
||||||
|
].get_feature_names_out()
|
||||||
# Take top features to avoid too many n-grams
|
# Take top features to avoid too many n-grams
|
||||||
top_indices = np.argsort(coefficients)[-20:]
|
top_indices = np.argsort(coefficients)[-20:]
|
||||||
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
|
return dict(
|
||||||
|
zip(feature_names[top_indices], coefficients[top_indices])
|
||||||
|
)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -143,7 +152,7 @@ class BaseModel(ABC):
|
|||||||
model_data = joblib.load(path)
|
model_data = joblib.load(path)
|
||||||
|
|
||||||
# Recreate the model instance
|
# Recreate the model instance
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
|
|
||||||
config = ExperimentConfig.from_dict(model_data["config"])
|
config = ExperimentConfig.from_dict(model_data["config"])
|
||||||
instance = cls(config)
|
instance = cls(config)
|
||||||
@@ -221,7 +230,9 @@ class BaseModel(ABC):
|
|||||||
if "accuracy" in self.training_history:
|
if "accuracy" in self.training_history:
|
||||||
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
||||||
if "val_accuracy" in self.training_history:
|
if "val_accuracy" in self.training_history:
|
||||||
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
|
axes[0].plot(
|
||||||
|
self.training_history["val_accuracy"], label="Validation Accuracy"
|
||||||
|
)
|
||||||
axes[0].set_title("Model Accuracy")
|
axes[0].set_title("Model Accuracy")
|
||||||
axes[0].set_xlabel("Epoch")
|
axes[0].set_xlabel("Epoch")
|
||||||
axes[0].set_ylabel("Accuracy")
|
axes[0].set_ylabel("Accuracy")
|
||||||
@@ -18,7 +18,9 @@ class ExperimentConfig:
|
|||||||
tags: List[str] = field(default_factory=list)
|
tags: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
# Model configuration
|
# Model configuration
|
||||||
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
|
model_type: str = (
|
||||||
|
"logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||||
|
)
|
||||||
model_params: Dict[str, Any] = field(default_factory=dict)
|
model_params: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
# Feature configuration
|
# Feature configuration
|
||||||
@@ -26,7 +28,9 @@ class ExperimentConfig:
|
|||||||
feature_params: Dict[str, Any] = field(default_factory=dict)
|
feature_params: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
# Data configuration
|
# Data configuration
|
||||||
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
|
train_data_filter: Optional[Dict[str, Any]] = (
|
||||||
|
None # Filter criteria for training data
|
||||||
|
)
|
||||||
test_data_filter: Optional[Dict[str, Any]] = None
|
test_data_filter: Optional[Dict[str, Any]] = None
|
||||||
target_column: str = "sex"
|
target_column: str = "sex"
|
||||||
|
|
||||||
@@ -36,7 +40,9 @@ class ExperimentConfig:
|
|||||||
cross_validation_folds: int = 5
|
cross_validation_folds: int = 5
|
||||||
|
|
||||||
# Evaluation configuration
|
# Evaluation configuration
|
||||||
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
|
metrics: List[str] = field(
|
||||||
|
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
|
||||||
|
)
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
"""Convert to dictionary for serialization"""
|
"""Convert to dictionary for serialization"""
|
||||||
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field, asdict
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Dict, List, Any
|
from typing import Optional, Dict, List, Any
|
||||||
|
|
||||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
from ners.research.experiment import ExperimentConfig, ExperimentStatus
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -51,6 +51,8 @@ class ExperimentResult:
|
|||||||
"""Create from dictionary"""
|
"""Create from dictionary"""
|
||||||
data["config"] = ExperimentConfig.from_dict(data["config"])
|
data["config"] = ExperimentConfig.from_dict(data["config"])
|
||||||
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
||||||
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
data["end_time"] = (
|
||||||
|
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||||
|
)
|
||||||
data["status"] = ExperimentStatus(data["status"])
|
data["status"] = ExperimentStatus(data["status"])
|
||||||
return cls(**data)
|
return cls(**data)
|
||||||
@@ -3,9 +3,9 @@ from typing import List, Dict
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from ners.core.config.pipeline_config import PipelineConfig
|
||||||
from research.experiment import ExperimentConfig
|
from ners.research.experiment import ExperimentConfig
|
||||||
from research.experiment.feature_extractor import FeatureType
|
from ners.research.experiment.feature_extractor import FeatureType
|
||||||
|
|
||||||
|
|
||||||
class ExperimentBuilder:
|
class ExperimentBuilder:
|
||||||
@@ -27,7 +27,9 @@ class ExperimentBuilder:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def find_template(cls, templates: dict, name: str, experiment_type: str = "baseline") -> dict:
|
def find_template(
|
||||||
|
cls, templates: dict, name: str, experiment_type: str = "baseline"
|
||||||
|
) -> dict:
|
||||||
"""Find experiment configuration by name and type"""
|
"""Find experiment configuration by name and type"""
|
||||||
|
|
||||||
# Map type to section in templates
|
# Map type to section in templates
|
||||||
@@ -9,12 +9,16 @@ import pandas as pd
|
|||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from ners.core.config import PipelineConfig
|
||||||
from core.utils.data_loader import DataLoader
|
from ners.core.utils.data_loader import DataLoader
|
||||||
from research.base_model import BaseModel
|
from ners.research.base_model import BaseModel
|
||||||
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
from ners.research.experiment import (
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
ExperimentConfig,
|
||||||
from research.model_registry import create_model
|
ExperimentStatus,
|
||||||
|
calculate_metrics,
|
||||||
|
)
|
||||||
|
from ners.research.experiment.experiment_tracker import ExperimentTracker
|
||||||
|
from ners.research.model_registry import create_model
|
||||||
|
|
||||||
|
|
||||||
class ExperimentRunner:
|
class ExperimentRunner:
|
||||||
@@ -32,10 +36,14 @@ class ExperimentRunner:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
logging.info(f"Starting experiment: {experiment_id}")
|
logging.info(f"Starting experiment: {experiment_id}")
|
||||||
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
self.tracker.update_experiment(
|
||||||
|
experiment_id, status=ExperimentStatus.RUNNING
|
||||||
|
)
|
||||||
|
|
||||||
# Load data
|
# Load data
|
||||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["featured"]
|
||||||
|
)
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
|
||||||
# Apply data filters if specified
|
# Apply data filters if specified
|
||||||
@@ -63,8 +71,12 @@ class ExperimentRunner:
|
|||||||
test_pred = model.predict(X_test)
|
test_pred = model.predict(X_test)
|
||||||
|
|
||||||
# Calculate metrics
|
# Calculate metrics
|
||||||
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
|
train_metrics = calculate_metrics(
|
||||||
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
|
y_train, train_pred, experiment_config.metrics
|
||||||
|
)
|
||||||
|
test_metrics = calculate_metrics(
|
||||||
|
y_test, test_pred, experiment_config.metrics
|
||||||
|
)
|
||||||
|
|
||||||
# Cross-validation if requested
|
# Cross-validation if requested
|
||||||
cv_metrics = {}
|
cv_metrics = {}
|
||||||
@@ -125,7 +137,9 @@ class ExperimentRunner:
|
|||||||
experiment_ids = []
|
experiment_ids = []
|
||||||
|
|
||||||
for i, config in enumerate(experiments):
|
for i, config in enumerate(experiments):
|
||||||
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
|
logging.info(
|
||||||
|
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
exp_id = self.run_experiment(config)
|
exp_id = self.run_experiment(config)
|
||||||
experiment_ids.append(exp_id)
|
experiment_ids.append(exp_id)
|
||||||
@@ -136,7 +150,9 @@ class ExperimentRunner:
|
|||||||
return experiment_ids
|
return experiment_ids
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
|
def _apply_data_filters(
|
||||||
|
cls, df: pd.DataFrame, config: ExperimentConfig
|
||||||
|
) -> pd.DataFrame:
|
||||||
"""Apply data filters specified in experiment config"""
|
"""Apply data filters specified in experiment config"""
|
||||||
filtered_df = df.copy()
|
filtered_df = df.copy()
|
||||||
|
|
||||||
@@ -148,9 +164,13 @@ class ExperimentRunner:
|
|||||||
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
||||||
elif isinstance(criteria, dict):
|
elif isinstance(criteria, dict):
|
||||||
if "min" in criteria:
|
if "min" in criteria:
|
||||||
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
|
filtered_df = filtered_df[
|
||||||
|
filtered_df[column] >= criteria["min"]
|
||||||
|
]
|
||||||
if "max" in criteria:
|
if "max" in criteria:
|
||||||
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
|
filtered_df = filtered_df[
|
||||||
|
filtered_df[column] <= criteria["max"]
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
filtered_df = filtered_df[filtered_df[column] == criteria]
|
filtered_df = filtered_df[filtered_df[column] == criteria]
|
||||||
|
|
||||||
@@ -231,7 +251,9 @@ class ExperimentRunner:
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
|
logging.error(
|
||||||
|
f"Failed to load model for experiment {experiment_id}: {e}"
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return None
|
return None
|
||||||