9 Commits

Author SHA1 Message Date
bernard-ng fad7ff9277 chore(release): v1.0.0 2025-10-05 23:45:35 +02:00
bernard-ng 8f90fdd625 fix: notebooks 2025-10-05 23:23:58 +02:00
bernard-ng 137dea7fe5 fix: models 2025-10-05 21:54:25 +02:00
bernard-ng 9dd4f759b3 refactoring: uv 2025-10-05 18:14:15 +02:00
bernard-ng f3b06fbd07 feat: regions clusters 2025-10-03 11:58:36 +02:00
bernard-ng 912d518106 feat: support gpu 2025-09-29 22:52:08 +02:00
bernard-ng a1d500830b feat: support gpu 2025-09-29 21:07:23 +02:00
bernard-ng 9e35f95107 feat: statistics tests 2025-09-28 23:50:40 +02:00
bernard-ng 9039e9a4cf feat: statistics tests 2025-09-28 17:16:02 +02:00
152 changed files with 42751 additions and 8277 deletions
+16
View File
@@ -0,0 +1,16 @@
.git
.gitignore
.idea
.vscode
__pycache__
.ruff_cache
.venv
*.pyc
*.pyo
*.pyd
*.swp
*.swo
*.DS_Store
dist
build
*.egg-info
+1
View File
@@ -0,0 +1 @@
3.11
+49
View File
@@ -0,0 +1,49 @@
# syntax=docker/dockerfile:1
# Minimal Linux base (glibc) Python will be installed by uv
FROM debian:bookworm-slim
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
UV_INSTALL_DIR=/usr/local/bin \
UV_LINK_MODE=copy \
UV_PYTHON_DOWNLOADS=1 \
UV_PROJECT_ENVIRONMENT=/app/.venv \
PATH=/app/.venv/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
WORKDIR /app
# System deps for building/using common scientific stack
# Keep minimal; rely on wheels where possible
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates curl git \
build-essential pkg-config \
libssl-dev libffi-dev \
libopenblas0 libstdc++6 \
libfreetype6 libpng16-16 libjpeg62-turbo \
&& rm -rf /var/lib/apt/lists/*
# Install uv (static binary)
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
# Copy project metadata first for layer caching
COPY pyproject.toml README.md ./
# Install a managed Python via uv and create the project venv
RUN uv python install 3.11 \
&& uv venv /app/.venv --python 3.11
# Resolve and install runtime deps into project venv
# Use lockfile if present for reproducibility
RUN if [ -f uv.lock ]; then uv sync --no-dev --no-install-project --frozen; else uv sync --no-dev --no-install-project; fi
# Copy source code and optional templates
COPY src ./src
# Re-sync to ensure the local package is installed
RUN uv sync --no-dev \
&& rm -rf /root/.cache
# Default command shows help; override in compose or docker run
CMD ["ners", "--help"]
-52
View File
@@ -1,52 +0,0 @@
.PHONY: default
default: help
.PHONY: help
help: ## Show this help message
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
# =============================================================================
# ENVIRONMENT SETUP
# =============================================================================
.PHONY: setup
setup: ## Setup virtual environment and install dependencies
python -m venv .venv
source .venv/bin/activate
.venv/bin/pip install --upgrade pip
.venv/bin/pip install -r requirements.txt
.PHONY: install
install: ## Install/update dependencies
pip install --upgrade pip
pip install -r requirements.txt
# =============================================================================
# DEVELOPMENT & CODE QUALITY
# =============================================================================
.PHONY: format
format: ## Format code with black
black . --line-length 100
.PHONY: lint
lint: ## Lint code with flake8
flake8 . --max-line-length=100 --ignore=E203,W503 --exclude=.venv
.PHONY: type-check
type-check: ## Type check with mypy
mypy . --ignore-missing-imports
.PHONY: notebook
notebook: ## Start Jupyter notebook
jupyter notebook notebooks/
# =============================================================================
# DEPLOYMENT & PRODUCTION
# =============================================================================
.PHONY: backup
backup: ## Backup datasets and results
@mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
@cp -r data/ backups/$(shell date +%Y%m%d_%H%M%S)/data/
@echo "Backup created in backups/$(shell date +%Y%m%d_%H%M%S)/"
+68 -90
View File
@@ -10,51 +10,41 @@ million names from the Democratic Republic of Congo (DRC) annotated with gender
### Installation & Setup
Instructions and command line snippets bellow are provided to help you set up the project environment quickly and
efficiently.
assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
**Using Makefile (Recommended)**
> download [the dataset](https://drive.google.com/file/d/1a5wQnOZdsRWBOeoMA_0lNtbneTvS9xqy/view?usp=drive_link), if you need access please reach us at mlec.academia@gmail.com.
```bash
git clone https://github.com/bernard-ng/drc-ners-nlp.git
cd drc-ners-nlp
# Setup environment
make setup
make activate
mkdir -p drc-ners-nlp/data/dataset
cp names.csv drc-ners-nlp/data/dataset
cd drc-ners-nlp
```
**Manual Setup**
**Linux**
```bash
git clone https://github.com/bernard-ng/drc-ners-nlp.git
cd drc-ners-nlp
curl -LsSf https://astral.sh/uv/install.sh | sh
# Setup environment
python -m venv .venv
.venv/bin/pip install --upgrade pip
.venv/bin/pip install -r requirements.txt
uv sync
```
pip install --upgrade pip
pip install -r requirements.txt
pip install jupyter notebook ipykernel pytest black flake8 mypy
source .venv/bin/activate
**Macos & windows**
```bash
docker compose build
docker compose exec app bash
```
## Data Processing
This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching,
checkpointing, and parallel processing capabilities.
step are defined in the `drc-ners-nlp/processing/steps` directory. and configuration to enable them is managed through
the `drc-ners-nlp/config/pipeline.yaml` file.
**Pipeline Configuration**
```yaml
stages:
- "data_cleaning"
- "data_selection"
- "feature_extraction"
- "data_splitting"
```
@@ -62,97 +52,77 @@ stages:
**Running the Pipeline**
```bash
python main.py --env production
```
## NER Processing (Optional)
This project implements a custom named entity recognition (NER) pipeline tailored for Congolese names.
Its main objective is to accurately identify and tag the different components of a Congolese name,
specifically distinguishing between the native part and the surname.
```bash
python ner.py --env production
```
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
**Running the Pipeline with NER Annotation**
```yaml
stages:
- "data_cleaning"
- "feature_extraction"
- "ner_annotation"
- "data_splitting"
```
**Running the Pipeline with LLM Annotation**
```yaml
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
uv run ners pipeline run --env="production"
```
## Experiments
This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and
research iteration. models are defined in the `drc-ners-nlp/research/models` directory.
you can define model features, training parameters, and evaluation metrics in the `research_templates.yaml` file.
research iteration. you can define model features, training parameters, and evaluation metrics in the `config/research_templates.yaml` file.
**Running Experiments**
```bash
# bigru
python train.py --name="bigru" --type="baseline" --env="production"
python train.py --name="bigru_native" --type="baseline" --env="production"
python train.py --name="bigru_surname" --type="baseline" --env="production"
uv run ners research train --name="bigru" --type="baseline" --env="production"
uv run ners research train --name="bigru_native" --type="baseline" --env="production"
uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
```
```bash
# cnn
python train.py --name="cnn" --type="baseline" --env="production"
python train.py --name="cnn_native" --type="baseline" --env="production"
python train.py --name="cnn_surname" --type="baseline" --env="production"
uv run ners research train --name="cnn" --type="baseline" --env="production"
uv run ners research train --name="cnn_native" --type="baseline" --env="production"
uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
```
```bash
# lightgbm
python train.py --name="lightgbm" --type="baseline" --env="production"
python train.py --name="lightgbm_native" --type="baseline" --env="production"
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
uv run ners research train --name="lightgbm" --type="baseline" --env="production"
uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
```
```bash
# logistic regression
python train.py --name="logistic_regression" --type="baseline" --env="production"
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
```
```bash
# lstm
python train.py --name="lstm" --type="baseline" --env="production"
python train.py --name="lstm_native" --type="baseline" --env="production"
python train.py --name="lstm_surname" --type="baseline" --env="production"
uv run ners research train --name="lstm" --type="baseline" --env="production"
uv run ners research train --name="lstm_native" --type="baseline" --env="production"
uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
```
```bash
# random forest
python train.py --name="random_forest" --type="baseline" --env="production"
python train.py --name="random_forest_native" --type="baseline" --env="production"
python train.py --name="random_forest_surname" --type="baseline" --env="production"
# svm
python train.py --name="svm" --type="baseline" --env="production"
python train.py --name="svm_native" --type="baseline" --env="production"
python train.py --name="svm_surname" --type="baseline" --env="production"
uv run ners research train --name="random_forest" --type="baseline" --env="production"
uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
```
```bash
# naive bayes
python train.py --name="naive_bayes" --type="baseline" --env="production"
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
```
```bash
# transformer
python train.py --name="transformer" --type="baseline" --env="production"
python train.py --name="transformer_native" --type="baseline" --env="production"
python train.py --name="transformer_surname" --type="baseline" --env="production"
uv run ners research train --name="transformer" --type="baseline" --env="production"
uv run ners research train --name="transformer_native" --type="baseline" --env="production"
uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
```
```bash
# xgboost
python train.py --name="xgboost" --type="baseline" --env="production"
python train.py --name="xgboost_native" --type="baseline" --env="production"
python train.py --name="xgboost_surname" --type="baseline" --env="production"
uv run ners research train --name="xgboost" --type="baseline" --env="production"
uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
uv run ners research train --name="xgboost_surname" --type="baseline" --env="production"
```
## Web Interface
@@ -162,10 +132,18 @@ experiments and make predictions without needing to understand the underlying co
### Running the Web Interface
![web](./assets/web.png)
```bash
streamlit run web/app.py
uv run ners web run --env="production"
```
```bash
docker compose run --rm --service-ports app ners web run --env=production
```
then open : http://localhost:8501/
## Contributors
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

+930
View File
@@ -0,0 +1,930 @@
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="720pt" height="720pt" viewBox="0 0 720 720" xmlns="http://www.w3.org/2000/svg" version="1.1">
<metadata>
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<cc:Work>
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
<dc:date>2025-10-05T23:19:48.322300</dc:date>
<dc:format>image/svg+xml</dc:format>
<dc:creator>
<cc:Agent>
<dc:title>Matplotlib v3.10.6, https://matplotlib.org/</dc:title>
</cc:Agent>
</dc:creator>
</cc:Work>
</rdf:RDF>
</metadata>
<defs>
<style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
</defs>
<g id="figure_1">
<g id="patch_1">
<path d="M 0 720
L 720 720
L 720 0
L 0 0
z
" style="fill: #ffffff"/>
</g>
<g id="axes_1">
<g id="patch_2">
<path d="M 178.972717 225.461826
C 165.311247 225.461826 151.729128 227.542675 138.694269 231.632688
C 125.659409 235.722701 113.323232 241.774364 102.110723 249.579196
C 90.898214 257.384027 80.939626 266.851361 72.578035 277.655044
C 64.216444 288.458727 57.548986 300.473255 52.805355 313.284724
C 48.061725 326.096193 45.297029 339.555775 44.606511 353.199783
C 43.915994 366.843791 45.307677 380.513725 48.733616 393.738652
C 52.159555 406.96358 57.579952 419.589869 64.808073 431.182541
C 72.036193 442.775213 80.98807 453.199597 91.355308 462.096571
C 101.722546 470.993545 113.38471 478.259754 125.940037 483.644875
C 138.495365 489.029996 151.798001 492.471471 165.389667 493.850741
C 178.981332 495.230011 192.704136 494.531053 206.085323 491.777945
C 219.46651 489.024838 232.350634 484.249563 244.293832 477.616632
C 256.237031 470.983701 267.100562 462.570166 276.510175 452.665876
C 285.919787 442.761586 293.766171 431.481598 299.779017 419.21451
C 305.791862 406.947421 309.901319 393.835738 311.965815 380.33116
C 314.030311 366.826582 314.025864 353.08599 311.952626 339.582752
L 178.972717 360
z
" style="fill: #1f77b4"/>
</g>
<g id="patch_3">
<path d="M 311.952627 339.582734
C 307.075828 307.819612 290.976562 278.831415 266.590126 257.903424
C 242.20369 236.975434 211.108058 225.461834 178.972736 225.461826
L 178.972717 360
z
" style="fill: #ff7f0e"/>
</g>
<g id="matplotlib.axis_1"/>
<g id="matplotlib.axis_2"/>
<g id="text_1">
<!-- Simple -->
<g transform="translate(48.446708 475.065617) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-53" d="M 3425 4513
L 3425 3897
Q 3066 4069 2747 4153
Q 2428 4238 2131 4238
Q 1616 4238 1336 4038
Q 1056 3838 1056 3469
Q 1056 3159 1242 3001
Q 1428 2844 1947 2747
L 2328 2669
Q 3034 2534 3370 2195
Q 3706 1856 3706 1288
Q 3706 609 3251 259
Q 2797 -91 1919 -91
Q 1588 -91 1214 -16
Q 841 59 441 206
L 441 856
Q 825 641 1194 531
Q 1563 422 1919 422
Q 2459 422 2753 634
Q 3047 847 3047 1241
Q 3047 1584 2836 1778
Q 2625 1972 2144 2069
L 1759 2144
Q 1053 2284 737 2584
Q 422 2884 422 3419
Q 422 4038 858 4394
Q 1294 4750 2059 4750
Q 2388 4750 2728 4690
Q 3069 4631 3425 4513
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-69" d="M 603 3500
L 1178 3500
L 1178 0
L 603 0
L 603 3500
z
M 603 4863
L 1178 4863
L 1178 4134
L 603 4134
L 603 4863
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-6d" d="M 3328 2828
Q 3544 3216 3844 3400
Q 4144 3584 4550 3584
Q 5097 3584 5394 3201
Q 5691 2819 5691 2113
L 5691 0
L 5113 0
L 5113 2094
Q 5113 2597 4934 2840
Q 4756 3084 4391 3084
Q 3944 3084 3684 2787
Q 3425 2491 3425 1978
L 3425 0
L 2847 0
L 2847 2094
Q 2847 2600 2669 2842
Q 2491 3084 2119 3084
Q 1678 3084 1418 2786
Q 1159 2488 1159 1978
L 1159 0
L 581 0
L 581 3500
L 1159 3500
L 1159 2956
Q 1356 3278 1631 3431
Q 1906 3584 2284 3584
Q 2666 3584 2933 3390
Q 3200 3197 3328 2828
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-70" d="M 1159 525
L 1159 -1331
L 581 -1331
L 581 3500
L 1159 3500
L 1159 2969
Q 1341 3281 1617 3432
Q 1894 3584 2278 3584
Q 2916 3584 3314 3078
Q 3713 2572 3713 1747
Q 3713 922 3314 415
Q 2916 -91 2278 -91
Q 1894 -91 1617 61
Q 1341 213 1159 525
z
M 3116 1747
Q 3116 2381 2855 2742
Q 2594 3103 2138 3103
Q 1681 3103 1420 2742
Q 1159 2381 1159 1747
Q 1159 1113 1420 752
Q 1681 391 2138 391
Q 2594 391 2855 752
Q 3116 1113 3116 1747
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-6c" d="M 603 4863
L 1178 4863
L 1178 0
L 603 0
L 603 4863
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-65" d="M 3597 1894
L 3597 1613
L 953 1613
Q 991 1019 1311 708
Q 1631 397 2203 397
Q 2534 397 2845 478
Q 3156 559 3463 722
L 3463 178
Q 3153 47 2828 -22
Q 2503 -91 2169 -91
Q 1331 -91 842 396
Q 353 884 353 1716
Q 353 2575 817 3079
Q 1281 3584 2069 3584
Q 2775 3584 3186 3129
Q 3597 2675 3597 1894
z
M 3022 2063
Q 3016 2534 2758 2815
Q 2500 3097 2075 3097
Q 1594 3097 1305 2825
Q 1016 2553 972 2059
L 3022 2063
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-53"/>
<use xlink:href="#DejaVuSans-69" transform="translate(63.476562 0)"/>
<use xlink:href="#DejaVuSans-6d" transform="translate(91.259766 0)"/>
<use xlink:href="#DejaVuSans-70" transform="translate(188.671875 0)"/>
<use xlink:href="#DejaVuSans-6c" transform="translate(252.148438 0)"/>
<use xlink:href="#DejaVuSans-65" transform="translate(279.931641 0)"/>
</g>
</g>
<g id="text_2">
<!-- 77.4% -->
<g transform="translate(110.518687 424.017325) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-37" d="M 525 4666
L 3525 4666
L 3525 4397
L 1831 0
L 1172 0
L 2766 4134
L 525 4134
L 525 4666
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-2e" d="M 684 794
L 1344 794
L 1344 0
L 684 0
L 684 794
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-34" d="M 2419 4116
L 825 1625
L 2419 1625
L 2419 4116
z
M 2253 4666
L 3047 4666
L 3047 1625
L 3713 1625
L 3713 1100
L 3047 1100
L 3047 0
L 2419 0
L 2419 1100
L 313 1100
L 313 1709
L 2253 4666
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-25" d="M 4653 2053
Q 4381 2053 4226 1822
Q 4072 1591 4072 1178
Q 4072 772 4226 539
Q 4381 306 4653 306
Q 4919 306 5073 539
Q 5228 772 5228 1178
Q 5228 1588 5073 1820
Q 4919 2053 4653 2053
z
M 4653 2450
Q 5147 2450 5437 2106
Q 5728 1763 5728 1178
Q 5728 594 5436 251
Q 5144 -91 4653 -91
Q 4153 -91 3862 251
Q 3572 594 3572 1178
Q 3572 1766 3864 2108
Q 4156 2450 4653 2450
z
M 1428 4353
Q 1159 4353 1004 4120
Q 850 3888 850 3481
Q 850 3069 1003 2837
Q 1156 2606 1428 2606
Q 1700 2606 1854 2837
Q 2009 3069 2009 3481
Q 2009 3884 1853 4118
Q 1697 4353 1428 4353
z
M 4250 4750
L 4750 4750
L 1831 -91
L 1331 -91
L 4250 4750
z
M 1428 4750
Q 1922 4750 2215 4408
Q 2509 4066 2509 3481
Q 2509 2891 2217 2550
Q 1925 2209 1428 2209
Q 931 2209 642 2551
Q 353 2894 353 3481
Q 353 4063 643 4406
Q 934 4750 1428 4750
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-37"/>
<use xlink:href="#DejaVuSans-37" transform="translate(63.623047 0)"/>
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
<use xlink:href="#DejaVuSans-34" transform="translate(159.033203 0)"/>
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
</g>
</g>
<g id="text_3">
<!-- Compose -->
<g transform="translate(275.351869 250.453148) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-43" d="M 4122 4306
L 4122 3641
Q 3803 3938 3442 4084
Q 3081 4231 2675 4231
Q 1875 4231 1450 3742
Q 1025 3253 1025 2328
Q 1025 1406 1450 917
Q 1875 428 2675 428
Q 3081 428 3442 575
Q 3803 722 4122 1019
L 4122 359
Q 3791 134 3420 21
Q 3050 -91 2638 -91
Q 1578 -91 968 557
Q 359 1206 359 2328
Q 359 3453 968 4101
Q 1578 4750 2638 4750
Q 3056 4750 3426 4639
Q 3797 4528 4122 4306
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-6f" d="M 1959 3097
Q 1497 3097 1228 2736
Q 959 2375 959 1747
Q 959 1119 1226 758
Q 1494 397 1959 397
Q 2419 397 2687 759
Q 2956 1122 2956 1747
Q 2956 2369 2687 2733
Q 2419 3097 1959 3097
z
M 1959 3584
Q 2709 3584 3137 3096
Q 3566 2609 3566 1747
Q 3566 888 3137 398
Q 2709 -91 1959 -91
Q 1206 -91 779 398
Q 353 888 353 1747
Q 353 2609 779 3096
Q 1206 3584 1959 3584
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-73" d="M 2834 3397
L 2834 2853
Q 2591 2978 2328 3040
Q 2066 3103 1784 3103
Q 1356 3103 1142 2972
Q 928 2841 928 2578
Q 928 2378 1081 2264
Q 1234 2150 1697 2047
L 1894 2003
Q 2506 1872 2764 1633
Q 3022 1394 3022 966
Q 3022 478 2636 193
Q 2250 -91 1575 -91
Q 1294 -91 989 -36
Q 684 19 347 128
L 347 722
Q 666 556 975 473
Q 1284 391 1588 391
Q 1994 391 2212 530
Q 2431 669 2431 922
Q 2431 1156 2273 1281
Q 2116 1406 1581 1522
L 1381 1569
Q 847 1681 609 1914
Q 372 2147 372 2553
Q 372 3047 722 3315
Q 1072 3584 1716 3584
Q 2034 3584 2315 3537
Q 2597 3491 2834 3397
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-43"/>
<use xlink:href="#DejaVuSans-6f" transform="translate(69.824219 0)"/>
<use xlink:href="#DejaVuSans-6d" transform="translate(131.005859 0)"/>
<use xlink:href="#DejaVuSans-70" transform="translate(228.417969 0)"/>
<use xlink:href="#DejaVuSans-6f" transform="translate(291.894531 0)"/>
<use xlink:href="#DejaVuSans-73" transform="translate(353.076172 0)"/>
<use xlink:href="#DejaVuSans-65" transform="translate(405.175781 0)"/>
</g>
</g>
<g id="text_4">
<!-- 22.6% -->
<g transform="translate(215.65957 301.501433) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-32" d="M 1228 531
L 3431 531
L 3431 0
L 469 0
L 469 531
Q 828 903 1448 1529
Q 2069 2156 2228 2338
Q 2531 2678 2651 2914
Q 2772 3150 2772 3378
Q 2772 3750 2511 3984
Q 2250 4219 1831 4219
Q 1534 4219 1204 4116
Q 875 4013 500 3803
L 500 4441
Q 881 4594 1212 4672
Q 1544 4750 1819 4750
Q 2544 4750 2975 4387
Q 3406 4025 3406 3419
Q 3406 3131 3298 2873
Q 3191 2616 2906 2266
Q 2828 2175 2409 1742
Q 1991 1309 1228 531
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-36" d="M 2113 2584
Q 1688 2584 1439 2293
Q 1191 2003 1191 1497
Q 1191 994 1439 701
Q 1688 409 2113 409
Q 2538 409 2786 701
Q 3034 994 3034 1497
Q 3034 2003 2786 2293
Q 2538 2584 2113 2584
z
M 3366 4563
L 3366 3988
Q 3128 4100 2886 4159
Q 2644 4219 2406 4219
Q 1781 4219 1451 3797
Q 1122 3375 1075 2522
Q 1259 2794 1537 2939
Q 1816 3084 2150 3084
Q 2853 3084 3261 2657
Q 3669 2231 3669 1497
Q 3669 778 3244 343
Q 2819 -91 2113 -91
Q 1303 -91 875 529
Q 447 1150 447 2328
Q 447 3434 972 4092
Q 1497 4750 2381 4750
Q 2619 4750 2861 4703
Q 3103 4656 3366 4563
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-32"/>
<use xlink:href="#DejaVuSans-32" transform="translate(63.623047 0)"/>
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
<use xlink:href="#DejaVuSans-36" transform="translate(159.033203 0)"/>
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
</g>
</g>
<g id="text_5">
<!-- Distribution by Category -->
<g transform="translate(105.387405 185.827283) scale(0.12 -0.12)">
<defs>
<path id="DejaVuSans-44" d="M 1259 4147
L 1259 519
L 2022 519
Q 2988 519 3436 956
Q 3884 1394 3884 2338
Q 3884 3275 3436 3711
Q 2988 4147 2022 4147
L 1259 4147
z
M 628 4666
L 1925 4666
Q 3281 4666 3915 4102
Q 4550 3538 4550 2338
Q 4550 1131 3912 565
Q 3275 0 1925 0
L 628 0
L 628 4666
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-74" d="M 1172 4494
L 1172 3500
L 2356 3500
L 2356 3053
L 1172 3053
L 1172 1153
Q 1172 725 1289 603
Q 1406 481 1766 481
L 2356 481
L 2356 0
L 1766 0
Q 1100 0 847 248
Q 594 497 594 1153
L 594 3053
L 172 3053
L 172 3500
L 594 3500
L 594 4494
L 1172 4494
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-72" d="M 2631 2963
Q 2534 3019 2420 3045
Q 2306 3072 2169 3072
Q 1681 3072 1420 2755
Q 1159 2438 1159 1844
L 1159 0
L 581 0
L 581 3500
L 1159 3500
L 1159 2956
Q 1341 3275 1631 3429
Q 1922 3584 2338 3584
Q 2397 3584 2469 3576
Q 2541 3569 2628 3553
L 2631 2963
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-62" d="M 3116 1747
Q 3116 2381 2855 2742
Q 2594 3103 2138 3103
Q 1681 3103 1420 2742
Q 1159 2381 1159 1747
Q 1159 1113 1420 752
Q 1681 391 2138 391
Q 2594 391 2855 752
Q 3116 1113 3116 1747
z
M 1159 2969
Q 1341 3281 1617 3432
Q 1894 3584 2278 3584
Q 2916 3584 3314 3078
Q 3713 2572 3713 1747
Q 3713 922 3314 415
Q 2916 -91 2278 -91
Q 1894 -91 1617 61
Q 1341 213 1159 525
L 1159 0
L 581 0
L 581 4863
L 1159 4863
L 1159 2969
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-75" d="M 544 1381
L 544 3500
L 1119 3500
L 1119 1403
Q 1119 906 1312 657
Q 1506 409 1894 409
Q 2359 409 2629 706
Q 2900 1003 2900 1516
L 2900 3500
L 3475 3500
L 3475 0
L 2900 0
L 2900 538
Q 2691 219 2414 64
Q 2138 -91 1772 -91
Q 1169 -91 856 284
Q 544 659 544 1381
z
M 1991 3584
L 1991 3584
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-6e" d="M 3513 2113
L 3513 0
L 2938 0
L 2938 2094
Q 2938 2591 2744 2837
Q 2550 3084 2163 3084
Q 1697 3084 1428 2787
Q 1159 2491 1159 1978
L 1159 0
L 581 0
L 581 3500
L 1159 3500
L 1159 2956
Q 1366 3272 1645 3428
Q 1925 3584 2291 3584
Q 2894 3584 3203 3211
Q 3513 2838 3513 2113
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-20" transform="scale(0.015625)"/>
<path id="DejaVuSans-79" d="M 2059 -325
Q 1816 -950 1584 -1140
Q 1353 -1331 966 -1331
L 506 -1331
L 506 -850
L 844 -850
Q 1081 -850 1212 -737
Q 1344 -625 1503 -206
L 1606 56
L 191 3500
L 800 3500
L 1894 763
L 2988 3500
L 3597 3500
L 2059 -325
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-61" d="M 2194 1759
Q 1497 1759 1228 1600
Q 959 1441 959 1056
Q 959 750 1161 570
Q 1363 391 1709 391
Q 2188 391 2477 730
Q 2766 1069 2766 1631
L 2766 1759
L 2194 1759
z
M 3341 1997
L 3341 0
L 2766 0
L 2766 531
Q 2569 213 2275 61
Q 1981 -91 1556 -91
Q 1019 -91 701 211
Q 384 513 384 1019
Q 384 1609 779 1909
Q 1175 2209 1959 2209
L 2766 2209
L 2766 2266
Q 2766 2663 2505 2880
Q 2244 3097 1772 3097
Q 1472 3097 1187 3025
Q 903 2953 641 2809
L 641 3341
Q 956 3463 1253 3523
Q 1550 3584 1831 3584
Q 2591 3584 2966 3190
Q 3341 2797 3341 1997
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-67" d="M 2906 1791
Q 2906 2416 2648 2759
Q 2391 3103 1925 3103
Q 1463 3103 1205 2759
Q 947 2416 947 1791
Q 947 1169 1205 825
Q 1463 481 1925 481
Q 2391 481 2648 825
Q 2906 1169 2906 1791
z
M 3481 434
Q 3481 -459 3084 -895
Q 2688 -1331 1869 -1331
Q 1566 -1331 1297 -1286
Q 1028 -1241 775 -1147
L 775 -588
Q 1028 -725 1275 -790
Q 1522 -856 1778 -856
Q 2344 -856 2625 -561
Q 2906 -266 2906 331
L 2906 616
Q 2728 306 2450 153
Q 2172 0 1784 0
Q 1141 0 747 490
Q 353 981 353 1791
Q 353 2603 747 3093
Q 1141 3584 1784 3584
Q 2172 3584 2450 3431
Q 2728 3278 2906 2969
L 2906 3500
L 3481 3500
L 3481 434
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-44"/>
<use xlink:href="#DejaVuSans-69" transform="translate(77.001953 0)"/>
<use xlink:href="#DejaVuSans-73" transform="translate(104.785156 0)"/>
<use xlink:href="#DejaVuSans-74" transform="translate(156.884766 0)"/>
<use xlink:href="#DejaVuSans-72" transform="translate(196.09375 0)"/>
<use xlink:href="#DejaVuSans-69" transform="translate(237.207031 0)"/>
<use xlink:href="#DejaVuSans-62" transform="translate(264.990234 0)"/>
<use xlink:href="#DejaVuSans-75" transform="translate(328.466797 0)"/>
<use xlink:href="#DejaVuSans-74" transform="translate(391.845703 0)"/>
<use xlink:href="#DejaVuSans-69" transform="translate(431.054688 0)"/>
<use xlink:href="#DejaVuSans-6f" transform="translate(458.837891 0)"/>
<use xlink:href="#DejaVuSans-6e" transform="translate(520.019531 0)"/>
<use xlink:href="#DejaVuSans-20" transform="translate(583.398438 0)"/>
<use xlink:href="#DejaVuSans-62" transform="translate(615.185547 0)"/>
<use xlink:href="#DejaVuSans-79" transform="translate(678.662109 0)"/>
<use xlink:href="#DejaVuSans-20" transform="translate(737.841797 0)"/>
<use xlink:href="#DejaVuSans-43" transform="translate(769.628906 0)"/>
<use xlink:href="#DejaVuSans-61" transform="translate(839.453125 0)"/>
<use xlink:href="#DejaVuSans-74" transform="translate(900.732422 0)"/>
<use xlink:href="#DejaVuSans-65" transform="translate(939.941406 0)"/>
<use xlink:href="#DejaVuSans-67" transform="translate(1001.464844 0)"/>
<use xlink:href="#DejaVuSans-6f" transform="translate(1064.941406 0)"/>
<use xlink:href="#DejaVuSans-72" transform="translate(1126.123047 0)"/>
<use xlink:href="#DejaVuSans-79" transform="translate(1167.236328 0)"/>
</g>
</g>
</g>
<g id="axes_2">
<g id="patch_4">
<path d="M 527.362719 225.461826
C 505.569772 225.461826 484.099667 230.756969 464.80585 240.890142
C 445.512033 251.023314 428.966145 265.69429 416.596539 283.636569
C 404.226933 301.578849 396.400097 322.260836 393.791699 343.897121
C 391.1833 365.533405 393.87062 387.482944 401.621671 407.850903
C 409.372721 428.218863 421.957854 446.401777 438.290633 460.829974
C 454.623413 475.258171 474.219929 485.50417 495.388478 490.683465
C 516.557026 495.862761 538.670422 495.8219 559.819686 490.564411
C 580.96895 485.306922 600.527468 474.988574 616.806816 460.500118
L 527.362719 360
z
" style="fill: #1f77b4"/>
</g>
<g id="patch_5">
<path d="M 616.806831 460.500101
C 637.179098 442.368988 651.564722 418.464933 658.046183 391.974229
C 664.527645 365.483527 662.803571 337.637926 653.103764 312.149096
C 643.403956 286.660262 626.179443 264.713372 603.726155 249.233823
C 581.272867 233.754273 554.634822 225.461826 527.362738 225.461826
L 527.362719 360
z
" style="fill: #ff7f0e"/>
</g>
<g id="matplotlib.axis_3"/>
<g id="matplotlib.axis_4"/>
<g id="text_6">
<!-- Male -->
<g transform="translate(365.360062 415.395359) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-4d" d="M 628 4666
L 1569 4666
L 2759 1491
L 3956 4666
L 4897 4666
L 4897 0
L 4281 0
L 4281 4097
L 3078 897
L 2444 897
L 1241 4097
L 1241 0
L 628 0
L 628 4666
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-4d"/>
<use xlink:href="#DejaVuSans-61" transform="translate(86.279297 0)"/>
<use xlink:href="#DejaVuSans-6c" transform="translate(147.558594 0)"/>
<use xlink:href="#DejaVuSans-65" transform="translate(175.341797 0)"/>
</g>
</g>
<g id="text_7">
<!-- 61.6% -->
<g transform="translate(436.034494 391.469912) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-31" d="M 794 531
L 1825 531
L 1825 4091
L 703 3866
L 703 4441
L 1819 4666
L 2450 4666
L 2450 531
L 3481 531
L 3481 0
L 794 0
L 794 531
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-36"/>
<use xlink:href="#DejaVuSans-31" transform="translate(63.623047 0)"/>
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
<use xlink:href="#DejaVuSans-36" transform="translate(159.033203 0)"/>
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
</g>
</g>
<g id="text_8">
<!-- Female -->
<g transform="translate(665.677871 310.123379) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-46" d="M 628 4666
L 3309 4666
L 3309 4134
L 1259 4134
L 1259 2759
L 3109 2759
L 3109 2228
L 1259 2228
L 1259 0
L 628 0
L 628 4666
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-46"/>
<use xlink:href="#DejaVuSans-65" transform="translate(52.019531 0)"/>
<use xlink:href="#DejaVuSans-6d" transform="translate(113.542969 0)"/>
<use xlink:href="#DejaVuSans-61" transform="translate(210.955078 0)"/>
<use xlink:href="#DejaVuSans-6c" transform="translate(272.234375 0)"/>
<use xlink:href="#DejaVuSans-65" transform="translate(300.017578 0)"/>
</g>
</g>
<g id="text_9">
<!-- 38.4% -->
<g transform="translate(586.923754 334.048832) scale(0.1 -0.1)">
<defs>
<path id="DejaVuSans-33" d="M 2597 2516
Q 3050 2419 3304 2112
Q 3559 1806 3559 1356
Q 3559 666 3084 287
Q 2609 -91 1734 -91
Q 1441 -91 1130 -33
Q 819 25 488 141
L 488 750
Q 750 597 1062 519
Q 1375 441 1716 441
Q 2309 441 2620 675
Q 2931 909 2931 1356
Q 2931 1769 2642 2001
Q 2353 2234 1838 2234
L 1294 2234
L 1294 2753
L 1863 2753
Q 2328 2753 2575 2939
Q 2822 3125 2822 3475
Q 2822 3834 2567 4026
Q 2313 4219 1838 4219
Q 1578 4219 1281 4162
Q 984 4106 628 3988
L 628 4550
Q 988 4650 1302 4700
Q 1616 4750 1894 4750
Q 2613 4750 3031 4423
Q 3450 4097 3450 3541
Q 3450 3153 3228 2886
Q 3006 2619 2597 2516
z
" transform="scale(0.015625)"/>
<path id="DejaVuSans-38" d="M 2034 2216
Q 1584 2216 1326 1975
Q 1069 1734 1069 1313
Q 1069 891 1326 650
Q 1584 409 2034 409
Q 2484 409 2743 651
Q 3003 894 3003 1313
Q 3003 1734 2745 1975
Q 2488 2216 2034 2216
z
M 1403 2484
Q 997 2584 770 2862
Q 544 3141 544 3541
Q 544 4100 942 4425
Q 1341 4750 2034 4750
Q 2731 4750 3128 4425
Q 3525 4100 3525 3541
Q 3525 3141 3298 2862
Q 3072 2584 2669 2484
Q 3125 2378 3379 2068
Q 3634 1759 3634 1313
Q 3634 634 3220 271
Q 2806 -91 2034 -91
Q 1263 -91 848 271
Q 434 634 434 1313
Q 434 1759 690 2068
Q 947 2378 1403 2484
z
M 1172 3481
Q 1172 3119 1398 2916
Q 1625 2713 2034 2713
Q 2441 2713 2670 2916
Q 2900 3119 2900 3481
Q 2900 3844 2670 4047
Q 2441 4250 2034 4250
Q 1625 4250 1398 4047
Q 1172 3844 1172 3481
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-33"/>
<use xlink:href="#DejaVuSans-38" transform="translate(63.623047 0)"/>
<use xlink:href="#DejaVuSans-2e" transform="translate(127.246094 0)"/>
<use xlink:href="#DejaVuSans-34" transform="translate(159.033203 0)"/>
<use xlink:href="#DejaVuSans-25" transform="translate(222.65625 0)"/>
</g>
</g>
<g id="text_10">
<!-- Distribution by Sex -->
<g transform="translate(470.238969 185.827283) scale(0.12 -0.12)">
<defs>
<path id="DejaVuSans-78" d="M 3513 3500
L 2247 1797
L 3578 0
L 2900 0
L 1881 1375
L 863 0
L 184 0
L 1544 1831
L 300 3500
L 978 3500
L 1906 2253
L 2834 3500
L 3513 3500
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#DejaVuSans-44"/>
<use xlink:href="#DejaVuSans-69" transform="translate(77.001953 0)"/>
<use xlink:href="#DejaVuSans-73" transform="translate(104.785156 0)"/>
<use xlink:href="#DejaVuSans-74" transform="translate(156.884766 0)"/>
<use xlink:href="#DejaVuSans-72" transform="translate(196.09375 0)"/>
<use xlink:href="#DejaVuSans-69" transform="translate(237.207031 0)"/>
<use xlink:href="#DejaVuSans-62" transform="translate(264.990234 0)"/>
<use xlink:href="#DejaVuSans-75" transform="translate(328.466797 0)"/>
<use xlink:href="#DejaVuSans-74" transform="translate(391.845703 0)"/>
<use xlink:href="#DejaVuSans-69" transform="translate(431.054688 0)"/>
<use xlink:href="#DejaVuSans-6f" transform="translate(458.837891 0)"/>
<use xlink:href="#DejaVuSans-6e" transform="translate(520.019531 0)"/>
<use xlink:href="#DejaVuSans-20" transform="translate(583.398438 0)"/>
<use xlink:href="#DejaVuSans-62" transform="translate(615.185547 0)"/>
<use xlink:href="#DejaVuSans-79" transform="translate(678.662109 0)"/>
<use xlink:href="#DejaVuSans-20" transform="translate(737.841797 0)"/>
<use xlink:href="#DejaVuSans-53" transform="translate(769.628906 0)"/>
<use xlink:href="#DejaVuSans-65" transform="translate(833.105469 0)"/>
<use xlink:href="#DejaVuSans-78" transform="translate(892.878906 0)"/>
</g>
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 24 KiB

@@ -0,0 +1,13 @@
compose,simple
0.2062165520477412,0.7937834479522587
0.6269061385346485,0.3730938614653515
0.09081330148566008,0.90918669851434
0.12423822403788959,0.8757617759621105
0.2612655252892886,0.7387344747107114
0.07622377139542966,0.9237762286045703
0.18062352012628255,0.8193764798737174
0.07679244621346286,0.9232075537865372
0.4611502742287561,0.5388497257712439
0.11962561930536533,0.8803743806946347
0.16090483213325235,0.8390951678667476
0.409646629226467,0.590353370773533
1 compose simple
2 0.2062165520477412 0.7937834479522587
3 0.6269061385346485 0.3730938614653515
4 0.09081330148566008 0.90918669851434
5 0.12423822403788959 0.8757617759621105
6 0.2612655252892886 0.7387344747107114
7 0.07622377139542966 0.9237762286045703
8 0.18062352012628255 0.8193764798737174
9 0.07679244621346286 0.9232075537865372
10 0.4611502742287561 0.5388497257712439
11 0.11962561930536533 0.8803743806946347
12 0.16090483213325235 0.8390951678667476
13 0.409646629226467 0.590353370773533
Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

+487
View File
@@ -0,0 +1,487 @@
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="432pt" height="432pt" viewBox="0 0 432 432" xmlns="http://www.w3.org/2000/svg" version="1.1">
<metadata>
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<cc:Work>
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
<dc:date>2025-09-28T16:57:45.798630</dc:date>
<dc:format>image/svg+xml</dc:format>
<dc:creator>
<cc:Agent>
<dc:title>Matplotlib v3.10.3, https://matplotlib.org/</dc:title>
</cc:Agent>
</dc:creator>
</cc:Work>
</rdf:RDF>
</metadata>
<defs>
<style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
</defs>
<g id="figure_1">
<g id="patch_1">
<path d="M 0 432
L 432 432
L 432 0
L 0 0
z
" style="fill: #ffffff"/>
</g>
<g id="axes_1">
<g id="matplotlib.axis_1"/>
<g id="matplotlib.axis_2"/>
<g id="patch_2">
<path d="M 373.581818 218.160006
C 373.581818 202.706938 371.228081 187.343628 366.601695 172.599346
C 361.975308 157.855065 355.130015 143.901092 346.301639 131.218148
C 337.473263 118.535204 326.76436 107.270623 314.543856 97.812474
C 302.323352 88.354325 288.73321 80.812481 274.241615 75.44676
C 259.750019 70.08104 244.525316 66.953775 229.092 66.172703
C 213.658685 65.39163 198.196043 66.965822 183.236767 70.841048
C 168.277492 74.716274 153.995362 80.847516 140.8824 89.02355
C 127.769438 97.199585 115.977976 107.325433 105.914233 119.052255
C 95.85049 130.779078 87.631374 143.970648 81.540037 158.172511
C 75.4487 172.374374 71.555903 187.42155 69.995754 202.795659
C 68.435605 218.169768 69.226228 233.692213 72.340387 248.82824
C 75.454545 263.964267 80.856062 278.538045 88.358853 292.047502
C 95.861645 305.556959 105.378552 317.845158 116.581715 328.488768
C 127.784879 339.132378 140.544153 348.007753 154.419977 354.809136
C 168.295802 361.610518 183.126983 366.258896 198.402584 368.594133
C 213.678185 370.929369 229.22075 370.924336 244.494835 368.579206
L 221.399995 218.160006
z
" style="fill: #1f77b4; stroke: #ffffff; stroke-linejoin: miter"/>
</g>
<g id="patch_3">
<path d="M 244.494835 368.579206
C 280.423454 363.062851 313.213222 344.852289 336.885761 317.267751
C 360.5583 289.683214 373.581822 254.509625 373.581818 218.159992
L 221.399995 218.160006
z
" style="fill: #ff7f0e; stroke: #ffffff; stroke-linejoin: miter"/>
</g>
<g id="text_1">
<!-- Simple -->
<g style="fill: #262626" transform="translate(63.800032 111.787574) scale(0.1 -0.1)">
<defs>
<path id="ArialMT-53" d="M 288 1472
L 859 1522
Q 900 1178 1048 958
Q 1197 738 1509 602
Q 1822 466 2213 466
Q 2559 466 2825 569
Q 3091 672 3220 851
Q 3350 1031 3350 1244
Q 3350 1459 3225 1620
Q 3100 1781 2813 1891
Q 2628 1963 1997 2114
Q 1366 2266 1113 2400
Q 784 2572 623 2826
Q 463 3081 463 3397
Q 463 3744 659 4045
Q 856 4347 1234 4503
Q 1613 4659 2075 4659
Q 2584 4659 2973 4495
Q 3363 4331 3572 4012
Q 3781 3694 3797 3291
L 3216 3247
Q 3169 3681 2898 3903
Q 2628 4125 2100 4125
Q 1550 4125 1298 3923
Q 1047 3722 1047 3438
Q 1047 3191 1225 3031
Q 1400 2872 2139 2705
Q 2878 2538 3153 2413
Q 3553 2228 3743 1945
Q 3934 1663 3934 1294
Q 3934 928 3725 604
Q 3516 281 3123 101
Q 2731 -78 2241 -78
Q 1619 -78 1198 103
Q 778 284 539 648
Q 300 1013 288 1472
z
" transform="scale(0.015625)"/>
<path id="ArialMT-69" d="M 425 3934
L 425 4581
L 988 4581
L 988 3934
L 425 3934
z
M 425 0
L 425 3319
L 988 3319
L 988 0
L 425 0
z
" transform="scale(0.015625)"/>
<path id="ArialMT-6d" d="M 422 0
L 422 3319
L 925 3319
L 925 2853
Q 1081 3097 1340 3245
Q 1600 3394 1931 3394
Q 2300 3394 2536 3241
Q 2772 3088 2869 2813
Q 3263 3394 3894 3394
Q 4388 3394 4653 3120
Q 4919 2847 4919 2278
L 4919 0
L 4359 0
L 4359 2091
Q 4359 2428 4304 2576
Q 4250 2725 4106 2815
Q 3963 2906 3769 2906
Q 3419 2906 3187 2673
Q 2956 2441 2956 1928
L 2956 0
L 2394 0
L 2394 2156
Q 2394 2531 2256 2718
Q 2119 2906 1806 2906
Q 1569 2906 1367 2781
Q 1166 2656 1075 2415
Q 984 2175 984 1722
L 984 0
L 422 0
z
" transform="scale(0.015625)"/>
<path id="ArialMT-70" d="M 422 -1272
L 422 3319
L 934 3319
L 934 2888
Q 1116 3141 1344 3267
Q 1572 3394 1897 3394
Q 2322 3394 2647 3175
Q 2972 2956 3137 2557
Q 3303 2159 3303 1684
Q 3303 1175 3120 767
Q 2938 359 2589 142
Q 2241 -75 1856 -75
Q 1575 -75 1351 44
Q 1128 163 984 344
L 984 -1272
L 422 -1272
z
M 931 1641
Q 931 1000 1190 694
Q 1450 388 1819 388
Q 2194 388 2461 705
Q 2728 1022 2728 1688
Q 2728 2322 2467 2637
Q 2206 2953 1844 2953
Q 1484 2953 1207 2617
Q 931 2281 931 1641
z
" transform="scale(0.015625)"/>
<path id="ArialMT-6c" d="M 409 0
L 409 4581
L 972 4581
L 972 0
L 409 0
z
" transform="scale(0.015625)"/>
<path id="ArialMT-65" d="M 2694 1069
L 3275 997
Q 3138 488 2766 206
Q 2394 -75 1816 -75
Q 1088 -75 661 373
Q 234 822 234 1631
Q 234 2469 665 2931
Q 1097 3394 1784 3394
Q 2450 3394 2872 2941
Q 3294 2488 3294 1666
Q 3294 1616 3291 1516
L 816 1516
Q 847 969 1125 678
Q 1403 388 1819 388
Q 2128 388 2347 550
Q 2566 713 2694 1069
z
M 847 1978
L 2700 1978
Q 2663 2397 2488 2606
Q 2219 2931 1791 2931
Q 1403 2931 1139 2672
Q 875 2413 847 1978
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#ArialMT-53"/>
<use xlink:href="#ArialMT-69" transform="translate(66.699219 0)"/>
<use xlink:href="#ArialMT-6d" transform="translate(88.916016 0)"/>
<use xlink:href="#ArialMT-70" transform="translate(172.216797 0)"/>
<use xlink:href="#ArialMT-6c" transform="translate(227.832031 0)"/>
<use xlink:href="#ArialMT-65" transform="translate(250.048828 0)"/>
</g>
</g>
<g id="text_2">
<!-- 77.4% -->
<g style="fill: #262626" transform="translate(137.931975 161.280512) scale(0.1 -0.1)">
<defs>
<path id="ArialMT-37" d="M 303 3981
L 303 4522
L 3269 4522
L 3269 4084
Q 2831 3619 2401 2847
Q 1972 2075 1738 1259
Q 1569 684 1522 0
L 944 0
Q 953 541 1156 1306
Q 1359 2072 1739 2783
Q 2119 3494 2547 3981
L 303 3981
z
" transform="scale(0.015625)"/>
<path id="ArialMT-2e" d="M 581 0
L 581 641
L 1222 641
L 1222 0
L 581 0
z
" transform="scale(0.015625)"/>
<path id="ArialMT-34" d="M 2069 0
L 2069 1097
L 81 1097
L 81 1613
L 2172 4581
L 2631 4581
L 2631 1613
L 3250 1613
L 3250 1097
L 2631 1097
L 2631 0
L 2069 0
z
M 2069 1613
L 2069 3678
L 634 1613
L 2069 1613
z
" transform="scale(0.015625)"/>
<path id="ArialMT-25" d="M 372 3481
Q 372 3972 619 4315
Q 866 4659 1334 4659
Q 1766 4659 2048 4351
Q 2331 4044 2331 3447
Q 2331 2866 2045 2552
Q 1759 2238 1341 2238
Q 925 2238 648 2547
Q 372 2856 372 3481
z
M 1350 4272
Q 1141 4272 1002 4090
Q 863 3909 863 3425
Q 863 2984 1003 2804
Q 1144 2625 1350 2625
Q 1563 2625 1702 2806
Q 1841 2988 1841 3469
Q 1841 3913 1700 4092
Q 1559 4272 1350 4272
z
M 1353 -169
L 3859 4659
L 4316 4659
L 1819 -169
L 1353 -169
z
M 3334 1075
Q 3334 1569 3581 1911
Q 3828 2253 4300 2253
Q 4731 2253 5014 1945
Q 5297 1638 5297 1041
Q 5297 459 5011 145
Q 4725 -169 4303 -169
Q 3888 -169 3611 142
Q 3334 453 3334 1075
z
M 4316 1866
Q 4103 1866 3964 1684
Q 3825 1503 3825 1019
Q 3825 581 3965 400
Q 4106 219 4313 219
Q 4528 219 4667 400
Q 4806 581 4806 1063
Q 4806 1506 4665 1686
Q 4525 1866 4316 1866
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#ArialMT-37"/>
<use xlink:href="#ArialMT-37" transform="translate(55.615234 0)"/>
<use xlink:href="#ArialMT-2e" transform="translate(111.230469 0)"/>
<use xlink:href="#ArialMT-34" transform="translate(139.013672 0)"/>
<use xlink:href="#ArialMT-25" transform="translate(194.628906 0)"/>
</g>
</g>
<g id="text_3">
<!-- Compose -->
<g style="fill: #262626" transform="translate(348.434338 329.82462) scale(0.1 -0.1)">
<defs>
<path id="ArialMT-43" d="M 3763 1606
L 4369 1453
Q 4178 706 3683 314
Q 3188 -78 2472 -78
Q 1731 -78 1267 223
Q 803 525 561 1097
Q 319 1669 319 2325
Q 319 3041 592 3573
Q 866 4106 1370 4382
Q 1875 4659 2481 4659
Q 3169 4659 3637 4309
Q 4106 3959 4291 3325
L 3694 3184
Q 3534 3684 3231 3912
Q 2928 4141 2469 4141
Q 1941 4141 1586 3887
Q 1231 3634 1087 3207
Q 944 2781 944 2328
Q 944 1744 1114 1308
Q 1284 872 1643 656
Q 2003 441 2422 441
Q 2931 441 3284 734
Q 3638 1028 3763 1606
z
" transform="scale(0.015625)"/>
<path id="ArialMT-6f" d="M 213 1659
Q 213 2581 725 3025
Q 1153 3394 1769 3394
Q 2453 3394 2887 2945
Q 3322 2497 3322 1706
Q 3322 1066 3130 698
Q 2938 331 2570 128
Q 2203 -75 1769 -75
Q 1072 -75 642 372
Q 213 819 213 1659
z
M 791 1659
Q 791 1022 1069 705
Q 1347 388 1769 388
Q 2188 388 2466 706
Q 2744 1025 2744 1678
Q 2744 2294 2464 2611
Q 2184 2928 1769 2928
Q 1347 2928 1069 2612
Q 791 2297 791 1659
z
" transform="scale(0.015625)"/>
<path id="ArialMT-73" d="M 197 991
L 753 1078
Q 800 744 1014 566
Q 1228 388 1613 388
Q 2000 388 2187 545
Q 2375 703 2375 916
Q 2375 1106 2209 1216
Q 2094 1291 1634 1406
Q 1016 1563 777 1677
Q 538 1791 414 1992
Q 291 2194 291 2438
Q 291 2659 392 2848
Q 494 3038 669 3163
Q 800 3259 1026 3326
Q 1253 3394 1513 3394
Q 1903 3394 2198 3281
Q 2494 3169 2634 2976
Q 2775 2784 2828 2463
L 2278 2388
Q 2241 2644 2061 2787
Q 1881 2931 1553 2931
Q 1166 2931 1000 2803
Q 834 2675 834 2503
Q 834 2394 903 2306
Q 972 2216 1119 2156
Q 1203 2125 1616 2013
Q 2213 1853 2448 1751
Q 2684 1650 2818 1456
Q 2953 1263 2953 975
Q 2953 694 2789 445
Q 2625 197 2315 61
Q 2006 -75 1616 -75
Q 969 -75 630 194
Q 291 463 197 991
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#ArialMT-43"/>
<use xlink:href="#ArialMT-6f" transform="translate(72.216797 0)"/>
<use xlink:href="#ArialMT-6d" transform="translate(127.832031 0)"/>
<use xlink:href="#ArialMT-70" transform="translate(211.132812 0)"/>
<use xlink:href="#ArialMT-6f" transform="translate(266.748047 0)"/>
<use xlink:href="#ArialMT-73" transform="translate(322.363281 0)"/>
<use xlink:href="#ArialMT-65" transform="translate(372.363281 0)"/>
</g>
</g>
<g id="text_4">
<!-- 22.6% -->
<g style="fill: #262626" transform="translate(276.514892 280.209809) scale(0.1 -0.1)">
<defs>
<path id="ArialMT-32" d="M 3222 541
L 3222 0
L 194 0
Q 188 203 259 391
Q 375 700 629 1000
Q 884 1300 1366 1694
Q 2113 2306 2375 2664
Q 2638 3022 2638 3341
Q 2638 3675 2398 3904
Q 2159 4134 1775 4134
Q 1369 4134 1125 3890
Q 881 3647 878 3216
L 300 3275
Q 359 3922 746 4261
Q 1134 4600 1788 4600
Q 2447 4600 2831 4234
Q 3216 3869 3216 3328
Q 3216 3053 3103 2787
Q 2991 2522 2730 2228
Q 2469 1934 1863 1422
Q 1356 997 1212 845
Q 1069 694 975 541
L 3222 541
z
" transform="scale(0.015625)"/>
<path id="ArialMT-36" d="M 3184 3459
L 2625 3416
Q 2550 3747 2413 3897
Q 2184 4138 1850 4138
Q 1581 4138 1378 3988
Q 1113 3794 959 3422
Q 806 3050 800 2363
Q 1003 2672 1297 2822
Q 1591 2972 1913 2972
Q 2475 2972 2870 2558
Q 3266 2144 3266 1488
Q 3266 1056 3080 686
Q 2894 316 2569 119
Q 2244 -78 1831 -78
Q 1128 -78 684 439
Q 241 956 241 2144
Q 241 3472 731 4075
Q 1159 4600 1884 4600
Q 2425 4600 2770 4297
Q 3116 3994 3184 3459
z
M 888 1484
Q 888 1194 1011 928
Q 1134 663 1356 523
Q 1578 384 1822 384
Q 2178 384 2434 671
Q 2691 959 2691 1453
Q 2691 1928 2437 2201
Q 2184 2475 1800 2475
Q 1419 2475 1153 2201
Q 888 1928 888 1484
z
" transform="scale(0.015625)"/>
</defs>
<use xlink:href="#ArialMT-32"/>
<use xlink:href="#ArialMT-32" transform="translate(55.615234 0)"/>
<use xlink:href="#ArialMT-2e" transform="translate(111.230469 0)"/>
<use xlink:href="#ArialMT-36" transform="translate(139.013672 0)"/>
<use xlink:href="#ArialMT-25" transform="translate(194.628906 0)"/>
</g>
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 30 KiB

+27
View File
@@ -0,0 +1,27 @@
letter,Male,Female
a,0.1726871198212362,0.1780007719968084
b,0.06275449167118631,0.05757115683434764
c,0.002527913112031784,0.002525362815502787
d,0.02639274743484273,0.025798130028588412
e,0.060460557268468315,0.05992111228866155
f,0.004168185425527368,0.005738163668905593
g,0.03710295718248242,0.035944081768606244
h,0.015744753594548896,0.016324638692088497
i,0.07320872667180656,0.07954877144283247
j,0.00442530712700423,0.004397881604276826
k,0.06012485644271973,0.05719911396875115
l,0.04930645065793003,0.04598845291218479
m,0.08281339976187696,0.08014229460267776
n,0.08138893330151427,0.08430430794896865
o,0.06920807306069308,0.06452478894803111
p,0.009832203366545821,0.009371006578405026
q,5.0822826402147366e-05,8.43622136063042e-05
r,0.009139850680293098,0.010064380634131025
s,0.032639239825093015,0.034139532349508485
t,0.0277669772899704,0.027953179679053274
u,0.06917254296038988,0.06619473621457156
v,0.0035449558612418576,0.006171217790567778
w,0.013512780220408454,0.014295070954872152
x,4.796818419701171e-05,1.6707334940670683e-05
y,0.020592394840652214,0.020809516372185803
z,0.01138579141093724,0.012971260356926069
1 letter Male Female
2 a 0.1726871198212362 0.1780007719968084
3 b 0.06275449167118631 0.05757115683434764
4 c 0.002527913112031784 0.002525362815502787
5 d 0.02639274743484273 0.025798130028588412
6 e 0.060460557268468315 0.05992111228866155
7 f 0.004168185425527368 0.005738163668905593
8 g 0.03710295718248242 0.035944081768606244
9 h 0.015744753594548896 0.016324638692088497
10 i 0.07320872667180656 0.07954877144283247
11 j 0.00442530712700423 0.004397881604276826
12 k 0.06012485644271973 0.05719911396875115
13 l 0.04930645065793003 0.04598845291218479
14 m 0.08281339976187696 0.08014229460267776
15 n 0.08138893330151427 0.08430430794896865
16 o 0.06920807306069308 0.06452478894803111
17 p 0.009832203366545821 0.009371006578405026
18 q 5.0822826402147366e-05 8.43622136063042e-05
19 r 0.009139850680293098 0.010064380634131025
20 s 0.032639239825093015 0.034139532349508485
21 t 0.0277669772899704 0.027953179679053274
22 u 0.06917254296038988 0.06619473621457156
23 v 0.0035449558612418576 0.006171217790567778
24 w 0.013512780220408454 0.014295070954872152
25 x 4.796818419701171e-05 1.6707334940670683e-05
26 y 0.020592394840652214 0.020809516372185803
27 z 0.01138579141093724 0.012971260356926069
+7
View File
@@ -0,0 +1,7 @@
sex,position,2-grams,3-grams,4-grams
female,prefix,"ka, ma, mu, mb, ng, ba, ki, lu, ts, bo","tsh, kab, ngo, mas, kas, kal, muk, kav, mbu, man","tshi, kavi, ngoy, kaso, ilun, mbuy, kaba, ntum, kavu, ngal"
female,suffix,"ba, ga, la, ka, ma, da, go, ya, bo, na","nga, mba, ngo, nda, ala, mbo, ngu, ndo, mbe, mbu","anga, amba, ongo, umba, inga, ombo, unga, enga, anda, ungu"
female,any,"ng, ka, mb, an, ba, ma, nd, ga, la, am","nga, mba, ang, ngo, amb, ong, nda, ala, mbo, eng","anga, amba, ongo, tshi, umba, inga, ombo, unga, anda, enga"
male,prefix,"ka, mu, ma, ba, mb, ng, ki, lu, ts, bo","kab, tsh, kal, kas, muk, ngo, kam, mut, mul, mbu","tshi, ngoy, ilun, kaba, kaso, kamb, muke, kabe, kalo, muto"
male,suffix,"ba, ga, la, go, ka, da, bo, le, di, ma","nga, mba, ngo, nda, ala, mbo, ngu, mbe, ndo, ele","amba, ongo, anga, umba, unga, ombo, anda, enga, onga, angu"
male,any,"ng, ka, mb, ba, an, ma, mu, nd, am, al","nga, mba, ngo, amb, ang, ong, ala, nda, shi, mbo","amba, ongo, anga, tshi, umba, unga, ombo, anda, lung, enga"
1 sex position 2-grams 3-grams 4-grams
2 female prefix ka, ma, mu, mb, ng, ba, ki, lu, ts, bo tsh, kab, ngo, mas, kas, kal, muk, kav, mbu, man tshi, kavi, ngoy, kaso, ilun, mbuy, kaba, ntum, kavu, ngal
3 female suffix ba, ga, la, ka, ma, da, go, ya, bo, na nga, mba, ngo, nda, ala, mbo, ngu, ndo, mbe, mbu anga, amba, ongo, umba, inga, ombo, unga, enga, anda, ungu
4 female any ng, ka, mb, an, ba, ma, nd, ga, la, am nga, mba, ang, ngo, amb, ong, nda, ala, mbo, eng anga, amba, ongo, tshi, umba, inga, ombo, unga, anda, enga
5 male prefix ka, mu, ma, ba, mb, ng, ki, lu, ts, bo kab, tsh, kal, kas, muk, ngo, kam, mut, mul, mbu tshi, ngoy, ilun, kaba, kaso, kamb, muke, kabe, kalo, muto
6 male suffix ba, ga, la, go, ka, da, bo, le, di, ma nga, mba, ngo, nda, ala, mbo, ngu, mbe, ndo, ele amba, ongo, anga, umba, unga, ombo, anda, enga, onga, angu
7 male any ng, ka, mb, ba, an, ma, mu, nd, am, al nga, mba, ngo, amb, ang, ong, ala, nda, shi, mbo amba, ongo, anga, tshi, umba, unga, ombo, anda, lung, enga
Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 454 KiB

+29
View File
@@ -0,0 +1,29 @@
^,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,$
0.0,0.03240135083510582,0.09260231861008805,0.008075675043119726,0.017857766912447937,0.02028878683448995,0.008457996285223429,0.0056965765224603,0.0036982016623235744,0.026885400880147632,0.0030749451480349046,0.18878761331096985,0.060296922518691204,0.26743305983402327,0.1199608033359101,0.014603892627340732,0.012508764232704531,9.485640637203402e-06,0.005467323565586415,0.032579081785992364,0.042149993340081786,0.004793643382647348,0.005066031042208305,0.01062281917633087,8.287454451451392e-06,0.010728658956072298,0.005944601062910965,0.0
0.0,0.0009791886146586694,0.048581928219493954,0.0020218103912897157,0.020990537577026614,0.001131268956960644,0.006787613741763863,0.006186016110784558,0.011832275288244888,0.004851088661689449,0.0032179280340603345,0.04386672984621395,0.07634733982374604,0.08293271059721305,0.13517553734169577,0.001808420172771762,0.01144167917930759,6.458327509042543e-06,0.010475318749427155,0.046487395291047624,0.033763074574396215,0.004567187662011127,0.011274824306950683,0.008258962300707514,1.6278524132381206e-05,0.032413284125006325,0.011652857641829295,0.3829322859400618
0.0,0.39420804956524974,0.00012606255453160502,2.546718273365758e-07,0.0006208899150465718,0.11658188641534632,1.273359136682879e-06,5.781050480540271e-05,0.0016016311221197252,0.11411997386048364,6.366795683414395e-06,1.4261622330848246e-05,0.0003718208679114007,2.2156448978282096e-05,3.8200774100486376e-05,0.19131406078812782,6.366795683414395e-06,2.546718273365758e-07,0.0013390644681357156,1.884571522290661e-05,9.422857611453305e-06,0.14365426567670633,1.222424771215564e-05,0.02738409757801999,2.546718273365758e-07,0.004381374117498451,1.3242935021501942e-05,0.004095886999054148
0.0,0.07019293726020524,3.059582305823609e-05,0.004779067561696477,1.8357493834941653e-05,0.10123545933509158,0.0,7.954913995141383e-05,0.33171991359739567,0.3235936629931282,1.8357493834941653e-05,0.03783479479381475,0.015714014722710057,8.566830456306106e-05,4.283415228153053e-05,0.03822642132896017,2.4476658446588873e-05,0.00897069532067482,0.0032064422565031424,8.566830456306106e-05,0.013223514725769638,0.018045416439747646,6.119164611647218e-06,0.0037694054007746864,1.8357493834941653e-05,0.008328183036451864,1.2238329223294436e-05,0.02073784886887242
0.0,0.2424810241211335,1.5959776634059462e-05,5.911028382984986e-07,0.0006726750299836915,0.10030246732235734,9.457645412775978e-06,4.0786095842596404e-05,0.004887238267051987,0.3104158822239417,0.06585713162618893,1.5959776634059462e-05,2.6008524885133942e-05,0.00012176718468949073,7.152344343411834e-05,0.1405991300148426,1.773308514895496e-06,5.911028382984986e-07,0.009874964016614718,9.398535128946128e-05,1.4777570957462466e-05,0.0932293307592775,3.014624475322343e-05,0.00875600634371566,5.911028382984986e-07,0.004086293921157521,0.0022692437962279362,0.016124694325944745
0.0,0.006205216662872565,0.028941081239369993,0.0011474697534826386,0.01425752342098435,0.003730367449534814,0.004639925223263839,0.005866185675542982,0.003561493573938398,0.0016397191355402771,0.0038625407716201547,0.05415821120988892,0.10151501424777083,0.09480497248870047,0.17112338854414036,0.0027153276653648665,0.008680322523437667,5.646239001703881e-06,0.03422185458932722,0.03327456967317772,0.03469819184328915,0.002615748541152998,0.002961965650848386,0.005427832211228881,0.00028205530285784386,0.028868963368484594,0.01293322372785744,0.33786118926732095
0.0,0.23077320736794152,3.230986452473805e-06,6.46197290494761e-06,3.5540850977211856e-05,0.08830932171901404,0.0015282565920201096,3.877183742968566e-05,0.000132470444551426,0.14398568026804265,6.46197290494761e-06,9.692959357421415e-05,0.0038481048648963015,3.5540850977211856e-05,1.938591871484283e-05,0.08549513251890935,6.46197290494761e-06,6.46197290494761e-06,0.013518447317150399,0.0002003211600533759,0.0005040338865859135,0.3976407336924036,0.0,0.02652962976126241,0.0,0.004814169814185969,6.46197290494761e-06,0.0024587806903325652
0.0,0.38404120076639636,0.02137884680216012,5.907040897397653e-06,4.261508075979735e-05,0.09362322277181143,5.485109404726393e-06,4.514666971582492e-05,0.015463789206401714,0.07891806638923264,1.0548287316781523e-05,1.4345670750822873e-05,0.000878039436248894,2.9113272994317006e-05,0.0005134906265809245,0.2519884576420865,7.172835375411436e-06,8.438629853425219e-07,0.0015463789206401712,2.3206232096919352e-05,2.3628163589590613e-05,0.11810284411361265,5.485109404726393e-06,0.020353131343476286,2.5315889560275656e-06,0.002251004513401177,6.750903882740175e-06,0.010718747639820713
0.0,0.2614230396902226,4.453049370764763e-05,3.872216844143272e-06,2.5169409486931267e-05,0.09374152952565344,2.5169409486931267e-05,3.484995159728945e-05,2.420135527589545e-05,0.4197734753146176,1.1616650532429817e-05,9.486931268151016e-05,7.454017424975799e-05,0.0002468538238141336,0.0006030977734753146,0.1158296224588577,2.3233301064859633e-05,9.68054211035818e-07,0.002967086156824782,0.00020909970958373668,0.00020716360116166505,0.07815295256534366,1.1616650532429817e-05,0.007239109390125847,0.0,0.0033049370764762828,1.0648596321393998e-05,0.01591674733785092
0.0,0.057525431732821986,0.041996980300875106,0.0033912381338324768,0.016022951674192765,0.02875699673928052,0.007807629805802953,0.006448520094153173,0.005039572274652094,0.0003533603548473279,0.0023910036496201273,0.05831058618602033,0.08421986612340406,0.07337456279315632,0.10815890676433847,0.011699700072632914,0.009224952061459374,0.0004542620977921718,0.03307003561382166,0.050486207507906694,0.048087850694833095,0.002131396128804547,0.005388030520530441,0.005741799384458517,9.109752500688338e-05,0.017802008720851855,0.013278383415184936,0.3087466696297192
0.0,0.22479395580338848,3.1523973982213474e-05,4.2031965309617963e-05,0.00024168380053030328,0.10204660644420081,1.0507991327404491e-05,2.1015982654808982e-05,0.0002346784729787003,0.42807805335957994,9.457192194664042e-05,8.756659439503743e-05,3.502663775801497e-05,0.0007320567291425129,0.0001260958959288539,0.12674739139115296,2.8021310206411978e-05,0.0,3.502663775801497e-05,1.0507991327404491e-05,7.355593929183144e-05,0.06349628892772954,7.0053275516029945e-06,0.001986010360879449,0.0,0.00035727170513175267,1.0507991327404491e-05,0.05067303684452026
0.0,0.5034268610320017,6.895672925910058e-05,5.506050625251378e-06,1.3109644345836613e-05,0.07313660826232712,0.00021106527396796946,4.719471964501181e-06,0.00408968465012719,0.14606084028625171,6.292629286001574e-06,2.9627796221590745e-05,0.0002089677308726356,3.9853318811343305e-05,5.322515604409665e-05,0.12817613908388756,0.00323913092496931,2.8841217560840547e-06,0.0001966446651875492,0.0006470920449104952,2.0975430953338582e-05,0.10610631502055855,3.146314643000787e-06,0.021257812692547902,7.865786607501968e-07,0.01034980201815109,1.0225522589752559e-05,0.0026337275490785753
0.0,0.2907864656030218,0.0008378546668440517,4.703067334072724e-05,0.00035917260804733476,0.17203916946407904,0.00028798234223705586,9.696049777780069e-05,0.00010694646266521537,0.11111415343016283,4.831918493910333e-06,0.00016943927518645567,0.0027625688669183344,0.00026962105196019656,7.408941690662511e-05,0.17521728332147365,0.000927728350830784,3.5434068955342444e-06,9.663836987820666e-06,0.00025609168017724765,0.0001262741366408567,0.2173715845181466,0.0004770714192987469,0.010590921082852263,6.442557991880444e-07,0.004126136265899831,2.0938313473611443e-05,0.011915833133882475
0.0,0.2455144244761935,0.34861793685557824,2.345028152630318e-05,5.805835829496029e-05,0.02770991169645199,0.00440562708416741,6.392092867653609e-05,1.3805407672743e-05,0.04043547172794345,6.6190310759726715e-06,7.224199631490173e-05,7.98066032588705e-05,0.0006679547931524422,0.00019195190120320746,0.05560269776577443,0.03787277201050043,5.673455207976575e-07,1.7209480797528944e-05,0.0004308043654590213,0.00011952078971470652,0.19084103867348565,0.006140002341245849,0.037190633579328045,5.673455207976575e-07,0.0008513965115436847,0.0001529941754417683,0.00291861447415675
0.0,0.06852883077720875,0.00014519177304071664,0.0020409279683942802,0.189448848282653,0.02723591583598238,0.0006794974978305538,0.3949594662038515,0.00015362226308824212,0.04945662681480349,0.007555592524815616,0.040859213064337006,0.003013431831432164,4.027900800484397e-05,0.0018093705084222468,0.014132311483001883,5.18943498481013e-05,1.5924258978659246e-05,0.0002334309022048167,0.0345440266418473,0.03498709572990059,0.011673605896696897,0.0013887827271623645,0.0007604302022867985,0.0,0.03952251203124602,0.06717414469868305,0.009589026724278762
0.0,0.002811754741536618,0.022217453078125466,0.0009318958571949934,0.010439299379577709,0.0016556070980166227,0.007270623934115422,0.004762079642666141,0.004112736445620663,0.006897636060238115,0.0017462718427437218,0.0702396992225785,0.08710265314882101,0.10264970585602438,0.195075412409823,0.004091849124683534,0.008484842920460622,2.75437199170934e-06,0.007752409501665248,0.028939268392892802,0.03214650504623902,0.0037128934448241906,0.0025411376933511757,0.007407424409703652,2.2723568931602058e-05,0.045256168071778936,0.005312035917010772,0.33641715881938433
0.0,0.2755187546363871,0.00010093905364029487,3.204414401279202e-06,6.408828802558404e-06,0.20437434609918623,0.00369629201187556,3.204414401279202e-06,0.055202446890836816,0.1479045533126435,6.408828802558404e-06,0.0002483421160991382,0.0013074010757219146,6.729270242686325e-05,0.00011055229684413248,0.14242500468645605,0.0011712134636675483,0.0,0.0033470108421361265,0.0001538118912614017,0.0002675686025068134,0.14199881757108593,9.613243203837607e-06,0.012518044858597203,0.0,0.007613688617439385,8.011036003198005e-06,0.0019370685055732778
0.0,0.003864734299516908,0.0007246376811594203,0.0,0.0004830917874396135,0.0007246376811594203,0.0,0.0,0.0004830917874396135,0.00024154589371980676,0.0,0.0004830917874396135,0.0004830917874396135,0.0007246376811594203,0.0016908212560386474,0.0007246376811594203,0.0,0.0,0.00024154589371980676,0.001932367149758454,0.0007246376811594203,0.9835748792270531,0.0,0.00024154589371980676,0.0,0.0,0.0,0.0026570048309178746
0.0,0.2822393929187961,0.0012267882391033057,0.007813046583785243,0.008807168087886199,0.18842588750970124,0.00023592081521217416,0.004038313540390457,0.03821103686384766,0.19376257905381108,5.694640367190411e-05,0.000777725170147719,0.003924420733046649,0.0033565837364325194,0.004537814852598302,0.0694306823968904,0.0003530677027658055,0.00015294176986168533,0.0043604674811629435,0.0014366764126368952,0.011607304108438968,0.1015468270277394,0.0017295436315209734,0.0317907366098667,3.254080209823092e-06,0.008437829984071277,0.00010738464692416203,0.03162965963948045
0.0,0.23379934463089652,3.257301707105292e-05,0.0016072457280488111,4.281025100766955e-05,0.13016736016171107,7.445261044812096e-06,1.2098549197819656e-05,0.244367427355192,0.15211273242011003,1.535585090492495e-05,0.0005100003815696286,0.0002601188077531226,0.0006663508635106826,0.00011400555974868522,0.11456581564230732,0.001241497279222417,9.306576306015119e-06,0.00021265526859244547,0.016044072222754766,0.009141384576583351,0.07100824655726476,6.97993222951134e-06,0.0097877263010361,1.3959864459022679e-06,0.0037649754445984165,8.375918675413608e-06,0.010492699456216746
0.0,0.22270715194197135,1.4995917777938228e-05,0.004287166271403895,0.0006853689828879916,0.1378752450721748,7.386878016539942e-05,6.1094479836044635e-06,0.02331143188798605,0.08055140545073841,6.6648523457503235e-06,9.719576337552555e-05,2.221617448583441e-05,6.831473654394081e-05,4.6653966420252264e-05,0.14458064193636178,6.1094479836044635e-06,1.6662130864375809e-06,0.003284661397730618,0.19145788091019666,0.0033663058389660594,0.16493732261773184,8.331065432187904e-06,0.011131414226127331,4.443234897166882e-06,0.0022449444317935675,1.943915267510511e-05,0.009203050280756905
0.0,0.052226842718488986,0.03074039814488894,0.0014204842607524314,0.01637797660464469,0.022525374073740573,0.006430721206713545,0.012709093373135088,0.013169096520166011,0.009173231318596861,0.007254770344054158,0.09023383531205176,0.09354867756485294,0.11036573774681681,0.12381315549879986,0.0011981001392515705,0.007636325104666064,1.5917063911104562e-06,0.014746477553756472,0.05062194790299505,0.05958757523086564,0.0004966123940264624,0.005729233461488436,0.007166089559406575,7.64019067733019e-05,0.0306126068603455,0.018867178013714143,0.21327046547861703
0.0,0.18540060189078825,2.026568310551328e-05,0.0,1.68880692545944e-05,0.08221449874521645,3.3776138509188797e-06,4.053136621102656e-05,0.00018239114794961952,0.2600120918575863,3.3776138509188797e-06,3.3776138509188797e-06,9.119557397480976e-05,6.755227701837759e-06,4.390898006194544e-05,0.05670675894307707,0.0,3.3776138509188797e-06,0.0005843271962089662,1.013284155275664e-05,1.68880692545944e-05,0.3652484066106658,2.364329695643216e-05,0.011642634944117379,0.0,0.01438187977721259,3.37761385091888e-05,0.02330891318519119
0.0,0.5743844833643246,2.79704006032656e-05,0.0,2.0138688434351233e-05,0.2559571359204835,2.237632048261248e-06,1.5663424337828736e-05,0.00014768371518524237,0.09644529772813218,4.475264096522496e-06,2.1257504458481857e-05,3.915856084457184e-05,6.601014542370682e-05,7.94359377132743e-05,0.03731027677270805,1.3425792289567489e-05,5.59408012065312e-06,2.5732768555004354e-05,3.132684867565747e-05,3.692092879631059e-05,0.02981420941103287,7.831712168914368e-06,5.258435313413933e-05,2.237632048261248e-06,0.00013425792289567488,3.4683296748049344e-05,0.005319970194741118
0.0,0.2527091460771565,0.0013003901170351106,0.008669267446900737,0.0,0.045947117468573904,0.0,0.0,0.0034677069787602947,0.2492414390983962,0.0,0.0,0.0008669267446900737,0.0,0.00043346337234503684,0.03034243606415258,0.002600780234070221,0.0,0.0,0.002600780234070221,0.007368877329865626,0.014304291287386216,0.0,0.0013003901170351106,0.009102730819245773,0.0039011703511053317,0.0,0.3658430862592111
0.0,0.37652174563292656,0.0002512562814070352,0.00022957047140464225,0.0004613843024647045,0.13366460277578368,3.514596793491266e-05,9.496889207944485e-05,8.001316104331179e-05,0.20418461354391002,6.730078966259871e-06,0.0003701543431442929,0.0011328966259870784,0.0010469011725293131,0.0007941493180186647,0.10861898779612347,0.00036940655659248626,7.477865518066523e-07,0.0004120303900454654,0.0007156317300789662,0.0002736898779612347,0.05130638310600622,0.0006640344580043072,0.0017565506101938262,3.7389327590332615e-06,3.0659248624072745e-05,0.00010543790380473797,0.11686856903565446
0.0,0.42030782462700017,0.00013905412295474562,5.150152702027615e-06,0.00033475992563179497,0.11766811385957594,3.8626145265207114e-06,1.1587843579562133e-05,0.00012746627937518349,0.2313873481348722,6.308937059983828e-05,0.00010042797768953849,3.090091621216569e-05,9.656536316301779e-05,0.0001660924246403906,0.11382481240568783,3.8626145265207114e-06,0.0,0.0002060061080811046,7.338967600389351e-05,3.3475992563179495e-05,0.09470873311393682,0.00025750763510138075,0.005945851294490882,0.0,0.00251584959494049,0.0002600827114523946,0.011728185240692387
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1 ^ a b c d e f g h i j k l m n o p q r s t u v w x y z $
2 0.0 0.03240135083510582 0.09260231861008805 0.008075675043119726 0.017857766912447937 0.02028878683448995 0.008457996285223429 0.0056965765224603 0.0036982016623235744 0.026885400880147632 0.0030749451480349046 0.18878761331096985 0.060296922518691204 0.26743305983402327 0.1199608033359101 0.014603892627340732 0.012508764232704531 9.485640637203402e-06 0.005467323565586415 0.032579081785992364 0.042149993340081786 0.004793643382647348 0.005066031042208305 0.01062281917633087 8.287454451451392e-06 0.010728658956072298 0.005944601062910965 0.0
3 0.0 0.0009791886146586694 0.048581928219493954 0.0020218103912897157 0.020990537577026614 0.001131268956960644 0.006787613741763863 0.006186016110784558 0.011832275288244888 0.004851088661689449 0.0032179280340603345 0.04386672984621395 0.07634733982374604 0.08293271059721305 0.13517553734169577 0.001808420172771762 0.01144167917930759 6.458327509042543e-06 0.010475318749427155 0.046487395291047624 0.033763074574396215 0.004567187662011127 0.011274824306950683 0.008258962300707514 1.6278524132381206e-05 0.032413284125006325 0.011652857641829295 0.3829322859400618
4 0.0 0.39420804956524974 0.00012606255453160502 2.546718273365758e-07 0.0006208899150465718 0.11658188641534632 1.273359136682879e-06 5.781050480540271e-05 0.0016016311221197252 0.11411997386048364 6.366795683414395e-06 1.4261622330848246e-05 0.0003718208679114007 2.2156448978282096e-05 3.8200774100486376e-05 0.19131406078812782 6.366795683414395e-06 2.546718273365758e-07 0.0013390644681357156 1.884571522290661e-05 9.422857611453305e-06 0.14365426567670633 1.222424771215564e-05 0.02738409757801999 2.546718273365758e-07 0.004381374117498451 1.3242935021501942e-05 0.004095886999054148
5 0.0 0.07019293726020524 3.059582305823609e-05 0.004779067561696477 1.8357493834941653e-05 0.10123545933509158 0.0 7.954913995141383e-05 0.33171991359739567 0.3235936629931282 1.8357493834941653e-05 0.03783479479381475 0.015714014722710057 8.566830456306106e-05 4.283415228153053e-05 0.03822642132896017 2.4476658446588873e-05 0.00897069532067482 0.0032064422565031424 8.566830456306106e-05 0.013223514725769638 0.018045416439747646 6.119164611647218e-06 0.0037694054007746864 1.8357493834941653e-05 0.008328183036451864 1.2238329223294436e-05 0.02073784886887242
6 0.0 0.2424810241211335 1.5959776634059462e-05 5.911028382984986e-07 0.0006726750299836915 0.10030246732235734 9.457645412775978e-06 4.0786095842596404e-05 0.004887238267051987 0.3104158822239417 0.06585713162618893 1.5959776634059462e-05 2.6008524885133942e-05 0.00012176718468949073 7.152344343411834e-05 0.1405991300148426 1.773308514895496e-06 5.911028382984986e-07 0.009874964016614718 9.398535128946128e-05 1.4777570957462466e-05 0.0932293307592775 3.014624475322343e-05 0.00875600634371566 5.911028382984986e-07 0.004086293921157521 0.0022692437962279362 0.016124694325944745
7 0.0 0.006205216662872565 0.028941081239369993 0.0011474697534826386 0.01425752342098435 0.003730367449534814 0.004639925223263839 0.005866185675542982 0.003561493573938398 0.0016397191355402771 0.0038625407716201547 0.05415821120988892 0.10151501424777083 0.09480497248870047 0.17112338854414036 0.0027153276653648665 0.008680322523437667 5.646239001703881e-06 0.03422185458932722 0.03327456967317772 0.03469819184328915 0.002615748541152998 0.002961965650848386 0.005427832211228881 0.00028205530285784386 0.028868963368484594 0.01293322372785744 0.33786118926732095
8 0.0 0.23077320736794152 3.230986452473805e-06 6.46197290494761e-06 3.5540850977211856e-05 0.08830932171901404 0.0015282565920201096 3.877183742968566e-05 0.000132470444551426 0.14398568026804265 6.46197290494761e-06 9.692959357421415e-05 0.0038481048648963015 3.5540850977211856e-05 1.938591871484283e-05 0.08549513251890935 6.46197290494761e-06 6.46197290494761e-06 0.013518447317150399 0.0002003211600533759 0.0005040338865859135 0.3976407336924036 0.0 0.02652962976126241 0.0 0.004814169814185969 6.46197290494761e-06 0.0024587806903325652
9 0.0 0.38404120076639636 0.02137884680216012 5.907040897397653e-06 4.261508075979735e-05 0.09362322277181143 5.485109404726393e-06 4.514666971582492e-05 0.015463789206401714 0.07891806638923264 1.0548287316781523e-05 1.4345670750822873e-05 0.000878039436248894 2.9113272994317006e-05 0.0005134906265809245 0.2519884576420865 7.172835375411436e-06 8.438629853425219e-07 0.0015463789206401712 2.3206232096919352e-05 2.3628163589590613e-05 0.11810284411361265 5.485109404726393e-06 0.020353131343476286 2.5315889560275656e-06 0.002251004513401177 6.750903882740175e-06 0.010718747639820713
10 0.0 0.2614230396902226 4.453049370764763e-05 3.872216844143272e-06 2.5169409486931267e-05 0.09374152952565344 2.5169409486931267e-05 3.484995159728945e-05 2.420135527589545e-05 0.4197734753146176 1.1616650532429817e-05 9.486931268151016e-05 7.454017424975799e-05 0.0002468538238141336 0.0006030977734753146 0.1158296224588577 2.3233301064859633e-05 9.68054211035818e-07 0.002967086156824782 0.00020909970958373668 0.00020716360116166505 0.07815295256534366 1.1616650532429817e-05 0.007239109390125847 0.0 0.0033049370764762828 1.0648596321393998e-05 0.01591674733785092
11 0.0 0.057525431732821986 0.041996980300875106 0.0033912381338324768 0.016022951674192765 0.02875699673928052 0.007807629805802953 0.006448520094153173 0.005039572274652094 0.0003533603548473279 0.0023910036496201273 0.05831058618602033 0.08421986612340406 0.07337456279315632 0.10815890676433847 0.011699700072632914 0.009224952061459374 0.0004542620977921718 0.03307003561382166 0.050486207507906694 0.048087850694833095 0.002131396128804547 0.005388030520530441 0.005741799384458517 9.109752500688338e-05 0.017802008720851855 0.013278383415184936 0.3087466696297192
12 0.0 0.22479395580338848 3.1523973982213474e-05 4.2031965309617963e-05 0.00024168380053030328 0.10204660644420081 1.0507991327404491e-05 2.1015982654808982e-05 0.0002346784729787003 0.42807805335957994 9.457192194664042e-05 8.756659439503743e-05 3.502663775801497e-05 0.0007320567291425129 0.0001260958959288539 0.12674739139115296 2.8021310206411978e-05 0.0 3.502663775801497e-05 1.0507991327404491e-05 7.355593929183144e-05 0.06349628892772954 7.0053275516029945e-06 0.001986010360879449 0.0 0.00035727170513175267 1.0507991327404491e-05 0.05067303684452026
13 0.0 0.5034268610320017 6.895672925910058e-05 5.506050625251378e-06 1.3109644345836613e-05 0.07313660826232712 0.00021106527396796946 4.719471964501181e-06 0.00408968465012719 0.14606084028625171 6.292629286001574e-06 2.9627796221590745e-05 0.0002089677308726356 3.9853318811343305e-05 5.322515604409665e-05 0.12817613908388756 0.00323913092496931 2.8841217560840547e-06 0.0001966446651875492 0.0006470920449104952 2.0975430953338582e-05 0.10610631502055855 3.146314643000787e-06 0.021257812692547902 7.865786607501968e-07 0.01034980201815109 1.0225522589752559e-05 0.0026337275490785753
14 0.0 0.2907864656030218 0.0008378546668440517 4.703067334072724e-05 0.00035917260804733476 0.17203916946407904 0.00028798234223705586 9.696049777780069e-05 0.00010694646266521537 0.11111415343016283 4.831918493910333e-06 0.00016943927518645567 0.0027625688669183344 0.00026962105196019656 7.408941690662511e-05 0.17521728332147365 0.000927728350830784 3.5434068955342444e-06 9.663836987820666e-06 0.00025609168017724765 0.0001262741366408567 0.2173715845181466 0.0004770714192987469 0.010590921082852263 6.442557991880444e-07 0.004126136265899831 2.0938313473611443e-05 0.011915833133882475
15 0.0 0.2455144244761935 0.34861793685557824 2.345028152630318e-05 5.805835829496029e-05 0.02770991169645199 0.00440562708416741 6.392092867653609e-05 1.3805407672743e-05 0.04043547172794345 6.6190310759726715e-06 7.224199631490173e-05 7.98066032588705e-05 0.0006679547931524422 0.00019195190120320746 0.05560269776577443 0.03787277201050043 5.673455207976575e-07 1.7209480797528944e-05 0.0004308043654590213 0.00011952078971470652 0.19084103867348565 0.006140002341245849 0.037190633579328045 5.673455207976575e-07 0.0008513965115436847 0.0001529941754417683 0.00291861447415675
16 0.0 0.06852883077720875 0.00014519177304071664 0.0020409279683942802 0.189448848282653 0.02723591583598238 0.0006794974978305538 0.3949594662038515 0.00015362226308824212 0.04945662681480349 0.007555592524815616 0.040859213064337006 0.003013431831432164 4.027900800484397e-05 0.0018093705084222468 0.014132311483001883 5.18943498481013e-05 1.5924258978659246e-05 0.0002334309022048167 0.0345440266418473 0.03498709572990059 0.011673605896696897 0.0013887827271623645 0.0007604302022867985 0.0 0.03952251203124602 0.06717414469868305 0.009589026724278762
17 0.0 0.002811754741536618 0.022217453078125466 0.0009318958571949934 0.010439299379577709 0.0016556070980166227 0.007270623934115422 0.004762079642666141 0.004112736445620663 0.006897636060238115 0.0017462718427437218 0.0702396992225785 0.08710265314882101 0.10264970585602438 0.195075412409823 0.004091849124683534 0.008484842920460622 2.75437199170934e-06 0.007752409501665248 0.028939268392892802 0.03214650504623902 0.0037128934448241906 0.0025411376933511757 0.007407424409703652 2.2723568931602058e-05 0.045256168071778936 0.005312035917010772 0.33641715881938433
18 0.0 0.2755187546363871 0.00010093905364029487 3.204414401279202e-06 6.408828802558404e-06 0.20437434609918623 0.00369629201187556 3.204414401279202e-06 0.055202446890836816 0.1479045533126435 6.408828802558404e-06 0.0002483421160991382 0.0013074010757219146 6.729270242686325e-05 0.00011055229684413248 0.14242500468645605 0.0011712134636675483 0.0 0.0033470108421361265 0.0001538118912614017 0.0002675686025068134 0.14199881757108593 9.613243203837607e-06 0.012518044858597203 0.0 0.007613688617439385 8.011036003198005e-06 0.0019370685055732778
19 0.0 0.003864734299516908 0.0007246376811594203 0.0 0.0004830917874396135 0.0007246376811594203 0.0 0.0 0.0004830917874396135 0.00024154589371980676 0.0 0.0004830917874396135 0.0004830917874396135 0.0007246376811594203 0.0016908212560386474 0.0007246376811594203 0.0 0.0 0.00024154589371980676 0.001932367149758454 0.0007246376811594203 0.9835748792270531 0.0 0.00024154589371980676 0.0 0.0 0.0 0.0026570048309178746
20 0.0 0.2822393929187961 0.0012267882391033057 0.007813046583785243 0.008807168087886199 0.18842588750970124 0.00023592081521217416 0.004038313540390457 0.03821103686384766 0.19376257905381108 5.694640367190411e-05 0.000777725170147719 0.003924420733046649 0.0033565837364325194 0.004537814852598302 0.0694306823968904 0.0003530677027658055 0.00015294176986168533 0.0043604674811629435 0.0014366764126368952 0.011607304108438968 0.1015468270277394 0.0017295436315209734 0.0317907366098667 3.254080209823092e-06 0.008437829984071277 0.00010738464692416203 0.03162965963948045
21 0.0 0.23379934463089652 3.257301707105292e-05 0.0016072457280488111 4.281025100766955e-05 0.13016736016171107 7.445261044812096e-06 1.2098549197819656e-05 0.244367427355192 0.15211273242011003 1.535585090492495e-05 0.0005100003815696286 0.0002601188077531226 0.0006663508635106826 0.00011400555974868522 0.11456581564230732 0.001241497279222417 9.306576306015119e-06 0.00021265526859244547 0.016044072222754766 0.009141384576583351 0.07100824655726476 6.97993222951134e-06 0.0097877263010361 1.3959864459022679e-06 0.0037649754445984165 8.375918675413608e-06 0.010492699456216746
22 0.0 0.22270715194197135 1.4995917777938228e-05 0.004287166271403895 0.0006853689828879916 0.1378752450721748 7.386878016539942e-05 6.1094479836044635e-06 0.02331143188798605 0.08055140545073841 6.6648523457503235e-06 9.719576337552555e-05 2.221617448583441e-05 6.831473654394081e-05 4.6653966420252264e-05 0.14458064193636178 6.1094479836044635e-06 1.6662130864375809e-06 0.003284661397730618 0.19145788091019666 0.0033663058389660594 0.16493732261773184 8.331065432187904e-06 0.011131414226127331 4.443234897166882e-06 0.0022449444317935675 1.943915267510511e-05 0.009203050280756905
23 0.0 0.052226842718488986 0.03074039814488894 0.0014204842607524314 0.01637797660464469 0.022525374073740573 0.006430721206713545 0.012709093373135088 0.013169096520166011 0.009173231318596861 0.007254770344054158 0.09023383531205176 0.09354867756485294 0.11036573774681681 0.12381315549879986 0.0011981001392515705 0.007636325104666064 1.5917063911104562e-06 0.014746477553756472 0.05062194790299505 0.05958757523086564 0.0004966123940264624 0.005729233461488436 0.007166089559406575 7.64019067733019e-05 0.0306126068603455 0.018867178013714143 0.21327046547861703
24 0.0 0.18540060189078825 2.026568310551328e-05 0.0 1.68880692545944e-05 0.08221449874521645 3.3776138509188797e-06 4.053136621102656e-05 0.00018239114794961952 0.2600120918575863 3.3776138509188797e-06 3.3776138509188797e-06 9.119557397480976e-05 6.755227701837759e-06 4.390898006194544e-05 0.05670675894307707 0.0 3.3776138509188797e-06 0.0005843271962089662 1.013284155275664e-05 1.68880692545944e-05 0.3652484066106658 2.364329695643216e-05 0.011642634944117379 0.0 0.01438187977721259 3.37761385091888e-05 0.02330891318519119
25 0.0 0.5743844833643246 2.79704006032656e-05 0.0 2.0138688434351233e-05 0.2559571359204835 2.237632048261248e-06 1.5663424337828736e-05 0.00014768371518524237 0.09644529772813218 4.475264096522496e-06 2.1257504458481857e-05 3.915856084457184e-05 6.601014542370682e-05 7.94359377132743e-05 0.03731027677270805 1.3425792289567489e-05 5.59408012065312e-06 2.5732768555004354e-05 3.132684867565747e-05 3.692092879631059e-05 0.02981420941103287 7.831712168914368e-06 5.258435313413933e-05 2.237632048261248e-06 0.00013425792289567488 3.4683296748049344e-05 0.005319970194741118
26 0.0 0.2527091460771565 0.0013003901170351106 0.008669267446900737 0.0 0.045947117468573904 0.0 0.0 0.0034677069787602947 0.2492414390983962 0.0 0.0 0.0008669267446900737 0.0 0.00043346337234503684 0.03034243606415258 0.002600780234070221 0.0 0.0 0.002600780234070221 0.007368877329865626 0.014304291287386216 0.0 0.0013003901170351106 0.009102730819245773 0.0039011703511053317 0.0 0.3658430862592111
27 0.0 0.37652174563292656 0.0002512562814070352 0.00022957047140464225 0.0004613843024647045 0.13366460277578368 3.514596793491266e-05 9.496889207944485e-05 8.001316104331179e-05 0.20418461354391002 6.730078966259871e-06 0.0003701543431442929 0.0011328966259870784 0.0010469011725293131 0.0007941493180186647 0.10861898779612347 0.00036940655659248626 7.477865518066523e-07 0.0004120303900454654 0.0007156317300789662 0.0002736898779612347 0.05130638310600622 0.0006640344580043072 0.0017565506101938262 3.7389327590332615e-06 3.0659248624072745e-05 0.00010543790380473797 0.11686856903565446
28 0.0 0.42030782462700017 0.00013905412295474562 5.150152702027615e-06 0.00033475992563179497 0.11766811385957594 3.8626145265207114e-06 1.1587843579562133e-05 0.00012746627937518349 0.2313873481348722 6.308937059983828e-05 0.00010042797768953849 3.090091621216569e-05 9.656536316301779e-05 0.0001660924246403906 0.11382481240568783 3.8626145265207114e-06 0.0 0.0002060061080811046 7.338967600389351e-05 3.3475992563179495e-05 0.09470873311393682 0.00025750763510138075 0.005945851294490882 0.0 0.00251584959494049 0.0002600827114523946 0.011728185240692387
29 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Binary file not shown.

After

Width:  |  Height:  |  Size: 224 KiB

+27
View File
@@ -0,0 +1,27 @@
letter,Male,Female
a,0.10419923843992125,0.11230333493368445
b,0.018546448249013497,0.015225074098649389
c,0.041480767577953326,0.04405987823025858
d,0.03868872722439995,0.026961499859145175
e,0.13138825792008238,0.18038344738539613
f,0.010247500256025038,0.007027086508994281
g,0.0180527572420696,0.017807501663033867
h,0.031508761634381516,0.03697185263156448
i,0.09337919525658041,0.10271520299704247
j,0.026664242619993696,0.012183083323972286
k,0.012803255631156278,0.004848323140290566
l,0.0509726758057992,0.06672341587576307
m,0.03320386129622267,0.030360801411648666
n,0.07989188838489009,0.08206189166389144
o,0.057005660062330925,0.03761362219276409
p,0.021467218695097115,0.011531157247822707
q,0.0018784195453980996,0.001950967247682959
r,0.0734505638264324,0.06822482369855525
s,0.05242163399917173,0.0432875249054165
t,0.03949576796436023,0.04783038894946737
u,0.031398878017324786,0.020461941209356852
v,0.011932256018971252,0.013217596918162086
w,0.0020867055933035655,0.0015356192415710282
x,0.002258056881804058,0.0006618534967773684
y,0.013323645811687641,0.011759717016584062
z,0.002253616045629267,0.002292394152504883
1 letter Male Female
2 a 0.10419923843992125 0.11230333493368445
3 b 0.018546448249013497 0.015225074098649389
4 c 0.041480767577953326 0.04405987823025858
5 d 0.03868872722439995 0.026961499859145175
6 e 0.13138825792008238 0.18038344738539613
7 f 0.010247500256025038 0.007027086508994281
8 g 0.0180527572420696 0.017807501663033867
9 h 0.031508761634381516 0.03697185263156448
10 i 0.09337919525658041 0.10271520299704247
11 j 0.026664242619993696 0.012183083323972286
12 k 0.012803255631156278 0.004848323140290566
13 l 0.0509726758057992 0.06672341587576307
14 m 0.03320386129622267 0.030360801411648666
15 n 0.07989188838489009 0.08206189166389144
16 o 0.057005660062330925 0.03761362219276409
17 p 0.021467218695097115 0.011531157247822707
18 q 0.0018784195453980996 0.001950967247682959
19 r 0.0734505638264324 0.06822482369855525
20 s 0.05242163399917173 0.0432875249054165
21 t 0.03949576796436023 0.04783038894946737
22 u 0.031398878017324786 0.020461941209356852
23 v 0.011932256018971252 0.013217596918162086
24 w 0.0020867055933035655 0.0015356192415710282
25 x 0.002258056881804058 0.0006618534967773684
26 y 0.013323645811687641 0.011759717016584062
27 z 0.002253616045629267 0.002292394152504883
+7
View File
@@ -0,0 +1,7 @@
sex,position,2-grams,3-grams,4-grams
female,prefix,"ma, ch, be, na, an, sa, es, jo, ju, me","mar, cha, est, chr, gra, dor, sar, rut, ben, mer","mari, chri, esth, grac, sara, dorc, ruth, rach, naom, jean"
female,suffix,"ne, ie, te, le, ce, ia, se, el, ah, th","ine, tte, lle, rah, nce, ene, nne, her, lie, rie","ette, line, tine, elle, ther, arie, ille, rcas, ruth, arah"
female,any,"ne, in, el, an, ie, ri, ra, li, ar, er","ine, tte, ett, mar, ari, lle, lin, eli, the, ell","ette, line, ther, mari, tine, elle, rist, chri, ance, hris"
male,prefix,"jo, je, pa, ch, ma, da, al, ju, be, fr","jea, jos, chr, pat, mar, jon, fra, dan, cha, ben","jean, chri, jose, jona, patr, fran, mich, emma, davi, dieu"
male,suffix,"in, el, an, ck, on, re, ce, er, se, is","ean, tin, ick, ier, ard, uel, ert, ain, iel, ise","jean, stin, rick, bert, seph, than, oise, avid, ndre, tian"
male,any,"an, er, el, ie, in, ri, is, on, en, re","ric, sti, jea, ean, tin, ris, ier, ist, ick, jos","jean, stin, rist, rick, chri, hris, usti, bert, jose, atha"
1 sex position 2-grams 3-grams 4-grams
2 female prefix ma, ch, be, na, an, sa, es, jo, ju, me mar, cha, est, chr, gra, dor, sar, rut, ben, mer mari, chri, esth, grac, sara, dorc, ruth, rach, naom, jean
3 female suffix ne, ie, te, le, ce, ia, se, el, ah, th ine, tte, lle, rah, nce, ene, nne, her, lie, rie ette, line, tine, elle, ther, arie, ille, rcas, ruth, arah
4 female any ne, in, el, an, ie, ri, ra, li, ar, er ine, tte, ett, mar, ari, lle, lin, eli, the, ell ette, line, ther, mari, tine, elle, rist, chri, ance, hris
5 male prefix jo, je, pa, ch, ma, da, al, ju, be, fr jea, jos, chr, pat, mar, jon, fra, dan, cha, ben jean, chri, jose, jona, patr, fran, mich, emma, davi, dieu
6 male suffix in, el, an, ck, on, re, ce, er, se, is ean, tin, ick, ier, ard, uel, ert, ain, iel, ise jean, stin, rick, bert, seph, than, oise, avid, ndre, tian
7 male any an, er, el, ie, in, ri, is, on, en, re ric, sti, jea, ean, tin, ris, ier, ist, ick, jos jean, stin, rist, rick, chri, hris, usti, bert, jose, atha
Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 455 KiB

+29
View File
@@ -0,0 +1,29 @@
^,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,$
0.0,0.08177487512565312,0.05064544871095967,0.07327059962450833,0.06495363065149523,0.061627841497941024,0.044425593979353147,0.05752526940789172,0.024078074472915836,0.016592203335813446,0.11910658380449511,0.017117380488251007,0.03302685352589563,0.08869583137140728,0.035182675783582286,0.013984289415344039,0.05503217558728984,0.00021346554218849907,0.05022750354744187,0.05679281701430199,0.024045325783562315,0.0015048422132203263,0.015574997094552255,0.0054400764881583534,0.00016613969233005726,0.0062853521103135605,0.002710153731134059,0.0
0.0,0.005939042406371192,0.02544883058808627,0.060741665942489004,0.03423882544765226,0.014545218880858291,0.0021159734936245364,0.009683458260117744,0.022014608259160358,0.0375319435995705,0.0012071918943162118,0.00483524835366186,0.06685892969977804,0.04498424729464408,0.22067338212572413,0.008778800791243883,0.010449662778691797,0.0001958961956369646,0.11578879149722136,0.04779808257392872,0.05734632827194516,0.041184450260777605,0.016855321085647877,0.0012893799222601413,0.0007600183229223588,0.005051176039980499,0.003479293182959682,0.1402042328307295
0.0,0.10740025063072939,0.008207325873470239,4.244850350569272e-05,0.003728085960065186,0.4237910943039645,3.691174217886323e-06,0.00013288227184390762,0.00039495564131383656,0.12999208243130264,3.1374980852033744e-05,2.7683806634147422e-05,0.046482956925842464,3.3220567960976905e-05,0.0004447864932553019,0.11562418678818012,9.227935544715807e-06,1.8455871089431614e-06,0.10180812169063161,0.0005019996936325399,4.060291639674955e-05,0.021691185291408975,2.214704530731794e-05,0.0056788715342181075,1.8455871089431614e-06,0.012356205694374466,1.291910976260213e-05,0.021538001561366694
0.0,0.10050204481617504,5.957221194601566e-06,0.012163156374077748,5.7338253998040075e-05,0.21854065952395846,8.191179142577154e-06,1.5637705635829112e-05,0.2589321085286557,0.08507730983805294,2.0105621531780285e-05,0.1057168473193994,0.04785063459298775,4.393450631018655e-05,2.978610597300783e-05,0.049474722021166005,5.957221194601566e-06,0.022256178383031452,0.010718530234386868,0.00032913647100173653,0.026146243823106274,0.0030076520506244655,1.1169789739877937e-05,1.71270109344795e-05,4.021124306356057e-05,0.017885067331492553,5.957221194601566e-06,0.04113833561197044
0.0,0.13123437278970704,4.496701950163427e-05,2.716757428223737e-05,0.01997097753616332,0.19341720307012325,3.6535703345077846e-05,0.0005414778598321794,0.001204741397481285,0.22505150128952298,0.023330388618097914,1.405219359426071e-05,0.000277296620260078,0.0018839307545372192,0.0003981454851707201,0.12731568440272087,7.494503250272378e-06,0.0,0.09698730337468113,0.00034755758823138154,4.215658078278213e-05,0.0051580918619999645,0.0001564477553494359,0.0013602523399244366,9.368129062840473e-07,0.04583919231738472,0.00014333237466145925,0.12520879217648806
0.0,0.031895413152132776,0.012309605339111969,0.00870721680660756,0.0298675163088017,0.01029528350039639,0.0020813614768936716,0.00574201810613231,0.0009116137714871438,0.010444608551166907,0.00043419130147119637,0.0026210201219160722,0.12830321631540123,0.027835024848523834,0.08354099609624636,0.013881382027362197,0.017625577146542382,3.717462802398891e-05,0.11198919346094116,0.05457966355708561,0.03163080498524293,0.014952763160862552,0.010523970116611377,0.0003372866531390005,0.007001151995776294,0.0020375037696743585,0.0024278373639262424,0.37798660543852275
0.0,0.24238728488124023,1.7778409899018633e-05,2.133409187882236e-05,4.977954771725217e-05,0.09607097141231688,0.010471483430521974,2.133409187882236e-05,0.00032712274214194285,0.18898094154458825,7.111363959607453e-06,5.689091167685962e-05,0.08809913241359693,7.111363959607453e-05,0.00019556250888920495,0.025647134120324277,1.066704593941118e-05,0.0,0.3050632911392405,0.00016711705305077514,0.0022969705589532072,0.02134475892476177,3.5556819798037263e-06,0.0007609159436779974,0.0,0.002400085336367515,3.5556819798037263e-06,0.015524107523823069
0.0,0.1356264584296794,0.001080252906268644,1.588607215100947e-05,0.0006989871746444167,0.31163178202896913,1.4120953023119531e-05,0.0013097183928943365,0.009215686966713384,0.0887413641546668,1.9416310406789355e-05,1.0590714767339647e-05,0.09359544175636414,8.649083726660713e-05,0.014772281981310918,0.05647322137771078,3.5302382557798828e-06,3.5302382557798828e-06,0.12695442815435615,0.00037420525511266755,0.00015709560238220477,0.14264104184391405,2.1181429534679295e-05,0.002757116077764088,8.825595639449706e-06,0.004749935573151832,3.883262081357871e-05,0.00899857731398292
0.0,0.23289963854076665,3.564871753738659e-05,4.690620728603499e-05,6.848306263761108e-05,0.26556981191549,1.7824358768693296e-05,3.940121412026939e-05,5.159682801463849e-05,0.12588969348669787,1.1257489748648397e-05,0.00019419169816418486,0.0004596808314031429,0.0005131539077092228,0.01605974725059266,0.07720574094452215,3.0019972663062393e-05,2.814372437162099e-06,0.09786511088158341,0.00017824358768693297,0.004121179372151034,0.016572901158301883,8.536929726058368e-05,0.00026642725738467874,0.0,0.010226491312501348,7.504993165765598e-06,0.15158116134140495
0.0,0.06228411243453732,0.005795446918619388,0.09845124598199072,0.03328635202876481,0.18937896710200938,0.005852137258390008,0.010564929010010733,0.0003991130242471751,0.00012739036120869017,0.0015853746742749012,0.004633946566422507,0.05203358674566824,0.029307928241758888,0.20644406259916737,0.010186993411539938,0.0038761205301784834,0.008475857408808376,0.03351050693544404,0.1267853384445865,0.034354671707545616,0.0007083034405823336,0.02195903569080437,0.0006861485951547353,0.002158142589888398,0.000965039002302149,0.0035929946378757934,0.052596254658219155
0.0,0.12490546761508407,4.295183412001663e-05,8.130168601288861e-05,4.295183412001663e-05,0.29643208317929476,7.669970378574399e-06,4.601982227144639e-06,0.0029621425602054325,0.02271998625541308,4.295183412001663e-05,3.374786966572735e-05,1.8407928908578555e-05,6.902973340716958e-05,5.6757780801450546e-05,0.36199192198719726,0.00021475917060008315,7.669970378574399e-06,0.00026691496917438906,3.988384596858687e-05,4.601982227144639e-06,0.1879909739788585,1.8407928908578555e-05,5.0621804498591024e-05,0.0,0.0005844517428473692,3.067988151429759e-06,0.0014066725674305445
0.0,0.17051293593552538,0.00010003301089359489,0.00018339385330492396,4.334763805389112e-05,0.16333056575336527,4.001320435743795e-05,1.0003301089359488e-05,0.0020973587950690394,0.09480795329125279,3.0009903268078467e-05,0.00013004291416167336,0.0040513369411905925,0.000266754695716253,0.00026342026201979984,0.04065008119346051,0.0005101683555573339,6.668867392906325e-06,0.002400792261446277,0.022800857616346728,0.00011337074567940754,0.031060249882461213,6.33542402326101e-05,0.007512479118108976,3.667877066098479e-05,0.03316094311122671,3.0009903268078467e-05,0.42578717643489017
0.0,0.14258031599664123,0.018121651433974158,0.0015416351691170474,0.010372152272561979,0.18043688111447329,0.003440854764523217,0.003236993073110312,0.0002837534353449891,0.16829829483469572,4.903700144796899e-05,0.0011846017203722846,0.08546157592801147,0.0018463257511476862,0.00020716755667906,0.0888495368482762,0.0208693764476935,4.297624846001776e-05,0.00014490709416647014,0.005683884347609529,0.0020215366102539125,0.020202693619018865,0.024516847791351416,0.001074406211500444,9.366618254106435e-06,0.0360691939639308,0.00015206980224313977,0.18330196434514115
0.0,0.31932204396028435,0.038368037519108326,0.00011873135546683685,0.00011081593176904773,0.17818311343296872,0.0006362021797098007,5.2439681997852944e-05,5.936567773341842e-05,0.20796192681201364,2.374627109336737e-05,0.00010092165214681132,0.00023053671519810822,0.04005699105062408,0.00020184330429362263,0.08672138203297763,0.010948020402004582,2.968283886670921e-06,0.0001612767578424534,0.005299376165669818,0.0002523041303670283,0.047970435892488755,0.0004581051465095455,0.006624220207087273,9.894279622236404e-07,0.018100595140919277,0.00036806720194719424,0.03766554366592954
0.0,0.09272466504327583,6.54951531625727e-05,0.0676682588133141,0.04243183895879178,0.2208676970453058,0.0016707146854644294,0.033734710117275545,0.0024899923798154134,0.0862990806755178,0.006808358436540491,0.00222879614025689,0.0004584660721380089,4.667019896015659e-05,0.05049048810360941,0.03840682843838769,0.0003961084113425055,0.0005533752162418567,0.005650231566546017,0.015463131131101295,0.04694629828845869,0.019948568656372817,0.007369969570245905,0.00011177316557684563,1.686402147299776e-05,0.014610517580350198,0.003478302475442259,0.23906279965503272
0.0,0.003164071575712822,0.016308332918058378,0.029613095334402047,0.041853890953758216,0.032495642848868175,0.0018930066187277572,0.011876432294110452,0.012510670409103,0.07853874127032717,0.0004316702272449279,0.00471407186694461,0.0692342092506865,0.04876514486192701,0.2053281179113374,0.0007785596452408519,0.01644424108555678,7.118999249916351e-05,0.17295026214096784,0.10292454961004063,0.022388605459236936,0.03825103015155055,0.011361275621116504,0.0008639876362398481,0.0006983091082417949,0.007007036807167667,0.002350564115972381,0.06718329028496059
0.0,0.26933300206615723,2.192209253680628e-05,0.0001096104626840314,0.00010778362163929754,0.08803364310467982,8.03810059682897e-05,2.5575774626273995e-05,0.34018520514511513,0.09085245883670416,2.9229456715741707e-05,2.5575774626273995e-05,0.04719826523174392,3.6536820894677135e-05,0.00012422519104190227,0.0354936946581341,0.013204407071336317,3.6536820894677134e-06,0.08679504487635026,0.003801656214091156,0.001415801809668739,0.0039459766566251305,1.8268410447338567e-06,0.0004694981484966012,3.6536820894677134e-06,0.01590082445336349,0.0,0.0028005473215770025
0.0,0.00038185048063353973,4.9806584430461705e-05,0.00014941975329138513,4.9806584430461705e-05,0.0014941975329138511,0.0,4.9806584430461705e-05,3.320438962030781e-05,0.00029883950658277027,0.0,0.0,3.320438962030781e-05,0.00011621536367107731,4.9806584430461705e-05,0.00018262414291169292,0.0,8.30109740507695e-05,8.30109740507695e-05,0.00023243072734215463,9.961316886092341e-05,0.9891919711785898,4.9806584430461705e-05,4.9806584430461705e-05,0.0,0.00011621536367107731,0.0,0.007205352547606794
0.0,0.1627793575149433,0.0035916362881505785,0.036496892212239035,0.040730844643505654,0.1503575411693532,0.0012619623047103333,0.015429811736916198,0.0012770756257248282,0.23819394035839017,0.00021692060750216365,0.000525410159974503,0.026246059979436992,0.01374067585882558,0.020855493981119907,0.0587779279725791,0.0009934786019822454,0.00013068577583122155,0.016929586592884027,0.006149343615133057,0.04790433801209155,0.02665767572236118,0.017815938419439997,0.00039161281805206186,2.5781547612962073e-05,0.01068822952333919,0.00010534873765986227,0.1017264302202411
0.0,0.11064837045719457,0.00019155378133028236,0.03693078718830975,0.00084374879871672,0.1936315534685894,6.319971696951493e-05,3.192563022171373e-05,0.02285614506485138,0.058747068868796735,1.1076239056512926e-05,0.0013578165996337023,0.008225084814671716,0.006192269176064638,0.0010717890145861037,0.04918110758481304,0.02873958263428148,0.0002521473244041472,0.005270335160478415,0.04403717185827366,0.20958133770996804,0.02021609090855783,8.274602118689069e-05,0.0006600135390733879,1.954630421737575e-06,0.024633555661684747,5.863891265212725e-06,0.17653570425659867
0.0,0.06351954441846226,1.4753288139094001e-05,0.0013484505359131916,1.99169389877769e-05,0.16353651069982222,1.84416101738675e-05,3.39325627199162e-05,0.18805131193614777,0.2149259016103214,1.4753288139094001e-05,1.69662813599581e-05,0.0001386809085074836,0.00011802630511275201,0.00019253041021517672,0.07676357118092694,5.38495017076931e-05,2.2129932208641e-06,0.06875179805699196,0.010426886392304685,0.08126775004979235,0.009717990897221218,0.0003732581899190782,0.0005030871255431055,1.10649661043205e-05,0.008581250046104025,3.98338779755538e-05,0.11155772592816623
0.0,0.018356080355773654,0.01654418269968032,0.045542802543262595,0.07255847970462294,0.18801741120403903,0.0036002028947896146,0.04503556556922605,0.002494898139737888,0.03139678907199226,0.0007679331862739315,0.011800927182003705,0.10299977587203472,0.02639873544642752,0.06185106106897243,0.0003314734644285327,0.009195144680499687,9.672891132789921e-05,0.06532858339329739,0.1206587002937256,0.050628148113196415,0.00014745260873155367,0.0076580986870819715,0.0010781734750451204,0.0062708645441355145,0.020688190815471907,0.010810045651327663,0.07974355042289408
0.0,0.12830504252517705,1.01674063454783e-05,3.304407062280448e-05,0.00014742739200943535,0.30917049215330417,5.08370317273915e-06,7.625554759108725e-06,0.00013471813407758747,0.5037873588636906,2.541851586369575e-06,7.625554759108725e-06,0.0008693132425383947,2.541851586369575e-05,0.00019572257215045728,0.04173211934501568,5.08370317273915e-06,0.0,0.00263081639189251,5.8462586486500224e-05,2.7960367450065326e-05,0.00580050532009537,4.321147696828278e-05,0.00031010589353708815,0.0,0.002524058625264988,5.08370317273915e-06,0.004161011046886994
0.0,0.36613233287858116,0.0001534788540245566,1.7053206002728514e-05,0.0002899045020463847,0.16732605729877217,1.7053206002728514e-05,6.821282401091406e-05,0.001892905866302865,0.4231412005457026,1.7053206002728514e-05,5.1159618008185536e-05,0.0003751705320600273,0.00042633015006821284,0.0036493860845839016,0.013250341064120055,1.7053206002728514e-05,5.1159618008185536e-05,0.0007162346521145975,0.0006309686221009549,0.0020804911323328784,0.007929740791268758,3.410641200545703e-05,3.410641200545703e-05,0.0,0.004280354706684857,6.821282401091406e-05,0.007349931787175989
0.0,0.42772328342798144,3.971248163297724e-05,0.012668281640919741,0.00015884992653190897,0.0298439299471824,1.985624081648862e-05,3.971248163297724e-05,0.0013899368571542036,0.19326079186688377,0.0,3.971248163297724e-05,0.00013899368571542036,0.00013899368571542036,0.00011913744489893174,0.02454231364917994,0.00230332393471268,3.971248163297724e-05,1.985624081648862e-05,0.0007148246693935904,0.002521742583694055,0.0016480679877685556,1.985624081648862e-05,0.0005956872244946587,7.942496326595449e-05,0.005877447281680632,0.00011913744489893174,0.29593741312894645
0.0,0.07903953601739935,0.0008749671887304226,0.020399235028686423,0.025884029348899416,0.017196855117933077,0.0011899553766733747,0.0022049173156006648,0.00033248753171756057,0.011642063422621652,9.499643763358873e-05,0.0003999850005624789,0.07594465207554717,0.013214504456082897,0.04049848130695099,0.024181593190255365,0.006899741259702761,5.749784383085634e-05,0.01051710560853968,0.03773608489681637,0.0037698586303013637,0.0075047185730535105,0.028846418259315276,0.00024249090659100284,0.00047998200067497467,6.749746884491831e-05,0.0010774595952651777,0.5897028861417697
0.0,0.40388373911101183,0.0005723698905517087,0.0004886084431538977,0.0008794951976770158,0.21811480902389993,2.792048246593701e-05,0.00011168192986374804,0.000991177127540764,0.20157192316283226,2.792048246593701e-05,0.00015356265356265356,0.0005584096493187402,0.0017589903953540316,0.0011168192986374804,0.07377987491623855,5.584096493187402e-05,2.792048246593701e-05,0.004844203707840071,6.980120616484253e-05,4.1880723698905516e-05,0.027669198123743577,6.980120616484253e-05,0.0018846325664507483,1.3960241232968505e-05,0.018092472637927185,0.004606879606879607,0.03858610676792495
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1 ^ a b c d e f g h i j k l m n o p q r s t u v w x y z $
2 0.0 0.08177487512565312 0.05064544871095967 0.07327059962450833 0.06495363065149523 0.061627841497941024 0.044425593979353147 0.05752526940789172 0.024078074472915836 0.016592203335813446 0.11910658380449511 0.017117380488251007 0.03302685352589563 0.08869583137140728 0.035182675783582286 0.013984289415344039 0.05503217558728984 0.00021346554218849907 0.05022750354744187 0.05679281701430199 0.024045325783562315 0.0015048422132203263 0.015574997094552255 0.0054400764881583534 0.00016613969233005726 0.0062853521103135605 0.002710153731134059 0.0
3 0.0 0.005939042406371192 0.02544883058808627 0.060741665942489004 0.03423882544765226 0.014545218880858291 0.0021159734936245364 0.009683458260117744 0.022014608259160358 0.0375319435995705 0.0012071918943162118 0.00483524835366186 0.06685892969977804 0.04498424729464408 0.22067338212572413 0.008778800791243883 0.010449662778691797 0.0001958961956369646 0.11578879149722136 0.04779808257392872 0.05734632827194516 0.041184450260777605 0.016855321085647877 0.0012893799222601413 0.0007600183229223588 0.005051176039980499 0.003479293182959682 0.1402042328307295
4 0.0 0.10740025063072939 0.008207325873470239 4.244850350569272e-05 0.003728085960065186 0.4237910943039645 3.691174217886323e-06 0.00013288227184390762 0.00039495564131383656 0.12999208243130264 3.1374980852033744e-05 2.7683806634147422e-05 0.046482956925842464 3.3220567960976905e-05 0.0004447864932553019 0.11562418678818012 9.227935544715807e-06 1.8455871089431614e-06 0.10180812169063161 0.0005019996936325399 4.060291639674955e-05 0.021691185291408975 2.214704530731794e-05 0.0056788715342181075 1.8455871089431614e-06 0.012356205694374466 1.291910976260213e-05 0.021538001561366694
5 0.0 0.10050204481617504 5.957221194601566e-06 0.012163156374077748 5.7338253998040075e-05 0.21854065952395846 8.191179142577154e-06 1.5637705635829112e-05 0.2589321085286557 0.08507730983805294 2.0105621531780285e-05 0.1057168473193994 0.04785063459298775 4.393450631018655e-05 2.978610597300783e-05 0.049474722021166005 5.957221194601566e-06 0.022256178383031452 0.010718530234386868 0.00032913647100173653 0.026146243823106274 0.0030076520506244655 1.1169789739877937e-05 1.71270109344795e-05 4.021124306356057e-05 0.017885067331492553 5.957221194601566e-06 0.04113833561197044
6 0.0 0.13123437278970704 4.496701950163427e-05 2.716757428223737e-05 0.01997097753616332 0.19341720307012325 3.6535703345077846e-05 0.0005414778598321794 0.001204741397481285 0.22505150128952298 0.023330388618097914 1.405219359426071e-05 0.000277296620260078 0.0018839307545372192 0.0003981454851707201 0.12731568440272087 7.494503250272378e-06 0.0 0.09698730337468113 0.00034755758823138154 4.215658078278213e-05 0.0051580918619999645 0.0001564477553494359 0.0013602523399244366 9.368129062840473e-07 0.04583919231738472 0.00014333237466145925 0.12520879217648806
7 0.0 0.031895413152132776 0.012309605339111969 0.00870721680660756 0.0298675163088017 0.01029528350039639 0.0020813614768936716 0.00574201810613231 0.0009116137714871438 0.010444608551166907 0.00043419130147119637 0.0026210201219160722 0.12830321631540123 0.027835024848523834 0.08354099609624636 0.013881382027362197 0.017625577146542382 3.717462802398891e-05 0.11198919346094116 0.05457966355708561 0.03163080498524293 0.014952763160862552 0.010523970116611377 0.0003372866531390005 0.007001151995776294 0.0020375037696743585 0.0024278373639262424 0.37798660543852275
8 0.0 0.24238728488124023 1.7778409899018633e-05 2.133409187882236e-05 4.977954771725217e-05 0.09607097141231688 0.010471483430521974 2.133409187882236e-05 0.00032712274214194285 0.18898094154458825 7.111363959607453e-06 5.689091167685962e-05 0.08809913241359693 7.111363959607453e-05 0.00019556250888920495 0.025647134120324277 1.066704593941118e-05 0.0 0.3050632911392405 0.00016711705305077514 0.0022969705589532072 0.02134475892476177 3.5556819798037263e-06 0.0007609159436779974 0.0 0.002400085336367515 3.5556819798037263e-06 0.015524107523823069
9 0.0 0.1356264584296794 0.001080252906268644 1.588607215100947e-05 0.0006989871746444167 0.31163178202896913 1.4120953023119531e-05 0.0013097183928943365 0.009215686966713384 0.0887413641546668 1.9416310406789355e-05 1.0590714767339647e-05 0.09359544175636414 8.649083726660713e-05 0.014772281981310918 0.05647322137771078 3.5302382557798828e-06 3.5302382557798828e-06 0.12695442815435615 0.00037420525511266755 0.00015709560238220477 0.14264104184391405 2.1181429534679295e-05 0.002757116077764088 8.825595639449706e-06 0.004749935573151832 3.883262081357871e-05 0.00899857731398292
10 0.0 0.23289963854076665 3.564871753738659e-05 4.690620728603499e-05 6.848306263761108e-05 0.26556981191549 1.7824358768693296e-05 3.940121412026939e-05 5.159682801463849e-05 0.12588969348669787 1.1257489748648397e-05 0.00019419169816418486 0.0004596808314031429 0.0005131539077092228 0.01605974725059266 0.07720574094452215 3.0019972663062393e-05 2.814372437162099e-06 0.09786511088158341 0.00017824358768693297 0.004121179372151034 0.016572901158301883 8.536929726058368e-05 0.00026642725738467874 0.0 0.010226491312501348 7.504993165765598e-06 0.15158116134140495
11 0.0 0.06228411243453732 0.005795446918619388 0.09845124598199072 0.03328635202876481 0.18937896710200938 0.005852137258390008 0.010564929010010733 0.0003991130242471751 0.00012739036120869017 0.0015853746742749012 0.004633946566422507 0.05203358674566824 0.029307928241758888 0.20644406259916737 0.010186993411539938 0.0038761205301784834 0.008475857408808376 0.03351050693544404 0.1267853384445865 0.034354671707545616 0.0007083034405823336 0.02195903569080437 0.0006861485951547353 0.002158142589888398 0.000965039002302149 0.0035929946378757934 0.052596254658219155
12 0.0 0.12490546761508407 4.295183412001663e-05 8.130168601288861e-05 4.295183412001663e-05 0.29643208317929476 7.669970378574399e-06 4.601982227144639e-06 0.0029621425602054325 0.02271998625541308 4.295183412001663e-05 3.374786966572735e-05 1.8407928908578555e-05 6.902973340716958e-05 5.6757780801450546e-05 0.36199192198719726 0.00021475917060008315 7.669970378574399e-06 0.00026691496917438906 3.988384596858687e-05 4.601982227144639e-06 0.1879909739788585 1.8407928908578555e-05 5.0621804498591024e-05 0.0 0.0005844517428473692 3.067988151429759e-06 0.0014066725674305445
13 0.0 0.17051293593552538 0.00010003301089359489 0.00018339385330492396 4.334763805389112e-05 0.16333056575336527 4.001320435743795e-05 1.0003301089359488e-05 0.0020973587950690394 0.09480795329125279 3.0009903268078467e-05 0.00013004291416167336 0.0040513369411905925 0.000266754695716253 0.00026342026201979984 0.04065008119346051 0.0005101683555573339 6.668867392906325e-06 0.002400792261446277 0.022800857616346728 0.00011337074567940754 0.031060249882461213 6.33542402326101e-05 0.007512479118108976 3.667877066098479e-05 0.03316094311122671 3.0009903268078467e-05 0.42578717643489017
14 0.0 0.14258031599664123 0.018121651433974158 0.0015416351691170474 0.010372152272561979 0.18043688111447329 0.003440854764523217 0.003236993073110312 0.0002837534353449891 0.16829829483469572 4.903700144796899e-05 0.0011846017203722846 0.08546157592801147 0.0018463257511476862 0.00020716755667906 0.0888495368482762 0.0208693764476935 4.297624846001776e-05 0.00014490709416647014 0.005683884347609529 0.0020215366102539125 0.020202693619018865 0.024516847791351416 0.001074406211500444 9.366618254106435e-06 0.0360691939639308 0.00015206980224313977 0.18330196434514115
15 0.0 0.31932204396028435 0.038368037519108326 0.00011873135546683685 0.00011081593176904773 0.17818311343296872 0.0006362021797098007 5.2439681997852944e-05 5.936567773341842e-05 0.20796192681201364 2.374627109336737e-05 0.00010092165214681132 0.00023053671519810822 0.04005699105062408 0.00020184330429362263 0.08672138203297763 0.010948020402004582 2.968283886670921e-06 0.0001612767578424534 0.005299376165669818 0.0002523041303670283 0.047970435892488755 0.0004581051465095455 0.006624220207087273 9.894279622236404e-07 0.018100595140919277 0.00036806720194719424 0.03766554366592954
16 0.0 0.09272466504327583 6.54951531625727e-05 0.0676682588133141 0.04243183895879178 0.2208676970453058 0.0016707146854644294 0.033734710117275545 0.0024899923798154134 0.0862990806755178 0.006808358436540491 0.00222879614025689 0.0004584660721380089 4.667019896015659e-05 0.05049048810360941 0.03840682843838769 0.0003961084113425055 0.0005533752162418567 0.005650231566546017 0.015463131131101295 0.04694629828845869 0.019948568656372817 0.007369969570245905 0.00011177316557684563 1.686402147299776e-05 0.014610517580350198 0.003478302475442259 0.23906279965503272
17 0.0 0.003164071575712822 0.016308332918058378 0.029613095334402047 0.041853890953758216 0.032495642848868175 0.0018930066187277572 0.011876432294110452 0.012510670409103 0.07853874127032717 0.0004316702272449279 0.00471407186694461 0.0692342092506865 0.04876514486192701 0.2053281179113374 0.0007785596452408519 0.01644424108555678 7.118999249916351e-05 0.17295026214096784 0.10292454961004063 0.022388605459236936 0.03825103015155055 0.011361275621116504 0.0008639876362398481 0.0006983091082417949 0.007007036807167667 0.002350564115972381 0.06718329028496059
18 0.0 0.26933300206615723 2.192209253680628e-05 0.0001096104626840314 0.00010778362163929754 0.08803364310467982 8.03810059682897e-05 2.5575774626273995e-05 0.34018520514511513 0.09085245883670416 2.9229456715741707e-05 2.5575774626273995e-05 0.04719826523174392 3.6536820894677135e-05 0.00012422519104190227 0.0354936946581341 0.013204407071336317 3.6536820894677134e-06 0.08679504487635026 0.003801656214091156 0.001415801809668739 0.0039459766566251305 1.8268410447338567e-06 0.0004694981484966012 3.6536820894677134e-06 0.01590082445336349 0.0 0.0028005473215770025
19 0.0 0.00038185048063353973 4.9806584430461705e-05 0.00014941975329138513 4.9806584430461705e-05 0.0014941975329138511 0.0 4.9806584430461705e-05 3.320438962030781e-05 0.00029883950658277027 0.0 0.0 3.320438962030781e-05 0.00011621536367107731 4.9806584430461705e-05 0.00018262414291169292 0.0 8.30109740507695e-05 8.30109740507695e-05 0.00023243072734215463 9.961316886092341e-05 0.9891919711785898 4.9806584430461705e-05 4.9806584430461705e-05 0.0 0.00011621536367107731 0.0 0.007205352547606794
20 0.0 0.1627793575149433 0.0035916362881505785 0.036496892212239035 0.040730844643505654 0.1503575411693532 0.0012619623047103333 0.015429811736916198 0.0012770756257248282 0.23819394035839017 0.00021692060750216365 0.000525410159974503 0.026246059979436992 0.01374067585882558 0.020855493981119907 0.0587779279725791 0.0009934786019822454 0.00013068577583122155 0.016929586592884027 0.006149343615133057 0.04790433801209155 0.02665767572236118 0.017815938419439997 0.00039161281805206186 2.5781547612962073e-05 0.01068822952333919 0.00010534873765986227 0.1017264302202411
21 0.0 0.11064837045719457 0.00019155378133028236 0.03693078718830975 0.00084374879871672 0.1936315534685894 6.319971696951493e-05 3.192563022171373e-05 0.02285614506485138 0.058747068868796735 1.1076239056512926e-05 0.0013578165996337023 0.008225084814671716 0.006192269176064638 0.0010717890145861037 0.04918110758481304 0.02873958263428148 0.0002521473244041472 0.005270335160478415 0.04403717185827366 0.20958133770996804 0.02021609090855783 8.274602118689069e-05 0.0006600135390733879 1.954630421737575e-06 0.024633555661684747 5.863891265212725e-06 0.17653570425659867
22 0.0 0.06351954441846226 1.4753288139094001e-05 0.0013484505359131916 1.99169389877769e-05 0.16353651069982222 1.84416101738675e-05 3.39325627199162e-05 0.18805131193614777 0.2149259016103214 1.4753288139094001e-05 1.69662813599581e-05 0.0001386809085074836 0.00011802630511275201 0.00019253041021517672 0.07676357118092694 5.38495017076931e-05 2.2129932208641e-06 0.06875179805699196 0.010426886392304685 0.08126775004979235 0.009717990897221218 0.0003732581899190782 0.0005030871255431055 1.10649661043205e-05 0.008581250046104025 3.98338779755538e-05 0.11155772592816623
23 0.0 0.018356080355773654 0.01654418269968032 0.045542802543262595 0.07255847970462294 0.18801741120403903 0.0036002028947896146 0.04503556556922605 0.002494898139737888 0.03139678907199226 0.0007679331862739315 0.011800927182003705 0.10299977587203472 0.02639873544642752 0.06185106106897243 0.0003314734644285327 0.009195144680499687 9.672891132789921e-05 0.06532858339329739 0.1206587002937256 0.050628148113196415 0.00014745260873155367 0.0076580986870819715 0.0010781734750451204 0.0062708645441355145 0.020688190815471907 0.010810045651327663 0.07974355042289408
24 0.0 0.12830504252517705 1.01674063454783e-05 3.304407062280448e-05 0.00014742739200943535 0.30917049215330417 5.08370317273915e-06 7.625554759108725e-06 0.00013471813407758747 0.5037873588636906 2.541851586369575e-06 7.625554759108725e-06 0.0008693132425383947 2.541851586369575e-05 0.00019572257215045728 0.04173211934501568 5.08370317273915e-06 0.0 0.00263081639189251 5.8462586486500224e-05 2.7960367450065326e-05 0.00580050532009537 4.321147696828278e-05 0.00031010589353708815 0.0 0.002524058625264988 5.08370317273915e-06 0.004161011046886994
25 0.0 0.36613233287858116 0.0001534788540245566 1.7053206002728514e-05 0.0002899045020463847 0.16732605729877217 1.7053206002728514e-05 6.821282401091406e-05 0.001892905866302865 0.4231412005457026 1.7053206002728514e-05 5.1159618008185536e-05 0.0003751705320600273 0.00042633015006821284 0.0036493860845839016 0.013250341064120055 1.7053206002728514e-05 5.1159618008185536e-05 0.0007162346521145975 0.0006309686221009549 0.0020804911323328784 0.007929740791268758 3.410641200545703e-05 3.410641200545703e-05 0.0 0.004280354706684857 6.821282401091406e-05 0.007349931787175989
26 0.0 0.42772328342798144 3.971248163297724e-05 0.012668281640919741 0.00015884992653190897 0.0298439299471824 1.985624081648862e-05 3.971248163297724e-05 0.0013899368571542036 0.19326079186688377 0.0 3.971248163297724e-05 0.00013899368571542036 0.00013899368571542036 0.00011913744489893174 0.02454231364917994 0.00230332393471268 3.971248163297724e-05 1.985624081648862e-05 0.0007148246693935904 0.002521742583694055 0.0016480679877685556 1.985624081648862e-05 0.0005956872244946587 7.942496326595449e-05 0.005877447281680632 0.00011913744489893174 0.29593741312894645
27 0.0 0.07903953601739935 0.0008749671887304226 0.020399235028686423 0.025884029348899416 0.017196855117933077 0.0011899553766733747 0.0022049173156006648 0.00033248753171756057 0.011642063422621652 9.499643763358873e-05 0.0003999850005624789 0.07594465207554717 0.013214504456082897 0.04049848130695099 0.024181593190255365 0.006899741259702761 5.749784383085634e-05 0.01051710560853968 0.03773608489681637 0.0037698586303013637 0.0075047185730535105 0.028846418259315276 0.00024249090659100284 0.00047998200067497467 6.749746884491831e-05 0.0010774595952651777 0.5897028861417697
28 0.0 0.40388373911101183 0.0005723698905517087 0.0004886084431538977 0.0008794951976770158 0.21811480902389993 2.792048246593701e-05 0.00011168192986374804 0.000991177127540764 0.20157192316283226 2.792048246593701e-05 0.00015356265356265356 0.0005584096493187402 0.0017589903953540316 0.0011168192986374804 0.07377987491623855 5.584096493187402e-05 2.792048246593701e-05 0.004844203707840071 6.980120616484253e-05 4.1880723698905516e-05 0.027669198123743577 6.980120616484253e-05 0.0018846325664507483 1.3960241232968505e-05 0.018092472637927185 0.004606879606879607 0.03858610676792495
29 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
+3
View File
@@ -0,0 +1,3 @@
category,l2,kl_mf,kl_fm,jsd,permutation_p_value
names,0.3189041485139616,0.04320097944655348,0.0215380760498496,0.03236952774820154,0.978
surnames,1.2770018925640299,0.2936188220992242,0.23989460296618093,0.26675671253270256,0.001
1 category l2 kl_mf kl_fm jsd permutation_p_value
2 names 0.3189041485139616 0.04320097944655348 0.0215380760498496 0.03236952774820154 0.978
3 surnames 1.2770018925640299 0.2936188220992242 0.23989460296618093 0.26675671253270256 0.001
Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 158 KiB

+23
View File
@@ -0,0 +1,23 @@
services:
app:
build:
context: .
dockerfile: Dockerfile
image: drc-ners:uv
working_dir: /app
tty: true
stdin_open: true
environment:
NERS_ENV: production
STREAMLIT_SERVER_ADDRESS: 0.0.0.0
PYTHONPATH: /app/src
# expose Streamlit for `ners web run`
ports:
- "8501:8501"
volumes:
- ./src:/app/src
- ./assets:/app/assets
- ./config:/app/config
- ./data:/app/data
# default command shows CLI help; override per run
command: ["ners", "--help"]
+1 -1
View File
@@ -30,7 +30,7 @@ llm:
# Data handling configuration
data:
split_evaluation: false
max_dataset_size: 100_000
max_dataset_size: 10_000
balance_by_sex: true
# Enhanced logging for development
-65
View File
@@ -73,37 +73,6 @@ baseline_experiments:
batch_size: 32
tags: [ "baseline", "neural", "cnn", "surname" ]
## Ensemble Models
- name: "ensemble"
description: "Baseline Ensemble with multiple models"
model_type: "ensemble"
features: [ "full_name" ]
model_params:
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
voting: "soft"
cv_folds: 5
tags: [ "baseline", "ensemble" ]
- name: "ensemble_native"
description: "Baseline Ensemble with native name"
model_type: "ensemble"
features: [ "native_name" ]
model_params:
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
voting: "soft"
cv_folds: 5
tags: [ "baseline", "ensemble", "native" ]
- name: "ensemble_surname"
description: "Baseline Ensemble with surname"
model_type: "ensemble"
features: [ "surname" ]
model_params:
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
voting: "soft"
cv_folds: 5
tags: [ "baseline", "ensemble", "surname" ]
# LightGBM Models
- name: "lightgbm"
description: "Baseline LightGBM with engineered features"
@@ -262,40 +231,6 @@ baseline_experiments:
min_samples_leaf: 1
tags: [ "baseline", "random_forest", "engineered", "surname" ]
# SVM Models
- name: "svm"
description: "Baseline SVM with full name features"
model_type: "svm"
features: [ "full_name" ]
model_params:
C: 1.0
kernel: "rbf"
ngram_range: [ 2, 4 ]
max_features: 5000
tags: [ "baseline", "svm" ]
- name: "svm_native"
description: "Baseline SVM with native name features"
model_type: "svm"
features: [ "native_name" ]
model_params:
C: 1.0
kernel: "rbf"
ngram_range: [ 2, 4 ]
max_features: 5000
tags: [ "baseline", "svm", "native" ]
- name: "svm_surname"
description: "Baseline SVM with surname features"
model_type: "svm"
features: [ "surname" ]
model_params:
C: 1.0
kernel: "rbf"
ngram_range: [ 2, 4 ]
max_features: 5000
tags: [ "baseline", "svm", "surname" ]
# Transformer Models
- name: "transformer"
description: "Baseline Transformer with attention mechanism"
-90
View File
@@ -1,90 +0,0 @@
#!.venv/bin/python3
import argparse
import sys
import traceback
from pathlib import Path
from core.config import setup_config
from processing.monitoring.pipeline_monitor import PipelineMonitor
def main():
choices = [
"data_cleaning",
"data_selection",
"feature_extraction",
"ner_annotation",
"llm_annotation",
"data_splitting",
]
parser = argparse.ArgumentParser(description="DRC NERS Processing Monitoring")
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Clean command
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
clean_parser.add_argument("--step", type=str, choices=choices, help="default: all")
clean_parser.add_argument("--keep-last", type=int, default=1, help="(default: 1)")
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
# Reset command
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
reset_parser.add_argument("--step", type=str, choices=choices, help="(default: all)")
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
args = parser.parse_args()
try:
setup_config(config_path=args.config, env=args.env)
monitor = PipelineMonitor()
if not args.command:
parser.print_help()
monitor.print_status(detailed=True)
return 1
elif args.command == "clean":
checkpoint_info = monitor.count_checkpoint_files()
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
if not args.force:
response = input("Are you sure you want to clean checkpoints? (y/N): ")
if response.lower() != "y":
print("Cancelled")
return 0
if args.step:
monitor.clean_step_checkpoints(args.step, args.keep_last)
else:
for step in monitor.steps:
monitor.clean_step_checkpoints(step, args.keep_last)
print("Checkpoint cleaning completed")
elif args.command == "reset":
if not args.force:
response = input(
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
)
if response.lower() != "y":
print("Cancelled")
return 0
if args.step:
monitor.reset_step(args.step)
else:
for step in monitor.steps:
monitor.reset_step(step)
print(f"Reset completed")
except Exception as e:
print(f"Monitoring failed: {e}")
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
-499
View File
File diff suppressed because one or more lines are too long
-2957
View File
File diff suppressed because one or more lines are too long
-1852
View File
File diff suppressed because one or more lines are too long
-832
View File
File diff suppressed because one or more lines are too long
-107
View File
@@ -1,107 +0,0 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Qualitative Analysis",
"id": "d20715dd63f57364"
},
{
"cell_type": "code",
"id": "c93a55c8",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:34:50.973298Z",
"start_time": "2025-09-21T13:34:50.969142Z"
}
},
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sys\n",
"import os\n",
"\n",
"sys.path.append(os.path.abspath(\"..\"))\n",
"from core.utils.data_loader import DataLoader\n",
"from core.config.pipeline_config import PipelineConfig"
],
"outputs": [],
"execution_count": 3
},
{
"cell_type": "code",
"id": "c0b00261",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:34:51.002610Z",
"start_time": "2025-09-21T13:34:50.998586Z"
}
},
"source": [
"config = PipelineConfig(\n",
" paths={\n",
" \"root_dir\": \"../data\",\n",
" \"data_dir\": \"../data/dataset\",\n",
" \"models_dir\": \"../models\",\n",
" \"outputs_dir\": \"../data/processed\",\n",
" \"logs_dir\": \"../logs\",\n",
" \"configs_dir\": \"../configs\",\n",
" \"checkpoints_dir\": \"../checkpoints\"\n",
" }\n",
")\n",
"\n",
"loader = DataLoader(config)"
],
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:35:27.430639Z",
"start_time": "2025-09-21T13:34:51.013412Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": 5,
"source": [
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
"gdf_proj = gdf.to_crs(epsg=32732)\n",
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
"\n",
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
],
"id": "b38394ce38864379"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Exploration",
"id": "a1af5626d2a948d6"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
-107
View File
@@ -1,107 +0,0 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Quantitative Analysis",
"id": "a605c0f92056a825"
},
{
"cell_type": "code",
"id": "c93a55c8",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:14:47.287549Z",
"start_time": "2025-09-21T14:14:47.279199Z"
}
},
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sys\n",
"import os\n",
"\n",
"sys.path.append(os.path.abspath(\"..\"))\n",
"from core.utils.data_loader import DataLoader\n",
"from core.config.pipeline_config import PipelineConfig"
],
"outputs": [],
"execution_count": 30
},
{
"cell_type": "code",
"id": "c0b00261",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:14:47.315980Z",
"start_time": "2025-09-21T14:14:47.308376Z"
}
},
"source": [
"config = PipelineConfig(\n",
" paths={\n",
" \"root_dir\": \"../data\",\n",
" \"data_dir\": \"../data/dataset\",\n",
" \"models_dir\": \"../models\",\n",
" \"outputs_dir\": \"../data/processed\",\n",
" \"logs_dir\": \"../logs\",\n",
" \"configs_dir\": \"../configs\",\n",
" \"checkpoints_dir\": \"../checkpoints\"\n",
" }\n",
")\n",
"\n",
"loader = DataLoader(config)"
],
"outputs": [],
"execution_count": 31
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:15:47.899044Z",
"start_time": "2025-09-21T14:14:47.339266Z"
}
},
"cell_type": "code",
"source": [
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
"gdf_proj = gdf.to_crs(epsg=32732)\n",
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
"\n",
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
],
"id": "b38394ce38864379",
"outputs": [],
"execution_count": 32
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Exploration",
"id": "a1af5626d2a948d6"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+41
View File
@@ -0,0 +1,41 @@
[project]
name = "ners"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"geopandas>=1.1.1",
"joblib>=1.5.2",
"lightgbm>=4.6.0",
"matplotlib>=3.10.6",
"numpy>=2.3.3",
"ollama>=0.6.0",
"pandas>=2.3.3",
"plotly>=6.3.1",
"psutil>=7.1.0",
"pydantic>=2.11.10",
"pyyaml>=6.0.3",
"scikit-learn>=1.7.2",
"seaborn>=0.13.2",
"spacy>=3.8.7",
"streamlit>=1.50.0",
"tqdm>=4.67.1",
"typer>=0.19.2",
"tensorflow==2.20.0; sys_platform == 'linux' and platform_machine == 'x86_64'",
"xgboost>=3.0.5",
"networkx>=3.5",
]
[project.scripts]
ners = "ners.cli:app"
[build-system]
requires = ["uv_build>=0.8.12,<0.9.0"]
build-backend = "uv_build"
[dependency-groups]
dev = [
"ipykernel>=6.30.1",
"ruff>=0.13.3",
]
-170
View File
@@ -1,170 +0,0 @@
absl-py==2.3.0
altair==5.1.2
annotated-types==0.7.0
anyio==4.9.0
appnope==0.1.4
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==3.0.0
astunparse==1.6.3
async-lru==2.0.5
attrs==25.3.0
babel==2.17.0
beautifulsoup4==4.13.4
black==25.1.0
bleach==6.2.0
blinker==1.9.0
cachetools==6.1.0
certifi==2025.6.15
cffi==1.17.1
charset-normalizer==3.4.2
click==8.2.1
comm==0.2.2
contourpy==1.3.2
cycler==0.12.1
debugpy==1.8.14
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.0
fastjsonschema==2.21.1
flake8==7.3.0
flatbuffers==25.2.10
fonttools==4.58.4
fqdn==1.5.1
gast==0.6.0
gitdb==4.0.12
GitPython==3.1.45
google-pasta==0.2.0
grpcio==1.73.0
h11==0.16.0
h5py==3.14.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
imbalanced-learn==0.13.0
ipykernel==6.29.5
ipython>=8.0,<9.0
ipython_pygments_lexers==1.1.1
isoduration==20.11.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.1
json5==0.12.0
jsonpointer==3.0.0
jsonschema==4.24.0
jsonschema-specifications==2025.4.1
jupyter-events==0.12.0
jupyter-lsp==2.2.5
jupyter_client==8.6.3
jupyter_core==5.8.1
jupyter_server==2.16.0
jupyter_server_terminals==0.5.3
jupyterlab==4.4.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
keras==3.10.0
kiwisolver==1.4.8
libclang==18.1.1
lightgbm~=4.6.0
Markdown==3.8.2
markdown-it-py==3.0.0
MarkupSafe==3.0.2
matplotlib==3.10.3
matplotlib-inline==0.1.7
mccabe==0.7.0
mdurl==0.1.2
mistune==3.1.3
ml-dtypes==0.3.2
mypy==1.17.0
mypy_extensions==1.1.0
namex==0.1.0
narwhals==2.0.1
nbclient==0.10.2
nbconvert==7.16.6
nbformat==5.10.4
nest-asyncio==1.6.0
nltk==3.9.1
notebook==7.4.4
notebook_shim==0.2.4
numpy==1.26.4
ollama~=0.5.1
opt_einsum==3.4.0
optree==0.16.0
overrides==7.7.0
packaging==25.0
pandas==2.3.0
pandocfilters==1.5.1
parso==0.8.4
pathspec==0.12.1
pexpect==4.9.0
pillow==11.2.1
platformdirs==4.3.8
plotly~=6.2.0
prometheus_client==0.22.1
prompt_toolkit==3.0.51
protobuf==4.25.8
psutil==7.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pyarrow==21.0.0
pycodestyle==2.14.0
pycparser==2.22
pydantic~=2.11.7
pydantic_core==2.33.2
pydeck==0.9.1
pyflakes==3.4.0
Pygments==2.19.1
pyparsing==3.2.3
python-dateutil==2.9.0.post0
python-json-logger==3.3.0
pytz==2025.2
PyYAML~=6.0.2
pyzmq==27.0.0
referencing==0.36.2
regex==2024.11.6
requests==2.32.4
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==14.0.0
rpds-py==0.26.0
scikit-learn~=1.6.1
scipy==1.15.3
seaborn==0.13.2
Send2Trash==1.8.3
six==1.17.0
sklearn-compat==0.1.3
smmap==5.0.2
sniffio==1.3.1
soupsieve==2.7
spacy~=3.8.7
stack-data==0.6.3
streamlit~=1.47.1
tenacity==9.1.2
tensorboard==2.16.2
tensorboard-data-server==0.7.2
tensorflow==2.16.2
tensorflow-io-gcs-filesystem==0.37.1
termcolor==3.1.0
terminado==0.18.1
threadpoolctl==3.6.0
tinycss2==1.4.0
toml==0.10.2
toolz==1.0.0
tornado==6.5.1
tqdm==4.67.1
traitlets==5.14.3
types-python-dateutil==2.9.0.20250516
types-PyYAML==6.0.12.20250516
typing-inspection==0.4.1
typing_extensions==4.14.0
tzdata==2025.2
uri-template==1.3.0
urllib3==2.5.0
wcwidth==0.2.13
webcolors==24.11.1
webencodings==0.5.1
websocket-client==1.8.0
Werkzeug==3.1.3
wrapt==1.17.2
xgboost~=3.0.3
-52
View File
@@ -1,52 +0,0 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from research.traditional_model import TraditionalModel
class SVMModel(TraditionalModel):
"""Support Vector Machine with character n-grams and RBF kernel"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
# for distinguishing name morphology under RBF kernels.
vectorizer = TfidfVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 4)),
max_features=params.get("max_features", 5000),
)
# RBF kernel captures non-linear interactions between n-grams; probability=True
# adds calibration at some cost. Larger cache helps speed kernel computations.
classifier = SVC(
kernel=params.get("kernel", "rbf"),
C=params.get("C", 1.0),
gamma=params.get("gamma", "scale"),
probability=True, # Enable probability prediction
class_weight=params.get("class_weight", None),
cache_size=params.get("cache_size", 1000),
random_state=self.config.random_seed,
verbose=2,
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+3
View File
@@ -0,0 +1,3 @@
"""DRC NERS NLP package."""
__all__: list[str] = []
+225
View File
@@ -0,0 +1,225 @@
from __future__ import annotations
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional
import typer
from ners.core.config import setup_config, PipelineConfig
app = typer.Typer(help="DRC NERS command-line interface", no_args_is_help=True)
# -------------------------
# Pipeline commands
# -------------------------
pipeline_app = typer.Typer(help="Data processing pipeline")
app.add_typer(pipeline_app, name="pipeline")
@pipeline_app.command("run")
def pipeline_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Run the full processing pipeline."""
from ners.main import run_pipeline as _run_pipeline
cfg = setup_config(config_path=config, env=env)
code = _run_pipeline(cfg)
raise typer.Exit(code)
# -------------------------
# NER commands
# -------------------------
ner_app = typer.Typer(help="NER dataset and model")
app.add_typer(ner_app, name="ner")
def _load_config(config: Optional[Path], env: str) -> PipelineConfig:
return setup_config(config_path=config, env=env)
@ner_app.command("feature")
def ner_feature(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import feature as _feature
cfg = _load_config(config, env)
_feature(cfg)
@ner_app.command("build")
def ner_build(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import build as _build
cfg = _load_config(config, env)
_build(cfg)
@ner_app.command("train")
def ner_train(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.ner import train as _train
cfg = _load_config(config, env)
_train(cfg)
@ner_app.command("run")
def ner_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
reset: bool = typer.Option(
False, help="Reset intermediate outputs and rerun all steps"
),
) -> None:
from ners.ner import run_pipeline as _ner_pipeline
cfg = _load_config(config, env)
code = _ner_pipeline(cfg, reset)
raise typer.Exit(code)
# -------------------------
# Research commands
# -------------------------
research_app = typer.Typer(help="Research experiments and training")
app.add_typer(research_app, name="research")
@research_app.command("train")
def research_train(
name: str = typer.Option(..., "--name", help="Model name to train"),
type: str = typer.Option(..., "--type", help="Experiment type"),
templates: str = typer.Option(
"research_templates.yaml", help="Templates file path"
),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
from ners.research.experiment.experiment_builder import ExperimentBuilder
from ners.research.model_trainer import ModelTrainer
cfg = _load_config(config, env)
exp_builder = ExperimentBuilder(cfg)
tmpl = exp_builder.load_templates(templates)
exp_cfg = exp_builder.find_template(tmpl, name, type)
trainer = ModelTrainer(cfg)
trainer.train_single_model(
model_name=exp_cfg.get("name"),
model_type=exp_cfg.get("model_type"),
features=exp_cfg.get("features"),
model_params=exp_cfg.get("model_params", {}),
tags=exp_cfg.get("tags", []),
)
# -------------------------
# Monitor commands
# -------------------------
monitor_app = typer.Typer(help="Monitor pipeline checkpoints")
app.add_typer(monitor_app, name="monitor")
@monitor_app.command("status")
def monitor_status(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
detailed: bool = typer.Option(
False, help="Show detailed status (failed batch IDs)"
),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
PipelineMonitor().print_status(detailed=detailed)
@monitor_app.command("clean")
def monitor_clean(
step: Optional[str] = typer.Option(None, help="Step to clean; default all"),
keep_last: int = typer.Option(1, help="Number of latest checkpoint files to keep"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
typer.confirm("Clean checkpoints?", abort=True)
if step:
mon.clean_step_checkpoints(step, keep_last)
else:
for s in mon.steps:
mon.clean_step_checkpoints(s, keep_last)
@monitor_app.command("reset")
def monitor_reset(
step: Optional[str] = typer.Option(None, help="Step to reset; default all"),
force: bool = typer.Option(False, help="Do not ask for confirmation"),
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
_ = _load_config(config, env)
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
mon = PipelineMonitor()
if not force:
msg = f"Reset {step or 'all steps'}? This deletes checkpoints."
typer.confirm(msg, abort=True)
if step:
mon.reset_step(step)
else:
for s in mon.steps:
mon.reset_step(s)
# -------------------------
# Web commands
# -------------------------
web_app = typer.Typer(help="Web UI wrapper")
app.add_typer(web_app, name="web")
@web_app.command("run")
def web_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
app_path = Path(__file__).parent / "web" / "app.py"
cmd = [
sys.executable,
"-m",
"streamlit",
"run",
str(app_path),
]
# Pass configuration via environment variables to avoid argparse in Streamlit
env_vars = os.environ.copy()
if config is not None:
env_vars["NERS_CONFIG"] = str(config)
env_vars["NERS_ENV"] = env
raise typer.Exit(subprocess.call(cmd, env=env_vars))
if __name__ == "__main__": # pragma: no cover
app()
@@ -2,10 +2,9 @@ import logging
from pathlib import Path
from typing import Optional, Union
from core.utils import ensure_directories
from .config_manager import ConfigManager
from .logging_config import LoggingConfig
from .pipeline_config import PipelineConfig
from ners.core.utils import ensure_directories
from ners.core.config.config_manager import ConfigManager
from ners.core.config.pipeline_config import PipelineConfig
config_manager = ConfigManager()
@@ -22,7 +21,9 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfi
return config_manager.get_config()
def setup_config(config_path: Optional[Path] = None, env: str = "development") -> PipelineConfig:
def setup_config(
config_path: Optional[Path] = None, env: str = "development"
) -> PipelineConfig:
"""
Unified configuration loading and logging setup for all entrypoint scripts.
@@ -5,8 +5,8 @@ from typing import Optional, Union, Dict, Any
import yaml
from core.config.pipeline_config import PipelineConfig
from core.config.project_paths import ProjectPaths
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.config.project_paths import ProjectPaths
class ConfigManager:
@@ -36,7 +36,7 @@ class ConfigManager:
def _setup_default_paths(self):
"""Setup default project paths"""
root_dir = Path(__file__).parent.parent.parent
root_dir = Path(__file__).parent.parent.parent.parent.parent
self.default_paths = ProjectPaths(
root_dir=root_dir,
configs_dir=root_dir / "config",
@@ -53,7 +53,9 @@ class ConfigManager:
self.config_path = config_path
if not self.config_path.exists():
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
logging.warning(
f"Config file not found: {self.config_path}. Using defaults."
)
return self._create_default_config()
try:
@@ -122,7 +124,11 @@ class ConfigManager:
def _deep_update(self, base_dict: Dict, update_dict: Dict):
"""Recursively update nested dictionaries"""
for key, value in update_dict.items():
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
if (
key in base_dict
and isinstance(base_dict[key], dict)
and isinstance(value, dict)
):
self._deep_update(base_dict[key], value)
else:
base_dict[key] = value
@@ -1,10 +1,10 @@
from pydantic import BaseModel
from core.config.annotation_config import AnnotationConfig
from core.config.data_config import DataConfig
from core.config.logging_config import LoggingConfig
from core.config.processing_config import ProcessingConfig
from core.config.project_paths import ProjectPaths
from ners.core.config.annotation_config import AnnotationConfig
from ners.core.config.data_config import DataConfig
from ners.core.config.logging_config import LoggingConfig
from ners.core.config.processing_config import ProcessingConfig
from ners.core.config.project_paths import ProjectPaths
class PipelineConfig(BaseModel):
@@ -10,6 +10,8 @@ class ProcessingConfig(BaseModel):
max_workers: int = 4
checkpoint_interval: int = 5
use_multiprocessing: bool = False
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
encoding_options: list = field(
default_factory=lambda: ["utf-8", "utf-16", "latin1"]
)
chunk_size: int = 100_000
epochs: int = 2
@@ -4,13 +4,13 @@ from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from core.config import PipelineConfig
from ners.core.config import PipelineConfig
@contextmanager
def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration"""
from core.config import get_config
from ners.core.config import get_config
config = get_config()
original_values = {}
@@ -5,7 +5,7 @@ from typing import Optional, Union, Iterator, Dict
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
OPTIMIZED_DTYPES = {
# Numeric columns with appropriate bit-width
@@ -113,7 +113,9 @@ class DataLoader:
sex_values = df["sex"].dropna().unique()
if len(sex_values) == 0:
logging.warning(f"No valid values found in sex column 'sex', using random sampling")
logging.warning(
"No valid values found in sex column 'sex', using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Calculate samples per sex category
@@ -140,18 +142,22 @@ class DataLoader:
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
if not balanced_samples:
logging.warning("No balanced samples could be created, using random sampling")
logging.warning(
"No balanced samples could be created, using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Create result using iloc with indices (no copying until final step)
result = df.iloc[balanced_samples].copy()
# Shuffle the final result
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
drop=True
)
result = result.sample(
frac=1, random_state=self.config.data.random_seed
).reset_index(drop=True)
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
logging.info(
f"Created balanced dataset with {len(result)} records from {len(df)} total"
)
return result
@classmethod
@@ -1,4 +1,4 @@
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
class PromptManager:
@@ -19,9 +19,15 @@ class RegionMapper:
return (
series.str.upper()
.str.strip()
.apply(lambda x: unicodedata.normalize("NFKD", x)
.apply(
lambda x: (
unicodedata.normalize("NFKD", x)
.encode("ascii", errors="ignore")
.decode("utf-8") if isinstance(x, str) else x)
.decode("utf-8")
if isinstance(x, str)
else x
)
)
)
@staticmethod
@@ -2,7 +2,7 @@ import json
import logging
from typing import Dict, Any
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
class StateManager:
+11 -41
View File
@@ -1,21 +1,17 @@
#!.venv/bin/python3
import argparse
import logging
import sys
import traceback
from core.config import setup_config
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from processing.pipeline import Pipeline
from processing.steps.data_cleaning_step import DataCleaningStep
from processing.steps.data_selection_step import DataSelectionStep
from processing.steps.data_splitting_step import DataSplittingStep
from processing.steps.feature_extraction_step import FeatureExtractionStep
from ners.core.utils.data_loader import DataLoader
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.pipeline import Pipeline
from ners.processing.steps.data_cleaning_step import DataCleaningStep
from ners.processing.steps.data_selection_step import DataSelectionStep
from ners.processing.steps.data_splitting_step import DataSplittingStep
from ners.processing.steps.llm_annotation_step import LLMAnnotationStep
from ners.processing.steps.ner_annotation_step import NERAnnotationStep
from ners.processing.steps.feature_extraction_step import FeatureExtractionStep
def create_pipeline(config) -> Pipeline:
"""Create pipeline from configuration"""
batch_config = BatchConfig(
batch_size=config.processing.batch_size,
max_workers=config.processing.max_workers,
@@ -23,14 +19,13 @@ def create_pipeline(config) -> Pipeline:
use_multiprocessing=config.processing.use_multiprocessing,
)
# Add steps based on configuration
pipeline = Pipeline(batch_config)
steps = [
DataCleaningStep(config),
FeatureExtractionStep(config),
DataSelectionStep(config),
# NERAnnotationStep(config),
# LLMAnnotationStep(config),
NERAnnotationStep(config),
LLMAnnotationStep(config),
]
for stage in config.stages:
@@ -42,7 +37,6 @@ def create_pipeline(config) -> Pipeline:
def run_pipeline(config) -> int:
"""Run the complete pipeline"""
try:
logging.info(f"Starting pipeline: {config.name} v{config.version}")
@@ -79,27 +73,3 @@ def run_pipeline(config) -> int:
except Exception as e:
logging.error(f"Pipeline failed: {e}", exc_info=True)
return 1
def main():
"""Main entry point with unified configuration loading"""
parser = argparse.ArgumentParser(
description="DRC NERS Processing Pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
args = parser.parse_args()
try:
config = setup_config(config_path=args.config, env=args.env)
return run_pipeline(config)
except Exception as e:
print(f"Pipeline failed: {e}")
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
+14
View File
@@ -0,0 +1,14 @@
#!.venv/bin/python3
from ners.processing.monitoring.pipeline_monitor import PipelineMonitor
def status(*, detailed: bool = False) -> None:
PipelineMonitor().print_status(detailed=detailed)
def clean_step(step: str, *, keep_last: int = 1) -> None:
PipelineMonitor().clean_step_checkpoints(step, keep_last)
def reset_step(step: str) -> None:
PipelineMonitor().reset_step(step)
+10 -25
View File
@@ -1,29 +1,24 @@
#!/usr/bin/env python3
import argparse
import logging
import os
import sys
import traceback
from pathlib import Path
from core.config import setup_config, PipelineConfig
from processing.ner.name_builder import NameBuilder
from processing.ner.name_engineering import NameEngineering
from processing.ner.name_model import NameModel
from ners.core.config import PipelineConfig
from ners.processing.ner.name_builder import NameBuilder
from ners.processing.ner.name_engineering import NameEngineering
from ners.processing.ner.name_model import NameModel
def feature(config: PipelineConfig):
"""Apply feature engineering to create position-independent NER dataset."""
NameEngineering(config).compute()
def build(config: PipelineConfig):
"""Build NER dataset using NERDataBuilder."""
NameBuilder(config).build()
def train(config: PipelineConfig):
"""Train the NER model."""
name_model = NameModel(config)
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
@@ -37,7 +32,9 @@ def train(config: PipelineConfig):
split_idx = int(len(data) * 0.9)
train_data, eval_data = data[:split_idx], data[split_idx:]
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
logging.info(
f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}"
)
name_model.train(
data=train_data,
epochs=config.processing.epochs,
@@ -75,21 +72,9 @@ def run_pipeline(config: PipelineConfig, reset: bool = False):
def main():
parser = argparse.ArgumentParser(description="NER model management for DRC names")
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
parser.add_argument("--reset", action="store_true", help="Reset all steps")
args = parser.parse_args()
try:
config = setup_config(config_path=args.config, env=args.env)
return run_pipeline(config, args.reset)
except Exception as e:
print(f"Pipeline failed: {e}")
logging.error("This module is no longer a CLI. Use 'ners ner ...' instead.")
return 1
except Exception:
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
@@ -8,4 +8,6 @@ class BatchConfig:
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5 # Save checkpoint every N batches
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
use_multiprocessing: bool = (
False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
)
@@ -4,9 +4,9 @@ from typing import Iterator
import pandas as pd
from processing.batch.batch_config import BatchConfig
from processing.batch.memory_monitor import MemoryMonitor
from processing.steps import PipelineStep
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.batch.memory_monitor import MemoryMonitor
from ners.processing.steps import PipelineStep
class BatchProcessor:
@@ -33,7 +33,9 @@ class BatchProcessor:
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
processed_batch = step.load_batch(batch_id)
else:
try:
@@ -80,7 +82,9 @@ class BatchProcessor:
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Memory-optimized concurrent processing"""
executor_class = (
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
ProcessPoolExecutor
if self.config.use_multiprocessing
else ThreadPoolExecutor
)
results = {}
@@ -89,7 +93,9 @@ class BatchProcessor:
future_to_batch = {}
for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
logging.info(
f"Batch {batch_id} already processed, loading from checkpoint"
)
results[batch_id] = step.load_batch(batch_id)
else:
# Only copy if necessary for concurrent processing
@@ -121,7 +127,9 @@ class BatchProcessor:
del results
self.memory_monitor.cleanup_memory()
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
result = (
self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
)
# Final cleanup
del ordered_results
@@ -131,7 +139,9 @@ class BatchProcessor:
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process data using the configured strategy"""
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
step.state.total_batches = (
len(df) + self.config.batch_size - 1
) // self.config.batch_size
step.load_state()
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
@@ -4,8 +4,8 @@ import shutil
from datetime import datetime
from typing import Optional, Dict
from core.config.config_manager import ConfigManager
from core.config.project_paths import ProjectPaths
from ners.core.config.config_manager import ConfigManager
from ners.core.config.project_paths import ProjectPaths
class PipelineMonitor:
@@ -97,7 +97,10 @@ class PipelineMonitor:
avg_completion = total_completion / len(self.steps)
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
if avg_completion >= 100 and overall_status not in [
"error",
"completed_with_errors",
]:
overall_status = "completed"
return {
@@ -121,7 +124,9 @@ class PipelineMonitor:
print(f"{step_name.replace('_', ' ').title()}:")
print(f" Status: {step_status['status']}")
print(f" Progress: {step_status['completion_percentage']:.1f}%")
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
print(
f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}"
)
if step_status["failed_batches"] > 0:
print(f" Failed Batches: {step_status['failed_batches']}")
@@ -141,7 +146,10 @@ class PipelineMonitor:
if step_dir.exists():
csv_files = list(step_dir.glob("*.csv"))
step_size = sum(f.stat().st_size for f in csv_files)
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
counts[step] = {
"files": len(csv_files),
"size_mb": step_size / (1024 * 1024),
}
total_size += step_size
else:
counts[step] = {"files": 0, "size_mb": 0}
@@ -160,7 +168,9 @@ class PipelineMonitor:
csv_files = sorted(step_dir.glob("batch_*.csv"))
if len(csv_files) <= keep_last:
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
logging.info(
f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all"
)
return
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
@@ -3,7 +3,7 @@ from typing import List, Tuple, Dict
import pandas as pd
from processing.steps.feature_extraction_step import NameCategory
from ners.processing.steps.feature_extraction_step import NameCategory
class BaseNameFormatter(ABC):
@@ -12,7 +12,9 @@ class BaseNameFormatter(ABC):
Contains common logic for NER tagging and attribute computation.
"""
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
def __init__(
self, connectors: List[str] = None, additional_surnames: List[str] = None
):
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
self.additional_surnames = additional_surnames or [
"jean",
@@ -46,7 +48,9 @@ class BaseNameFormatter(ABC):
end_pos = current_pos + len(word)
# Determine tag based on word content
if word in native_parts or any(connector in word for connector in self.connectors):
if word in native_parts or any(
connector in word for connector in self.connectors
):
tag = "NATIVE"
elif word == surname or word in self.additional_surnames:
tag = "SURNAME"
@@ -72,7 +76,9 @@ class BaseNameFormatter(ABC):
"words": words_count,
"length": length,
"identified_category": (
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
NameCategory.SIMPLE.value
if words_count == 3
else NameCategory.COMPOSE.value
),
}
@@ -3,7 +3,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class ConnectorFormatter(BaseNameFormatter):
@@ -3,13 +3,15 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class ExtendedSurnameFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row["probable_native"])
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
original_surname = (
row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
)
# Add random additional surname
additional_surname = random.choice(self.additional_surnames)
@@ -22,7 +24,9 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
"identified_name": row["probable_native"],
"probable_surname": combined_surname,
"identified_surname": combined_surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
"ner_entities": str(
self.create_ner_tags(full_name, native_parts, combined_surname)
),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class NativeOnlyFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class OriginalFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class PositionFlippedFormatter(BaseNameFormatter):
@@ -2,7 +2,7 @@ from typing import Dict
import pandas as pd
from processing.ner.formats import BaseNameFormatter
from ners.processing.ner.formats import BaseNameFormatter
class ReducedNativeFormatter(BaseNameFormatter):
@@ -11,7 +11,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Keep only first native component + surname
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
reduced_native = (
native_parts[0] if len(native_parts) > 1 else row["probable_native"]
)
full_name = f"{reduced_native} {surname}".strip()
return {
@@ -20,7 +22,9 @@ class ReducedNativeFormatter(BaseNameFormatter):
"identified_name": reduced_native,
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
"ner_entities": str(
self.create_ner_tags(full_name, [reduced_native], surname)
),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@@ -4,8 +4,8 @@ import logging
import spacy
from spacy.tokens import DocBin
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from .name_tagger import NameTagger
@@ -20,7 +20,9 @@ class NameBuilder:
self.tagger = NameTagger()
def build(self) -> int:
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
@@ -38,7 +40,9 @@ class NameBuilder:
# Use NERNameTagger for parsing and validation
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities)
validated_entities = self.tagger.validate_entities(
ner_df["name"], parsed_entities
)
# Drop rows with no valid entities
mask = validated_entities.map(bool)
@@ -51,22 +55,33 @@ class NameBuilder:
# Prepare training data
training_data = list(
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
zip(
ner_df["name"].tolist(),
[{"entities": ents} for ents in validated_entities],
)
)
# Use NERNameTagger to create spaCy DocBin
docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
docs = self.tagger.create_docs(
nlp, ner_df["name"].tolist(), validated_entities.tolist()
)
doc_bin = DocBin(docs=docs)
# Save
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
json_path = self.config.paths.get_data_path(
self.config.data.output_files["ner_data"]
)
spacy_path = self.config.paths.get_data_path(
self.config.data.output_files["ner_spacy"]
)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
doc_bin.to_disk(spacy_path)
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
logging.info(
f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}"
)
logging.info(f"Saved NER JSON to {json_path}")
logging.info(f"Saved NER spacy to {spacy_path}")
return 0
@@ -6,14 +6,14 @@ import numpy as np
import pandas as pd
from tqdm import tqdm
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
from processing.ner.formats.connectors_format import ConnectorFormatter
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from processing.ner.formats.native_only_format import NativeOnlyFormatter
from processing.ner.formats.original_format import OriginalFormatter
from processing.ner.formats.position_flipped_format import PositionFlippedFormatter
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.ner.formats.connectors_format import ConnectorFormatter
from ners.processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from ners.processing.ner.formats.native_only_format import NativeOnlyFormatter
from ners.processing.ner.formats.original_format import OriginalFormatter
from ners.processing.ner.formats.position_flipped_format import PositionFlippedFormatter
from ners.processing.ner.formats.reduced_native_format import ReducedNativeFormatter
class NameEngineering:
@@ -44,42 +44,60 @@ class NameEngineering:
# Initialize format classes
self.formatters = {
"original": OriginalFormatter(self.connectors, self.additional_surnames),
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
"native_only": NativeOnlyFormatter(
self.connectors, self.additional_surnames
),
"position_flipped": PositionFlippedFormatter(
self.connectors, self.additional_surnames
),
"reduced_native": ReducedNativeFormatter(
self.connectors, self.additional_surnames
),
"connector_added": ConnectorFormatter(
self.connectors, self.additional_surnames
),
"extended_surname": ExtendedSurnameFormatter(
self.connectors, self.additional_surnames
),
}
def load_data(self) -> pd.DataFrame:
"""Load and filter NER-tagged data from CSV file"""
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath)
# Filter only NER-tagged rows
ner_data = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
logging.info(
f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records"
)
return ner_data
def compute(self) -> None:
logging.info("Applying feature engineering transformations...")
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
input_filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
output_filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
logging.info(
f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records"
)
del df # No need to keep in memory
gc.collect()
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
drop=True
)
ner_df = ner_df.sample(
frac=1, random_state=self.config.data.random_seed
).reset_index(drop=True)
total_rows = len(ner_df)
# Calculate split points
@@ -94,7 +112,11 @@ class NameEngineering:
(0, split_25_1, "original"), # First 25%: original format
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
(
split_25_3,
split_10_1,
"reduced_native",
), # Fourth 10%: reduce native components
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
]
@@ -11,7 +11,7 @@ from spacy.training import Example
from spacy.util import minibatch
from tqdm import tqdm
from core.config.pipeline_config import PipelineConfig
from ners.core.config.pipeline_config import PipelineConfig
class NameModel:
@@ -29,6 +29,15 @@ class NameModel:
"""Create a blank spaCy model with NER pipeline"""
logging.info(f"Creating blank {language} model for NER training")
# Prefer GPU for spaCy if available (falls back to CPU automatically)
try:
if spacy.prefer_gpu():
logging.info("spaCy GPU enabled (cupy) for NER training")
else:
logging.info("spaCy running on CPU")
except Exception as e:
logging.debug(f"spaCy GPU selection skipped: {e}")
# Create blank model - French tokenizer works well for DRC names
self.nlp = spacy.blank(language)
@@ -78,7 +87,9 @@ class NameModel:
# Handle different annotation formats from NERNameTagger
if not isinstance(annotations, dict) or "entities" not in annotations:
logging.warning(f"Skipping invalid annotations at index {i}: {annotations}")
logging.warning(
f"Skipping invalid annotations at index {i}: {annotations}"
)
skipped_count += 1
continue
@@ -115,7 +126,9 @@ class NameModel:
valid_entities = []
for entity in entities:
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
logging.warning(f"Skipping invalid entity format in '{text}': {entity}")
logging.warning(
f"Skipping invalid entity format in '{text}': {entity}"
)
continue
start, end, label = entity
@@ -129,21 +142,30 @@ class NameModel:
or start < 0
or end > len(text)
):
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
logging.warning(
f"Skipping invalid entity bounds in '{text}': {entity}"
)
continue
# Check for overlaps with already validated entities
has_overlap = any(
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
start < v_end and end > v_start
for v_start, v_end, _ in valid_entities
)
if has_overlap:
logging.warning(f"Skipping overlapping entity in '{text}': {entity}")
logging.warning(
f"Skipping overlapping entity in '{text}': {entity}"
)
continue
# Validate that the span doesn't contain spaces (matching tagger validation)
span_text = text[start:end]
if not span_text or span_text != span_text.strip() or " " in span_text:
if (
not span_text
or span_text != span_text.strip()
or " " in span_text
):
logging.warning(
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
)
@@ -152,7 +174,9 @@ class NameModel:
valid_entities.append((start, end, label))
if not valid_entities:
logging.warning(f"Skipping training example with no valid entities: '{text}'")
logging.warning(
f"Skipping training example with no valid entities: '{text}'"
)
skipped_count += 1
continue
@@ -210,7 +234,9 @@ class NameModel:
batches = minibatch(examples, size=batch_size)
for batch in batches:
batch_losses = {}
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
self.nlp.update(
batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer
)
logging.info(
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
)
@@ -221,7 +247,7 @@ class NameModel:
del batches # free memory
losses_history.append(losses.get("ner", 0))
logging.info(f"Epoch {epoch+1}/{epochs}, Total Loss: {losses['ner']:.4f}")
logging.info(f"Epoch {epoch + 1}/{epochs}, Total Loss: {losses['ner']:.4f}")
# Store training statistics
self.training_stats = {
@@ -233,7 +259,9 @@ class NameModel:
"dropout_rate": dropout_rate,
}
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
logging.info(
f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}"
)
def evaluate(self, test_data: List[Tuple[str, Dict]]) -> Dict[str, Any]:
"""Evaluate the trained model on test data"""
@@ -282,10 +310,14 @@ class NameModel:
entity_stats[label]["fp"] += 1
# Calculate overall metrics
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
precision = (
correct_entities / predicted_entities if predicted_entities > 0 else 0
)
recall = correct_entities / actual_entities if actual_entities > 0 else 0
f1_score = (
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
2 * (precision * recall) / (precision + recall)
if (precision + recall) > 0
else 0
)
# Calculate per-label metrics
@@ -295,7 +327,11 @@ class NameModel:
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
label_f1 = (
(2 * (label_precision * label_recall) / (label_precision + label_recall))
(
2
* (label_precision * label_recall)
/ (label_precision + label_recall)
)
if (label_precision + label_recall) > 0
else 0
)
@@ -385,7 +421,9 @@ class NameModel:
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
"confidence": getattr(ent, "score", None), # If confidence scores are available
"confidence": getattr(
ent, "score", None
), # If confidence scores are available
}
)
@@ -48,7 +48,9 @@ class NameTagger:
# Find the first occurrence of this native word that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
pos = name_lower.find(
native_word_lower, start_pos
) # Case-insensitive search
if pos == -1:
break
@@ -78,7 +80,9 @@ class NameTagger:
# Find the first occurrence that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
pos = name_lower.find(
surname_lower, start_pos
) # Case-insensitive search
if pos == -1:
break
@@ -120,8 +124,13 @@ class NameTagger:
continue
# Check for overlaps with already validated entities
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
if any(
start < v_end and end > v_start
for v_start, v_end, _ in validated_entities
):
logging.warning(
f"Overlapping span ({start}, {end}, '{label}') in '{name}'"
)
continue
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
@@ -200,10 +209,16 @@ class NameTagger:
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
return [tuple(e) for e in ast.literal_eval(entities_str)]
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
return [
(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)
]
else:
parsed = ast.literal_eval(entities_str)
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
return [
tuple(e)
for e in parsed
if isinstance(e, (list, tuple)) and len(e) == 3
]
except (ValueError, SyntaxError, json.JSONDecodeError):
return []
@@ -251,7 +266,9 @@ class NameTagger:
last_end = e
return filtered
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
def validate_entities(
self, texts: pd.Series, entities_series: pd.Series
) -> pd.Series:
"""Vectorized entity validation."""
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
@@ -4,9 +4,9 @@ from typing import Dict, Any
import pandas as pd
from processing.batch.batch_config import BatchConfig
from processing.batch.batch_processor import BatchProcessor
from processing.steps import PipelineStep
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.batch.batch_processor import BatchProcessor
from ners.processing.steps import PipelineStep
class Pipeline:
@@ -8,9 +8,9 @@ from typing import List, Optional
import pandas as pd
from pydantic import BaseModel
from core.config.pipeline_config import PipelineConfig
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.processing.batch.batch_config import BatchConfig
@dataclass
@@ -38,7 +38,10 @@ class PipelineStep(ABC):
"""Abstract base class for pipeline steps"""
def __init__(
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
self,
name: str,
pipeline_config: PipelineConfig,
batch_config: Optional[BatchConfig] = None,
):
self.name = name
self.pipeline_config = pipeline_config
@@ -2,9 +2,9 @@ import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.text_cleaner import TextCleaner
from ners.processing.steps import PipelineStep
class DataCleaningStep(PipelineStep):
@@ -2,8 +2,8 @@ import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps import PipelineStep
from ners.core.config.pipeline_config import PipelineConfig
from ners.processing.steps import PipelineStep
class DataSelectionStep(PipelineStep):
@@ -20,15 +20,23 @@ class DataSelectionStep(PipelineStep):
# Remove rows where region == "global" only for specific years
if "region" in batch.columns and "year" in batch.columns:
target_years = {2015, 2021, 2022}
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(target_years)
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
target_years
)
removed = int(mask_remove.sum())
if removed:
batch = batch[~mask_remove]
logging.info(f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}")
logging.info(
f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}"
)
# Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns]
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
available_columns = [
col for col in self.selected_columns if col in batch.columns
]
missing_columns = [
col for col in self.selected_columns if col not in batch.columns
]
if missing_columns:
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
@@ -1,11 +1,11 @@
import numpy as np
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
from processing.steps.feature_extraction_step import Gender
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.region_mapper import RegionMapper
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.steps import PipelineStep
from ners.processing.steps.feature_extraction_step import Gender
class DataSplittingStep(PipelineStep):
@@ -26,7 +26,9 @@ class DataSplittingStep(PipelineStep):
if self.eval_indices is None:
np.random.seed(self.pipeline_config.data.random_seed)
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
self.eval_indices = set(
np.random.choice(total_size, size=eval_size, replace=False)
)
return self.eval_indices
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
@@ -45,7 +47,9 @@ class DataSplittingStep(PipelineStep):
df_evaluation = df[eval_mask]
df_featured = df[~eval_mask]
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
self.data_loader.save_csv(
df_evaluation, data_dir / output_files["evaluation"]
)
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
else:
self.data_loader.save_csv(df, data_dir / output_files["featured"])
@@ -53,7 +57,9 @@ class DataSplittingStep(PipelineStep):
if self.pipeline_config.data.split_by_province:
for province in RegionMapper.get_provinces():
df_region = df[df.province == province]
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
self.data_loader.save_csv(
df_region, data_dir / "provinces" / f"{province}.csv"
)
if self.pipeline_config.data.split_by_gender:
df_males = df[df.sex == Gender.MALE.value]
@@ -5,10 +5,10 @@ from typing import Dict, Any
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.ner.name_tagger import NameTagger
from processing.steps import PipelineStep
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.region_mapper import RegionMapper
from ners.processing.ner.name_tagger import NameTagger
from ners.processing.steps import PipelineStep
class Gender(Enum):
@@ -64,10 +64,14 @@ class FeatureExtractionStep(PipelineStep):
self._assign_probable_names(result)
self._process_simple_names(result)
result["identified_category"] = self._assign_identified_category(result["words"])
result["identified_category"] = self._assign_identified_category(
result["words"]
)
if "year" in result.columns:
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype(
"Int16"
)
if "region" in result.columns:
result["province"] = self.region_mapper.map(result["region"]).str.lower()
@@ -7,12 +7,12 @@ import ollama
import pandas as pd
from pydantic import ValidationError
from core.config.pipeline_config import PipelineConfig
from core.utils.prompt_manager import PromptManager
from core.utils.rate_limiter import RateLimitConfig
from core.utils.rate_limiter import RateLimiter
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep, NameAnnotation
from ners.core.config.pipeline_config import PipelineConfig
from ners.core.utils.prompt_manager import PromptManager
from ners.core.utils.rate_limiter import RateLimitConfig
from ners.core.utils.rate_limiter import RateLimiter
from ners.processing.batch.batch_config import BatchConfig
from ners.processing.steps import PipelineStep, NameAnnotation
class LLMAnnotationStep(PipelineStep):
@@ -24,7 +24,8 @@ class LLMAnnotationStep(PipelineStep):
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=min(
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
self.llm_config.max_concurrent_requests,
pipeline_config.processing.max_workers,
),
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
@@ -33,7 +34,9 @@ class LLMAnnotationStep(PipelineStep):
self.prompt = PromptManager(pipeline_config).load_prompt()
self.rate_limiter = (
self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None
self._create_rate_limiter()
if self.llm_config.enable_rate_limiting
else None
)
# Statistics
@@ -76,7 +79,9 @@ class LLMAnnotationStep(PipelineStep):
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
)
annotation = NameAnnotation.model_validate_json(response.message.content)
annotation = NameAnnotation.model_validate_json(
response.message.content
)
result = {
**annotation.model_dump(),
"annotated": 1,
@@ -119,7 +124,9 @@ class LLMAnnotationStep(PipelineStep):
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM")
logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM"
)
batch = batch.copy()
client = ollama.Client()
@@ -5,9 +5,9 @@ from typing import Dict
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.ner.name_model import NameModel
from processing.steps import PipelineStep, NameAnnotation
from ners.core.config.pipeline_config import PipelineConfig
from ners.processing.ner.name_model import NameModel
from ners.processing.steps import PipelineStep, NameAnnotation
class NERAnnotationStep(PipelineStep):
@@ -39,7 +39,9 @@ class NERAnnotationStep(PipelineStep):
logging.info("NER model loaded successfully")
else:
logging.warning(f"NER model not found at {self.model_path}")
logging.warning("NER annotation will be skipped. Train the model first.")
logging.warning(
"NER annotation will be skipped. Train the model first."
)
self.name_model.nlp = None
except Exception as e:
logging.error(f"Failed to load NER model: {e}")
@@ -80,7 +82,9 @@ class NERAnnotationStep(PipelineStep):
# Create annotation result in same format as LLM step
annotation = NameAnnotation(
identified_name=" ".join(native_parts) if native_parts else None,
identified_surname=" ".join(surname_parts) if surname_parts else None,
identified_surname=" ".join(surname_parts)
if surname_parts
else None,
)
result = {
@@ -124,7 +128,9 @@ class NERAnnotationStep(PipelineStep):
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER")
logging.info(
f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER"
)
batch = batch.copy()
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from research.experiment import ExperimentConfig
from ners.research.experiment import ExperimentConfig
class BaseModel(ABC):
@@ -103,16 +103,25 @@ class BaseModel(ABC):
feature_names = self._get_feature_names()
return dict(zip(feature_names, coefficients))
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
elif (
hasattr(self.model, "named_steps")
and "classifier" in self.model.named_steps
):
# For sklearn pipelines (like LogisticRegression with vectorizer)
classifier = self.model.named_steps["classifier"]
if hasattr(classifier, "coef_"):
coefficients = np.abs(classifier.coef_[0])
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
if hasattr(
self.model.named_steps["vectorizer"], "get_feature_names_out"
):
feature_names = self.model.named_steps[
"vectorizer"
].get_feature_names_out()
# Take top features to avoid too many n-grams
top_indices = np.argsort(coefficients)[-20:]
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
return dict(
zip(feature_names[top_indices], coefficients[top_indices])
)
return None
@@ -143,7 +152,7 @@ class BaseModel(ABC):
model_data = joblib.load(path)
# Recreate the model instance
from research.experiment import ExperimentConfig
from ners.research.experiment import ExperimentConfig
config = ExperimentConfig.from_dict(model_data["config"])
instance = cls(config)
@@ -221,7 +230,9 @@ class BaseModel(ABC):
if "accuracy" in self.training_history:
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
if "val_accuracy" in self.training_history:
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
axes[0].plot(
self.training_history["val_accuracy"], label="Validation Accuracy"
)
axes[0].set_title("Model Accuracy")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Accuracy")
@@ -18,7 +18,9 @@ class ExperimentConfig:
tags: List[str] = field(default_factory=list)
# Model configuration
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
model_type: str = (
"logistic_regression" # logistic_regression, lstm, transformer, etc.
)
model_params: Dict[str, Any] = field(default_factory=dict)
# Feature configuration
@@ -26,7 +28,9 @@ class ExperimentConfig:
feature_params: Dict[str, Any] = field(default_factory=dict)
# Data configuration
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
train_data_filter: Optional[Dict[str, Any]] = (
None # Filter criteria for training data
)
test_data_filter: Optional[Dict[str, Any]] = None
target_column: str = "sex"
@@ -36,7 +40,9 @@ class ExperimentConfig:
cross_validation_folds: int = 5
# Evaluation configuration
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
metrics: List[str] = field(
default_factory=lambda: ["accuracy", "precision", "recall", "f1"]
)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional, Dict, List, Any
from research.experiment import ExperimentConfig, ExperimentStatus
from ners.research.experiment import ExperimentConfig, ExperimentStatus
@dataclass
@@ -51,6 +51,8 @@ class ExperimentResult:
"""Create from dictionary"""
data["config"] = ExperimentConfig.from_dict(data["config"])
data["start_time"] = datetime.fromisoformat(data["start_time"])
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
data["end_time"] = (
datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
)
data["status"] = ExperimentStatus(data["status"])
return cls(**data)
@@ -3,9 +3,9 @@ from typing import List, Dict
import yaml
from core.config.pipeline_config import PipelineConfig
from research.experiment import ExperimentConfig
from research.experiment.feature_extractor import FeatureType
from ners.core.config.pipeline_config import PipelineConfig
from ners.research.experiment import ExperimentConfig
from ners.research.experiment.feature_extractor import FeatureType
class ExperimentBuilder:
@@ -27,7 +27,9 @@ class ExperimentBuilder:
raise
@classmethod
def find_template(cls, templates: dict, name: str, experiment_type: str = "baseline") -> dict:
def find_template(
cls, templates: dict, name: str, experiment_type: str = "baseline"
) -> dict:
"""Find experiment configuration by name and type"""
# Map type to section in templates
@@ -9,12 +9,16 @@ import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
from research.base_model import BaseModel
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
from research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import create_model
from ners.core.config import PipelineConfig
from ners.core.utils.data_loader import DataLoader
from ners.research.base_model import BaseModel
from ners.research.experiment import (
ExperimentConfig,
ExperimentStatus,
calculate_metrics,
)
from ners.research.experiment.experiment_tracker import ExperimentTracker
from ners.research.model_registry import create_model
class ExperimentRunner:
@@ -32,10 +36,14 @@ class ExperimentRunner:
try:
logging.info(f"Starting experiment: {experiment_id}")
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
self.tracker.update_experiment(
experiment_id, status=ExperimentStatus.RUNNING
)
# Load data
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
filepath = self.config.paths.get_data_path(
self.config.data.output_files["featured"]
)
df = self.data_loader.load_csv_complete(filepath)
# Apply data filters if specified
@@ -63,8 +71,12 @@ class ExperimentRunner:
test_pred = model.predict(X_test)
# Calculate metrics
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
train_metrics = calculate_metrics(
y_train, train_pred, experiment_config.metrics
)
test_metrics = calculate_metrics(
y_test, test_pred, experiment_config.metrics
)
# Cross-validation if requested
cv_metrics = {}
@@ -125,7 +137,9 @@ class ExperimentRunner:
experiment_ids = []
for i, config in enumerate(experiments):
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
logging.info(
f"Running experiment {i + 1}/{len(experiments)}: {config.name}"
)
try:
exp_id = self.run_experiment(config)
experiment_ids.append(exp_id)
@@ -136,7 +150,9 @@ class ExperimentRunner:
return experiment_ids
@classmethod
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
def _apply_data_filters(
cls, df: pd.DataFrame, config: ExperimentConfig
) -> pd.DataFrame:
"""Apply data filters specified in experiment config"""
filtered_df = df.copy()
@@ -148,9 +164,13 @@ class ExperimentRunner:
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
elif isinstance(criteria, dict):
if "min" in criteria:
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
filtered_df = filtered_df[
filtered_df[column] >= criteria["min"]
]
if "max" in criteria:
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
filtered_df = filtered_df[
filtered_df[column] <= criteria["max"]
]
else:
filtered_df = filtered_df[filtered_df[column] == criteria]
@@ -231,7 +251,9 @@ class ExperimentRunner:
return model
except Exception as e:
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
logging.error(
f"Failed to load model for experiment {experiment_id}: {e}"
)
return None
return None

Some files were not shown because too many files have changed in this diff Show More