chore(release): v1.0.0
@@ -10,8 +10,14 @@ million names from the Democratic Republic of Congo (DRC) annotated with gender
|
|||||||
|
|
||||||
### Installation & Setup
|
### Installation & Setup
|
||||||
|
|
||||||
|
> download [the dataset](https://drive.google.com/file/d/1a5wQnOZdsRWBOeoMA_0lNtbneTvS9xqy/view?usp=drive_link), if you need access please reach us at mlec.academia@gmail.com.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||||
|
|
||||||
|
mkdir -p drc-ners-nlp/data/dataset
|
||||||
|
cp names.csv drc-ners-nlp/data/dataset
|
||||||
|
|
||||||
cd drc-ners-nlp
|
cd drc-ners-nlp
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -27,13 +33,11 @@ uv sync
|
|||||||
docker compose build
|
docker compose build
|
||||||
docker compose exec app bash
|
docker compose exec app bash
|
||||||
```
|
```
|
||||||
s
|
|
||||||
## Data Processing
|
## Data Processing
|
||||||
|
|
||||||
This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching,
|
This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching,
|
||||||
checkpointing, and parallel processing capabilities.
|
checkpointing, and parallel processing capabilities.
|
||||||
step are defined in the `drc-ners-nlp/processing/steps` directory. and configuration to enable them is managed through
|
|
||||||
the `drc-ners-nlp/config/pipeline.yaml` file.
|
|
||||||
|
|
||||||
**Pipeline Configuration**
|
**Pipeline Configuration**
|
||||||
|
|
||||||
@@ -54,8 +58,7 @@ uv run ners pipeline run --env="production"
|
|||||||
## Experiments
|
## Experiments
|
||||||
|
|
||||||
This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and
|
This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and
|
||||||
research iteration. models are defined in the `drc-ners-nlp/research/models` directory.
|
research iteration. you can define model features, training parameters, and evaluation metrics in the `config/research_templates.yaml` file.
|
||||||
you can define model features, training parameters, and evaluation metrics in the `research_templates.yaml` file.
|
|
||||||
|
|
||||||
**Running Experiments**
|
**Running Experiments**
|
||||||
|
|
||||||
@@ -64,42 +67,58 @@ you can define model features, training parameters, and evaluation metrics in th
|
|||||||
uv run ners research train --name="bigru" --type="baseline" --env="production"
|
uv run ners research train --name="bigru" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="bigru_native" --type="baseline" --env="production"
|
uv run ners research train --name="bigru_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
|
uv run ners research train --name="bigru_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# cnn
|
# cnn
|
||||||
uv run ners research train --name="cnn" --type="baseline" --env="production"
|
uv run ners research train --name="cnn" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="cnn_native" --type="baseline" --env="production"
|
uv run ners research train --name="cnn_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
|
uv run ners research train --name="cnn_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# lightgbm
|
# lightgbm
|
||||||
uv run ners research train --name="lightgbm" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
|
uv run ners research train --name="lightgbm_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# logistic regression
|
# logistic regression
|
||||||
uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
|
uv run ners research train --name="logistic_regression_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# lstm
|
# lstm
|
||||||
uv run ners research train --name="lstm" --type="baseline" --env="production"
|
uv run ners research train --name="lstm" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="lstm_native" --type="baseline" --env="production"
|
uv run ners research train --name="lstm_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
|
uv run ners research train --name="lstm_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# random forest
|
# random forest
|
||||||
uv run ners research train --name="random_forest" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
|
uv run ners research train --name="random_forest_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# naive bayes
|
# naive bayes
|
||||||
uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
|
uv run ners research train --name="naive_bayes_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# transformer
|
# transformer
|
||||||
uv run ners research train --name="transformer" --type="baseline" --env="production"
|
uv run ners research train --name="transformer" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="transformer_native" --type="baseline" --env="production"
|
uv run ners research train --name="transformer_native" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
|
uv run ners research train --name="transformer_surname" --type="baseline" --env="production"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
# xgboost
|
# xgboost
|
||||||
uv run ners research train --name="xgboost" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost" --type="baseline" --env="production"
|
||||||
uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
|
uv run ners research train --name="xgboost_native" --type="baseline" --env="production"
|
||||||
@@ -113,6 +132,8 @@ experiments and make predictions without needing to understand the underlying co
|
|||||||
|
|
||||||
### Running the Web Interface
|
### Running the Web Interface
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv run ners web run --env="production"
|
uv run ners web run --env="production"
|
||||||
```
|
```
|
||||||
@@ -121,6 +142,8 @@ uv run ners web run --env="production"
|
|||||||
docker compose run --rm --service-ports app ners web run --env=production
|
docker compose run --rm --service-ports app ners web run --env=production
|
||||||
```
|
```
|
||||||
|
|
||||||
|
then open : http://localhost:8501/
|
||||||
|
|
||||||
## Contributors
|
## Contributors
|
||||||
|
|
||||||
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||||
<cc:Work>
|
<cc:Work>
|
||||||
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
|
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
|
||||||
<dc:date>2025-10-05T22:52:00.716620</dc:date>
|
<dc:date>2025-10-05T23:30:08.756879</dc:date>
|
||||||
<dc:format>image/svg+xml</dc:format>
|
<dc:format>image/svg+xml</dc:format>
|
||||||
<dc:creator>
|
<dc:creator>
|
||||||
<cc:Agent>
|
<cc:Agent>
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 30 KiB After Width: | Height: | Size: 30 KiB |
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 37 KiB |
|
Before Width: | Height: | Size: 463 KiB After Width: | Height: | Size: 454 KiB |
|
Before Width: | Height: | Size: 34 KiB After Width: | Height: | Size: 38 KiB |
|
Before Width: | Height: | Size: 464 KiB After Width: | Height: | Size: 455 KiB |
@@ -1,3 +1,3 @@
|
|||||||
category,l2,kl_mf,kl_fm,jsd,permutation_p_value
|
category,l2,kl_mf,kl_fm,jsd,permutation_p_value
|
||||||
names,0.3189041485139616,0.04320097944655348,0.0215380760498496,0.03236952774820154,0.973
|
names,0.3189041485139616,0.04320097944655348,0.0215380760498496,0.03236952774820154,0.978
|
||||||
surnames,1.2770018925640299,0.2936188220992242,0.23989460296618093,0.26675671253270256,0.003
|
surnames,1.2770018925640299,0.2936188220992242,0.23989460296618093,0.26675671253270256,0.001
|
||||||
|
|||||||
|
|
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 48 KiB |
|
After Width: | Height: | Size: 158 KiB |
@@ -25,7 +25,7 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
|||||||
|
|
||||||
# Combine into one DataFrame
|
# Combine into one DataFrame
|
||||||
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
|
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
|
||||||
df_plot.to_csv(f"../assets/{title}_letter_frequencies.csv", index=False)
|
df_plot.to_csv(f"../../assets/{title}_letter_frequencies.csv", index=False)
|
||||||
|
|
||||||
# Optional sorting
|
# Optional sorting
|
||||||
if sort_values:
|
if sort_values:
|
||||||
|
|||||||