feat: add osm data
This commit is contained in:
@@ -62,7 +62,7 @@ stages:
|
|||||||
**Running the Pipeline**
|
**Running the Pipeline**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python main.py --env development
|
python main.py --env production
|
||||||
```
|
```
|
||||||
|
|
||||||
## NER Processing (Optional)
|
## NER Processing (Optional)
|
||||||
@@ -72,7 +72,7 @@ Its main objective is to accurately identify and tag the different components of
|
|||||||
specifically distinguishing between the native part and the surname.
|
specifically distinguishing between the native part and the surname.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python ner.py --env development
|
python ner.py --env production
|
||||||
```
|
```
|
||||||
|
|
||||||
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
|
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
|
||||||
@@ -105,54 +105,54 @@ you can define model features, training parameters, and evaluation metrics in th
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# bigru
|
# bigru
|
||||||
python train.py --name="bigru" --type="baseline" --env="development"
|
python train.py --name="bigru" --type="baseline" --env="production"
|
||||||
python train.py --name="bigru_native" --type="baseline" --env="development"
|
python train.py --name="bigru_native" --type="baseline" --env="production"
|
||||||
python train.py --name="bigru_surname" --type="baseline" --env="development"
|
python train.py --name="bigru_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# cnn
|
# cnn
|
||||||
python train.py --name="cnn" --type="baseline" --env="development"
|
python train.py --name="cnn" --type="baseline" --env="production"
|
||||||
python train.py --name="cnn_native" --type="baseline" --env="development"
|
python train.py --name="cnn_native" --type="baseline" --env="production"
|
||||||
python train.py --name="cnn_surname" --type="baseline" --env="development"
|
python train.py --name="cnn_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# lightgbm
|
# lightgbm
|
||||||
python train.py --name="lightgbm" --type="baseline" --env="development"
|
python train.py --name="lightgbm" --type="baseline" --env="production"
|
||||||
python train.py --name="lightgbm_native" --type="baseline" --env="development"
|
python train.py --name="lightgbm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="lightgbm_surname" --type="baseline" --env="development"
|
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# logistic regression
|
# logistic regression
|
||||||
python train.py --name="logistic_regression" --type="baseline" --env="development"
|
python train.py --name="logistic_regression" --type="baseline" --env="production"
|
||||||
python train.py --name="logistic_regression_native" --type="baseline" --env="development"
|
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
|
||||||
python train.py --name="logistic_regression_surname" --type="baseline" --env="development"
|
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# lstm
|
# lstm
|
||||||
python train.py --name="lstm" --type="baseline" --env="development"
|
python train.py --name="lstm" --type="baseline" --env="production"
|
||||||
python train.py --name="lstm_native" --type="baseline" --env="development"
|
python train.py --name="lstm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="lstm_surname" --type="baseline" --env="development"
|
python train.py --name="lstm_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# random forest
|
# random forest
|
||||||
python train.py --name="random_forest" --type="baseline" --env="development"
|
python train.py --name="random_forest" --type="baseline" --env="production"
|
||||||
python train.py --name="random_forest_native" --type="baseline" --env="development"
|
python train.py --name="random_forest_native" --type="baseline" --env="production"
|
||||||
python train.py --name="random_forest_surname" --type="baseline" --env="development"
|
python train.py --name="random_forest_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# svm
|
# svm
|
||||||
python train.py --name="svm" --type="baseline" --env="development"
|
python train.py --name="svm" --type="baseline" --env="production"
|
||||||
python train.py --name="svm_native" --type="baseline" --env="development"
|
python train.py --name="svm_native" --type="baseline" --env="production"
|
||||||
python train.py --name="svm_surname" --type="baseline" --env="development"
|
python train.py --name="svm_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# naive bayes
|
# naive bayes
|
||||||
python train.py --name="naive_bayes" --type="baseline" --env="development"
|
python train.py --name="naive_bayes" --type="baseline" --env="production"
|
||||||
python train.py --name="naive_bayes_native" --type="baseline" --env="development"
|
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
|
||||||
python train.py --name="naive_bayes_surname" --type="baseline" --env="development"
|
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# transformer
|
# transformer
|
||||||
python train.py --name="transformer" --type="baseline" --env="development"
|
python train.py --name="transformer" --type="baseline" --env="production"
|
||||||
python train.py --name="transformer_native" --type="baseline" --env="development"
|
python train.py --name="transformer_native" --type="baseline" --env="production"
|
||||||
python train.py --name="transformer_surname" --type="baseline" --env="development"
|
python train.py --name="transformer_surname" --type="baseline" --env="production"
|
||||||
|
|
||||||
# xgboost
|
# xgboost
|
||||||
python train.py --name="xgboost" --type="baseline" --env="development"
|
python train.py --name="xgboost" --type="baseline" --env="production"
|
||||||
python train.py --name="xgboost_native" --type="baseline" --env="development"
|
python train.py --name="xgboost_native" --type="baseline" --env="production"
|
||||||
python train.py --name="xgboost_surname" --type="baseline" --env="development"
|
python train.py --name="xgboost_surname" --type="baseline" --env="production"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Web Interface
|
## Web Interface
|
||||||
@@ -171,3 +171,6 @@ streamlit run web/app.py
|
|||||||
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||||
<img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
|
<img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
## Acknowledgements
|
||||||
|
- Map Visualization: [https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc](https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc)
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ processing:
|
|||||||
# Pipeline stages
|
# Pipeline stages
|
||||||
stages:
|
stages:
|
||||||
- "data_cleaning"
|
- "data_cleaning"
|
||||||
|
- "data_selection"
|
||||||
- "feature_extraction"
|
- "feature_extraction"
|
||||||
#- "ner_annotation"
|
#- "ner_annotation"
|
||||||
#- "llm_annotation"
|
#- "llm_annotation"
|
||||||
|
|||||||
@@ -3,17 +3,18 @@ debug: false
|
|||||||
|
|
||||||
# Processing settings
|
# Processing settings
|
||||||
processing:
|
processing:
|
||||||
batch_size: 10_000
|
batch_size: 100_000
|
||||||
max_workers: 8
|
max_workers: 4
|
||||||
checkpoint_interval: 10
|
checkpoint_interval: 10
|
||||||
use_multiprocessing: true
|
use_multiprocessing: true
|
||||||
|
|
||||||
# Pipeline stages
|
# Pipeline stages
|
||||||
stages:
|
stages:
|
||||||
- "data_cleaning"
|
- "data_cleaning"
|
||||||
|
- "data_selection"
|
||||||
- "feature_extraction"
|
- "feature_extraction"
|
||||||
- "ner_annotation"
|
# - "ner_annotation"
|
||||||
- "llm_annotation"
|
# - "llm_annotation"
|
||||||
- "data_splitting"
|
- "data_splitting"
|
||||||
|
|
||||||
# Production LLM settings
|
# Production LLM settings
|
||||||
@@ -34,7 +35,7 @@ data:
|
|||||||
# Production logging (less verbose)
|
# Production logging (less verbose)
|
||||||
logging:
|
logging:
|
||||||
level: "INFO"
|
level: "INFO"
|
||||||
console_logging: false
|
console_logging: true
|
||||||
file_logging: true
|
file_logging: true
|
||||||
log_file: "pipeline.production.log"
|
log_file: "pipeline.production.log"
|
||||||
max_log_size: 52428800 # 50MB
|
max_log_size: 52428800 # 50MB
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ paths:
|
|||||||
# List of stages in the processing pipeline
|
# List of stages in the processing pipeline
|
||||||
stages:
|
stages:
|
||||||
- "data_cleaning" # Data cleaning stage
|
- "data_cleaning" # Data cleaning stage
|
||||||
|
- "data_selection" # Data selection stage - keep only required columns
|
||||||
- "feature_extraction" # Feature extraction stage
|
- "feature_extraction" # Feature extraction stage
|
||||||
- "ner_annotation" # NER-based annotation stage
|
- "ner_annotation" # NER-based annotation stage
|
||||||
- "llm_annotation" # LLM annotation stage (computational intensive)
|
- "llm_annotation" # LLM annotation stage (computational intensive)
|
||||||
@@ -64,6 +65,11 @@ data:
|
|||||||
females: "names_females.csv" # Output files for female names
|
females: "names_females.csv" # Output files for female names
|
||||||
ner_data: "names_ner.json" # Output file for NER annotated data
|
ner_data: "names_ner.json" # Output file for NER annotated data
|
||||||
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
|
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
|
||||||
|
selected_columns: # Required columns for processing
|
||||||
|
- name
|
||||||
|
- sex
|
||||||
|
- region
|
||||||
|
- year
|
||||||
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
|
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
|
||||||
split_by_gender: true # Should the dataset be split by gender ?
|
split_by_gender: true # Should the dataset be split by gender ?
|
||||||
split_by_province: true # Should the dataset be split by province ?
|
split_by_province: true # Should the dataset be split by province ?
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
|
|||||||
"ner_spacy": "names_ner.spacy",
|
"ner_spacy": "names_ner.spacy",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
selected_columns: list[str] = field(default=["name", "sex", "region"])
|
||||||
split_evaluation: bool = False
|
split_evaluation: bool = False
|
||||||
split_by_province: bool = True
|
split_by_province: bool = True
|
||||||
split_by_gender: bool = True
|
split_by_gender: bool = True
|
||||||
|
|||||||
@@ -8,12 +8,10 @@ class RegionMapper:
|
|||||||
|
|
||||||
def __init__(self, mapping: Optional[Dict] = None):
|
def __init__(self, mapping: Optional[Dict] = None):
|
||||||
self.mapping = mapping or REGION_MAPPING
|
self.mapping = mapping or REGION_MAPPING
|
||||||
|
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
|
||||||
|
|
||||||
def map(self, series: pd.Series) -> pd.Series:
|
def map(self, series: pd.Series) -> pd.Series:
|
||||||
"""Vectorized region to province mapping"""
|
return series.str.lower().map(self.mapping).fillna("AUTRES")
|
||||||
return series.str.lower().map(
|
|
||||||
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_provinces():
|
def get_provinces():
|
||||||
|
|||||||
@@ -30,9 +30,8 @@ class TextCleaner:
|
|||||||
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""Clean all text columns in a DataFrame"""
|
"""Clean all text columns in a DataFrame"""
|
||||||
df = df.copy()
|
df = df.copy()
|
||||||
text_columns = df.select_dtypes(include="object").columns
|
columns = df.select_dtypes(include=["object", "string"]).columns
|
||||||
|
for col in columns:
|
||||||
for col in text_columns:
|
|
||||||
df[col] = self.clean_text_series(df[col])
|
df[col] = self.clean_text_series(df[col])
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|||||||
@@ -9,9 +9,9 @@ from core.utils.data_loader import DataLoader
|
|||||||
from processing.batch.batch_config import BatchConfig
|
from processing.batch.batch_config import BatchConfig
|
||||||
from processing.pipeline import Pipeline
|
from processing.pipeline import Pipeline
|
||||||
from processing.steps.data_cleaning_step import DataCleaningStep
|
from processing.steps.data_cleaning_step import DataCleaningStep
|
||||||
|
from processing.steps.data_selection_step import DataSelectionStep
|
||||||
from processing.steps.data_splitting_step import DataSplittingStep
|
from processing.steps.data_splitting_step import DataSplittingStep
|
||||||
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||||
from processing.steps.llm_annotation_step import LLMAnnotationStep
|
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(config) -> Pipeline:
|
def create_pipeline(config) -> Pipeline:
|
||||||
@@ -28,8 +28,9 @@ def create_pipeline(config) -> Pipeline:
|
|||||||
steps = [
|
steps = [
|
||||||
DataCleaningStep(config),
|
DataCleaningStep(config),
|
||||||
FeatureExtractionStep(config),
|
FeatureExtractionStep(config),
|
||||||
|
DataSelectionStep(config),
|
||||||
# NERAnnotationStep(config),
|
# NERAnnotationStep(config),
|
||||||
LLMAnnotationStep(config),
|
# LLMAnnotationStep(config),
|
||||||
]
|
]
|
||||||
|
|
||||||
for stage in config.stages:
|
for stage in config.stages:
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from processing.monitoring.pipeline_monitor import PipelineMonitor
|
|||||||
def main():
|
def main():
|
||||||
choices = [
|
choices = [
|
||||||
"data_cleaning",
|
"data_cleaning",
|
||||||
|
"data_selection",
|
||||||
"feature_extraction",
|
"feature_extraction",
|
||||||
"ner_annotation",
|
"ner_annotation",
|
||||||
"llm_annotation",
|
"llm_annotation",
|
||||||
|
|||||||
Vendored
+99
-93
File diff suppressed because one or more lines are too long
Vendored
+430
-98
File diff suppressed because one or more lines are too long
Vendored
+107
@@ -0,0 +1,107 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": "# Qualitative Analysis",
|
||||||
|
"id": "d20715dd63f57364"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"id": "c93a55c8",
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-09-21T13:34:50.973298Z",
|
||||||
|
"start_time": "2025-09-21T13:34:50.969142Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import geopandas as gpd\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import seaborn as sns\n",
|
||||||
|
"import sys\n",
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"sys.path.append(os.path.abspath(\"..\"))\n",
|
||||||
|
"from core.utils.data_loader import DataLoader\n",
|
||||||
|
"from core.config.pipeline_config import PipelineConfig"
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"id": "c0b00261",
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-09-21T13:34:51.002610Z",
|
||||||
|
"start_time": "2025-09-21T13:34:50.998586Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"config = PipelineConfig(\n",
|
||||||
|
" paths={\n",
|
||||||
|
" \"root_dir\": \"../data\",\n",
|
||||||
|
" \"data_dir\": \"../data/dataset\",\n",
|
||||||
|
" \"models_dir\": \"../models\",\n",
|
||||||
|
" \"outputs_dir\": \"../data/processed\",\n",
|
||||||
|
" \"logs_dir\": \"../logs\",\n",
|
||||||
|
" \"configs_dir\": \"../configs\",\n",
|
||||||
|
" \"checkpoints_dir\": \"../checkpoints\"\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"loader = DataLoader(config)"
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-09-21T13:35:27.430639Z",
|
||||||
|
"start_time": "2025-09-21T13:34:51.013412Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 5,
|
||||||
|
"source": [
|
||||||
|
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
|
||||||
|
"gdf_proj = gdf.to_crs(epsg=32732)\n",
|
||||||
|
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
|
||||||
|
"\n",
|
||||||
|
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
|
||||||
|
],
|
||||||
|
"id": "b38394ce38864379"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": "## Exploration",
|
||||||
|
"id": "a1af5626d2a948d6"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.11"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Vendored
+107
@@ -0,0 +1,107 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": "# Quantitative Analysis",
|
||||||
|
"id": "a605c0f92056a825"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"id": "c93a55c8",
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-09-21T14:14:47.287549Z",
|
||||||
|
"start_time": "2025-09-21T14:14:47.279199Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import geopandas as gpd\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import seaborn as sns\n",
|
||||||
|
"import sys\n",
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"sys.path.append(os.path.abspath(\"..\"))\n",
|
||||||
|
"from core.utils.data_loader import DataLoader\n",
|
||||||
|
"from core.config.pipeline_config import PipelineConfig"
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 30
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"id": "c0b00261",
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-09-21T14:14:47.315980Z",
|
||||||
|
"start_time": "2025-09-21T14:14:47.308376Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"config = PipelineConfig(\n",
|
||||||
|
" paths={\n",
|
||||||
|
" \"root_dir\": \"../data\",\n",
|
||||||
|
" \"data_dir\": \"../data/dataset\",\n",
|
||||||
|
" \"models_dir\": \"../models\",\n",
|
||||||
|
" \"outputs_dir\": \"../data/processed\",\n",
|
||||||
|
" \"logs_dir\": \"../logs\",\n",
|
||||||
|
" \"configs_dir\": \"../configs\",\n",
|
||||||
|
" \"checkpoints_dir\": \"../checkpoints\"\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"loader = DataLoader(config)"
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 31
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-09-21T14:15:47.899044Z",
|
||||||
|
"start_time": "2025-09-21T14:14:47.339266Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
|
||||||
|
"gdf_proj = gdf.to_crs(epsg=32732)\n",
|
||||||
|
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
|
||||||
|
"\n",
|
||||||
|
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
|
||||||
|
],
|
||||||
|
"id": "b38394ce38864379",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 32
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": "## Exploration",
|
||||||
|
"id": "a1af5626d2a948d6"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.11"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
UTF-8
|
||||||
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
|
||||||
Binary file not shown.
Binary file not shown.
@@ -211,7 +211,9 @@ class NameModel:
|
|||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_losses = {}
|
batch_losses = {}
|
||||||
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
|
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
|
||||||
logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}")
|
logging.info(
|
||||||
|
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
||||||
|
)
|
||||||
|
|
||||||
# Accumulate into total losses dict
|
# Accumulate into total losses dict
|
||||||
for k, v in batch_losses.items():
|
for k, v in batch_losses.items():
|
||||||
|
|||||||
@@ -49,6 +49,9 @@ class Pipeline:
|
|||||||
"processed_batches": step.state.processed_batches,
|
"processed_batches": step.state.processed_batches,
|
||||||
"total_batches": step.state.total_batches,
|
"total_batches": step.state.total_batches,
|
||||||
"failed_batches": len(step.state.failed_batches),
|
"failed_batches": len(step.state.failed_batches),
|
||||||
"completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
|
"completion_percentage": (
|
||||||
|
step.state.processed_batches / max(1, step.state.total_batches)
|
||||||
|
)
|
||||||
|
* 100,
|
||||||
}
|
}
|
||||||
return progress
|
return progress
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
|
class DataSelectionStep(PipelineStep):
|
||||||
|
"""Configuration-driven data selection step to keep only specified columns"""
|
||||||
|
|
||||||
|
def __init__(self, pipeline_config: PipelineConfig):
|
||||||
|
super().__init__("data_selection", pipeline_config)
|
||||||
|
self.selected_columns = pipeline_config.data.selected_columns
|
||||||
|
|
||||||
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
|
"""Process a single batch for data selection"""
|
||||||
|
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
|
||||||
|
|
||||||
|
# Check which columns exist in the batch
|
||||||
|
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
||||||
|
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
|
||||||
|
|
||||||
|
if missing_columns:
|
||||||
|
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
||||||
|
|
||||||
|
if not available_columns:
|
||||||
|
logging.error(f"No required columns found in batch {batch_id}")
|
||||||
|
return pd.DataFrame() # Return empty DataFrame if no required columns exist
|
||||||
|
|
||||||
|
# Select only the available required columns
|
||||||
|
selected_batch = batch[available_columns].copy()
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return selected_batch
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_batch_mutation(self) -> bool:
|
||||||
|
"""This step modifies the batch data by selecting columns"""
|
||||||
|
return True
|
||||||
@@ -72,7 +72,9 @@ class ExperimentBuilder:
|
|||||||
f"Available experiments: {available_experiments}"
|
f"Available experiments: {available_experiments}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]:
|
def get_templates(
|
||||||
|
self, templates_path: str = "research_templates.yaml"
|
||||||
|
) -> Dict[str, List[Dict]]:
|
||||||
"""Get all available experiments from templates organized by type"""
|
"""Get all available experiments from templates organized by type"""
|
||||||
templates = self.load_templates(templates_path)
|
templates = self.load_templates(templates_path)
|
||||||
|
|
||||||
@@ -80,7 +82,7 @@ class ExperimentBuilder:
|
|||||||
"baseline": templates.get("baseline_experiments", []),
|
"baseline": templates.get("baseline_experiments", []),
|
||||||
"advanced": templates.get("advanced_experiments", []),
|
"advanced": templates.get("advanced_experiments", []),
|
||||||
"feature_study": templates.get("feature_studies", []),
|
"feature_study": templates.get("feature_studies", []),
|
||||||
"tuning": templates.get("hyperparameter_tuning", [])
|
"tuning": templates.get("hyperparameter_tuning", []),
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -104,5 +106,5 @@ class ExperimentBuilder:
|
|||||||
tags=template_config.get("tags", []),
|
tags=template_config.get("tags", []),
|
||||||
test_size=template_config.get("test_size", 0.2),
|
test_size=template_config.get("test_size", 0.2),
|
||||||
cross_validation_folds=template_config.get("cross_validation_folds", 5),
|
cross_validation_folds=template_config.get("cross_validation_folds", 5),
|
||||||
train_data_filter=template_config.get("train_data_filter")
|
train_data_filter=template_config.get("train_data_filter"),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
|
|||||||
# Soft voting averages probabilities (preferred when members are calibrated);
|
# Soft voting averages probabilities (preferred when members are calibrated);
|
||||||
# hard voting uses majority class. Parallelize member predictions.
|
# hard voting uses majority class. Parallelize member predictions.
|
||||||
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||||
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
|
return VotingClassifier(
|
||||||
|
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
|
||||||
|
)
|
||||||
|
|
||||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
text_features = []
|
text_features = []
|
||||||
|
|||||||
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
|
|||||||
encoder = self.label_encoders[feature_key]
|
encoder = self.label_encoders[feature_key]
|
||||||
column_clean = column.fillna("unknown").astype(str)
|
column_clean = column.fillna("unknown").astype(str)
|
||||||
known_classes = set(encoder.classes_)
|
known_classes = set(encoder.classes_)
|
||||||
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
|
default_class = (
|
||||||
|
"unknown" if "unknown" in known_classes else encoder.classes_[0]
|
||||||
|
)
|
||||||
column_mapped = column_clean.apply(
|
column_mapped = column_clean.apply(
|
||||||
lambda value: value if value in known_classes else default_class
|
lambda value: value if value in known_classes else default_class
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):
|
|||||||
|
|
||||||
# Add positional encoding
|
# Add positional encoding
|
||||||
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
||||||
pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
|
pos_embedding = Embedding(
|
||||||
positions
|
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
|
||||||
)
|
)(positions)
|
||||||
x = x + pos_embedding
|
x = x + pos_embedding
|
||||||
|
|
||||||
x = self._transformer_encoder(x, params)
|
x = self._transformer_encoder(x, params)
|
||||||
|
|||||||
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
|
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
|
||||||
"""Combine configured textual features into one string per record."""
|
"""Combine configured textual features into one string per record."""
|
||||||
|
|
||||||
column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
|
column_names = [
|
||||||
|
feature.value for feature in self.config.features if feature.value in X.columns
|
||||||
|
]
|
||||||
if not column_names:
|
if not column_names:
|
||||||
raise ValueError("No configured text features found in the provided DataFrame.")
|
raise ValueError("No configured text features found in the provided DataFrame.")
|
||||||
|
|
||||||
|
|||||||
+3
-1
@@ -50,7 +50,9 @@ class StreamlitApp:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def run(cls):
|
def run(cls):
|
||||||
st.title("🇨🇩 DRC NERS Platform")
|
st.title("🇨🇩 DRC NERS Platform")
|
||||||
st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
|
st.markdown(
|
||||||
|
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
|
||||||
|
)
|
||||||
st.markdown(
|
st.markdown(
|
||||||
"""
|
"""
|
||||||
## Overview
|
## Overview
|
||||||
|
|||||||
@@ -1,2 +1 @@
|
|||||||
from .ner_testing import NERTesting
|
from .ner_testing import NERTesting
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ class Experiments:
|
|||||||
self,
|
self,
|
||||||
config: PipelineConfig,
|
config: PipelineConfig,
|
||||||
experiment_tracker: ExperimentTracker,
|
experiment_tracker: ExperimentTracker,
|
||||||
experiment_runner: ExperimentRunner
|
experiment_runner: ExperimentRunner,
|
||||||
):
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.experiment_tracker = experiment_tracker
|
self.experiment_tracker = experiment_tracker
|
||||||
@@ -26,8 +26,7 @@ class Experiments:
|
|||||||
def index(self):
|
def index(self):
|
||||||
st.title("Experiments")
|
st.title("Experiments")
|
||||||
|
|
||||||
tab1, tab2, tab3 = st.tabs(
|
tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
|
||||||
["Templates", "Experiments", "Batch Experiments"])
|
|
||||||
|
|
||||||
with tab1:
|
with tab1:
|
||||||
self.show_template_experiments()
|
self.show_template_experiments()
|
||||||
@@ -56,14 +55,18 @@ class Experiments:
|
|||||||
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
|
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
|
||||||
|
|
||||||
with exp_tabs[2]:
|
with exp_tabs[2]:
|
||||||
self._show_experiments_by_type(available_experiments["feature_study"], "feature_study")
|
self._show_experiments_by_type(
|
||||||
|
available_experiments["feature_study"], "feature_study"
|
||||||
|
)
|
||||||
|
|
||||||
with exp_tabs[3]:
|
with exp_tabs[3]:
|
||||||
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
|
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error loading experiment templates: {e}")
|
st.error(f"Error loading experiment templates: {e}")
|
||||||
st.info("Make sure the research templates file exists at `config/research_templates.yaml`")
|
st.info(
|
||||||
|
"Make sure the research templates file exists at `config/research_templates.yaml`"
|
||||||
|
)
|
||||||
|
|
||||||
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
|
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
|
||||||
"""Show experiments for a specific type"""
|
"""Show experiments for a specific type"""
|
||||||
@@ -213,7 +216,7 @@ class Experiments:
|
|||||||
experiment_types = st.multiselect(
|
experiment_types = st.multiselect(
|
||||||
"Select Experiment Types",
|
"Select Experiment Types",
|
||||||
["baseline", "advanced", "feature_study", "tuning"],
|
["baseline", "advanced", "feature_study", "tuning"],
|
||||||
default=["baseline"]
|
default=["baseline"],
|
||||||
)
|
)
|
||||||
|
|
||||||
if experiment_types:
|
if experiment_types:
|
||||||
@@ -223,11 +226,11 @@ class Experiments:
|
|||||||
experiments = available_experiments.get(exp_type, [])
|
experiments = available_experiments.get(exp_type, [])
|
||||||
if experiments:
|
if experiments:
|
||||||
st.write(f"**{exp_type.title()} Experiments:**")
|
st.write(f"**{exp_type.title()} Experiments:**")
|
||||||
exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)]
|
exp_names = [
|
||||||
|
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
|
||||||
|
]
|
||||||
selected_names = st.multiselect(
|
selected_names = st.multiselect(
|
||||||
f"Select {exp_type} experiments",
|
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
|
||||||
exp_names,
|
|
||||||
key=f"select_{exp_type}"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
for name in selected_names:
|
for name in selected_names:
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class LogReader:
|
|||||||
|
|
||||||
# Parse log entries from the end
|
# Parse log entries from the end
|
||||||
entries = []
|
entries = []
|
||||||
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
|
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
|
||||||
entry = self._parse_log_line(line.strip())
|
entry = self._parse_log_line(line.strip())
|
||||||
if entry:
|
if entry:
|
||||||
entries.append(entry)
|
entries.append(entry)
|
||||||
|
|||||||
@@ -33,7 +33,9 @@ class NERTesting:
|
|||||||
|
|
||||||
# Load model
|
# Load model
|
||||||
if not self.load_ner_model():
|
if not self.load_ner_model():
|
||||||
st.warning("NER model could not be loaded. Please ensure the model is trained and available.")
|
st.warning(
|
||||||
|
"NER model could not be loaded. Please ensure the model is trained and available."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Display model information
|
# Display model information
|
||||||
@@ -53,9 +55,11 @@ class NERTesting:
|
|||||||
col1, col2, col3, col4 = st.columns(4)
|
col1, col2, col3, col4 = st.columns(4)
|
||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}")
|
st.metric(
|
||||||
|
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
|
||||||
|
)
|
||||||
with col2:
|
with col2:
|
||||||
st.metric("Epochs", self.training_stats.get('epochs', 0))
|
st.metric("Epochs", self.training_stats.get("epochs", 0))
|
||||||
with col3:
|
with col3:
|
||||||
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
|
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
|
||||||
with col4:
|
with col4:
|
||||||
@@ -64,7 +68,7 @@ class NERTesting:
|
|||||||
def show_model_evaluation_info(self):
|
def show_model_evaluation_info(self):
|
||||||
if self.evaluation_stats:
|
if self.evaluation_stats:
|
||||||
col1, col2, col3 = st.columns(4)
|
col1, col2, col3 = st.columns(4)
|
||||||
overall = self.evaluation_stats.get('overall', {})
|
overall = self.evaluation_stats.get("overall", {})
|
||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
st.metric("Overall Precision", f"{overall['precision']:.2f}")
|
st.metric("Overall Precision", f"{overall['precision']:.2f}")
|
||||||
@@ -79,7 +83,7 @@ class NERTesting:
|
|||||||
name_input = st.text_input(
|
name_input = st.text_input(
|
||||||
"Name:",
|
"Name:",
|
||||||
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
|
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
|
||||||
help="Enter a full name or multiple names separated by spaces"
|
help="Enter a full name or multiple names separated by spaces",
|
||||||
)
|
)
|
||||||
if name_input.strip():
|
if name_input.strip():
|
||||||
if st.button("Analyze Name", type="primary"):
|
if st.button("Analyze Name", type="primary"):
|
||||||
@@ -90,12 +94,12 @@ class NERTesting:
|
|||||||
"Names:",
|
"Names:",
|
||||||
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
|
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
|
||||||
height=150,
|
height=150,
|
||||||
help="Enter each name on a new line"
|
help="Enter each name on a new line",
|
||||||
)
|
)
|
||||||
|
|
||||||
if names_input.strip():
|
if names_input.strip():
|
||||||
if st.button("Analyze All Names", type="primary"):
|
if st.button("Analyze All Names", type="primary"):
|
||||||
names = [name.strip() for name in names_input.split('\n') if name.strip()]
|
names = [name.strip() for name in names_input.split("\n") if name.strip()]
|
||||||
for i, name in enumerate(names):
|
for i, name in enumerate(names):
|
||||||
st.markdown(f"**Name {i+1}: {name}**")
|
st.markdown(f"**Name {i+1}: {name}**")
|
||||||
self.analyze_and_display(name)
|
self.analyze_and_display(name)
|
||||||
@@ -106,12 +110,12 @@ class NERTesting:
|
|||||||
try:
|
try:
|
||||||
result = self.ner_model.predict(text)
|
result = self.ner_model.predict(text)
|
||||||
st.subheader("Analysis Results")
|
st.subheader("Analysis Results")
|
||||||
entities = result.get('entities', [])
|
entities = result.get("entities", [])
|
||||||
|
|
||||||
if entities:
|
if entities:
|
||||||
self.show_visual_entities(text, entities)
|
self.show_visual_entities(text, entities)
|
||||||
native_count = sum(1 for e in entities if e['label'] == 'NATIVE')
|
native_count = sum(1 for e in entities if e["label"] == "NATIVE")
|
||||||
surname_count = sum(1 for e in entities if e['label'] == 'SURNAME')
|
surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
|
||||||
|
|
||||||
col1, col2, col3 = st.columns(3)
|
col1, col2, col3 = st.columns(3)
|
||||||
with col1:
|
with col1:
|
||||||
@@ -134,29 +138,17 @@ class NERTesting:
|
|||||||
# Convert our entities format to spaCy format for displacy
|
# Convert our entities format to spaCy format for displacy
|
||||||
ents = []
|
ents = []
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
ents.append({
|
ents.append(
|
||||||
"start": entity['start'],
|
{"start": entity["start"], "end": entity["end"], "label": entity["label"]}
|
||||||
"end": entity['end'],
|
)
|
||||||
"label": entity['label']
|
|
||||||
})
|
|
||||||
|
|
||||||
# Create doc-like structure for displacy
|
# Create doc-like structure for displacy
|
||||||
doc_data = {
|
doc_data = {"text": text, "ents": ents, "title": None}
|
||||||
"text": text,
|
|
||||||
"ents": ents,
|
|
||||||
"title": None
|
|
||||||
}
|
|
||||||
|
|
||||||
# Custom colors for our labels
|
# Custom colors for our labels
|
||||||
colors = {
|
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
|
||||||
"NATIVE": "#74C0FC", # Light blue
|
|
||||||
"SURNAME": "#69DB7C" # Light green
|
|
||||||
}
|
|
||||||
|
|
||||||
options = {
|
options = {"colors": colors, "distance": 90}
|
||||||
"colors": colors,
|
|
||||||
"distance": 90
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate HTML visualization
|
# Generate HTML visualization
|
||||||
html = displacy.render(doc_data, style="ent", manual=True, options=options)
|
html = displacy.render(doc_data, style="ent", manual=True, options=options)
|
||||||
|
|||||||
Reference in New Issue
Block a user