feat: add osm data
This commit is contained in:
@@ -62,7 +62,7 @@ stages:
|
||||
**Running the Pipeline**
|
||||
|
||||
```bash
|
||||
python main.py --env development
|
||||
python main.py --env production
|
||||
```
|
||||
|
||||
## NER Processing (Optional)
|
||||
@@ -72,7 +72,7 @@ Its main objective is to accurately identify and tag the different components of
|
||||
specifically distinguishing between the native part and the surname.
|
||||
|
||||
```bash
|
||||
python ner.py --env development
|
||||
python ner.py --env production
|
||||
```
|
||||
|
||||
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
|
||||
@@ -105,54 +105,54 @@ you can define model features, training parameters, and evaluation metrics in th
|
||||
|
||||
```bash
|
||||
# bigru
|
||||
python train.py --name="bigru" --type="baseline" --env="development"
|
||||
python train.py --name="bigru_native" --type="baseline" --env="development"
|
||||
python train.py --name="bigru_surname" --type="baseline" --env="development"
|
||||
python train.py --name="bigru" --type="baseline" --env="production"
|
||||
python train.py --name="bigru_native" --type="baseline" --env="production"
|
||||
python train.py --name="bigru_surname" --type="baseline" --env="production"
|
||||
|
||||
# cnn
|
||||
python train.py --name="cnn" --type="baseline" --env="development"
|
||||
python train.py --name="cnn_native" --type="baseline" --env="development"
|
||||
python train.py --name="cnn_surname" --type="baseline" --env="development"
|
||||
python train.py --name="cnn" --type="baseline" --env="production"
|
||||
python train.py --name="cnn_native" --type="baseline" --env="production"
|
||||
python train.py --name="cnn_surname" --type="baseline" --env="production"
|
||||
|
||||
# lightgbm
|
||||
python train.py --name="lightgbm" --type="baseline" --env="development"
|
||||
python train.py --name="lightgbm_native" --type="baseline" --env="development"
|
||||
python train.py --name="lightgbm_surname" --type="baseline" --env="development"
|
||||
python train.py --name="lightgbm" --type="baseline" --env="production"
|
||||
python train.py --name="lightgbm_native" --type="baseline" --env="production"
|
||||
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
|
||||
|
||||
# logistic regression
|
||||
python train.py --name="logistic_regression" --type="baseline" --env="development"
|
||||
python train.py --name="logistic_regression_native" --type="baseline" --env="development"
|
||||
python train.py --name="logistic_regression_surname" --type="baseline" --env="development"
|
||||
python train.py --name="logistic_regression" --type="baseline" --env="production"
|
||||
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
|
||||
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
|
||||
|
||||
# lstm
|
||||
python train.py --name="lstm" --type="baseline" --env="development"
|
||||
python train.py --name="lstm_native" --type="baseline" --env="development"
|
||||
python train.py --name="lstm_surname" --type="baseline" --env="development"
|
||||
python train.py --name="lstm" --type="baseline" --env="production"
|
||||
python train.py --name="lstm_native" --type="baseline" --env="production"
|
||||
python train.py --name="lstm_surname" --type="baseline" --env="production"
|
||||
|
||||
# random forest
|
||||
python train.py --name="random_forest" --type="baseline" --env="development"
|
||||
python train.py --name="random_forest_native" --type="baseline" --env="development"
|
||||
python train.py --name="random_forest_surname" --type="baseline" --env="development"
|
||||
python train.py --name="random_forest" --type="baseline" --env="production"
|
||||
python train.py --name="random_forest_native" --type="baseline" --env="production"
|
||||
python train.py --name="random_forest_surname" --type="baseline" --env="production"
|
||||
|
||||
# svm
|
||||
python train.py --name="svm" --type="baseline" --env="development"
|
||||
python train.py --name="svm_native" --type="baseline" --env="development"
|
||||
python train.py --name="svm_surname" --type="baseline" --env="development"
|
||||
python train.py --name="svm" --type="baseline" --env="production"
|
||||
python train.py --name="svm_native" --type="baseline" --env="production"
|
||||
python train.py --name="svm_surname" --type="baseline" --env="production"
|
||||
|
||||
# naive bayes
|
||||
python train.py --name="naive_bayes" --type="baseline" --env="development"
|
||||
python train.py --name="naive_bayes_native" --type="baseline" --env="development"
|
||||
python train.py --name="naive_bayes_surname" --type="baseline" --env="development"
|
||||
python train.py --name="naive_bayes" --type="baseline" --env="production"
|
||||
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
|
||||
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
|
||||
|
||||
# transformer
|
||||
python train.py --name="transformer" --type="baseline" --env="development"
|
||||
python train.py --name="transformer_native" --type="baseline" --env="development"
|
||||
python train.py --name="transformer_surname" --type="baseline" --env="development"
|
||||
python train.py --name="transformer" --type="baseline" --env="production"
|
||||
python train.py --name="transformer_native" --type="baseline" --env="production"
|
||||
python train.py --name="transformer_surname" --type="baseline" --env="production"
|
||||
|
||||
# xgboost
|
||||
python train.py --name="xgboost" --type="baseline" --env="development"
|
||||
python train.py --name="xgboost_native" --type="baseline" --env="development"
|
||||
python train.py --name="xgboost_surname" --type="baseline" --env="development"
|
||||
python train.py --name="xgboost" --type="baseline" --env="production"
|
||||
python train.py --name="xgboost_native" --type="baseline" --env="production"
|
||||
python train.py --name="xgboost_surname" --type="baseline" --env="production"
|
||||
```
|
||||
|
||||
## Web Interface
|
||||
@@ -171,3 +171,6 @@ streamlit run web/app.py
|
||||
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||
<img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
|
||||
</a>
|
||||
|
||||
## Acknowledgements
|
||||
- Map Visualization: [https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc](https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc)
|
||||
|
||||
@@ -11,6 +11,7 @@ processing:
|
||||
# Pipeline stages
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "data_selection"
|
||||
- "feature_extraction"
|
||||
#- "ner_annotation"
|
||||
#- "llm_annotation"
|
||||
|
||||
@@ -3,17 +3,18 @@ debug: false
|
||||
|
||||
# Processing settings
|
||||
processing:
|
||||
batch_size: 10_000
|
||||
max_workers: 8
|
||||
batch_size: 100_000
|
||||
max_workers: 4
|
||||
checkpoint_interval: 10
|
||||
use_multiprocessing: true
|
||||
|
||||
# Pipeline stages
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "data_selection"
|
||||
- "feature_extraction"
|
||||
- "ner_annotation"
|
||||
- "llm_annotation"
|
||||
# - "ner_annotation"
|
||||
# - "llm_annotation"
|
||||
- "data_splitting"
|
||||
|
||||
# Production LLM settings
|
||||
@@ -34,7 +35,7 @@ data:
|
||||
# Production logging (less verbose)
|
||||
logging:
|
||||
level: "INFO"
|
||||
console_logging: false
|
||||
console_logging: true
|
||||
file_logging: true
|
||||
log_file: "pipeline.production.log"
|
||||
max_log_size: 52428800 # 50MB
|
||||
|
||||
@@ -21,6 +21,7 @@ paths:
|
||||
# List of stages in the processing pipeline
|
||||
stages:
|
||||
- "data_cleaning" # Data cleaning stage
|
||||
- "data_selection" # Data selection stage - keep only required columns
|
||||
- "feature_extraction" # Feature extraction stage
|
||||
- "ner_annotation" # NER-based annotation stage
|
||||
- "llm_annotation" # LLM annotation stage (computational intensive)
|
||||
@@ -64,6 +65,11 @@ data:
|
||||
females: "names_females.csv" # Output files for female names
|
||||
ner_data: "names_ner.json" # Output file for NER annotated data
|
||||
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
|
||||
selected_columns: # Required columns for processing
|
||||
- name
|
||||
- sex
|
||||
- region
|
||||
- year
|
||||
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
|
||||
split_by_gender: true # Should the dataset be split by gender ?
|
||||
split_by_province: true # Should the dataset be split by province ?
|
||||
|
||||
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
|
||||
"ner_spacy": "names_ner.spacy",
|
||||
}
|
||||
)
|
||||
selected_columns: list[str] = field(default=["name", "sex", "region"])
|
||||
split_evaluation: bool = False
|
||||
split_by_province: bool = True
|
||||
split_by_gender: bool = True
|
||||
|
||||
@@ -8,12 +8,10 @@ class RegionMapper:
|
||||
|
||||
def __init__(self, mapping: Optional[Dict] = None):
|
||||
self.mapping = mapping or REGION_MAPPING
|
||||
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
|
||||
|
||||
def map(self, series: pd.Series) -> pd.Series:
|
||||
"""Vectorized region to province mapping"""
|
||||
return series.str.lower().map(
|
||||
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
||||
)
|
||||
return series.str.lower().map(self.mapping).fillna("AUTRES")
|
||||
|
||||
@staticmethod
|
||||
def get_provinces():
|
||||
|
||||
@@ -30,9 +30,8 @@ class TextCleaner:
|
||||
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Clean all text columns in a DataFrame"""
|
||||
df = df.copy()
|
||||
text_columns = df.select_dtypes(include="object").columns
|
||||
|
||||
for col in text_columns:
|
||||
columns = df.select_dtypes(include=["object", "string"]).columns
|
||||
for col in columns:
|
||||
df[col] = self.clean_text_series(df[col])
|
||||
|
||||
return df
|
||||
|
||||
@@ -9,9 +9,9 @@ from core.utils.data_loader import DataLoader
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.pipeline import Pipeline
|
||||
from processing.steps.data_cleaning_step import DataCleaningStep
|
||||
from processing.steps.data_selection_step import DataSelectionStep
|
||||
from processing.steps.data_splitting_step import DataSplittingStep
|
||||
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||
from processing.steps.llm_annotation_step import LLMAnnotationStep
|
||||
|
||||
|
||||
def create_pipeline(config) -> Pipeline:
|
||||
@@ -28,8 +28,9 @@ def create_pipeline(config) -> Pipeline:
|
||||
steps = [
|
||||
DataCleaningStep(config),
|
||||
FeatureExtractionStep(config),
|
||||
DataSelectionStep(config),
|
||||
# NERAnnotationStep(config),
|
||||
LLMAnnotationStep(config),
|
||||
# LLMAnnotationStep(config),
|
||||
]
|
||||
|
||||
for stage in config.stages:
|
||||
|
||||
@@ -11,6 +11,7 @@ from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
def main():
|
||||
choices = [
|
||||
"data_cleaning",
|
||||
"data_selection",
|
||||
"feature_extraction",
|
||||
"ner_annotation",
|
||||
"llm_annotation",
|
||||
|
||||
Vendored
+99
-93
File diff suppressed because one or more lines are too long
Vendored
+431
-99
File diff suppressed because one or more lines are too long
Vendored
+107
@@ -0,0 +1,107 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": "# Qualitative Analysis",
|
||||
"id": "d20715dd63f57364"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "c93a55c8",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-09-21T13:34:50.973298Z",
|
||||
"start_time": "2025-09-21T13:34:50.969142Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import geopandas as gpd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"sys.path.append(os.path.abspath(\"..\"))\n",
|
||||
"from core.utils.data_loader import DataLoader\n",
|
||||
"from core.config.pipeline_config import PipelineConfig"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 3
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "c0b00261",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-09-21T13:34:51.002610Z",
|
||||
"start_time": "2025-09-21T13:34:50.998586Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"config = PipelineConfig(\n",
|
||||
" paths={\n",
|
||||
" \"root_dir\": \"../data\",\n",
|
||||
" \"data_dir\": \"../data/dataset\",\n",
|
||||
" \"models_dir\": \"../models\",\n",
|
||||
" \"outputs_dir\": \"../data/processed\",\n",
|
||||
" \"logs_dir\": \"../logs\",\n",
|
||||
" \"configs_dir\": \"../configs\",\n",
|
||||
" \"checkpoints_dir\": \"../checkpoints\"\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"loader = DataLoader(config)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 4
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-09-21T13:35:27.430639Z",
|
||||
"start_time": "2025-09-21T13:34:51.013412Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": 5,
|
||||
"source": [
|
||||
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
|
||||
"gdf_proj = gdf.to_crs(epsg=32732)\n",
|
||||
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
|
||||
"\n",
|
||||
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
|
||||
],
|
||||
"id": "b38394ce38864379"
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": "## Exploration",
|
||||
"id": "a1af5626d2a948d6"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Vendored
+107
@@ -0,0 +1,107 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": "# Quantitative Analysis",
|
||||
"id": "a605c0f92056a825"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "c93a55c8",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-09-21T14:14:47.287549Z",
|
||||
"start_time": "2025-09-21T14:14:47.279199Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import geopandas as gpd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"sys.path.append(os.path.abspath(\"..\"))\n",
|
||||
"from core.utils.data_loader import DataLoader\n",
|
||||
"from core.config.pipeline_config import PipelineConfig"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 30
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "c0b00261",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-09-21T14:14:47.315980Z",
|
||||
"start_time": "2025-09-21T14:14:47.308376Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"config = PipelineConfig(\n",
|
||||
" paths={\n",
|
||||
" \"root_dir\": \"../data\",\n",
|
||||
" \"data_dir\": \"../data/dataset\",\n",
|
||||
" \"models_dir\": \"../models\",\n",
|
||||
" \"outputs_dir\": \"../data/processed\",\n",
|
||||
" \"logs_dir\": \"../logs\",\n",
|
||||
" \"configs_dir\": \"../configs\",\n",
|
||||
" \"checkpoints_dir\": \"../checkpoints\"\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"loader = DataLoader(config)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 31
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-09-21T14:15:47.899044Z",
|
||||
"start_time": "2025-09-21T14:14:47.339266Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
|
||||
"gdf_proj = gdf.to_crs(epsg=32732)\n",
|
||||
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
|
||||
"\n",
|
||||
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
|
||||
],
|
||||
"id": "b38394ce38864379",
|
||||
"outputs": [],
|
||||
"execution_count": 32
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": "## Exploration",
|
||||
"id": "a1af5626d2a948d6"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
UTF-8
|
||||
Binary file not shown.
@@ -0,0 +1 @@
|
||||
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
|
||||
Binary file not shown.
Binary file not shown.
@@ -211,7 +211,9 @@ class NameModel:
|
||||
for batch in batches:
|
||||
batch_losses = {}
|
||||
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
|
||||
logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}")
|
||||
logging.info(
|
||||
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
||||
)
|
||||
|
||||
# Accumulate into total losses dict
|
||||
for k, v in batch_losses.items():
|
||||
|
||||
@@ -49,6 +49,9 @@ class Pipeline:
|
||||
"processed_batches": step.state.processed_batches,
|
||||
"total_batches": step.state.total_batches,
|
||||
"failed_batches": len(step.state.failed_batches),
|
||||
"completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
|
||||
"completion_percentage": (
|
||||
step.state.processed_batches / max(1, step.state.total_batches)
|
||||
)
|
||||
* 100,
|
||||
}
|
||||
return progress
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.steps import PipelineStep
|
||||
|
||||
|
||||
class DataSelectionStep(PipelineStep):
|
||||
"""Configuration-driven data selection step to keep only specified columns"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
super().__init__("data_selection", pipeline_config)
|
||||
self.selected_columns = pipeline_config.data.selected_columns
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process a single batch for data selection"""
|
||||
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
|
||||
|
||||
# Check which columns exist in the batch
|
||||
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
||||
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
|
||||
|
||||
if missing_columns:
|
||||
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
||||
|
||||
if not available_columns:
|
||||
logging.error(f"No required columns found in batch {batch_id}")
|
||||
return pd.DataFrame() # Return empty DataFrame if no required columns exist
|
||||
|
||||
# Select only the available required columns
|
||||
selected_batch = batch[available_columns].copy()
|
||||
|
||||
logging.info(
|
||||
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
|
||||
)
|
||||
|
||||
return selected_batch
|
||||
|
||||
@property
|
||||
def requires_batch_mutation(self) -> bool:
|
||||
"""This step modifies the batch data by selecting columns"""
|
||||
return True
|
||||
@@ -41,14 +41,14 @@ class BaseModel(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> Dict[str, float] | dict[str, np.floating[Any]]:
|
||||
"""Perform cross-validation and return average scores"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
pass
|
||||
|
||||
@@ -55,11 +55,11 @@ class ExperimentBuilder:
|
||||
# Check if this is the experiment we're looking for
|
||||
# Look for experiments that match the model type or contain the name
|
||||
if (
|
||||
experiment.get("model_type") == name
|
||||
or name.lower() in experiment.get("name", "").lower()
|
||||
or experiment.get("name") == name
|
||||
or f"baseline_{name}" == experiment.get("name")
|
||||
or f"advanced_{name}" == experiment.get("name")
|
||||
experiment.get("model_type") == name
|
||||
or name.lower() in experiment.get("name", "").lower()
|
||||
or experiment.get("name") == name
|
||||
or f"baseline_{name}" == experiment.get("name")
|
||||
or f"advanced_{name}" == experiment.get("name")
|
||||
):
|
||||
return experiment
|
||||
|
||||
@@ -72,7 +72,9 @@ class ExperimentBuilder:
|
||||
f"Available experiments: {available_experiments}"
|
||||
)
|
||||
|
||||
def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]:
|
||||
def get_templates(
|
||||
self, templates_path: str = "research_templates.yaml"
|
||||
) -> Dict[str, List[Dict]]:
|
||||
"""Get all available experiments from templates organized by type"""
|
||||
templates = self.load_templates(templates_path)
|
||||
|
||||
@@ -80,7 +82,7 @@ class ExperimentBuilder:
|
||||
"baseline": templates.get("baseline_experiments", []),
|
||||
"advanced": templates.get("advanced_experiments", []),
|
||||
"feature_study": templates.get("feature_studies", []),
|
||||
"tuning": templates.get("hyperparameter_tuning", [])
|
||||
"tuning": templates.get("hyperparameter_tuning", []),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -104,5 +106,5 @@ class ExperimentBuilder:
|
||||
tags=template_config.get("tags", []),
|
||||
test_size=template_config.get("test_size", 0.2),
|
||||
cross_validation_folds=template_config.get("cross_validation_folds", 5),
|
||||
train_data_filter=template_config.get("train_data_filter")
|
||||
train_data_filter=template_config.get("train_data_filter"),
|
||||
)
|
||||
|
||||
@@ -158,12 +158,12 @@ class ExperimentRunner:
|
||||
|
||||
@classmethod
|
||||
def _create_prediction_examples(
|
||||
cls,
|
||||
X_test: pd.DataFrame,
|
||||
y_test: pd.Series,
|
||||
predictions: np.ndarray,
|
||||
model: BaseModel,
|
||||
n_examples: int = 10,
|
||||
cls,
|
||||
X_test: pd.DataFrame,
|
||||
y_test: pd.Series,
|
||||
predictions: np.ndarray,
|
||||
model: BaseModel,
|
||||
n_examples: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""Create prediction examples for analysis"""
|
||||
examples = []
|
||||
@@ -237,7 +237,7 @@ class ExperimentRunner:
|
||||
return None
|
||||
|
||||
def compare_experiments(
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
) -> pd.DataFrame:
|
||||
"""Compare experiments and return analysis"""
|
||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||
|
||||
@@ -77,10 +77,10 @@ class ExperimentTracker:
|
||||
return self._results.get(experiment_id)
|
||||
|
||||
def list_experiments(
|
||||
self,
|
||||
status: Optional[ExperimentStatus] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
self,
|
||||
status: Optional[ExperimentStatus] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
) -> List[ExperimentResult]:
|
||||
"""List experiments with optional filtering"""
|
||||
results = list(self._results.values())
|
||||
@@ -97,7 +97,7 @@ class ExperimentTracker:
|
||||
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||
|
||||
def get_best_experiment(
|
||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||
) -> Optional[ExperimentResult]:
|
||||
"""Get the best experiment based on a metric"""
|
||||
experiments = self.list_experiments()
|
||||
@@ -159,8 +159,8 @@ class ExperimentTracker:
|
||||
"""Export all results to CSV"""
|
||||
if output_path is None:
|
||||
output_path = (
|
||||
self.experiments_dir
|
||||
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
self.experiments_dir
|
||||
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
)
|
||||
|
||||
rows = []
|
||||
|
||||
@@ -43,7 +43,7 @@ class FeatureExtractor:
|
||||
return features_df
|
||||
|
||||
def _extract_single_feature(
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
) -> Union[pd.Series, pd.DataFrame]:
|
||||
"""Extract a single type of feature"""
|
||||
if feature_type == FeatureType.FULL_NAME:
|
||||
|
||||
@@ -27,13 +27,13 @@ class ModelTrainer:
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def train_single_model(
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
tags: List[str] = None,
|
||||
save_artifacts: bool = True,
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
tags: List[str] = None,
|
||||
save_artifacts: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Train a single model and save its artifacts.
|
||||
@@ -75,7 +75,7 @@ class ModelTrainer:
|
||||
return experiment_id
|
||||
|
||||
def train_multiple_models(
|
||||
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||
) -> List[str]:
|
||||
"""
|
||||
Train multiple models with different configurations.
|
||||
|
||||
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
|
||||
# Soft voting averages probabilities (preferred when members are calibrated);
|
||||
# hard voting uses majority class. Parallelize member predictions.
|
||||
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
|
||||
return VotingClassifier(
|
||||
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
|
||||
encoder = self.label_encoders[feature_key]
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
known_classes = set(encoder.classes_)
|
||||
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
|
||||
default_class = (
|
||||
"unknown" if "unknown" in known_classes else encoder.classes_[0]
|
||||
)
|
||||
column_mapped = column_clean.apply(
|
||||
lambda value: value if value in known_classes else default_class
|
||||
)
|
||||
|
||||
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
||||
pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
|
||||
positions
|
||||
)
|
||||
pos_embedding = Embedding(
|
||||
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
|
||||
)(positions)
|
||||
x = x + pos_embedding
|
||||
|
||||
x = self._transformer_encoder(x, params)
|
||||
|
||||
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
|
||||
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
|
||||
"""Combine configured textual features into one string per record."""
|
||||
|
||||
column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
|
||||
column_names = [
|
||||
feature.value for feature in self.config.features if feature.value in X.columns
|
||||
]
|
||||
if not column_names:
|
||||
raise ValueError("No configured text features found in the provided DataFrame.")
|
||||
|
||||
@@ -101,7 +103,7 @@ class NeuralNetworkModel(BaseModel):
|
||||
return combined_rows
|
||||
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> dict[str, np.floating[Any]]:
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
@@ -158,7 +160,7 @@ class NeuralNetworkModel(BaseModel):
|
||||
}
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
@@ -103,7 +103,7 @@ class TraditionalModel(BaseModel):
|
||||
return results
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
+3
-1
@@ -50,7 +50,9 @@ class StreamlitApp:
|
||||
@classmethod
|
||||
def run(cls):
|
||||
st.title("🇨🇩 DRC NERS Platform")
|
||||
st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
|
||||
st.markdown(
|
||||
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
|
||||
)
|
||||
st.markdown(
|
||||
"""
|
||||
## Overview
|
||||
|
||||
@@ -1,2 +1 @@
|
||||
from .ner_testing import NERTesting
|
||||
|
||||
|
||||
@@ -13,10 +13,10 @@ from research.model_registry import list_available_models
|
||||
|
||||
class Experiments:
|
||||
def __init__(
|
||||
self,
|
||||
config: PipelineConfig,
|
||||
experiment_tracker: ExperimentTracker,
|
||||
experiment_runner: ExperimentRunner
|
||||
self,
|
||||
config: PipelineConfig,
|
||||
experiment_tracker: ExperimentTracker,
|
||||
experiment_runner: ExperimentRunner,
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
@@ -26,8 +26,7 @@ class Experiments:
|
||||
def index(self):
|
||||
st.title("Experiments")
|
||||
|
||||
tab1, tab2, tab3 = st.tabs(
|
||||
["Templates", "Experiments", "Batch Experiments"])
|
||||
tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
|
||||
|
||||
with tab1:
|
||||
self.show_template_experiments()
|
||||
@@ -56,14 +55,18 @@ class Experiments:
|
||||
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
|
||||
|
||||
with exp_tabs[2]:
|
||||
self._show_experiments_by_type(available_experiments["feature_study"], "feature_study")
|
||||
self._show_experiments_by_type(
|
||||
available_experiments["feature_study"], "feature_study"
|
||||
)
|
||||
|
||||
with exp_tabs[3]:
|
||||
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error loading experiment templates: {e}")
|
||||
st.info("Make sure the research templates file exists at `config/research_templates.yaml`")
|
||||
st.info(
|
||||
"Make sure the research templates file exists at `config/research_templates.yaml`"
|
||||
)
|
||||
|
||||
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
|
||||
"""Show experiments for a specific type"""
|
||||
@@ -142,7 +145,7 @@ class Experiments:
|
||||
# Display experiments
|
||||
for i, exp in enumerate(experiments):
|
||||
with st.expander(
|
||||
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
|
||||
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
|
||||
):
|
||||
self._display_experiment_details(exp, i)
|
||||
|
||||
@@ -213,7 +216,7 @@ class Experiments:
|
||||
experiment_types = st.multiselect(
|
||||
"Select Experiment Types",
|
||||
["baseline", "advanced", "feature_study", "tuning"],
|
||||
default=["baseline"]
|
||||
default=["baseline"],
|
||||
)
|
||||
|
||||
if experiment_types:
|
||||
@@ -223,11 +226,11 @@ class Experiments:
|
||||
experiments = available_experiments.get(exp_type, [])
|
||||
if experiments:
|
||||
st.write(f"**{exp_type.title()} Experiments:**")
|
||||
exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)]
|
||||
exp_names = [
|
||||
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
|
||||
]
|
||||
selected_names = st.multiselect(
|
||||
f"Select {exp_type} experiments",
|
||||
exp_names,
|
||||
key=f"select_{exp_type}"
|
||||
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
|
||||
)
|
||||
|
||||
for name in selected_names:
|
||||
@@ -308,13 +311,13 @@ class Experiments:
|
||||
)
|
||||
|
||||
def run_batch_experiments(
|
||||
self,
|
||||
base_name: str,
|
||||
model_types: List[str],
|
||||
ngram_ranges: str,
|
||||
feature_combinations: List[str],
|
||||
test_sizes: str,
|
||||
tags: str,
|
||||
self,
|
||||
base_name: str,
|
||||
model_types: List[str],
|
||||
ngram_ranges: str,
|
||||
feature_combinations: List[str],
|
||||
test_sizes: str,
|
||||
tags: str,
|
||||
):
|
||||
"""Run batch experiments with parameter combinations"""
|
||||
with st.spinner("Running batch experiments..."):
|
||||
|
||||
@@ -38,7 +38,7 @@ class LogReader:
|
||||
|
||||
# Parse log entries from the end
|
||||
entries = []
|
||||
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
|
||||
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
|
||||
entry = self._parse_log_line(line.strip())
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
@@ -33,7 +33,9 @@ class NERTesting:
|
||||
|
||||
# Load model
|
||||
if not self.load_ner_model():
|
||||
st.warning("NER model could not be loaded. Please ensure the model is trained and available.")
|
||||
st.warning(
|
||||
"NER model could not be loaded. Please ensure the model is trained and available."
|
||||
)
|
||||
return
|
||||
|
||||
# Display model information
|
||||
@@ -53,9 +55,11 @@ class NERTesting:
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}")
|
||||
st.metric(
|
||||
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
|
||||
)
|
||||
with col2:
|
||||
st.metric("Epochs", self.training_stats.get('epochs', 0))
|
||||
st.metric("Epochs", self.training_stats.get("epochs", 0))
|
||||
with col3:
|
||||
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
|
||||
with col4:
|
||||
@@ -64,7 +68,7 @@ class NERTesting:
|
||||
def show_model_evaluation_info(self):
|
||||
if self.evaluation_stats:
|
||||
col1, col2, col3 = st.columns(4)
|
||||
overall = self.evaluation_stats.get('overall', {})
|
||||
overall = self.evaluation_stats.get("overall", {})
|
||||
|
||||
with col1:
|
||||
st.metric("Overall Precision", f"{overall['precision']:.2f}")
|
||||
@@ -79,7 +83,7 @@ class NERTesting:
|
||||
name_input = st.text_input(
|
||||
"Name:",
|
||||
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
|
||||
help="Enter a full name or multiple names separated by spaces"
|
||||
help="Enter a full name or multiple names separated by spaces",
|
||||
)
|
||||
if name_input.strip():
|
||||
if st.button("Analyze Name", type="primary"):
|
||||
@@ -90,12 +94,12 @@ class NERTesting:
|
||||
"Names:",
|
||||
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
|
||||
height=150,
|
||||
help="Enter each name on a new line"
|
||||
help="Enter each name on a new line",
|
||||
)
|
||||
|
||||
if names_input.strip():
|
||||
if st.button("Analyze All Names", type="primary"):
|
||||
names = [name.strip() for name in names_input.split('\n') if name.strip()]
|
||||
names = [name.strip() for name in names_input.split("\n") if name.strip()]
|
||||
for i, name in enumerate(names):
|
||||
st.markdown(f"**Name {i+1}: {name}**")
|
||||
self.analyze_and_display(name)
|
||||
@@ -106,12 +110,12 @@ class NERTesting:
|
||||
try:
|
||||
result = self.ner_model.predict(text)
|
||||
st.subheader("Analysis Results")
|
||||
entities = result.get('entities', [])
|
||||
entities = result.get("entities", [])
|
||||
|
||||
if entities:
|
||||
self.show_visual_entities(text, entities)
|
||||
native_count = sum(1 for e in entities if e['label'] == 'NATIVE')
|
||||
surname_count = sum(1 for e in entities if e['label'] == 'SURNAME')
|
||||
native_count = sum(1 for e in entities if e["label"] == "NATIVE")
|
||||
surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
@@ -134,29 +138,17 @@ class NERTesting:
|
||||
# Convert our entities format to spaCy format for displacy
|
||||
ents = []
|
||||
for entity in entities:
|
||||
ents.append({
|
||||
"start": entity['start'],
|
||||
"end": entity['end'],
|
||||
"label": entity['label']
|
||||
})
|
||||
ents.append(
|
||||
{"start": entity["start"], "end": entity["end"], "label": entity["label"]}
|
||||
)
|
||||
|
||||
# Create doc-like structure for displacy
|
||||
doc_data = {
|
||||
"text": text,
|
||||
"ents": ents,
|
||||
"title": None
|
||||
}
|
||||
doc_data = {"text": text, "ents": ents, "title": None}
|
||||
|
||||
# Custom colors for our labels
|
||||
colors = {
|
||||
"NATIVE": "#74C0FC", # Light blue
|
||||
"SURNAME": "#69DB7C" # Light green
|
||||
}
|
||||
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
|
||||
|
||||
options = {
|
||||
"colors": colors,
|
||||
"distance": 90
|
||||
}
|
||||
options = {"colors": colors, "distance": 90}
|
||||
|
||||
# Generate HTML visualization
|
||||
html = displacy.render(doc_data, style="ent", manual=True, options=options)
|
||||
|
||||
@@ -13,7 +13,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
class Predictions:
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
@@ -111,7 +111,7 @@ class Predictions:
|
||||
return None
|
||||
|
||||
def _display_single_prediction_results(
|
||||
self, prediction: str, confidence: Optional[float], experiment, name_input: str
|
||||
self, prediction: str, confidence: Optional[float], experiment, name_input: str
|
||||
):
|
||||
"""Display single prediction results"""
|
||||
col1, col2 = st.columns(2)
|
||||
@@ -288,7 +288,7 @@ class Predictions:
|
||||
return pd.DataFrame()
|
||||
|
||||
def _run_dataset_prediction(
|
||||
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
|
||||
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
|
||||
):
|
||||
"""Run dataset prediction and display results"""
|
||||
with st.spinner("Running predictions..."):
|
||||
|
||||
@@ -12,7 +12,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
class ResultsAnalysis:
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
|
||||
Reference in New Issue
Block a user