feat: add osm data

This commit is contained in:
2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
+35 -32
View File
@@ -62,7 +62,7 @@ stages:
**Running the Pipeline**
```bash
python main.py --env development
python main.py --env production
```
## NER Processing (Optional)
@@ -72,7 +72,7 @@ Its main objective is to accurately identify and tag the different components of
specifically distinguishing between the native part and the surname.
```bash
python ner.py --env development
python ner.py --env production
```
Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset
@@ -105,54 +105,54 @@ you can define model features, training parameters, and evaluation metrics in th
```bash
# bigru
python train.py --name="bigru" --type="baseline" --env="development"
python train.py --name="bigru_native" --type="baseline" --env="development"
python train.py --name="bigru_surname" --type="baseline" --env="development"
python train.py --name="bigru" --type="baseline" --env="production"
python train.py --name="bigru_native" --type="baseline" --env="production"
python train.py --name="bigru_surname" --type="baseline" --env="production"
# cnn
python train.py --name="cnn" --type="baseline" --env="development"
python train.py --name="cnn_native" --type="baseline" --env="development"
python train.py --name="cnn_surname" --type="baseline" --env="development"
python train.py --name="cnn" --type="baseline" --env="production"
python train.py --name="cnn_native" --type="baseline" --env="production"
python train.py --name="cnn_surname" --type="baseline" --env="production"
# lightgbm
python train.py --name="lightgbm" --type="baseline" --env="development"
python train.py --name="lightgbm_native" --type="baseline" --env="development"
python train.py --name="lightgbm_surname" --type="baseline" --env="development"
python train.py --name="lightgbm" --type="baseline" --env="production"
python train.py --name="lightgbm_native" --type="baseline" --env="production"
python train.py --name="lightgbm_surname" --type="baseline" --env="production"
# logistic regression
python train.py --name="logistic_regression" --type="baseline" --env="development"
python train.py --name="logistic_regression_native" --type="baseline" --env="development"
python train.py --name="logistic_regression_surname" --type="baseline" --env="development"
python train.py --name="logistic_regression" --type="baseline" --env="production"
python train.py --name="logistic_regression_native" --type="baseline" --env="production"
python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
# lstm
python train.py --name="lstm" --type="baseline" --env="development"
python train.py --name="lstm_native" --type="baseline" --env="development"
python train.py --name="lstm_surname" --type="baseline" --env="development"
python train.py --name="lstm" --type="baseline" --env="production"
python train.py --name="lstm_native" --type="baseline" --env="production"
python train.py --name="lstm_surname" --type="baseline" --env="production"
# random forest
python train.py --name="random_forest" --type="baseline" --env="development"
python train.py --name="random_forest_native" --type="baseline" --env="development"
python train.py --name="random_forest_surname" --type="baseline" --env="development"
python train.py --name="random_forest" --type="baseline" --env="production"
python train.py --name="random_forest_native" --type="baseline" --env="production"
python train.py --name="random_forest_surname" --type="baseline" --env="production"
# svm
python train.py --name="svm" --type="baseline" --env="development"
python train.py --name="svm_native" --type="baseline" --env="development"
python train.py --name="svm_surname" --type="baseline" --env="development"
python train.py --name="svm" --type="baseline" --env="production"
python train.py --name="svm_native" --type="baseline" --env="production"
python train.py --name="svm_surname" --type="baseline" --env="production"
# naive bayes
python train.py --name="naive_bayes" --type="baseline" --env="development"
python train.py --name="naive_bayes_native" --type="baseline" --env="development"
python train.py --name="naive_bayes_surname" --type="baseline" --env="development"
python train.py --name="naive_bayes" --type="baseline" --env="production"
python train.py --name="naive_bayes_native" --type="baseline" --env="production"
python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
# transformer
python train.py --name="transformer" --type="baseline" --env="development"
python train.py --name="transformer_native" --type="baseline" --env="development"
python train.py --name="transformer_surname" --type="baseline" --env="development"
python train.py --name="transformer" --type="baseline" --env="production"
python train.py --name="transformer_native" --type="baseline" --env="production"
python train.py --name="transformer_surname" --type="baseline" --env="production"
# xgboost
python train.py --name="xgboost" --type="baseline" --env="development"
python train.py --name="xgboost_native" --type="baseline" --env="development"
python train.py --name="xgboost_surname" --type="baseline" --env="development"
python train.py --name="xgboost" --type="baseline" --env="production"
python train.py --name="xgboost_native" --type="baseline" --env="production"
python train.py --name="xgboost_surname" --type="baseline" --env="production"
```
## Web Interface
@@ -171,3 +171,6 @@ streamlit run web/app.py
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
<img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
</a>
## Acknowledgements
- Map Visualization: [https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc](https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc)
+1
View File
@@ -11,6 +11,7 @@ processing:
# Pipeline stages
stages:
- "data_cleaning"
- "data_selection"
- "feature_extraction"
#- "ner_annotation"
#- "llm_annotation"
+6 -5
View File
@@ -3,17 +3,18 @@ debug: false
# Processing settings
processing:
batch_size: 10_000
max_workers: 8
batch_size: 100_000
max_workers: 4
checkpoint_interval: 10
use_multiprocessing: true
# Pipeline stages
stages:
- "data_cleaning"
- "data_selection"
- "feature_extraction"
- "ner_annotation"
- "llm_annotation"
# - "ner_annotation"
# - "llm_annotation"
- "data_splitting"
# Production LLM settings
@@ -34,7 +35,7 @@ data:
# Production logging (less verbose)
logging:
level: "INFO"
console_logging: false
console_logging: true
file_logging: true
log_file: "pipeline.production.log"
max_log_size: 52428800 # 50MB
+6
View File
@@ -21,6 +21,7 @@ paths:
# List of stages in the processing pipeline
stages:
- "data_cleaning" # Data cleaning stage
- "data_selection" # Data selection stage - keep only required columns
- "feature_extraction" # Feature extraction stage
- "ner_annotation" # NER-based annotation stage
- "llm_annotation" # LLM annotation stage (computational intensive)
@@ -64,6 +65,11 @@ data:
females: "names_females.csv" # Output files for female names
ner_data: "names_ner.json" # Output file for NER annotated data
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
selected_columns: # Required columns for processing
- name
- sex
- region
- year
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
split_by_gender: true # Should the dataset be split by gender ?
split_by_province: true # Should the dataset be split by province ?
+1
View File
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
"ner_spacy": "names_ner.spacy",
}
)
selected_columns: list[str] = field(default=["name", "sex", "region"])
split_evaluation: bool = False
split_by_province: bool = True
split_by_gender: bool = True
+2 -4
View File
@@ -8,12 +8,10 @@ class RegionMapper:
def __init__(self, mapping: Optional[Dict] = None):
self.mapping = mapping or REGION_MAPPING
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
def map(self, series: pd.Series) -> pd.Series:
"""Vectorized region to province mapping"""
return series.str.lower().map(
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
)
return series.str.lower().map(self.mapping).fillna("AUTRES")
@staticmethod
def get_provinces():
+2 -3
View File
@@ -30,9 +30,8 @@ class TextCleaner:
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean all text columns in a DataFrame"""
df = df.copy()
text_columns = df.select_dtypes(include="object").columns
for col in text_columns:
columns = df.select_dtypes(include=["object", "string"]).columns
for col in columns:
df[col] = self.clean_text_series(df[col])
return df
+3 -2
View File
@@ -9,9 +9,9 @@ from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from processing.pipeline import Pipeline
from processing.steps.data_cleaning_step import DataCleaningStep
from processing.steps.data_selection_step import DataSelectionStep
from processing.steps.data_splitting_step import DataSplittingStep
from processing.steps.feature_extraction_step import FeatureExtractionStep
from processing.steps.llm_annotation_step import LLMAnnotationStep
def create_pipeline(config) -> Pipeline:
@@ -28,8 +28,9 @@ def create_pipeline(config) -> Pipeline:
steps = [
DataCleaningStep(config),
FeatureExtractionStep(config),
DataSelectionStep(config),
# NERAnnotationStep(config),
LLMAnnotationStep(config),
# LLMAnnotationStep(config),
]
for stage in config.stages:
+1
View File
@@ -11,6 +11,7 @@ from processing.monitoring.pipeline_monitor import PipelineMonitor
def main():
choices = [
"data_cleaning",
"data_selection",
"feature_extraction",
"ner_annotation",
"llm_annotation",
+99 -93
View File
File diff suppressed because one or more lines are too long
+431 -99
View File
File diff suppressed because one or more lines are too long
+107
View File
@@ -0,0 +1,107 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Qualitative Analysis",
"id": "d20715dd63f57364"
},
{
"cell_type": "code",
"id": "c93a55c8",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:34:50.973298Z",
"start_time": "2025-09-21T13:34:50.969142Z"
}
},
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sys\n",
"import os\n",
"\n",
"sys.path.append(os.path.abspath(\"..\"))\n",
"from core.utils.data_loader import DataLoader\n",
"from core.config.pipeline_config import PipelineConfig"
],
"outputs": [],
"execution_count": 3
},
{
"cell_type": "code",
"id": "c0b00261",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:34:51.002610Z",
"start_time": "2025-09-21T13:34:50.998586Z"
}
},
"source": [
"config = PipelineConfig(\n",
" paths={\n",
" \"root_dir\": \"../data\",\n",
" \"data_dir\": \"../data/dataset\",\n",
" \"models_dir\": \"../models\",\n",
" \"outputs_dir\": \"../data/processed\",\n",
" \"logs_dir\": \"../logs\",\n",
" \"configs_dir\": \"../configs\",\n",
" \"checkpoints_dir\": \"../checkpoints\"\n",
" }\n",
")\n",
"\n",
"loader = DataLoader(config)"
],
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T13:35:27.430639Z",
"start_time": "2025-09-21T13:34:51.013412Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": 5,
"source": [
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
"gdf_proj = gdf.to_crs(epsg=32732)\n",
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
"\n",
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
],
"id": "b38394ce38864379"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Exploration",
"id": "a1af5626d2a948d6"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+107
View File
@@ -0,0 +1,107 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Quantitative Analysis",
"id": "a605c0f92056a825"
},
{
"cell_type": "code",
"id": "c93a55c8",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:14:47.287549Z",
"start_time": "2025-09-21T14:14:47.279199Z"
}
},
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sys\n",
"import os\n",
"\n",
"sys.path.append(os.path.abspath(\"..\"))\n",
"from core.utils.data_loader import DataLoader\n",
"from core.config.pipeline_config import PipelineConfig"
],
"outputs": [],
"execution_count": 30
},
{
"cell_type": "code",
"id": "c0b00261",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:14:47.315980Z",
"start_time": "2025-09-21T14:14:47.308376Z"
}
},
"source": [
"config = PipelineConfig(\n",
" paths={\n",
" \"root_dir\": \"../data\",\n",
" \"data_dir\": \"../data/dataset\",\n",
" \"models_dir\": \"../models\",\n",
" \"outputs_dir\": \"../data/processed\",\n",
" \"logs_dir\": \"../logs\",\n",
" \"configs_dir\": \"../configs\",\n",
" \"checkpoints_dir\": \"../checkpoints\"\n",
" }\n",
")\n",
"\n",
"loader = DataLoader(config)"
],
"outputs": [],
"execution_count": 31
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-21T14:15:47.899044Z",
"start_time": "2025-09-21T14:14:47.339266Z"
}
},
"cell_type": "code",
"source": [
"gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
"gdf_proj = gdf.to_crs(epsg=32732)\n",
"gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
"\n",
"df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
],
"id": "b38394ce38864379",
"outputs": [],
"execution_count": 32
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Exploration",
"id": "a1af5626d2a948d6"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+1
View File
@@ -0,0 +1 @@
UTF-8
BIN
View File
Binary file not shown.
+1
View File
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
+3 -1
View File
@@ -211,7 +211,9 @@ class NameModel:
for batch in batches:
batch_losses = {}
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}")
logging.info(
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
)
# Accumulate into total losses dict
for k, v in batch_losses.items():
+4 -1
View File
@@ -49,6 +49,9 @@ class Pipeline:
"processed_batches": step.state.processed_batches,
"total_batches": step.state.total_batches,
"failed_batches": len(step.state.failed_batches),
"completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
"completion_percentage": (
step.state.processed_batches / max(1, step.state.total_batches)
)
* 100,
}
return progress
+43
View File
@@ -0,0 +1,43 @@
import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps import PipelineStep
class DataSelectionStep(PipelineStep):
"""Configuration-driven data selection step to keep only specified columns"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_selection", pipeline_config)
self.selected_columns = pipeline_config.data.selected_columns
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data selection"""
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
# Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns]
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
if missing_columns:
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
if not available_columns:
logging.error(f"No required columns found in batch {batch_id}")
return pd.DataFrame() # Return empty DataFrame if no required columns exist
# Select only the available required columns
selected_batch = batch[available_columns].copy()
logging.info(
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
)
return selected_batch
@property
def requires_batch_mutation(self) -> bool:
"""This step modifies the batch data by selecting columns"""
return True
+2 -2
View File
@@ -41,14 +41,14 @@ class BaseModel(ABC):
@abstractmethod
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float] | dict[str, np.floating[Any]]:
"""Perform cross-validation and return average scores"""
pass
@abstractmethod
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
pass
+10 -8
View File
@@ -55,11 +55,11 @@ class ExperimentBuilder:
# Check if this is the experiment we're looking for
# Look for experiments that match the model type or contain the name
if (
experiment.get("model_type") == name
or name.lower() in experiment.get("name", "").lower()
or experiment.get("name") == name
or f"baseline_{name}" == experiment.get("name")
or f"advanced_{name}" == experiment.get("name")
experiment.get("model_type") == name
or name.lower() in experiment.get("name", "").lower()
or experiment.get("name") == name
or f"baseline_{name}" == experiment.get("name")
or f"advanced_{name}" == experiment.get("name")
):
return experiment
@@ -72,7 +72,9 @@ class ExperimentBuilder:
f"Available experiments: {available_experiments}"
)
def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]:
def get_templates(
self, templates_path: str = "research_templates.yaml"
) -> Dict[str, List[Dict]]:
"""Get all available experiments from templates organized by type"""
templates = self.load_templates(templates_path)
@@ -80,7 +82,7 @@ class ExperimentBuilder:
"baseline": templates.get("baseline_experiments", []),
"advanced": templates.get("advanced_experiments", []),
"feature_study": templates.get("feature_studies", []),
"tuning": templates.get("hyperparameter_tuning", [])
"tuning": templates.get("hyperparameter_tuning", []),
}
@classmethod
@@ -104,5 +106,5 @@ class ExperimentBuilder:
tags=template_config.get("tags", []),
test_size=template_config.get("test_size", 0.2),
cross_validation_folds=template_config.get("cross_validation_folds", 5),
train_data_filter=template_config.get("train_data_filter")
train_data_filter=template_config.get("train_data_filter"),
)
+7 -7
View File
@@ -158,12 +158,12 @@ class ExperimentRunner:
@classmethod
def _create_prediction_examples(
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
) -> List[Dict]:
"""Create prediction examples for analysis"""
examples = []
@@ -237,7 +237,7 @@ class ExperimentRunner:
return None
def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy"
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
"""Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids)
+7 -7
View File
@@ -77,10 +77,10 @@ class ExperimentTracker:
return self._results.get(experiment_id)
def list_experiments(
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
) -> List[ExperimentResult]:
"""List experiments with optional filtering"""
results = list(self._results.values())
@@ -97,7 +97,7 @@ class ExperimentTracker:
return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric"""
experiments = self.list_experiments()
@@ -159,8 +159,8 @@ class ExperimentTracker:
"""Export all results to CSV"""
if output_path is None:
output_path = (
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
rows = []
+1 -1
View File
@@ -43,7 +43,7 @@ class FeatureExtractor:
return features_df
def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType
self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME:
+8 -8
View File
@@ -27,13 +27,13 @@ class ModelTrainer:
self.models_dir.mkdir(parents=True, exist_ok=True)
def train_single_model(
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
tags: List[str] = None,
save_artifacts: bool = True,
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
tags: List[str] = None,
save_artifacts: bool = True,
) -> str:
"""
Train a single model and save its artifacts.
@@ -75,7 +75,7 @@ class ModelTrainer:
return experiment_id
def train_multiple_models(
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
) -> List[str]:
"""
Train multiple models with different configurations.
+3 -1
View File
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
# Soft voting averages probabilities (preferred when members are calibrated);
# hard voting uses majority class. Parallelize member predictions.
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
return VotingClassifier(
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
+3 -1
View File
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
encoder = self.label_encoders[feature_key]
column_clean = column.fillna("unknown").astype(str)
known_classes = set(encoder.classes_)
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
default_class = (
"unknown" if "unknown" in known_classes else encoder.classes_[0]
)
column_mapped = column_clean.apply(
lambda value: value if value in known_classes else default_class
)
+3 -3
View File
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):
# Add positional encoding
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
positions
)
pos_embedding = Embedding(
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
)(positions)
x = x + pos_embedding
x = self._transformer_encoder(x, params)
+5 -3
View File
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
"""Combine configured textual features into one string per record."""
column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
column_names = [
feature.value for feature in self.config.features if feature.value in X.columns
]
if not column_names:
raise ValueError("No configured text features found in the provided DataFrame.")
@@ -101,7 +103,7 @@ class NeuralNetworkModel(BaseModel):
return combined_rows
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
@@ -158,7 +160,7 @@ class NeuralNetworkModel(BaseModel):
}
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
+1 -1
View File
@@ -103,7 +103,7 @@ class TraditionalModel(BaseModel):
return results
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
+3 -1
View File
@@ -50,7 +50,9 @@ class StreamlitApp:
@classmethod
def run(cls):
st.title("🇨🇩 DRC NERS Platform")
st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
st.markdown(
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
)
st.markdown(
"""
## Overview
-1
View File
@@ -1,2 +1 @@
from .ner_testing import NERTesting
+24 -21
View File
@@ -13,10 +13,10 @@ from research.model_registry import list_available_models
class Experiments:
def __init__(
self,
config: PipelineConfig,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner
self,
config: PipelineConfig,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner,
):
self.config = config
self.experiment_tracker = experiment_tracker
@@ -26,8 +26,7 @@ class Experiments:
def index(self):
st.title("Experiments")
tab1, tab2, tab3 = st.tabs(
["Templates", "Experiments", "Batch Experiments"])
tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
with tab1:
self.show_template_experiments()
@@ -56,14 +55,18 @@ class Experiments:
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
with exp_tabs[2]:
self._show_experiments_by_type(available_experiments["feature_study"], "feature_study")
self._show_experiments_by_type(
available_experiments["feature_study"], "feature_study"
)
with exp_tabs[3]:
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
except Exception as e:
st.error(f"Error loading experiment templates: {e}")
st.info("Make sure the research templates file exists at `config/research_templates.yaml`")
st.info(
"Make sure the research templates file exists at `config/research_templates.yaml`"
)
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
"""Show experiments for a specific type"""
@@ -142,7 +145,7 @@ class Experiments:
# Display experiments
for i, exp in enumerate(experiments):
with st.expander(
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
):
self._display_experiment_details(exp, i)
@@ -213,7 +216,7 @@ class Experiments:
experiment_types = st.multiselect(
"Select Experiment Types",
["baseline", "advanced", "feature_study", "tuning"],
default=["baseline"]
default=["baseline"],
)
if experiment_types:
@@ -223,11 +226,11 @@ class Experiments:
experiments = available_experiments.get(exp_type, [])
if experiments:
st.write(f"**{exp_type.title()} Experiments:**")
exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)]
exp_names = [
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
]
selected_names = st.multiselect(
f"Select {exp_type} experiments",
exp_names,
key=f"select_{exp_type}"
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
)
for name in selected_names:
@@ -308,13 +311,13 @@ class Experiments:
)
def run_batch_experiments(
self,
base_name: str,
model_types: List[str],
ngram_ranges: str,
feature_combinations: List[str],
test_sizes: str,
tags: str,
self,
base_name: str,
model_types: List[str],
ngram_ranges: str,
feature_combinations: List[str],
test_sizes: str,
tags: str,
):
"""Run batch experiments with parameter combinations"""
with st.spinner("Running batch experiments..."):
+1 -1
View File
@@ -38,7 +38,7 @@ class LogReader:
# Parse log entries from the end
entries = []
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip())
if entry:
entries.append(entry)
+20 -28
View File
@@ -33,7 +33,9 @@ class NERTesting:
# Load model
if not self.load_ner_model():
st.warning("NER model could not be loaded. Please ensure the model is trained and available.")
st.warning(
"NER model could not be loaded. Please ensure the model is trained and available."
)
return
# Display model information
@@ -53,9 +55,11 @@ class NERTesting:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}")
st.metric(
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
)
with col2:
st.metric("Epochs", self.training_stats.get('epochs', 0))
st.metric("Epochs", self.training_stats.get("epochs", 0))
with col3:
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
with col4:
@@ -64,7 +68,7 @@ class NERTesting:
def show_model_evaluation_info(self):
if self.evaluation_stats:
col1, col2, col3 = st.columns(4)
overall = self.evaluation_stats.get('overall', {})
overall = self.evaluation_stats.get("overall", {})
with col1:
st.metric("Overall Precision", f"{overall['precision']:.2f}")
@@ -79,7 +83,7 @@ class NERTesting:
name_input = st.text_input(
"Name:",
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
help="Enter a full name or multiple names separated by spaces"
help="Enter a full name or multiple names separated by spaces",
)
if name_input.strip():
if st.button("Analyze Name", type="primary"):
@@ -90,12 +94,12 @@ class NERTesting:
"Names:",
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
height=150,
help="Enter each name on a new line"
help="Enter each name on a new line",
)
if names_input.strip():
if st.button("Analyze All Names", type="primary"):
names = [name.strip() for name in names_input.split('\n') if name.strip()]
names = [name.strip() for name in names_input.split("\n") if name.strip()]
for i, name in enumerate(names):
st.markdown(f"**Name {i+1}: {name}**")
self.analyze_and_display(name)
@@ -106,12 +110,12 @@ class NERTesting:
try:
result = self.ner_model.predict(text)
st.subheader("Analysis Results")
entities = result.get('entities', [])
entities = result.get("entities", [])
if entities:
self.show_visual_entities(text, entities)
native_count = sum(1 for e in entities if e['label'] == 'NATIVE')
surname_count = sum(1 for e in entities if e['label'] == 'SURNAME')
native_count = sum(1 for e in entities if e["label"] == "NATIVE")
surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
col1, col2, col3 = st.columns(3)
with col1:
@@ -134,29 +138,17 @@ class NERTesting:
# Convert our entities format to spaCy format for displacy
ents = []
for entity in entities:
ents.append({
"start": entity['start'],
"end": entity['end'],
"label": entity['label']
})
ents.append(
{"start": entity["start"], "end": entity["end"], "label": entity["label"]}
)
# Create doc-like structure for displacy
doc_data = {
"text": text,
"ents": ents,
"title": None
}
doc_data = {"text": text, "ents": ents, "title": None}
# Custom colors for our labels
colors = {
"NATIVE": "#74C0FC", # Light blue
"SURNAME": "#69DB7C" # Light green
}
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
options = {
"colors": colors,
"distance": 90
}
options = {"colors": colors, "distance": 90}
# Generate HTML visualization
html = displacy.render(doc_data, style="ent", manual=True, options=options)
+3 -3
View File
@@ -13,7 +13,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
class Predictions:
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker
@@ -111,7 +111,7 @@ class Predictions:
return None
def _display_single_prediction_results(
self, prediction: str, confidence: Optional[float], experiment, name_input: str
self, prediction: str, confidence: Optional[float], experiment, name_input: str
):
"""Display single prediction results"""
col1, col2 = st.columns(2)
@@ -288,7 +288,7 @@ class Predictions:
return pd.DataFrame()
def _run_dataset_prediction(
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
):
"""Run dataset prediction and display results"""
with st.spinner("Running predictions..."):
+1 -1
View File
@@ -12,7 +12,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
class ResultsAnalysis:
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker