feat: add osm data

2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
@@ -62,7 +62,7 @@ stages:
 **Running the Pipeline**

 ```bash
-python main.py --env development
+python main.py --env production
 ```

 ## NER Processing (Optional)
@@ -72,7 +72,7 @@ Its main objective is to accurately identify and tag the different components of
 specifically distinguishing between the native part and the surname.

 ```bash
-python ner.py --env development
+python ner.py --env production
 ```

 Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset 
@@ -105,54 +105,54 @@ you can define model features, training parameters, and evaluation metrics in th

 ```bash
 # bigru
-python train.py --name="bigru" --type="baseline" --env="development"
-python train.py --name="bigru_native" --type="baseline" --env="development"
-python train.py --name="bigru_surname" --type="baseline" --env="development"
+python train.py --name="bigru" --type="baseline" --env="production"
+python train.py --name="bigru_native" --type="baseline" --env="production"
+python train.py --name="bigru_surname" --type="baseline" --env="production"

 # cnn
-python train.py --name="cnn" --type="baseline" --env="development"
-python train.py --name="cnn_native" --type="baseline" --env="development"
-python train.py --name="cnn_surname" --type="baseline" --env="development"
+python train.py --name="cnn" --type="baseline" --env="production"
+python train.py --name="cnn_native" --type="baseline" --env="production"
+python train.py --name="cnn_surname" --type="baseline" --env="production"

 # lightgbm
-python train.py --name="lightgbm" --type="baseline" --env="development"
-python train.py --name="lightgbm_native" --type="baseline" --env="development"
-python train.py --name="lightgbm_surname" --type="baseline" --env="development"
+python train.py --name="lightgbm" --type="baseline" --env="production"
+python train.py --name="lightgbm_native" --type="baseline" --env="production"
+python train.py --name="lightgbm_surname" --type="baseline" --env="production"

 # logistic regression
-python train.py --name="logistic_regression" --type="baseline" --env="development"
-python train.py --name="logistic_regression_native" --type="baseline" --env="development"
-python train.py --name="logistic_regression_surname" --type="baseline" --env="development"
+python train.py --name="logistic_regression" --type="baseline" --env="production"
+python train.py --name="logistic_regression_native" --type="baseline" --env="production"
+python train.py --name="logistic_regression_surname" --type="baseline" --env="production"

 # lstm
-python train.py --name="lstm" --type="baseline" --env="development"
-python train.py --name="lstm_native" --type="baseline" --env="development"
-python train.py --name="lstm_surname" --type="baseline" --env="development"
+python train.py --name="lstm" --type="baseline" --env="production"
+python train.py --name="lstm_native" --type="baseline" --env="production"
+python train.py --name="lstm_surname" --type="baseline" --env="production"

 # random forest
-python train.py --name="random_forest" --type="baseline" --env="development"
-python train.py --name="random_forest_native" --type="baseline" --env="development"
-python train.py --name="random_forest_surname" --type="baseline" --env="development"
+python train.py --name="random_forest" --type="baseline" --env="production"
+python train.py --name="random_forest_native" --type="baseline" --env="production"
+python train.py --name="random_forest_surname" --type="baseline" --env="production"

 # svm
-python train.py --name="svm" --type="baseline" --env="development"
-python train.py --name="svm_native" --type="baseline" --env="development"
-python train.py --name="svm_surname" --type="baseline" --env="development"
+python train.py --name="svm" --type="baseline" --env="production"
+python train.py --name="svm_native" --type="baseline" --env="production"
+python train.py --name="svm_surname" --type="baseline" --env="production"

 # naive bayes
-python train.py --name="naive_bayes" --type="baseline" --env="development"
-python train.py --name="naive_bayes_native" --type="baseline" --env="development"
-python train.py --name="naive_bayes_surname" --type="baseline" --env="development"
+python train.py --name="naive_bayes" --type="baseline" --env="production"
+python train.py --name="naive_bayes_native" --type="baseline" --env="production"
+python train.py --name="naive_bayes_surname" --type="baseline" --env="production"

 # transformer
-python train.py --name="transformer" --type="baseline" --env="development"
-python train.py --name="transformer_native" --type="baseline" --env="development"
-python train.py --name="transformer_surname" --type="baseline" --env="development"
+python train.py --name="transformer" --type="baseline" --env="production"
+python train.py --name="transformer_native" --type="baseline" --env="production"
+python train.py --name="transformer_surname" --type="baseline" --env="production"

 # xgboost
-python train.py --name="xgboost" --type="baseline" --env="development"
-python train.py --name="xgboost_native" --type="baseline" --env="development"
-python train.py --name="xgboost_surname" --type="baseline" --env="development"
+python train.py --name="xgboost" --type="baseline" --env="production"
+python train.py --name="xgboost_native" --type="baseline" --env="production"
+python train.py --name="xgboost_surname" --type="baseline" --env="production"
 ```

 ## Web Interface
@@ -171,3 +171,6 @@ streamlit run web/app.py
 <a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
  <img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
 </a>
+
+## Acknowledgements
+- Map Visualization: [https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc](https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc)
@@ -11,6 +11,7 @@ processing:
 # Pipeline stages
 stages:
  - "data_cleaning"
+  - "data_selection"
  - "feature_extraction"
  #- "ner_annotation"
  #- "llm_annotation"
@@ -3,17 +3,18 @@ debug: false

 # Processing settings
 processing:
-  batch_size: 10_000
-  max_workers: 8
+  batch_size: 100_000
+  max_workers: 4
  checkpoint_interval: 10
  use_multiprocessing: true

 # Pipeline stages
 stages:
  - "data_cleaning"
+  - "data_selection"
  - "feature_extraction"
-  - "ner_annotation"
-  - "llm_annotation"
+  # - "ner_annotation"
+  # - "llm_annotation"
  - "data_splitting"

 # Production LLM settings
@@ -34,7 +35,7 @@ data:
 # Production logging (less verbose)
 logging:
  level: "INFO"
-  console_logging: false
+  console_logging: true
  file_logging: true
  log_file: "pipeline.production.log"
  max_log_size: 52428800  # 50MB
@@ -21,6 +21,7 @@ paths:
 # List of stages in the processing pipeline
 stages:
  - "data_cleaning"                        # Data cleaning stage
+  - "data_selection"                       # Data selection stage - keep only required columns
  - "feature_extraction"                   # Feature extraction stage
  - "ner_annotation"                       # NER-based annotation stage
  - "llm_annotation"                       # LLM annotation stage (computational intensive)
@@ -64,6 +65,11 @@ data:
    females: "names_females.csv"            # Output files for female names
    ner_data: "names_ner.json"              # Output file for NER annotated data
    ner_spacy: "names_ner.spacy"            # Output file for NER annotated data using spaCy format
+  selected_columns:                         # Required columns for processing
+    - name
+    - sex
+    - region
+    - year
  split_evaluation: false                   # Should the dataset be split into training and evaluation sets ?
  split_by_gender: true                     # Should the dataset be split by gender ?
  split_by_province: true                   # Should the dataset be split by province ?
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
            "ner_spacy": "names_ner.spacy",
        }
    )
+    selected_columns: list[str] = field(default=["name", "sex", "region"])
    split_evaluation: bool = False
    split_by_province: bool = True
    split_by_gender: bool = True
@@ -8,12 +8,10 @@ class RegionMapper:

    def __init__(self, mapping: Optional[Dict] = None):
        self.mapping = mapping or REGION_MAPPING
+        self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}

    def map(self, series: pd.Series) -> pd.Series:
-        """Vectorized region to province mapping"""
-        return series.str.lower().map(
-            lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
-        )
+        return series.str.lower().map(self.mapping).fillna("AUTRES")

    @staticmethod
    def get_provinces():
@@ -30,9 +30,8 @@ class TextCleaner:
    def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean all text columns in a DataFrame"""
        df = df.copy()
-        text_columns = df.select_dtypes(include="object").columns
-
-        for col in text_columns:
+        columns = df.select_dtypes(include=["object", "string"]).columns
+        for col in columns:
            df[col] = self.clean_text_series(df[col])

        return df
@@ -9,9 +9,9 @@ from core.utils.data_loader import DataLoader
 from processing.batch.batch_config import BatchConfig
 from processing.pipeline import Pipeline
 from processing.steps.data_cleaning_step import DataCleaningStep
+from processing.steps.data_selection_step import DataSelectionStep
 from processing.steps.data_splitting_step import DataSplittingStep
 from processing.steps.feature_extraction_step import FeatureExtractionStep
-from processing.steps.llm_annotation_step import LLMAnnotationStep


 def create_pipeline(config) -> Pipeline:
@@ -28,8 +28,9 @@ def create_pipeline(config) -> Pipeline:
    steps = [
        DataCleaningStep(config),
        FeatureExtractionStep(config),
+        DataSelectionStep(config),
        # NERAnnotationStep(config),
-        LLMAnnotationStep(config),
+        # LLMAnnotationStep(config),
    ]

    for stage in config.stages:
@@ -11,6 +11,7 @@ from processing.monitoring.pipeline_monitor import PipelineMonitor
 def main():
    choices = [
        "data_cleaning",
+        "data_selection",
        "feature_extraction",
        "ner_annotation",
        "llm_annotation",
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Qualitative Analysis",
+   "id": "d20715dd63f57364"
+  },
+  {
+   "cell_type": "code",
+   "id": "c93a55c8",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-21T13:34:50.973298Z",
+     "start_time": "2025-09-21T13:34:50.969142Z"
+    }
+   },
+   "source": [
+    "import pandas as pd\n",
+    "import geopandas as gpd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "sys.path.append(os.path.abspath(\"..\"))\n",
+    "from core.utils.data_loader import DataLoader\n",
+    "from core.config.pipeline_config import PipelineConfig"
+   ],
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "cell_type": "code",
+   "id": "c0b00261",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-21T13:34:51.002610Z",
+     "start_time": "2025-09-21T13:34:50.998586Z"
+    }
+   },
+   "source": [
+    "config = PipelineConfig(\n",
+    "    paths={\n",
+    "        \"root_dir\": \"../data\",\n",
+    "        \"data_dir\": \"../data/dataset\",\n",
+    "        \"models_dir\": \"../models\",\n",
+    "        \"outputs_dir\": \"../data/processed\",\n",
+    "        \"logs_dir\": \"../logs\",\n",
+    "        \"configs_dir\": \"../configs\",\n",
+    "        \"checkpoints_dir\": \"../checkpoints\"\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "loader = DataLoader(config)"
+   ],
+   "outputs": [],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-21T13:35:27.430639Z",
+     "start_time": "2025-09-21T13:34:51.013412Z"
+    }
+   },
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": 5,
+   "source": [
+    "gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
+    "gdf_proj = gdf.to_crs(epsg=32732)\n",
+    "gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
+    "\n",
+    "df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
+   ],
+   "id": "b38394ce38864379"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Exploration",
+   "id": "a1af5626d2a948d6"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Quantitative Analysis",
+   "id": "a605c0f92056a825"
+  },
+  {
+   "cell_type": "code",
+   "id": "c93a55c8",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-21T14:14:47.287549Z",
+     "start_time": "2025-09-21T14:14:47.279199Z"
+    }
+   },
+   "source": [
+    "import pandas as pd\n",
+    "import geopandas as gpd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "sys.path.append(os.path.abspath(\"..\"))\n",
+    "from core.utils.data_loader import DataLoader\n",
+    "from core.config.pipeline_config import PipelineConfig"
+   ],
+   "outputs": [],
+   "execution_count": 30
+  },
+  {
+   "cell_type": "code",
+   "id": "c0b00261",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-21T14:14:47.315980Z",
+     "start_time": "2025-09-21T14:14:47.308376Z"
+    }
+   },
+   "source": [
+    "config = PipelineConfig(\n",
+    "    paths={\n",
+    "        \"root_dir\": \"../data\",\n",
+    "        \"data_dir\": \"../data/dataset\",\n",
+    "        \"models_dir\": \"../models\",\n",
+    "        \"outputs_dir\": \"../data/processed\",\n",
+    "        \"logs_dir\": \"../logs\",\n",
+    "        \"configs_dir\": \"../configs\",\n",
+    "        \"checkpoints_dir\": \"../checkpoints\"\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "loader = DataLoader(config)"
+   ],
+   "outputs": [],
+   "execution_count": 31
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-21T14:15:47.899044Z",
+     "start_time": "2025-09-21T14:14:47.339266Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
+    "gdf_proj = gdf.to_crs(epsg=32732)\n",
+    "gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
+    "\n",
+    "df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
+   ],
+   "id": "b38394ce38864379",
+   "outputs": [],
+   "execution_count": 32
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Exploration",
+   "id": "a1af5626d2a948d6"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1 @@
+UTF-8
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
@@ -211,7 +211,9 @@ class NameModel:
            for batch in batches:
                batch_losses = {}
                self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
-                logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}")
+                logging.info(
+                    f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
+                )

                # Accumulate into total losses dict
                for k, v in batch_losses.items():
@@ -49,6 +49,9 @@ class Pipeline:
                "processed_batches": step.state.processed_batches,
                "total_batches": step.state.total_batches,
                "failed_batches": len(step.state.failed_batches),
-                "completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
+                "completion_percentage": (
+                    step.state.processed_batches / max(1, step.state.total_batches)
+                )
+                * 100,
            }
        return progress
@@ -0,0 +1,43 @@
+import logging
+
+import pandas as pd
+
+from core.config.pipeline_config import PipelineConfig
+from processing.steps import PipelineStep
+
+
+class DataSelectionStep(PipelineStep):
+    """Configuration-driven data selection step to keep only specified columns"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        super().__init__("data_selection", pipeline_config)
+        self.selected_columns = pipeline_config.data.selected_columns
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process a single batch for data selection"""
+        logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
+
+        # Check which columns exist in the batch
+        available_columns = [col for col in self.selected_columns if col in batch.columns]
+        missing_columns = [col for col in self.selected_columns if col not in batch.columns]
+
+        if missing_columns:
+            logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
+
+        if not available_columns:
+            logging.error(f"No required columns found in batch {batch_id}")
+            return pd.DataFrame()  # Return empty DataFrame if no required columns exist
+
+        # Select only the available required columns
+        selected_batch = batch[available_columns].copy()
+
+        logging.info(
+            f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
+        )
+
+        return selected_batch
+
+    @property
+    def requires_batch_mutation(self) -> bool:
+        """This step modifies the batch data by selecting columns"""
+        return True
@@ -41,14 +41,14 @@ class BaseModel(ABC):

    @abstractmethod
    def cross_validate(
-            self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
+        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
    ) -> Dict[str, float] | dict[str, np.floating[Any]]:
        """Perform cross-validation and return average scores"""
        pass

    @abstractmethod
    def generate_learning_curve(
-            self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        pass
@@ -55,11 +55,11 @@ class ExperimentBuilder:
            # Check if this is the experiment we're looking for
            # Look for experiments that match the model type or contain the name
            if (
-                    experiment.get("model_type") == name
-                    or name.lower() in experiment.get("name", "").lower()
-                    or experiment.get("name") == name
-                    or f"baseline_{name}" == experiment.get("name")
-                    or f"advanced_{name}" == experiment.get("name")
+                experiment.get("model_type") == name
+                or name.lower() in experiment.get("name", "").lower()
+                or experiment.get("name") == name
+                or f"baseline_{name}" == experiment.get("name")
+                or f"advanced_{name}" == experiment.get("name")
            ):
                return experiment

@@ -72,7 +72,9 @@ class ExperimentBuilder:
            f"Available experiments: {available_experiments}"
        )

-    def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]:
+    def get_templates(
+        self, templates_path: str = "research_templates.yaml"
+    ) -> Dict[str, List[Dict]]:
        """Get all available experiments from templates organized by type"""
        templates = self.load_templates(templates_path)

@@ -80,7 +82,7 @@ class ExperimentBuilder:
            "baseline": templates.get("baseline_experiments", []),
            "advanced": templates.get("advanced_experiments", []),
            "feature_study": templates.get("feature_studies", []),
-            "tuning": templates.get("hyperparameter_tuning", [])
+            "tuning": templates.get("hyperparameter_tuning", []),
        }

    @classmethod
@@ -104,5 +106,5 @@ class ExperimentBuilder:
            tags=template_config.get("tags", []),
            test_size=template_config.get("test_size", 0.2),
            cross_validation_folds=template_config.get("cross_validation_folds", 5),
-            train_data_filter=template_config.get("train_data_filter")
+            train_data_filter=template_config.get("train_data_filter"),
        )
@@ -158,12 +158,12 @@ class ExperimentRunner:

    @classmethod
    def _create_prediction_examples(
-            cls,
-            X_test: pd.DataFrame,
-            y_test: pd.Series,
-            predictions: np.ndarray,
-            model: BaseModel,
-            n_examples: int = 10,
+        cls,
+        X_test: pd.DataFrame,
+        y_test: pd.Series,
+        predictions: np.ndarray,
+        model: BaseModel,
+        n_examples: int = 10,
    ) -> List[Dict]:
        """Create prediction examples for analysis"""
        examples = []
@@ -237,7 +237,7 @@ class ExperimentRunner:
        return None

    def compare_experiments(
-            self, experiment_ids: List[str], metric: str = "accuracy"
+        self, experiment_ids: List[str], metric: str = "accuracy"
    ) -> pd.DataFrame:
        """Compare experiments and return analysis"""
        comparison_df = self.tracker.compare_experiments(experiment_ids)
@@ -77,10 +77,10 @@ class ExperimentTracker:
        return self._results.get(experiment_id)

    def list_experiments(
-            self,
-            status: Optional[ExperimentStatus] = None,
-            tags: Optional[List[str]] = None,
-            model_type: Optional[str] = None,
+        self,
+        status: Optional[ExperimentStatus] = None,
+        tags: Optional[List[str]] = None,
+        model_type: Optional[str] = None,
    ) -> List[ExperimentResult]:
        """List experiments with optional filtering"""
        results = list(self._results.values())
@@ -97,7 +97,7 @@ class ExperimentTracker:
        return sorted(results, key=lambda x: x.start_time, reverse=True)

    def get_best_experiment(
-            self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
+        self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
    ) -> Optional[ExperimentResult]:
        """Get the best experiment based on a metric"""
        experiments = self.list_experiments()
@@ -159,8 +159,8 @@ class ExperimentTracker:
        """Export all results to CSV"""
        if output_path is None:
            output_path = (
-                    self.experiments_dir
-                    / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+                self.experiments_dir
+                / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            )

        rows = []
@@ -43,7 +43,7 @@ class FeatureExtractor:
        return features_df

    def _extract_single_feature(
-            self, df: pd.DataFrame, feature_type: FeatureType
+        self, df: pd.DataFrame, feature_type: FeatureType
    ) -> Union[pd.Series, pd.DataFrame]:
        """Extract a single type of feature"""
        if feature_type == FeatureType.FULL_NAME:
@@ -27,13 +27,13 @@ class ModelTrainer:
        self.models_dir.mkdir(parents=True, exist_ok=True)

    def train_single_model(
-            self,
-            model_name: str,
-            model_type: str = "logistic_regression",
-            features: List[str] = None,
-            model_params: Dict[str, Any] = None,
-            tags: List[str] = None,
-            save_artifacts: bool = True,
+        self,
+        model_name: str,
+        model_type: str = "logistic_regression",
+        features: List[str] = None,
+        model_params: Dict[str, Any] = None,
+        tags: List[str] = None,
+        save_artifacts: bool = True,
    ) -> str:
        """
        Train a single model and save its artifacts.
@@ -75,7 +75,7 @@ class ModelTrainer:
        return experiment_id

    def train_multiple_models(
-            self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
+        self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
    ) -> List[str]:
        """
        Train multiple models with different configurations.
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
        # Soft voting averages probabilities (preferred when members are calibrated);
        # hard voting uses majority class. Parallelize member predictions.
        voting_type = params.get("voting", "soft")  # 'hard' or 'soft'
-        return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
+        return VotingClassifier(
+            estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
+        )

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_features = []
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
                        encoder = self.label_encoders[feature_key]
                        column_clean = column.fillna("unknown").astype(str)
                        known_classes = set(encoder.classes_)
-                        default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
+                        default_class = (
+                            "unknown" if "unknown" in known_classes else encoder.classes_[0]
+                        )
                        column_mapped = column_clean.apply(
                            lambda value: value if value in known_classes else default_class
                        )
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):

        # Add positional encoding
        positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
-        pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
-            positions
-        )
+        pos_embedding = Embedding(
+            input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
+        )(positions)
        x = x + pos_embedding

        x = self._transformer_encoder(x, params)
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
    def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
        """Combine configured textual features into one string per record."""

-        column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
+        column_names = [
+            feature.value for feature in self.config.features if feature.value in X.columns
+        ]
        if not column_names:
            raise ValueError("No configured text features found in the provided DataFrame.")

@@ -101,7 +103,7 @@ class NeuralNetworkModel(BaseModel):
        return combined_rows

    def cross_validate(
-            self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
+        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
    ) -> dict[str, np.floating[Any]]:
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
@@ -158,7 +160,7 @@ class NeuralNetworkModel(BaseModel):
        }

    def generate_learning_curve(
-            self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        logging.info(f"Generating learning curve for {self.__class__.__name__}")
@@ -103,7 +103,7 @@ class TraditionalModel(BaseModel):
        return results

    def generate_learning_curve(
-            self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        logging.info(f"Generating learning curve for {self.__class__.__name__}")
@@ -50,7 +50,9 @@ class StreamlitApp:
    @classmethod
    def run(cls):
        st.title("🇨🇩 DRC NERS Platform")
-        st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
+        st.markdown(
+            "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
+        )
        st.markdown(
            """
            ## Overview
@@ -1,2 +1 @@
 from .ner_testing import NERTesting
-
@@ -13,10 +13,10 @@ from research.model_registry import list_available_models

 class Experiments:
    def __init__(
-            self,
-            config: PipelineConfig,
-            experiment_tracker: ExperimentTracker,
-            experiment_runner: ExperimentRunner
+        self,
+        config: PipelineConfig,
+        experiment_tracker: ExperimentTracker,
+        experiment_runner: ExperimentRunner,
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -26,8 +26,7 @@ class Experiments:
    def index(self):
        st.title("Experiments")

-        tab1, tab2, tab3 = st.tabs(
-            ["Templates", "Experiments", "Batch Experiments"])
+        tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])

        with tab1:
            self.show_template_experiments()
@@ -56,14 +55,18 @@ class Experiments:
                self._show_experiments_by_type(available_experiments["advanced"], "advanced")

            with exp_tabs[2]:
-                self._show_experiments_by_type(available_experiments["feature_study"], "feature_study")
+                self._show_experiments_by_type(
+                    available_experiments["feature_study"], "feature_study"
+                )

            with exp_tabs[3]:
                self._show_experiments_by_type(available_experiments["tuning"], "tuning")

        except Exception as e:
            st.error(f"Error loading experiment templates: {e}")
-            st.info("Make sure the research templates file exists at `config/research_templates.yaml`")
+            st.info(
+                "Make sure the research templates file exists at `config/research_templates.yaml`"
+            )

    def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
        """Show experiments for a specific type"""
@@ -142,7 +145,7 @@ class Experiments:
        # Display experiments
        for i, exp in enumerate(experiments):
            with st.expander(
-                    f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
+                f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
            ):
                self._display_experiment_details(exp, i)

@@ -213,7 +216,7 @@ class Experiments:
            experiment_types = st.multiselect(
                "Select Experiment Types",
                ["baseline", "advanced", "feature_study", "tuning"],
-                default=["baseline"]
+                default=["baseline"],
            )

            if experiment_types:
@@ -223,11 +226,11 @@ class Experiments:
                    experiments = available_experiments.get(exp_type, [])
                    if experiments:
                        st.write(f"**{exp_type.title()} Experiments:**")
-                        exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)]
+                        exp_names = [
+                            exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
+                        ]
                        selected_names = st.multiselect(
-                            f"Select {exp_type} experiments",
-                            exp_names,
-                            key=f"select_{exp_type}"
+                            f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
                        )

                        for name in selected_names:
@@ -308,13 +311,13 @@ class Experiments:
                )

    def run_batch_experiments(
-            self,
-            base_name: str,
-            model_types: List[str],
-            ngram_ranges: str,
-            feature_combinations: List[str],
-            test_sizes: str,
-            tags: str,
+        self,
+        base_name: str,
+        model_types: List[str],
+        ngram_ranges: str,
+        feature_combinations: List[str],
+        test_sizes: str,
+        tags: str,
    ):
        """Run batch experiments with parameter combinations"""
        with st.spinner("Running batch experiments..."):
@@ -38,7 +38,7 @@ class LogReader:

            # Parse log entries from the end
            entries = []
-            for line in reversed(lines[-count * 2:]):  # Read more lines in case some don't match
+            for line in reversed(lines[-count * 2 :]):  # Read more lines in case some don't match
                entry = self._parse_log_line(line.strip())
                if entry:
                    entries.append(entry)
@@ -33,7 +33,9 @@ class NERTesting:

        # Load model
        if not self.load_ner_model():
-            st.warning("NER model could not be loaded. Please ensure the model is trained and available.")
+            st.warning(
+                "NER model could not be loaded. Please ensure the model is trained and available."
+            )
            return

        # Display model information
@@ -53,9 +55,11 @@ class NERTesting:
            col1, col2, col3, col4 = st.columns(4)

            with col1:
-                st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}")
+                st.metric(
+                    "Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
+                )
            with col2:
-                st.metric("Epochs", self.training_stats.get('epochs', 0))
+                st.metric("Epochs", self.training_stats.get("epochs", 0))
            with col3:
                st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
            with col4:
@@ -64,7 +68,7 @@ class NERTesting:
    def show_model_evaluation_info(self):
        if self.evaluation_stats:
            col1, col2, col3 = st.columns(4)
-            overall = self.evaluation_stats.get('overall', {})
+            overall = self.evaluation_stats.get("overall", {})

            with col1:
                st.metric("Overall Precision", f"{overall['precision']:.2f}")
@@ -79,7 +83,7 @@ class NERTesting:
        name_input = st.text_input(
            "Name:",
            placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
-            help="Enter a full name or multiple names separated by spaces"
+            help="Enter a full name or multiple names separated by spaces",
        )
        if name_input.strip():
            if st.button("Analyze Name", type="primary"):
@@ -90,12 +94,12 @@ class NERTesting:
            "Names:",
            placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
            height=150,
-            help="Enter each name on a new line"
+            help="Enter each name on a new line",
        )

        if names_input.strip():
            if st.button("Analyze All Names", type="primary"):
-                names = [name.strip() for name in names_input.split('\n') if name.strip()]
+                names = [name.strip() for name in names_input.split("\n") if name.strip()]
                for i, name in enumerate(names):
                    st.markdown(f"**Name {i+1}: {name}**")
                    self.analyze_and_display(name)
@@ -106,12 +110,12 @@ class NERTesting:
        try:
            result = self.ner_model.predict(text)
            st.subheader("Analysis Results")
-            entities = result.get('entities', [])
+            entities = result.get("entities", [])

            if entities:
                self.show_visual_entities(text, entities)
-                native_count = sum(1 for e in entities if e['label'] == 'NATIVE')
-                surname_count = sum(1 for e in entities if e['label'] == 'SURNAME')
+                native_count = sum(1 for e in entities if e["label"] == "NATIVE")
+                surname_count = sum(1 for e in entities if e["label"] == "SURNAME")

                col1, col2, col3 = st.columns(3)
                with col1:
@@ -134,29 +138,17 @@ class NERTesting:
            # Convert our entities format to spaCy format for displacy
            ents = []
            for entity in entities:
-                ents.append({
-                    "start": entity['start'],
-                    "end": entity['end'],
-                    "label": entity['label']
-                })
+                ents.append(
+                    {"start": entity["start"], "end": entity["end"], "label": entity["label"]}
+                )

            # Create doc-like structure for displacy
-            doc_data = {
-                "text": text,
-                "ents": ents,
-                "title": None
-            }
+            doc_data = {"text": text, "ents": ents, "title": None}

            # Custom colors for our labels
-            colors = {
-                "NATIVE": "#74C0FC",  # Light blue
-                "SURNAME": "#69DB7C"  # Light green
-            }
+            colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"}  # Light blue  # Light green

-            options = {
-                "colors": colors,
-                "distance": 90
-            }
+            options = {"colors": colors, "distance": 90}

            # Generate HTML visualization
            html = displacy.render(doc_data, style="ent", manual=True, options=options)
@@ -13,7 +13,7 @@ from research.experiment.experiment_tracker import ExperimentTracker

 class Predictions:
    def __init__(
-            self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -111,7 +111,7 @@ class Predictions:
            return None

    def _display_single_prediction_results(
-            self, prediction: str, confidence: Optional[float], experiment, name_input: str
+        self, prediction: str, confidence: Optional[float], experiment, name_input: str
    ):
        """Display single prediction results"""
        col1, col2 = st.columns(2)
@@ -288,7 +288,7 @@ class Predictions:
            return pd.DataFrame()

    def _run_dataset_prediction(
-            self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
+        self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
    ):
        """Run dataset prediction and display results"""
        with st.spinner("Running predictions..."):
@@ -12,7 +12,7 @@ from research.experiment.experiment_tracker import ExperimentTracker

 class ResultsAnalysis:
    def __init__(
-            self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
				`@@ -0,0 +1 @@`
				`GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]`