feat: add osm data

2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
@@ -62,7 +62,7 @@ stages:
 **Running the Pipeline**
 ```bash
-python main.py --env development
+python main.py --env production
 ```
 ## NER Processing (Optional)
@@ -72,7 +72,7 @@ Its main objective is to accurately identify and tag the different components of
 specifically distinguishing between the native part and the surname.
 ```bash
-python ner.py --env development
+python ner.py --env production
 ```
 Once you've built and train the NER model you can use it to annotate **COMPOSE** name in the original dataset 
@@ -105,54 +105,54 @@ you can define model features, training parameters, and evaluation metrics in th
 ```bash
 # bigru
-python train.py --name="bigru" --type="baseline" --env="development"
+python train.py --name="bigru" --type="baseline" --env="production"
-python train.py --name="bigru_native" --type="baseline" --env="development"
+python train.py --name="bigru_native" --type="baseline" --env="production"
-python train.py --name="bigru_surname" --type="baseline" --env="development"
+python train.py --name="bigru_surname" --type="baseline" --env="production"
 # cnn
-python train.py --name="cnn" --type="baseline" --env="development"
+python train.py --name="cnn" --type="baseline" --env="production"
-python train.py --name="cnn_native" --type="baseline" --env="development"
+python train.py --name="cnn_native" --type="baseline" --env="production"
-python train.py --name="cnn_surname" --type="baseline" --env="development"
+python train.py --name="cnn_surname" --type="baseline" --env="production"
 # lightgbm
-python train.py --name="lightgbm" --type="baseline" --env="development"
+python train.py --name="lightgbm" --type="baseline" --env="production"
-python train.py --name="lightgbm_native" --type="baseline" --env="development"
+python train.py --name="lightgbm_native" --type="baseline" --env="production"
-python train.py --name="lightgbm_surname" --type="baseline" --env="development"
+python train.py --name="lightgbm_surname" --type="baseline" --env="production"
 # logistic regression
-python train.py --name="logistic_regression" --type="baseline" --env="development"
+python train.py --name="logistic_regression" --type="baseline" --env="production"
-python train.py --name="logistic_regression_native" --type="baseline" --env="development"
+python train.py --name="logistic_regression_native" --type="baseline" --env="production"
-python train.py --name="logistic_regression_surname" --type="baseline" --env="development"
+python train.py --name="logistic_regression_surname" --type="baseline" --env="production"
 # lstm
-python train.py --name="lstm" --type="baseline" --env="development"
+python train.py --name="lstm" --type="baseline" --env="production"
-python train.py --name="lstm_native" --type="baseline" --env="development"
+python train.py --name="lstm_native" --type="baseline" --env="production"
-python train.py --name="lstm_surname" --type="baseline" --env="development"
+python train.py --name="lstm_surname" --type="baseline" --env="production"
 # random forest
-python train.py --name="random_forest" --type="baseline" --env="development"
+python train.py --name="random_forest" --type="baseline" --env="production"
-python train.py --name="random_forest_native" --type="baseline" --env="development"
+python train.py --name="random_forest_native" --type="baseline" --env="production"
-python train.py --name="random_forest_surname" --type="baseline" --env="development"
+python train.py --name="random_forest_surname" --type="baseline" --env="production"
 # svm
-python train.py --name="svm" --type="baseline" --env="development"
+python train.py --name="svm" --type="baseline" --env="production"
-python train.py --name="svm_native" --type="baseline" --env="development"
+python train.py --name="svm_native" --type="baseline" --env="production"
-python train.py --name="svm_surname" --type="baseline" --env="development"
+python train.py --name="svm_surname" --type="baseline" --env="production"
 # naive bayes
-python train.py --name="naive_bayes" --type="baseline" --env="development"
+python train.py --name="naive_bayes" --type="baseline" --env="production"
-python train.py --name="naive_bayes_native" --type="baseline" --env="development"
+python train.py --name="naive_bayes_native" --type="baseline" --env="production"
-python train.py --name="naive_bayes_surname" --type="baseline" --env="development"
+python train.py --name="naive_bayes_surname" --type="baseline" --env="production"
 # transformer
-python train.py --name="transformer" --type="baseline" --env="development"
+python train.py --name="transformer" --type="baseline" --env="production"
-python train.py --name="transformer_native" --type="baseline" --env="development"
+python train.py --name="transformer_native" --type="baseline" --env="production"
-python train.py --name="transformer_surname" --type="baseline" --env="development"
+python train.py --name="transformer_surname" --type="baseline" --env="production"
 # xgboost
-python train.py --name="xgboost" --type="baseline" --env="development"
+python train.py --name="xgboost" --type="baseline" --env="production"
-python train.py --name="xgboost_native" --type="baseline" --env="development"
+python train.py --name="xgboost_native" --type="baseline" --env="production"
-python train.py --name="xgboost_surname" --type="baseline" --env="development"
+python train.py --name="xgboost_surname" --type="baseline" --env="production"
 ```
 ## Web Interface
@@ -171,3 +171,6 @@ streamlit run web/app.py
 <a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
  <img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
 </a>
 ## Acknowledgements
 - Map Visualization: [https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc](https://data.humdata.org/dataset/anciennes-provinces-rdc-old-provinces-drc)
@@ -11,6 +11,7 @@ processing:
 # Pipeline stages
 stages:
  - "data_cleaning"
  - "data_selection"
  - "feature_extraction"
  #- "ner_annotation"
  #- "llm_annotation"
@@ -3,17 +3,18 @@ debug: false
 # Processing settings
 processing:
-  batch_size: 10_000
+  batch_size: 100_000
-  max_workers: 8
+  max_workers: 4
  checkpoint_interval: 10
  use_multiprocessing: true
 # Pipeline stages
 stages:
  - "data_cleaning"
  - "data_selection"
  - "feature_extraction"
-  - "ner_annotation"
+  # - "ner_annotation"
-  - "llm_annotation"
+  # - "llm_annotation"
  - "data_splitting"
 # Production LLM settings
@@ -34,7 +35,7 @@ data:
 # Production logging (less verbose)
 logging:
  level: "INFO"
-  console_logging: false
+  console_logging: true
  file_logging: true
  log_file: "pipeline.production.log"
  max_log_size: 52428800  # 50MB
@@ -21,6 +21,7 @@ paths:
 # List of stages in the processing pipeline
 stages:
  - "data_cleaning"                        # Data cleaning stage
  - "data_selection"                       # Data selection stage - keep only required columns
  - "feature_extraction"                   # Feature extraction stage
  - "ner_annotation"                       # NER-based annotation stage
  - "llm_annotation"                       # LLM annotation stage (computational intensive)
@@ -64,6 +65,11 @@ data:
    females: "names_females.csv"            # Output files for female names
    ner_data: "names_ner.json"              # Output file for NER annotated data
    ner_spacy: "names_ner.spacy"            # Output file for NER annotated data using spaCy format
  selected_columns:                         # Required columns for processing
    - name
    - sex
    - region
    - year
  split_evaluation: false                   # Should the dataset be split into training and evaluation sets ?
  split_by_gender: true                     # Should the dataset be split by gender ?
  split_by_province: true                   # Should the dataset be split by province ?
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
            "ner_spacy": "names_ner.spacy",
        }
    )
    selected_columns: list[str] = field(default=["name", "sex", "region"])
    split_evaluation: bool = False
    split_by_province: bool = True
    split_by_gender: bool = True
@@ -8,12 +8,10 @@ class RegionMapper:
    def __init__(self, mapping: Optional[Dict] = None):
        self.mapping = mapping or REGION_MAPPING
        self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
    def map(self, series: pd.Series) -> pd.Series:
-        """Vectorized region to province mapping"""
+        return series.str.lower().map(self.mapping).fillna("AUTRES")
        return series.str.lower().map(
            lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
        )
    @staticmethod
    def get_provinces():
@@ -30,9 +30,8 @@ class TextCleaner:
    def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean all text columns in a DataFrame"""
        df = df.copy()
-        text_columns = df.select_dtypes(include="object").columns
+        columns = df.select_dtypes(include=["object", "string"]).columns
-
+        for col in columns:
        for col in text_columns:
            df[col] = self.clean_text_series(df[col])
        return df
@@ -9,9 +9,9 @@ from core.utils.data_loader import DataLoader
 from processing.batch.batch_config import BatchConfig
 from processing.pipeline import Pipeline
 from processing.steps.data_cleaning_step import DataCleaningStep
 from processing.steps.data_selection_step import DataSelectionStep
 from processing.steps.data_splitting_step import DataSplittingStep
 from processing.steps.feature_extraction_step import FeatureExtractionStep
 from processing.steps.llm_annotation_step import LLMAnnotationStep
 def create_pipeline(config) -> Pipeline:
@@ -28,8 +28,9 @@ def create_pipeline(config) -> Pipeline:
    steps = [
        DataCleaningStep(config),
        FeatureExtractionStep(config),
        DataSelectionStep(config),
        # NERAnnotationStep(config),
-        LLMAnnotationStep(config),
+        # LLMAnnotationStep(config),
    ]
    for stage in config.stages:
@@ -11,6 +11,7 @@ from processing.monitoring.pipeline_monitor import PipelineMonitor
 def main():
    choices = [
        "data_cleaning",
        "data_selection",
        "feature_extraction",
        "ner_annotation",
        "llm_annotation",
@@ -0,0 +1,107 @@
 {
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Qualitative Analysis",
   "id": "d20715dd63f57364"
  },
  {
   "cell_type": "code",
   "id": "c93a55c8",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-21T13:34:50.973298Z",
     "start_time": "2025-09-21T13:34:50.969142Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import sys\n",
    "import os\n",
    "\n",
    "sys.path.append(os.path.abspath(\"..\"))\n",
    "from core.utils.data_loader import DataLoader\n",
    "from core.config.pipeline_config import PipelineConfig"
   ],
   "outputs": [],
   "execution_count": 3
  },
  {
   "cell_type": "code",
   "id": "c0b00261",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-21T13:34:51.002610Z",
     "start_time": "2025-09-21T13:34:50.998586Z"
    }
   },
   "source": [
    "config = PipelineConfig(\n",
    "    paths={\n",
    "        \"root_dir\": \"../data\",\n",
    "        \"data_dir\": \"../data/dataset\",\n",
    "        \"models_dir\": \"../models\",\n",
    "        \"outputs_dir\": \"../data/processed\",\n",
    "        \"logs_dir\": \"../logs\",\n",
    "        \"configs_dir\": \"../configs\",\n",
    "        \"checkpoints_dir\": \"../checkpoints\"\n",
    "    }\n",
    ")\n",
    "\n",
    "loader = DataLoader(config)"
   ],
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-21T13:35:27.430639Z",
     "start_time": "2025-09-21T13:34:51.013412Z"
    }
   },
   "cell_type": "code",
   "outputs": [],
   "execution_count": 5,
   "source": [
    "gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
    "gdf_proj = gdf.to_crs(epsg=32732)\n",
    "gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
    "\n",
    "df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
   ],
   "id": "b38394ce38864379"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Exploration",
   "id": "a1af5626d2a948d6"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1,107 @@
 {
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Quantitative Analysis",
   "id": "a605c0f92056a825"
  },
  {
   "cell_type": "code",
   "id": "c93a55c8",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-21T14:14:47.287549Z",
     "start_time": "2025-09-21T14:14:47.279199Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import sys\n",
    "import os\n",
    "\n",
    "sys.path.append(os.path.abspath(\"..\"))\n",
    "from core.utils.data_loader import DataLoader\n",
    "from core.config.pipeline_config import PipelineConfig"
   ],
   "outputs": [],
   "execution_count": 30
  },
  {
   "cell_type": "code",
   "id": "c0b00261",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-21T14:14:47.315980Z",
     "start_time": "2025-09-21T14:14:47.308376Z"
    }
   },
   "source": [
    "config = PipelineConfig(\n",
    "    paths={\n",
    "        \"root_dir\": \"../data\",\n",
    "        \"data_dir\": \"../data/dataset\",\n",
    "        \"models_dir\": \"../models\",\n",
    "        \"outputs_dir\": \"../data/processed\",\n",
    "        \"logs_dir\": \"../logs\",\n",
    "        \"configs_dir\": \"../configs\",\n",
    "        \"checkpoints_dir\": \"../checkpoints\"\n",
    "    }\n",
    ")\n",
    "\n",
    "loader = DataLoader(config)"
   ],
   "outputs": [],
   "execution_count": 31
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-21T14:15:47.899044Z",
     "start_time": "2025-09-21T14:14:47.339266Z"
    }
   },
   "cell_type": "code",
   "source": [
    "gdf = gpd.read_file(\"../osm/provinces.shp\")\n",
    "gdf_proj = gdf.to_crs(epsg=32732)\n",
    "gdf['centroid'] = gdf_proj.geometry.centroid.to_crs(gdf.crs)\n",
    "\n",
    "df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")"
   ],
   "id": "b38394ce38864379",
   "outputs": [],
   "execution_count": 32
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Exploration",
   "id": "a1af5626d2a948d6"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1 @@
 UTF-8
@@ -0,0 +1 @@
 GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
@@ -211,7 +211,9 @@ class NameModel:
            for batch in batches:
                batch_losses = {}
                self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
-                logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}")
+                logging.info(
                    f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
                )
                # Accumulate into total losses dict
                for k, v in batch_losses.items():
@@ -49,6 +49,9 @@ class Pipeline:
                "processed_batches": step.state.processed_batches,
                "total_batches": step.state.total_batches,
                "failed_batches": len(step.state.failed_batches),
-                "completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
+                "completion_percentage": (
                    step.state.processed_batches / max(1, step.state.total_batches)
                )
                * 100,
            }
        return progress
@@ -0,0 +1,43 @@
 import logging
 import pandas as pd
 from core.config.pipeline_config import PipelineConfig
 from processing.steps import PipelineStep
 class DataSelectionStep(PipelineStep):
    """Configuration-driven data selection step to keep only specified columns"""
    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("data_selection", pipeline_config)
        self.selected_columns = pipeline_config.data.selected_columns
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process a single batch for data selection"""
        logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
        # Check which columns exist in the batch
        available_columns = [col for col in self.selected_columns if col in batch.columns]
        missing_columns = [col for col in self.selected_columns if col not in batch.columns]
        if missing_columns:
            logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
        if not available_columns:
            logging.error(f"No required columns found in batch {batch_id}")
            return pd.DataFrame()  # Return empty DataFrame if no required columns exist
        # Select only the available required columns
        selected_batch = batch[available_columns].copy()
        logging.info(
            f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
        )
        return selected_batch
    @property
    def requires_batch_mutation(self) -> bool:
        """This step modifies the batch data by selecting columns"""
        return True
@@ -72,7 +72,9 @@ class ExperimentBuilder:
            f"Available experiments: {available_experiments}"
        )
-    def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]:
+    def get_templates(
        self, templates_path: str = "research_templates.yaml"
    ) -> Dict[str, List[Dict]]:
        """Get all available experiments from templates organized by type"""
        templates = self.load_templates(templates_path)
@@ -80,7 +82,7 @@ class ExperimentBuilder:
            "baseline": templates.get("baseline_experiments", []),
            "advanced": templates.get("advanced_experiments", []),
            "feature_study": templates.get("feature_studies", []),
-            "tuning": templates.get("hyperparameter_tuning", [])
+            "tuning": templates.get("hyperparameter_tuning", []),
        }
    @classmethod
@@ -104,5 +106,5 @@ class ExperimentBuilder:
            tags=template_config.get("tags", []),
            test_size=template_config.get("test_size", 0.2),
            cross_validation_folds=template_config.get("cross_validation_folds", 5),
-            train_data_filter=template_config.get("train_data_filter")
+            train_data_filter=template_config.get("train_data_filter"),
        )
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
        # Soft voting averages probabilities (preferred when members are calibrated);
        # hard voting uses majority class. Parallelize member predictions.
        voting_type = params.get("voting", "soft")  # 'hard' or 'soft'
-        return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
+        return VotingClassifier(
            estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
        )
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_features = []
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
                        encoder = self.label_encoders[feature_key]
                        column_clean = column.fillna("unknown").astype(str)
                        known_classes = set(encoder.classes_)
-                        default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
+                        default_class = (
                            "unknown" if "unknown" in known_classes else encoder.classes_[0]
                        )
                        column_mapped = column_clean.apply(
                            lambda value: value if value in known_classes else default_class
                        )
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):
        # Add positional encoding
        positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
-        pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
+        pos_embedding = Embedding(
-            positions
+            input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
-        )
+        )(positions)
        x = x + pos_embedding
        x = self._transformer_encoder(x, params)
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
    def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
        """Combine configured textual features into one string per record."""
-        column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
+        column_names = [
            feature.value for feature in self.config.features if feature.value in X.columns
        ]
        if not column_names:
            raise ValueError("No configured text features found in the provided DataFrame.")
@@ -50,7 +50,9 @@ class StreamlitApp:
    @classmethod
    def run(cls):
        st.title("🇨🇩 DRC NERS Platform")
-        st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
+        st.markdown(
            "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
        )
        st.markdown(
            """
            ## Overview
@@ -1,2 +1 @@
 from .ner_testing import NERTesting
@@ -16,7 +16,7 @@ class Experiments:
        self,
        config: PipelineConfig,
        experiment_tracker: ExperimentTracker,
-            experiment_runner: ExperimentRunner
+        experiment_runner: ExperimentRunner,
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -26,8 +26,7 @@ class Experiments:
    def index(self):
        st.title("Experiments")
-        tab1, tab2, tab3 = st.tabs(
+        tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
            ["Templates", "Experiments", "Batch Experiments"])
        with tab1:
            self.show_template_experiments()
@@ -56,14 +55,18 @@ class Experiments:
                self._show_experiments_by_type(available_experiments["advanced"], "advanced")
            with exp_tabs[2]:
-                self._show_experiments_by_type(available_experiments["feature_study"], "feature_study")
+                self._show_experiments_by_type(
                    available_experiments["feature_study"], "feature_study"
                )
            with exp_tabs[3]:
                self._show_experiments_by_type(available_experiments["tuning"], "tuning")
        except Exception as e:
            st.error(f"Error loading experiment templates: {e}")
-            st.info("Make sure the research templates file exists at `config/research_templates.yaml`")
+            st.info(
                "Make sure the research templates file exists at `config/research_templates.yaml`"
            )
    def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
        """Show experiments for a specific type"""
@@ -213,7 +216,7 @@ class Experiments:
            experiment_types = st.multiselect(
                "Select Experiment Types",
                ["baseline", "advanced", "feature_study", "tuning"],
-                default=["baseline"]
+                default=["baseline"],
            )
            if experiment_types:
@@ -223,11 +226,11 @@ class Experiments:
                    experiments = available_experiments.get(exp_type, [])
                    if experiments:
                        st.write(f"**{exp_type.title()} Experiments:**")
-                        exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)]
+                        exp_names = [
                            exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
                        ]
                        selected_names = st.multiselect(
-                            f"Select {exp_type} experiments",
+                            f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
                            exp_names,
                            key=f"select_{exp_type}"
                        )
                        for name in selected_names:
@@ -38,7 +38,7 @@ class LogReader:
            # Parse log entries from the end
            entries = []
-            for line in reversed(lines[-count * 2:]):  # Read more lines in case some don't match
+            for line in reversed(lines[-count * 2 :]):  # Read more lines in case some don't match
                entry = self._parse_log_line(line.strip())
                if entry:
                    entries.append(entry)
@@ -33,7 +33,9 @@ class NERTesting:
        # Load model
        if not self.load_ner_model():
-            st.warning("NER model could not be loaded. Please ensure the model is trained and available.")
+            st.warning(
                "NER model could not be loaded. Please ensure the model is trained and available."
            )
            return
        # Display model information
@@ -53,9 +55,11 @@ class NERTesting:
            col1, col2, col3, col4 = st.columns(4)
            with col1:
-                st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}")
+                st.metric(
                    "Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
                )
            with col2:
-                st.metric("Epochs", self.training_stats.get('epochs', 0))
+                st.metric("Epochs", self.training_stats.get("epochs", 0))
            with col3:
                st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
            with col4:
@@ -64,7 +68,7 @@ class NERTesting:
    def show_model_evaluation_info(self):
        if self.evaluation_stats:
            col1, col2, col3 = st.columns(4)
-            overall = self.evaluation_stats.get('overall', {})
+            overall = self.evaluation_stats.get("overall", {})
            with col1:
                st.metric("Overall Precision", f"{overall['precision']:.2f}")
@@ -79,7 +83,7 @@ class NERTesting:
        name_input = st.text_input(
            "Name:",
            placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
-            help="Enter a full name or multiple names separated by spaces"
+            help="Enter a full name or multiple names separated by spaces",
        )
        if name_input.strip():
            if st.button("Analyze Name", type="primary"):
@@ -90,12 +94,12 @@ class NERTesting:
            "Names:",
            placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
            height=150,
-            help="Enter each name on a new line"
+            help="Enter each name on a new line",
        )
        if names_input.strip():
            if st.button("Analyze All Names", type="primary"):
-                names = [name.strip() for name in names_input.split('\n') if name.strip()]
+                names = [name.strip() for name in names_input.split("\n") if name.strip()]
                for i, name in enumerate(names):
                    st.markdown(f"**Name {i+1}: {name}**")
                    self.analyze_and_display(name)
@@ -106,12 +110,12 @@ class NERTesting:
        try:
            result = self.ner_model.predict(text)
            st.subheader("Analysis Results")
-            entities = result.get('entities', [])
+            entities = result.get("entities", [])
            if entities:
                self.show_visual_entities(text, entities)
-                native_count = sum(1 for e in entities if e['label'] == 'NATIVE')
+                native_count = sum(1 for e in entities if e["label"] == "NATIVE")
-                surname_count = sum(1 for e in entities if e['label'] == 'SURNAME')
+                surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
                col1, col2, col3 = st.columns(3)
                with col1:
@@ -134,29 +138,17 @@ class NERTesting:
            # Convert our entities format to spaCy format for displacy
            ents = []
            for entity in entities:
-                ents.append({
+                ents.append(
-                    "start": entity['start'],
+                    {"start": entity["start"], "end": entity["end"], "label": entity["label"]}
-                    "end": entity['end'],
+                )
                    "label": entity['label']
                })
            # Create doc-like structure for displacy
-            doc_data = {
+            doc_data = {"text": text, "ents": ents, "title": None}
                "text": text,
                "ents": ents,
                "title": None
            }
            # Custom colors for our labels
-            colors = {
+            colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"}  # Light blue  # Light green
                "NATIVE": "#74C0FC",  # Light blue
                "SURNAME": "#69DB7C"  # Light green
            }
-            options = {
+            options = {"colors": colors, "distance": 90}
                "colors": colors,
                "distance": 90
            }
            # Generate HTML visualization
            html = displacy.render(doc_data, style="ent", manual=True, options=options)
		`@@ -0,0 +1 @@`
							`GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]`
`@@ -1,2 +1 @@`
	`from .ner_testing import NERTesting`	`from .ner_testing import NERTesting`