feat: web application multipage support

2025-08-16 19:05:24 +02:00
parent 7b652d6999
commit 84f7d41a84
38 changed files with 765 additions and 507 deletions
@@ -1,101 +0,0 @@
 #!.venv/bin/python3
 import argparse
 import streamlit as st
 from core.config import setup_config, PipelineConfig
 from core.utils.data_loader import DataLoader
 from interface.configuration import Configuration
 from interface.dashboard import Dashboard
 from interface.data_overview import DataOverview
 from interface.data_processing import DataProcessing
 from interface.experiments import Experiments
 from interface.predictions import Predictions
 from interface.results_analysis import ResultsAnalysis
 from processing.monitoring.pipeline_monitor import PipelineMonitor
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
 # Page configuration
 st.set_page_config(
    page_title="DRC Names NLP Pipeline",
    page_icon="🇨🇩",
    layout="wide",
    initial_sidebar_state="expanded",
 )
 class StreamlitApp:
    """Main Streamlit application class"""
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.data_loader = DataLoader(self.config)
        self.experiment_tracker = ExperimentTracker(self.config)
        self.experiment_runner = ExperimentRunner(self.config)
        self.pipeline_monitor = PipelineMonitor()
        # Initialize interface components
        self.dashboard = Dashboard(self.config, self.experiment_tracker, self.experiment_runner)
        self.data_overview = DataOverview(self.config)
        self.data_processing = DataProcessing(self.config, self.pipeline_monitor)
        self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner)
        self.results_analysis = ResultsAnalysis(
            self.config, self.experiment_tracker, self.experiment_runner
        )
        self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner)
        self.configuration = Configuration(self.config)
        # Initialize session state
        if "current_experiment" not in st.session_state:
            st.session_state.current_experiment = None
        if "experiment_results" not in st.session_state:
            st.session_state.experiment_results = {}
    def run(self):
        st.title("🇨🇩 DRC NERS Pipeline")
        st.markdown("A comprehensive tool for Congolese name analysis and gender prediction")
        # Sidebar navigation
        page = st.sidebar.selectbox(
            "Navigation",
            [
                "Dashboard",
                "Dataset Overview",
                "Data Processing",
                "Experiments",
                "Results & Analysis",
                "Predictions",
                "Configuration",
            ],
        )
        # Route to appropriate page
        page_map = {
            "Dashboard": self.dashboard.index,
            "Dataset Overview": self.data_overview.index,
            "Data Processing": self.data_processing.index,
            "Experiments": self.experiments.index,
            "Results & Analysis": self.results_analysis.index,
            "Predictions": self.predictions.index,
            "Configuration": self.configuration.index,
        }
        page_map.get(page, lambda: None)()
 def main():
    parser = argparse.ArgumentParser(
        description="DRC NERS Platform",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("--config", type=str, help="Path to configuration file")
    parser.add_argument("--env", type=str, default="development", help="Environment name")
    args = parser.parse_args()
    config = setup_config(args.config, env=args.env)
    app = StreamlitApp(config)
    app.run()
 if __name__ == "__main__":
    main()
@@ -18,7 +18,8 @@ paths:
  checkpoints_dir: "./data/checkpoints"   # Directory for model checkpoints
 # Pipeline stages
-stages: # List of stages in the processing pipeline
+# List of stages in the processing pipeline
 stages:
  - "data_cleaning"                        # Data cleaning stage
  - "feature_extraction"                   # Feature extraction stage
  - "ner_annotation"                       # NER-based annotation stage
@@ -36,6 +37,7 @@ processing:
    - "utf-16"
    - "latin1"
  chunk_size: 100_000                      # Size of data chunks to process in parallel
  epochs: 2                                # Number of Epochs for training
 # Annotation settings
 annotation:
@@ -72,8 +74,9 @@ data:
  balance_by_sex: false                     # Should the dataset be balanced by sex when limiting the dataset size?
 # Logging configuration
 # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
 logging:
-  level: "INFO"                            # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  file_logging: true                       # Enable logging to file
  console_logging: true                    # Enable logging to console
@@ -7,7 +7,7 @@ baseline_experiments:
      max_len: 20
      embedding_dim: 64
      gru_units: 32
-      epochs: 10
+      epochs: 2
      batch_size: 32
    tags: [ "baseline", "neural", "bigru" ]
@@ -21,7 +21,7 @@ baseline_experiments:
      filters: 64
      kernel_size: 3
      dropout: 0.5
-      epochs: 10
+      epochs: 2
      batch_size: 32
    tags: [ "baseline", "neural", "cnn" ]
@@ -79,7 +79,7 @@ baseline_experiments:
    model_params:
      embedding_dim: 128
      lstm_units: 64
-      epochs: 10
+      epochs: 2
      batch_size: 64
    tags: [ "baseline", "neural", "lstm" ]
@@ -121,7 +121,7 @@ baseline_experiments:
      embedding_dim: 128
      num_heads: 4
      num_layers: 2
-      epochs: 10
+      epochs: 2
      batch_size: 64
    tags: [ "baseline", "neural", "transformer" ]
@@ -0,0 +1,145 @@
 [paths]
 train = null
 dev = null
 vectors = null
 init_tok2vec = null
 [system]
 gpu_allocator = null
 seed = 42
 [nlp]
 lang = "fr"
 pipeline = ["tok2vec","ner"]
 batch_size = 100000
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 vectors = {"@vectors":"spacy.Vectors.v1"}
 [components]
 [components.ner]
 factory = "ner"
 incorrect_spans_key = null
 moves = null
 scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,1000,2500,2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [training]
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 before_to_disk = null
 before_update = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 ents_f = 1.0
 ents_p = 0.0
 ents_r = 0.0
 ents_per_type = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
@@ -12,3 +12,4 @@ class ProcessingConfig(BaseModel):
    use_multiprocessing: bool = False
    encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
    chunk_size: int = 100_000
    epochs: int = 2
@@ -7,24 +7,24 @@ import traceback
 from pathlib import Path
 from core.config import setup_config, PipelineConfig
-from processing.ner.ner_data_builder import NERDataBuilder
+from processing.ner.name_builder import NameBuilder
-from processing.ner.ner_engineering import NEREngineering
+from processing.ner.name_engineering import NameEngineering
-from processing.ner.ner_name_model import NERNameModel
+from processing.ner.name_model import NameModel
 def feature(config: PipelineConfig):
    """Apply feature engineering to create position-independent NER dataset."""
-    NEREngineering(config).compute()
+    NameEngineering(config).compute()
 def build(config: PipelineConfig):
    """Build NER dataset using NERDataBuilder."""
-    NERDataBuilder(config).build()
+    NameBuilder(config).build()
 def train(config: PipelineConfig):
    """Train the NER model."""
-    trainer = NERNameModel(config)
+    trainer = NameModel(config)
    data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
    if not data_path.exists():
@@ -39,7 +39,10 @@ def train(config: PipelineConfig):
    logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
    trainer.train(
-        data=train_data, epochs=1, batch_size=config.processing.batch_size, dropout_rate=0.3
+        data=train_data,
        epochs=config.processing.epochs,
        batch_size=config.processing.batch_size,
        dropout_rate=0.3,
    )
    trainer.evaluate(eval_data)
@@ -48,13 +51,17 @@ def train(config: PipelineConfig):
 def run_pipeline(config: PipelineConfig, reset: bool = False):
-    if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
+    if not reset and os.path.exists(
        config.paths.get_data_path(config.data.output_files["engineered"])
    ):
        logging.info("Step 1: Feature engineering already done.")
    else:
        logging.info("Step 1: Running feature engineering")
        feature(config)
-    if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
+    if not reset and os.path.exists(
        config.paths.get_data_path(config.data.output_files["ner_data"])
    ):
        logging.info("Step 2: NER dataset already built.")
    else:
        logging.info("Step 2: Building NER dataset")
@@ -0,0 +1,68 @@
 import json
 import logging
 import spacy
 from spacy.tokens import DocBin
 from core.config import PipelineConfig
 from core.utils.data_loader import DataLoader
 from .name_tagger import NameTagger
 class NameBuilder:
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.data_loader = DataLoader(config)
        self.tagger = NameTagger()
    def build(self) -> int:
        filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
        df = self.data_loader.load_csv_complete(filepath)
        df = df[["name", "ner_tagged", "ner_entities"]]
        # Filter early
        ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
        if ner_df.empty:
            logging.error("No NER tagged data found")
            return 1
        total_rows = len(df)
        del df  # No need to keep in memory
        logging.info(f"Found {len(ner_df)} NER tagged entries")
        nlp = spacy.blank("fr")
        # Use NERNameTagger for parsing and validation
        parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"])
        validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities)
        # Drop rows with no valid entities
        mask = validated_entities.map(bool)
        ner_df = ner_df.loc[mask]
        validated_entities = validated_entities.loc[mask]
        if ner_df.empty:
            logging.error("No valid training examples after validation")
            return 1
        # Prepare training data
        training_data = list(
            zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
        )
        # Use NERNameTagger to create spaCy DocBin
        docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
        doc_bin = DocBin(docs=docs)
        # Save
        json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
        spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
        doc_bin.to_disk(spacy_path)
        logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
        logging.info(f"Saved NER JSON to {json_path}")
        logging.info(f"Saved NER spacy to {spacy_path}")
        return 0
@@ -1,5 +1,5 @@
 import gc
 import random
 from typing import List
 import logging
 import numpy as np
@@ -7,7 +7,7 @@ import pandas as pd
 from tqdm import tqdm
 from core.config import PipelineConfig
-from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
+from core.utils.data_loader import DataLoader
 from processing.ner.formats.connectors_format import ConnectorFormatter
 from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
 from processing.ner.formats.native_only_format import NativeOnlyFormatter
@@ -16,7 +16,7 @@ from processing.ner.formats.position_flipped_format import PositionFlippedFormat
 from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
-class NEREngineering:
+class NameEngineering:
    """
    Feature engineering for NER dataset to prevent position-based learning
    and encourage sequence characteristic learning.
@@ -66,13 +66,16 @@ class NEREngineering:
    def compute(self) -> None:
        logging.info("Applying feature engineering transformations...")
        input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
-        output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
+        output_filepath = self.config.paths.get_data_path(
            self.config.data.output_files["engineered"]
        )
        df = self.data_loader.load_csv_complete(input_filepath)
        ner_df = df[df["ner_tagged"] == 1].copy()
        logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
        del df  # No need to keep in memory
        gc.collect()
        ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
            drop=True
@@ -1,3 +1,4 @@
 import ast
 import json
 import logging
 import os
@@ -11,7 +12,7 @@ from spacy.util import minibatch
 from core.config.pipeline_config import PipelineConfig
-class NERNameModel:
+class NameModel:
    """NER model trainer using spaCy for DRC names entity recognition"""
    def __init__(self, config: PipelineConfig):
@@ -84,8 +85,6 @@ class NERNameModel:
                if isinstance(entities_raw, str):
                    # String format from tagger: "[(0, 6, 'NATIVE'), ...]"
                    try:
                        import ast
                        entities = ast.literal_eval(entities_raw)
                        if not isinstance(entities, list):
                            logging.warning(
@@ -175,9 +174,9 @@ class NERNameModel:
    def train(
        self,
        data: List[Tuple[str, Dict]],
-        epochs: int = 5,
+        epochs: int = 1,
-        batch_size: int = 16,
+        batch_size: int = 10_000,
-        dropout_rate: float = 0.2,
+        dropout_rate: float = 0.3,
    ) -> None:
        """Train the NER model"""
        logging.info(f"Starting NER training with {len(data)} examples")
@@ -204,7 +203,7 @@ class NERNameModel:
                example = Example.from_dict(doc, annotations)
                examples.append(example)
                logging.info(
-                    f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
+                    f"Training example: {text[:30]} with entities {annotations.get('entities', [])}"
                )
            # Train in batches
@@ -215,6 +214,7 @@ class NERNameModel:
                )
                logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
            del batches  # free memory
            epoch_loss = losses.get("ner", 0)
            losses_history.append(epoch_loss)
            logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")
@@ -0,0 +1,273 @@
 from typing import Union, Dict, Any, List
 import ast
 import json
 import logging
 import pandas as pd
 from spacy.util import filter_spans
 class NameTagger:
    def tag_name(
        self, name: str, probable_native: str, probable_surname: str
    ) -> Union[Dict[str, Any], None]:
        """Create a single NER training example using probable_native and probable_surname"""
        if not name or not probable_native or not probable_surname:
            return None
        name = name.strip()
        probable_native = probable_native.strip()
        probable_surname = probable_surname.strip()
        entities = []
        used_spans = []  # Track used character spans to prevent overlaps
        # Helper function to check if a span overlaps with any existing span
        def has_overlap(start, end):
            for used_start, used_end in used_spans:
                if not (end <= used_start or start >= used_end):
                    return True
            return False
        # Find positions of native names in the full name
        native_words = probable_native.split()
        name_lower = name.lower()  # Use lowercase for consistent searching
        processed_native_words = set()
        for native_word in native_words:
            native_word = native_word.strip()
            if len(native_word) < 2:  # Skip very short words
                continue
            native_word_lower = native_word.lower()
            # Skip if we've already processed this exact word
            if native_word_lower in processed_native_words:
                continue
            processed_native_words.add(native_word_lower)
            # Find the first occurrence of this native word that doesn't overlap
            start_pos = 0
            while True:
                pos = name_lower.find(native_word_lower, start_pos)  # Case-insensitive search
                if pos == -1:
                    break
                # Calculate end position - make sure we only include the word itself
                end_pos = pos + len(native_word_lower)
                # Double-check that the extracted span matches exactly what we expect
                extracted_text = name[pos:end_pos]  # Get original case text
                if extracted_text.lower() != native_word_lower:
                    start_pos = pos + 1
                    continue
                # Check if this is a word boundary match and doesn't overlap
                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
                    pos, end_pos
                ):
                    entities.append((pos, end_pos, "NATIVE"))
                    used_spans.append((pos, end_pos))
                    break  # Only take the first non-overlapping occurrence
                start_pos = pos + 1
        # Find position of surname in the full name
        if probable_surname and len(probable_surname.strip()) >= 2:
            surname_lower = probable_surname.lower()
            # Find the first occurrence that doesn't overlap
            start_pos = 0
            while True:
                pos = name_lower.find(surname_lower, start_pos)  # Case-insensitive search
                if pos == -1:
                    break
                # Calculate end position correctly - exact match only
                end_pos = pos + len(surname_lower)
                # Double-check that the extracted span matches exactly what we expect
                extracted_text = name[pos:end_pos]  # Get original case text
                if extracted_text.lower() != surname_lower:
                    start_pos = pos + 1
                    continue
                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
                    pos, end_pos
                ):
                    entities.append((pos, end_pos, "SURNAME"))
                    used_spans.append((pos, end_pos))
                    break
                start_pos = pos + 1
        if not entities:
            logging.warning(
                f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
            )
            return None
        # Sort entities by position and validate
        entities.sort(key=lambda x: x[0])
        # Final validation - ensure no overlaps and valid spans
        validated_entities = []
        for start, end, label in entities:
            # Check bounds
            if not (0 <= start < end <= len(name)):
                logging.warning(
                    f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
                )
                continue
            # Check for overlaps with already validated entities
            if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
                logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
                continue
            # CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
            span_text = name[start:end]
            if not span_text or span_text != span_text.strip() or " " in span_text:
                logging.warning(
                    f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
                )
                continue
            validated_entities.append((start, end, label))
        if not validated_entities:
            logging.warning(f"No valid entities after validation for: '{name}'")
            return None
        # Convert to string format that matches the dataset
        entities_str = str(validated_entities)
        return {
            "entities": entities_str,
            "spans": validated_entities,  # Keep the original tuples for internal use
        }
    @classmethod
    def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
        """Check if the match is at word boundaries"""
        # Check character before start position
        if start > 0:
            prev_char = text[start - 1]
            if prev_char.isalnum():
                return False
        # Check character after end position
        if end < len(text):
            next_char = text[end]
            if next_char.isalnum():
                return False
        return True
    @classmethod
    def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
        """Extract the actual text for each entity type"""
        result = {"NATIVE": [], "SURNAME": []}
        try:
            entities = ast.literal_eval(entities_str)
            for start, end, label in entities:
                if 0 <= start < end <= len(name):
                    span_text = name[start:end]
                    if label in result:
                        result[label].append(span_text)
        except (ValueError, SyntaxError, TypeError):
            pass
        return result
    @classmethod
    def parse(cls, entities_str: str) -> List[tuple]:
        """Parse entity strings from various formats.
        Supports formats:
        - [(start, end, label), ...]
        - [[start, end, label], ...]
        - [{"start": start, "end": end, "label": label}, ...]
        """
        if not entities_str or entities_str in ["[]", "", "nan"]:
            return []
        entities_str = str(entities_str).strip()
        try:
            if entities_str.startswith("[(") and entities_str.endswith(")]"):
                return ast.literal_eval(entities_str)
            elif entities_str.startswith("[[") and entities_str.endswith("]]"):
                return [tuple(e) for e in ast.literal_eval(entities_str)]
            elif entities_str.startswith("[{") and entities_str.endswith("}]"):
                return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
            else:
                parsed = ast.literal_eval(entities_str)
                return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
        except (ValueError, SyntaxError, json.JSONDecodeError):
            return []
    def parse_entities(self, series: pd.Series) -> pd.Series:
        """Vectorized parse of entity strings."""
        return series.map(self.parse)
    @classmethod
    def validate(cls, text: str, entities: List[tuple]) -> List[tuple]:
        """Advanced entity validation with overlap removal.
        This is more comprehensive than the basic validate_entities method.
        """
        if not entities or not text:
            return []
        text = str(text).strip()
        valid = []
        for ent in entities:
            if not isinstance(ent, (list, tuple)) or len(ent) != 3:
                continue
            start, end, label = ent
            try:
                start, end = int(start), int(end)
            except (ValueError, TypeError):
                continue
            if not isinstance(label, str):
                continue
            if not (0 <= start < end <= len(text)):
                continue
            if not text[start:end].strip():
                continue
            valid.append((start, end, label))
        if not valid:
            return []
        valid.sort(key=lambda x: (x[0], x[1]))
        # Remove overlaps
        filtered, last_end = [], -1
        for s, e, l in valid:
            if s >= last_end:
                filtered.append((s, e, l))
                last_end = e
        return filtered
    def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
        """Vectorized entity validation."""
        return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
    @classmethod
    def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List:
        """Batch create spaCy Docs from texts and entities."""
        docs = []
        for text, ents in zip(texts, entities):
            doc = nlp(text)
            spans = []
            for start, end, label in ents:
                span = doc.char_span(
                    start, end, label=label, alignment_mode="contract"
                ) or doc.char_span(start, end, label=label, alignment_mode="strict")
                if span:
                    spans.append(span)
            doc.ents = filter_spans(spans)
            docs.append(doc)
        return docs
@@ -1,149 +0,0 @@
 import ast
 import json
 import logging
 from pathlib import Path
 import pandas as pd
 import spacy
 from spacy.tokens import DocBin
 from spacy.util import filter_spans
 from core.config import PipelineConfig
 from core.utils.data_loader import DataLoader
 class NERDataBuilder:
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.data_loader = DataLoader(config)
    @staticmethod
    def _parse_entities(series: pd.Series) -> pd.Series:
        """Vectorized parse of entity strings."""
        def _parse(entities_str):
            if not entities_str or entities_str in ["[]", "", "nan"]:
                return []
            entities_str = str(entities_str).strip()
            try:
                if entities_str.startswith("[(") and entities_str.endswith(")]"):
                    return ast.literal_eval(entities_str)
                elif entities_str.startswith("[[") and entities_str.endswith("]]"):
                    return [tuple(e) for e in ast.literal_eval(entities_str)]
                elif entities_str.startswith("[{") and entities_str.endswith("}]"):
                    return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
                else:
                    parsed = ast.literal_eval(entities_str)
                    return [
                        tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
                    ]
            except (ValueError, SyntaxError, json.JSONDecodeError):
                return []
        return series.map(_parse)
    @staticmethod
    def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
        """Vectorized entity validation."""
        def _validate(text, entities):
            if not entities or not text:
                return []
            text = str(text).strip()
            valid = []
            for ent in entities:
                if not isinstance(ent, (list, tuple)) or len(ent) != 3:
                    continue
                start, end, label = ent
                try:
                    start, end = int(start), int(end)
                except (ValueError, TypeError):
                    continue
                if not isinstance(label, str):
                    continue
                if not (0 <= start < end <= len(text)):
                    continue
                if not text[start:end].strip():
                    continue
                valid.append((start, end, label))
            if not valid:
                return []
            valid.sort(key=lambda x: (x[0], x[1]))
            # remove overlaps
            filtered, last_end = [], -1
            for s, e, l in valid:
                if s >= last_end:
                    filtered.append((s, e, l))
                    last_end = e
            return filtered
        return pd.Series(map(_validate, texts, entities_series), index=texts.index)
    @staticmethod
    def _create_docs(nlp, texts, entities):
        """Batch create spaCy Docs."""
        docs = []
        for text, ents in zip(texts, entities):
            doc = nlp(text)
            spans = []
            for start, end, label in ents:
                span = doc.char_span(
                    start, end, label=label, alignment_mode="contract"
                ) or doc.char_span(start, end, label=label, alignment_mode="strict")
                if span:
                    spans.append(span)
            doc.ents = filter_spans(spans)
            docs.append(doc)
        return docs
    def build(self) -> int:
        filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
        df = self.data_loader.load_csv_complete(filepath)
        df = df[["name", "ner_tagged", "ner_entities"]]
        # Filter early
        ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
        if ner_df.empty:
            logging.error("No NER tagged data found")
            return 1
        total_rows = len(df)
        del df  # No need to keep in memory
        logging.info(f"Found {len(ner_df)} NER tagged entries")
        nlp = spacy.blank("fr")
        # Vectorized parsing + validation
        parsed_entities = self._parse_entities(ner_df["ner_entities"])
        validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
        # Drop rows with no valid entities
        mask = validated_entities.map(bool)
        ner_df = ner_df.loc[mask]
        validated_entities = validated_entities.loc[mask]
        if ner_df.empty:
            logging.error("No valid training examples after validation")
            return 1
        # Prepare training data
        training_data = list(
            zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
        )
        # Create spaCy DocBin in batch
        docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
        doc_bin = DocBin(docs=docs)
        # Save
        json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
        spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
        doc_bin.to_disk(spacy_path)
        logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
        logging.info(f"Saved NER JSON to {json_path}")
        logging.info(f"Saved NER spacy to {spacy_path}")
        return 0
@@ -1,212 +0,0 @@
 from typing import Union, Dict, Any, List
 import logging
 class NERNameTagger:
    def tag_name(
        self, name: str, probable_native: str, probable_surname: str
    ) -> Union[Dict[str, Any], None]:
        """Create a single NER training example using probable_native and probable_surname"""
        if not name or not probable_native or not probable_surname:
            return None
        name = name.strip()
        probable_native = probable_native.strip()
        probable_surname = probable_surname.strip()
        entities = []
        used_spans = []  # Track used character spans to prevent overlaps
        # Helper function to check if a span overlaps with any existing span
        def has_overlap(start, end):
            for used_start, used_end in used_spans:
                if not (end <= used_start or start >= used_end):
                    return True
            return False
        # Find positions of native names in the full name
        native_words = probable_native.split()
        name_lower = name.lower()  # Use lowercase for consistent searching
        processed_native_words = set()
        for native_word in native_words:
            native_word = native_word.strip()
            if len(native_word) < 2:  # Skip very short words
                continue
            native_word_lower = native_word.lower()
            # Skip if we've already processed this exact word
            if native_word_lower in processed_native_words:
                continue
            processed_native_words.add(native_word_lower)
            # Find the first occurrence of this native word that doesn't overlap
            start_pos = 0
            while True:
                pos = name_lower.find(native_word_lower, start_pos)  # Case-insensitive search
                if pos == -1:
                    break
                # Calculate end position - make sure we only include the word itself
                end_pos = pos + len(native_word_lower)
                # Double-check that the extracted span matches exactly what we expect
                extracted_text = name[pos:end_pos]  # Get original case text
                if extracted_text.lower() != native_word_lower:
                    start_pos = pos + 1
                    continue
                # Check if this is a word boundary match and doesn't overlap
                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
                    pos, end_pos
                ):
                    entities.append((pos, end_pos, "NATIVE"))
                    used_spans.append((pos, end_pos))
                    break  # Only take the first non-overlapping occurrence
                start_pos = pos + 1
        # Find position of surname in the full name
        if probable_surname and len(probable_surname.strip()) >= 2:
            surname_lower = probable_surname.lower()
            # Find the first occurrence that doesn't overlap
            start_pos = 0
            while True:
                pos = name_lower.find(surname_lower, start_pos)  # Case-insensitive search
                if pos == -1:
                    break
                # Calculate end position correctly - exact match only
                end_pos = pos + len(surname_lower)
                # Double-check that the extracted span matches exactly what we expect
                extracted_text = name[pos:end_pos]  # Get original case text
                if extracted_text.lower() != surname_lower:
                    start_pos = pos + 1
                    continue
                if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
                    pos, end_pos
                ):
                    entities.append((pos, end_pos, "SURNAME"))
                    used_spans.append((pos, end_pos))
                    break
                start_pos = pos + 1
        if not entities:
            logging.warning(
                f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
            )
            return None
        # Sort entities by position and validate
        entities.sort(key=lambda x: x[0])
        # Final validation - ensure no overlaps and valid spans
        validated_entities = []
        for start, end, label in entities:
            # Check bounds
            if not (0 <= start < end <= len(name)):
                logging.warning(
                    f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
                )
                continue
            # Check for overlaps with already validated entities
            if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
                logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
                continue
            # CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
            span_text = name[start:end]
            if not span_text or span_text != span_text.strip() or " " in span_text:
                logging.warning(
                    f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
                )
                continue
            validated_entities.append((start, end, label))
        if not validated_entities:
            logging.warning(f"No valid entities after validation for: '{name}'")
            return None
        # Convert to string format that matches the dataset
        entities_str = str(validated_entities)
        return {
            "entities": entities_str,
            "spans": validated_entities,  # Keep the original tuples for internal use
        }
    @classmethod
    def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
        """Check if the match is at word boundaries"""
        # Check character before start position
        if start > 0:
            prev_char = text[start - 1]
            if prev_char.isalnum():
                return False
        # Check character after end position
        if end < len(text):
            next_char = text[end]
            if next_char.isalnum():
                return False
        return True
    @classmethod
    def validate_entities(cls, name: str, entities_str: str) -> bool:
        """Validate that entity annotations are correct for a given name"""
        try:
            import ast
            entities = ast.literal_eval(entities_str)
            # Check for overlaps and valid bounds
            sorted_entities = sorted(entities, key=lambda x: x[0])
            for i, (start, end, label) in enumerate(sorted_entities):
                # Check bounds
                if not (0 <= start < end <= len(name)):
                    return False
                # Check for overlaps with next entity
                if i < len(sorted_entities) - 1:
                    next_start = sorted_entities[i + 1][0]
                    if end > next_start:
                        return False
                # Extract the text span and validate it's not empty
                span_text = name[start:end]
                if not span_text.strip():
                    return False
            return True
        except (ValueError, SyntaxError, TypeError):
            return False
    @classmethod
    def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
        """Extract the actual text for each entity type"""
        result = {"NATIVE": [], "SURNAME": []}
        try:
            import ast
            entities = ast.literal_eval(entities_str)
            for start, end, label in entities:
                if 0 <= start < end <= len(name):
                    span_text = name[start:end]
                    if label in result:
                        result[label].append(span_text)
        except (ValueError, SyntaxError, TypeError):
            pass
        return result
@@ -0,0 +1,89 @@
 #!.venv/bin/python3
 import argparse
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent
 sys.path.insert(0, str(parent_dir))
 from core.config import setup_config, PipelineConfig
 from core.utils.data_loader import DataLoader
 from processing.monitoring.pipeline_monitor import PipelineMonitor
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
 from web.interfaces.configuration import Configuration
 from web.interfaces.dashboard import Dashboard
 from web.interfaces.data_overview import DataOverview
 from web.interfaces.data_processing import DataProcessing
 from web.interfaces.experiments import Experiments
 from web.interfaces.predictions import Predictions
 from web.interfaces.results_analysis import ResultsAnalysis
 # Page configuration
 st.set_page_config(
    page_title="DRC NERS Platform",
    page_icon="🇨🇩",
    layout="wide",
    initial_sidebar_state="expanded",
 )
 def initialize_session_state(config: PipelineConfig):
    """Initialize session state variables"""
    if "config" not in st.session_state:
        st.session_state.config = config
    if "data_loader" not in st.session_state:
        st.session_state.data_loader = DataLoader(config)
    if "experiment_tracker" not in st.session_state:
        st.session_state.experiment_tracker = ExperimentTracker(config)
    if "experiment_runner" not in st.session_state:
        st.session_state.experiment_runner = ExperimentRunner(config)
    if "pipeline_monitor" not in st.session_state:
        st.session_state.pipeline_monitor = PipelineMonitor()
    if "current_experiment" not in st.session_state:
        st.session_state.current_experiment = None
    if "experiment_results" not in st.session_state:
        st.session_state.experiment_results = {}
 class StreamlitApp:
    def __init__(self, config: PipelineConfig):
        self.config = config
        initialize_session_state(config)
    def run(self):
        st.title("🇨🇩 DRC NERS Pipeline")
        st.markdown(
            "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
        )
        st.markdown(
            """
            ## Overview
            Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
            underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
            data.
            This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
            million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
        """
        )
 def main():
    parser = argparse.ArgumentParser(
        description="DRC NERS Platform",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("--config", type=str, help="Path to configuration file")
    parser.add_argument("--env", type=str, default="development", help="Environment name")
    args = parser.parse_args()
    config = setup_config(args.config, env=args.env)
    app = StreamlitApp(config)
    app.run()
 if __name__ == "__main__":
    main()
@@ -2,11 +2,9 @@ import streamlit as st
 class Configuration:
    """Handles configuration display and management"""
    def __init__(self, config):
        self.config = config
    def index(self):
-        st.header("Current Configuration")
+        st.title("Configuration")
        st.json(self.config.model_dump())
@@ -20,7 +20,7 @@ class Dashboard:
        self.experiment_runner = experiment_runner
    def index(self):
-        st.header("Dashboard")
+        st.title("Dashboard")
        col1, col2, col3, col4 = st.columns(4)
        # Load basic statistics
@@ -21,7 +21,7 @@ class DataOverview:
        self.config = config
    def index(self):
-        st.header("Data Overview")
+        st.title("Data Overview")
        data_files = {
            "Names": self.config.data.input_file,
            "Featured Dataset": self.config.data.output_files["featured"],
@@ -3,7 +3,7 @@ import plotly.express as px
 import streamlit as st
 from core.utils.data_loader import OPTIMIZED_DTYPES
-from interface.log_reader import LogReader
+from web.interfaces.log_reader import LogReader
@st.cache_data
@@ -21,7 +21,7 @@ class DataProcessing:
        self.pipeline_monitor = pipeline_monitor
    def index(self):
-        st.header("Data Processing Pipeline")
+        st.title("Data Processing")
        status = self.pipeline_monitor.get_pipeline_status()
        # Overall progress
@@ -12,8 +12,6 @@ from research.model_registry import list_available_models
 class Experiments:
    """Handles experiment management interface"""
    def __init__(
        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
@@ -22,8 +20,7 @@ class Experiments:
        self.experiment_runner = experiment_runner
    def index(self):
-        """Main experiments page"""
+        st.title("Experiments")
        st.header("Experiment Management")
        tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"])
        with tab1:
@@ -12,8 +12,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
 class Predictions:
    """Handles prediction interface"""
    def __init__(
        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
@@ -22,8 +20,7 @@ class Predictions:
        self.experiment_runner = experiment_runner
    def index(self):
-        """Main predictions page"""
+        st.title("Predictions")
        st.header("Make Predictions")
        # Load available models
        experiments = self.experiment_tracker.list_experiments()
@@ -11,8 +11,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
 class ResultsAnalysis:
    """Handles experiment results and analysis interface"""
    def __init__(
        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
@@ -21,8 +19,7 @@ class ResultsAnalysis:
        self.experiment_runner = experiment_runner
    def index(self):
-        """Main results analysis page"""
+        st.title("Results & Analysis")
        st.header("Results & Analysis")
        tab1, tab2, tab3 = st.tabs(
            ["Experiment Comparison", "Performance Analysis", "Model Analysis"]
        )
@@ -0,0 +1,22 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(parent_dir))
 from web.interfaces.dashboard import Dashboard
 st.set_page_config(page_title="Dashboard", page_icon="📊", layout="wide")
 if "config" in st.session_state:
    dashboard = Dashboard(
        st.session_state.config,
        st.session_state.experiment_tracker,
        st.session_state.experiment_runner,
    )
    dashboard.index()
 else:
    st.error("Please run the main app first to initialize the configuration.")
    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,18 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(parent_dir))
 from web.interfaces.data_overview import DataOverview
 st.set_page_config(page_title="Data Overview", page_icon="📋", layout="wide")
 if "config" in st.session_state:
    data_overview = DataOverview(st.session_state.config)
    data_overview.index()
 else:
    st.error("Please run the main app first to initialize the configuration.")
    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,18 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(parent_dir))
 from web.interfaces.data_processing import DataProcessing
 st.set_page_config(page_title="Data Processing", page_icon="⚙️", layout="wide")
 if "config" in st.session_state:
    data_processing = DataProcessing(st.session_state.config, st.session_state.pipeline_monitor)
    data_processing.index()
 else:
    st.error("Please run the main app first to initialize the configuration.")
    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,22 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(parent_dir))
 from web.interfaces.experiments import Experiments
 st.set_page_config(page_title="Experiments", page_icon="🧪", layout="wide")
 if "config" in st.session_state:
    experiments = Experiments(
        st.session_state.config,
        st.session_state.experiment_tracker,
        st.session_state.experiment_runner,
    )
    experiments.index()
 else:
    st.error("Please run the main app first to initialize the configuration.")
    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,22 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(parent_dir))
 from web.interfaces.results_analysis import ResultsAnalysis
 st.set_page_config(page_title="Results & Analysis", page_icon="📈", layout="wide")
 if "config" in st.session_state:
    results_analysis = ResultsAnalysis(
        st.session_state.config,
        st.session_state.experiment_tracker,
        st.session_state.experiment_runner,
    )
    results_analysis.index()
 else:
    st.error("Please run the main app first to initialize the configuration.")
    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,22 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(parent_dir))
 from web.interfaces.predictions import Predictions
 st.set_page_config(page_title="Predictions", page_icon="🔮", layout="wide")
 if "config" in st.session_state:
    predictions = Predictions(
        st.session_state.config,
        st.session_state.experiment_tracker,
        st.session_state.experiment_runner,
    )
    predictions.index()
 else:
    st.error("Please run the main app first to initialize the configuration.")
    st.markdown("Go back to the [main page](/) to start the application.")
@@ -0,0 +1,18 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
 parent_dir = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(parent_dir))
 from web.interfaces.configuration import Configuration
 st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
 if "config" in st.session_state:
    configuration = Configuration(st.session_state.config)
    configuration.index()
 else:
    st.error("Please run the main app first to initialize the configuration.")
    st.markdown("Go back to the [main page](/) to start the application.")