hotfixes

2025-08-16 20:34:45 +02:00
parent 84f7d41a84
commit cf1cbac1a8
36 changed files with 95 additions and 152 deletions
@@ -0,0 +1,8 @@
+notebooks/* linguist-vendored
+
+linguist-detectable=false
+
+*.ipynb linguist-detectable=false
+
+# Enforce Unix newlines
+*.py   text eol=lf
@@ -24,29 +24,29 @@ def build(config: PipelineConfig):

 def train(config: PipelineConfig):
    """Train the NER model."""
-    trainer = NameModel(config)
+    name_model = NameModel(config)

    data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
    if not data_path.exists():
        logging.info("NER data not found. Building dataset first...")
        build(config)

-    trainer.create_blank_model("fr")
-    data = trainer.load_data(str(data_path))
+    name_model.create_blank_model("fr")
+    data = name_model.load_data(str(data_path))

    split_idx = int(len(data) * 0.9)
    train_data, eval_data = data[:split_idx], data[split_idx:]

    logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
-    trainer.train(
+    name_model.train(
        data=train_data,
        epochs=config.processing.epochs,
        batch_size=config.processing.batch_size,
        dropout_rate=0.3,
    )
-    trainer.evaluate(eval_data)
+    name_model.evaluate(eval_data)

-    model_path = trainer.save()
+    model_path = name_model.save()
    logging.info(f"Model saved to: {model_path}")


@@ -1,8 +1,8 @@
 import logging
+import time
+from typing import Dict, Any

 import pandas as pd
-from typing import Dict, Any
-import time

 from processing.batch.batch_config import BatchConfig
 from processing.batch.batch_processor import BatchProcessor
@@ -49,9 +49,6 @@ class Pipeline:
                "processed_batches": step.state.processed_batches,
                "total_batches": step.state.total_batches,
                "failed_batches": len(step.state.failed_batches),
-                "completion_percentage": (
-                    step.state.processed_batches / max(1, step.state.total_batches)
-                )
-                * 100,
+                "completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
            }
        return progress
@@ -7,7 +7,7 @@ import pandas as pd

 from core.config.pipeline_config import PipelineConfig
 from core.utils.region_mapper import RegionMapper
-from processing.ner.ner_name_tagger import NERNameTagger
+from processing.ner.name_tagger import NameTagger
 from processing.steps import PipelineStep


@@ -27,7 +27,7 @@ class FeatureExtractionStep(PipelineStep):
    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("feature_extraction", pipeline_config)
        self.region_mapper = RegionMapper()
-        self.name_tagger = NERNameTagger()
+        self.name_tagger = NameTagger()

    @classmethod
    def requires_batch_mutation(cls) -> bool:
@@ -6,7 +6,7 @@ from typing import Dict
 import pandas as pd

 from core.config.pipeline_config import PipelineConfig
-from processing.ner.ner_name_model import NERNameModel
+from processing.ner.name_model import NameModel
 from processing.steps import PipelineStep, NameAnnotation


@@ -19,7 +19,7 @@ class NERAnnotationStep(PipelineStep):

        self.model_name = "drc_ner_model"
        self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
-        self.ner_trainer = NERNameModel(pipeline_config)
+        self.name_model = NameModel(pipeline_config)
        self.ner_config = pipeline_config.annotation.ner

        # Statistics
@@ -35,19 +35,19 @@ class NERAnnotationStep(PipelineStep):
        try:
            if self.model_path.exists():
                logging.info(f"Loading NER model from {self.model_path}")
-                self.ner_trainer.load(str(self.model_path))
+                self.name_model.load(str(self.model_path))
                logging.info("NER model loaded successfully")
            else:
                logging.warning(f"NER model not found at {self.model_path}")
                logging.warning("NER annotation will be skipped. Train the model first.")
-                self.ner_trainer.nlp = None
+                self.name_model.nlp = None
        except Exception as e:
            logging.error(f"Failed to load NER model: {e}")
-            self.ner_trainer.nlp = None
+            self.name_model.nlp = None

    def analyze_name(self, name: str) -> Dict:
        """Analyze a name with retry logic"""
-        if self.ner_trainer.nlp is None:
+        if self.name_model.nlp is None:
            return {
                "identified_name": None,
                "identified_surname": None,
@@ -62,7 +62,7 @@ class NERAnnotationStep(PipelineStep):
                start_time = time.time()

                # Get NER predictions
-                prediction = self.ner_trainer.predict(name.lower())
+                prediction = self.name_model.predict(name.lower())
                entities = prediction.get("entities", [])

                elapsed_time = time.time() - start_time
@@ -7,7 +7,6 @@ from typing import Optional, Dict, List
 import pandas as pd

 from core.config import PipelineConfig, get_config
-
 from research.experiment import ExperimentConfig, ExperimentStatus
 from research.experiment.experiement_result import ExperimentResult

@@ -2,6 +2,7 @@
 import argparse
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules
@@ -13,13 +14,6 @@ from core.utils.data_loader import DataLoader
 from processing.monitoring.pipeline_monitor import PipelineMonitor
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
-from web.interfaces.configuration import Configuration
-from web.interfaces.dashboard import Dashboard
-from web.interfaces.data_overview import DataOverview
-from web.interfaces.data_processing import DataProcessing
-from web.interfaces.experiments import Experiments
-from web.interfaces.predictions import Predictions
-from web.interfaces.results_analysis import ResultsAnalysis

 # Page configuration
 st.set_page_config(
@@ -53,12 +47,10 @@ class StreamlitApp:
        self.config = config
        initialize_session_state(config)

-    def run(self):
-        st.title("🇨🇩 DRC NERS Pipeline")
-        st.markdown(
-            "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
-        )
-
+    @classmethod
+    def run(cls):
+        st.title("🇨🇩 DRC NERS Platform")
+        st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
        st.markdown(
            """
            ## Overview
@@ -230,7 +230,8 @@ class Experiments:

        return experiments

-    def _display_experiment_details(self, exp, index: int):
+    @classmethod
+    def _display_experiment_details(cls, exp, index: int):
        """Display details for a single experiment"""
        col1, col2, col3 = st.columns(3)

@@ -368,64 +369,3 @@ class Experiments:

            except Exception as e:
                st.error(f"Error running batch experiments: {e}")
-
-    def run_baseline_experiments(self):
-        """Run baseline experiments"""
-        with st.spinner("Running baseline experiments..."):
-            try:
-                builder = ExperimentBuilder()
-                experiments = builder.create_baseline_experiments()
-                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
-
-                st.success(f"Completed {len(experiment_ids)} baseline experiments")
-
-                # Show quick comparison
-                if experiment_ids:
-                    comparison = self.experiment_runner.compare_experiments(experiment_ids)
-                    st.write("**Results Summary:**")
-                    st.dataframe(
-                        comparison[["name", "model_type", "test_accuracy"]],
-                        use_container_width=True,
-                    )
-
-            except Exception as e:
-                st.error(f"Error running baseline experiments: {e}")
-
-    def run_ablation_study(self):
-        """Run feature ablation study"""
-        with st.spinner("Running ablation study..."):
-            try:
-                builder = ExperimentBuilder()
-                experiments = builder.create_feature_ablation_study()
-                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
-
-                st.success(f"Completed {len(experiment_ids)} ablation experiments")
-
-            except Exception as e:
-                st.error(f"Error running ablation study: {e}")
-
-    def run_component_study(self):
-        """Run name component study"""
-        with st.spinner("Running component study..."):
-            try:
-                builder = ExperimentBuilder()
-                experiments = builder.create_name_component_study()
-                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
-
-                st.success(f"Completed {len(experiment_ids)} component experiments")
-
-            except Exception as e:
-                st.error(f"Error running component study: {e}")
-
-    def run_province_study(self):
-        """Run province-specific study"""
-        with st.spinner("Running province study..."):
-            try:
-                builder = ExperimentBuilder()
-                experiments = builder.create_province_specific_study()
-                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
-
-                st.success(f"Completed {len(experiment_ids)} province experiments")
-
-            except Exception as e:
-                st.error(f"Error running province study: {e}")
@@ -38,7 +38,7 @@ class LogReader:

            # Parse log entries from the end
            entries = []
-            for line in reversed(lines[-count * 2 :]):  # Read more lines in case some don't match
+            for line in reversed(lines[-count * 2:]):  # Read more lines in case some don't match
                entry = self._parse_log_line(line.strip())
                if entry:
                    entries.append(entry)
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+
 import streamlit as st

 # Add parent directory to Python path to access core modules