feat: add osm data

2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
@@ -50,7 +50,9 @@ class StreamlitApp:
    @classmethod
    def run(cls):
        st.title("🇨🇩 DRC NERS Platform")
-        st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
+        st.markdown(
+            "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
+        )
        st.markdown(
            """
            ## Overview
@@ -1,2 +1 @@
 from .ner_testing import NERTesting
-
@@ -13,10 +13,10 @@ from research.model_registry import list_available_models

 class Experiments:
    def __init__(
-            self,
-            config: PipelineConfig,
-            experiment_tracker: ExperimentTracker,
-            experiment_runner: ExperimentRunner
+        self,
+        config: PipelineConfig,
+        experiment_tracker: ExperimentTracker,
+        experiment_runner: ExperimentRunner,
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -26,8 +26,7 @@ class Experiments:
    def index(self):
        st.title("Experiments")

-        tab1, tab2, tab3 = st.tabs(
-            ["Templates", "Experiments", "Batch Experiments"])
+        tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])

        with tab1:
            self.show_template_experiments()
@@ -56,14 +55,18 @@ class Experiments:
                self._show_experiments_by_type(available_experiments["advanced"], "advanced")

            with exp_tabs[2]:
-                self._show_experiments_by_type(available_experiments["feature_study"], "feature_study")
+                self._show_experiments_by_type(
+                    available_experiments["feature_study"], "feature_study"
+                )

            with exp_tabs[3]:
                self._show_experiments_by_type(available_experiments["tuning"], "tuning")

        except Exception as e:
            st.error(f"Error loading experiment templates: {e}")
-            st.info("Make sure the research templates file exists at `config/research_templates.yaml`")
+            st.info(
+                "Make sure the research templates file exists at `config/research_templates.yaml`"
+            )

    def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
        """Show experiments for a specific type"""
@@ -142,7 +145,7 @@ class Experiments:
        # Display experiments
        for i, exp in enumerate(experiments):
            with st.expander(
-                    f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
+                f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
            ):
                self._display_experiment_details(exp, i)

@@ -213,7 +216,7 @@ class Experiments:
            experiment_types = st.multiselect(
                "Select Experiment Types",
                ["baseline", "advanced", "feature_study", "tuning"],
-                default=["baseline"]
+                default=["baseline"],
            )

            if experiment_types:
@@ -223,11 +226,11 @@ class Experiments:
                    experiments = available_experiments.get(exp_type, [])
                    if experiments:
                        st.write(f"**{exp_type.title()} Experiments:**")
-                        exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)]
+                        exp_names = [
+                            exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
+                        ]
                        selected_names = st.multiselect(
-                            f"Select {exp_type} experiments",
-                            exp_names,
-                            key=f"select_{exp_type}"
+                            f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
                        )

                        for name in selected_names:
@@ -308,13 +311,13 @@ class Experiments:
                )

    def run_batch_experiments(
-            self,
-            base_name: str,
-            model_types: List[str],
-            ngram_ranges: str,
-            feature_combinations: List[str],
-            test_sizes: str,
-            tags: str,
+        self,
+        base_name: str,
+        model_types: List[str],
+        ngram_ranges: str,
+        feature_combinations: List[str],
+        test_sizes: str,
+        tags: str,
    ):
        """Run batch experiments with parameter combinations"""
        with st.spinner("Running batch experiments..."):
@@ -38,7 +38,7 @@ class LogReader:

            # Parse log entries from the end
            entries = []
-            for line in reversed(lines[-count * 2:]):  # Read more lines in case some don't match
+            for line in reversed(lines[-count * 2 :]):  # Read more lines in case some don't match
                entry = self._parse_log_line(line.strip())
                if entry:
                    entries.append(entry)
@@ -33,7 +33,9 @@ class NERTesting:

        # Load model
        if not self.load_ner_model():
-            st.warning("NER model could not be loaded. Please ensure the model is trained and available.")
+            st.warning(
+                "NER model could not be loaded. Please ensure the model is trained and available."
+            )
            return

        # Display model information
@@ -53,9 +55,11 @@ class NERTesting:
            col1, col2, col3, col4 = st.columns(4)

            with col1:
-                st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}")
+                st.metric(
+                    "Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
+                )
            with col2:
-                st.metric("Epochs", self.training_stats.get('epochs', 0))
+                st.metric("Epochs", self.training_stats.get("epochs", 0))
            with col3:
                st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
            with col4:
@@ -64,7 +68,7 @@ class NERTesting:
    def show_model_evaluation_info(self):
        if self.evaluation_stats:
            col1, col2, col3 = st.columns(4)
-            overall = self.evaluation_stats.get('overall', {})
+            overall = self.evaluation_stats.get("overall", {})

            with col1:
                st.metric("Overall Precision", f"{overall['precision']:.2f}")
@@ -79,7 +83,7 @@ class NERTesting:
        name_input = st.text_input(
            "Name:",
            placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
-            help="Enter a full name or multiple names separated by spaces"
+            help="Enter a full name or multiple names separated by spaces",
        )
        if name_input.strip():
            if st.button("Analyze Name", type="primary"):
@@ -90,12 +94,12 @@ class NERTesting:
            "Names:",
            placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
            height=150,
-            help="Enter each name on a new line"
+            help="Enter each name on a new line",
        )

        if names_input.strip():
            if st.button("Analyze All Names", type="primary"):
-                names = [name.strip() for name in names_input.split('\n') if name.strip()]
+                names = [name.strip() for name in names_input.split("\n") if name.strip()]
                for i, name in enumerate(names):
                    st.markdown(f"**Name {i+1}: {name}**")
                    self.analyze_and_display(name)
@@ -106,12 +110,12 @@ class NERTesting:
        try:
            result = self.ner_model.predict(text)
            st.subheader("Analysis Results")
-            entities = result.get('entities', [])
+            entities = result.get("entities", [])

            if entities:
                self.show_visual_entities(text, entities)
-                native_count = sum(1 for e in entities if e['label'] == 'NATIVE')
-                surname_count = sum(1 for e in entities if e['label'] == 'SURNAME')
+                native_count = sum(1 for e in entities if e["label"] == "NATIVE")
+                surname_count = sum(1 for e in entities if e["label"] == "SURNAME")

                col1, col2, col3 = st.columns(3)
                with col1:
@@ -134,29 +138,17 @@ class NERTesting:
            # Convert our entities format to spaCy format for displacy
            ents = []
            for entity in entities:
-                ents.append({
-                    "start": entity['start'],
-                    "end": entity['end'],
-                    "label": entity['label']
-                })
+                ents.append(
+                    {"start": entity["start"], "end": entity["end"], "label": entity["label"]}
+                )

            # Create doc-like structure for displacy
-            doc_data = {
-                "text": text,
-                "ents": ents,
-                "title": None
-            }
+            doc_data = {"text": text, "ents": ents, "title": None}

            # Custom colors for our labels
-            colors = {
-                "NATIVE": "#74C0FC",  # Light blue
-                "SURNAME": "#69DB7C"  # Light green
-            }
+            colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"}  # Light blue  # Light green

-            options = {
-                "colors": colors,
-                "distance": 90
-            }
+            options = {"colors": colors, "distance": 90}

            # Generate HTML visualization
            html = displacy.render(doc_data, style="ent", manual=True, options=options)
@@ -13,7 +13,7 @@ from research.experiment.experiment_tracker import ExperimentTracker

 class Predictions:
    def __init__(
-            self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -111,7 +111,7 @@ class Predictions:
            return None

    def _display_single_prediction_results(
-            self, prediction: str, confidence: Optional[float], experiment, name_input: str
+        self, prediction: str, confidence: Optional[float], experiment, name_input: str
    ):
        """Display single prediction results"""
        col1, col2 = st.columns(2)
@@ -288,7 +288,7 @@ class Predictions:
            return pd.DataFrame()

    def _run_dataset_prediction(
-            self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
+        self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
    ):
        """Run dataset prediction and display results"""
        with st.spinner("Running predictions..."):
@@ -12,7 +12,7 @@ from research.experiment.experiment_tracker import ExperimentTracker

 class ResultsAnalysis:
    def __init__(
-            self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker