refactor: clean up imports and improve gender normalization method

2025-09-20 22:55:24 +02:00
parent 0816207a2c
commit dd2a9f2711
5 changed files with 226 additions and 82 deletions
@@ -23,7 +23,7 @@ class RegionMapper:
            "bandundu",
            "katanga",
            "equateur",
-            "province-orientale",
+            "orientale",
            "maniema",
            "nord-kivu",
            "sud-kivu",
@@ -70,6 +70,11 @@ REGION_MAPPING: Dict[str, Tuple[str, str]] = {
    "mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
    "mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
    # Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
+    "katanga": ("KATANGA", "KATANGA"),
+    "katanga-1": ("KATANGA", "KATANGA"),
+    "katanga-2": ("KATANGA", "KATANGA"),
+    "katanga-3": ("KATANGA", "KATANGA"),
+    "katanga-4": ("KATANGA", "KATANGA"),
    "haut-katanga": ("HAUT-KATANGA", "KATANGA"),
    "haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
    "haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
@@ -103,23 +108,23 @@ REGION_MAPPING: Dict[str, Tuple[str, str]] = {
    "tshuapa-1": ("TSHUAPA", "EQUATEUR"),
    "tshuapa-2": ("TSHUAPA", "EQUATEUR"),
    # Province-Orientale
-    "province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
-    "province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
-    "province-orientale-2": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
-    "province-orientale-3": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
-    "province-orientale-4": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
-    "haut-uele": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
-    "haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
-    "haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
-    "bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
-    "bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"),
-    "bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"),
-    "ituri": ("ITURI", "PROVINCE-ORIENTALE"),
-    "ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
-    "ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
-    "tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
-    "tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
-    "tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
+    "province-orientale": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-1": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-2": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-3": ("ORIENTALE", "ORIENTALE"),
+    "province-orientale-4": ("ORIENTALE", "ORIENTALE"),
+    "haut-uele": ("HAUT-UELE", "ORIENTALE"),
+    "haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
+    "haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
+    "bas-uele": ("BAS-UELE", "ORIENTALE"),
+    "bas-uele-1": ("BAS-UELE", "ORIENTALE"),
+    "bas-uele-2": ("BAS-UELE", "ORIENTALE"),
+    "ituri": ("ITURI", "ORIENTALE"),
+    "ituri-1": ("ITURI", "ORIENTALE"),
+    "ituri-2": ("ITURI", "ORIENTALE"),
+    "tshopo": ("TSHOPO", "ORIENTALE"),
+    "tshopo-1": ("TSHOPO", "ORIENTALE"),
+    "tshopo-2": ("TSHOPO", "ORIENTALE"),
    # Maniema
    "maniema": ("MANIEMA", "MANIEMA"),
    "maniema-1": ("MANIEMA", "MANIEMA"),
@@ -9,7 +9,7 @@ import pandas as pd
 from pydantic import BaseModel

 from core.config.pipeline_config import PipelineConfig
-from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
+from core.utils.data_loader import DataLoader
 from processing.batch.batch_config import BatchConfig


@@ -150,7 +150,8 @@ class FeatureExtractionStep(PipelineStep):
            except Exception as e:
                logging.warning(f"NER tagging failed for row {idx}: {e}")

-    def _normalize_gender(self, series: pd.Series) -> pd.Series:
+    @classmethod
+    def _normalize_gender(cls, series: pd.Series) -> pd.Series:
        gender_mapping = {
            "m": "m",
            "male": "m",
@@ -21,7 +21,7 @@ class Dashboard:

    def index(self):
        st.title("Dashboard")
-        col1, col2, col3, col4 = st.columns(4)
+        col1, col2, col3, col4, col5 = st.columns(5)

        # Load basic statistics
        try:
@@ -44,7 +44,12 @@ class Dashboard:
                    if "sex" in df.columns:
                        gender_dist = df["sex"].value_counts()
                        ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
-                        st.metric("F/M Ratio", f"{ratio:.2f}")
+                        st.metric("F/M Rate", f"{ratio:.2%}")
+                with col5:
+                    if "annotated" in df.columns:
+                        annotated = (df.get("annotated", 0) == 1).sum()
+                        ratio = annotated / len(df) if len(df) > 0 else 0
+                        st.metric("Annotation Rate", f"{ratio:.2%}")
            else:
                st.warning("No processed data found. Please run data processing first.")