refactor: clean up imports and improve gender normalization method

This commit is contained in:
2025-09-20 22:55:24 +02:00
parent 0816207a2c
commit dd2a9f2711
5 changed files with 226 additions and 82 deletions
+23 -18
View File
@@ -23,7 +23,7 @@ class RegionMapper:
"bandundu",
"katanga",
"equateur",
"province-orientale",
"orientale",
"maniema",
"nord-kivu",
"sud-kivu",
@@ -70,6 +70,11 @@ REGION_MAPPING: Dict[str, Tuple[str, str]] = {
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
# Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
"katanga": ("KATANGA", "KATANGA"),
"katanga-1": ("KATANGA", "KATANGA"),
"katanga-2": ("KATANGA", "KATANGA"),
"katanga-3": ("KATANGA", "KATANGA"),
"katanga-4": ("KATANGA", "KATANGA"),
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
@@ -103,23 +108,23 @@ REGION_MAPPING: Dict[str, Tuple[str, str]] = {
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
# Province-Orientale
"province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
"province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
"province-orientale-2": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
"province-orientale-3": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
"province-orientale-4": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
"haut-uele": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
"haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
"bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"),
"ituri": ("ITURI", "PROVINCE-ORIENTALE"),
"ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
"ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
"tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
"tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
"tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
"province-orientale": ("ORIENTALE", "ORIENTALE"),
"province-orientale-1": ("ORIENTALE", "ORIENTALE"),
"province-orientale-2": ("ORIENTALE", "ORIENTALE"),
"province-orientale-3": ("ORIENTALE", "ORIENTALE"),
"province-orientale-4": ("ORIENTALE", "ORIENTALE"),
"haut-uele": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
"bas-uele": ("BAS-UELE", "ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "ORIENTALE"),
"ituri": ("ITURI", "ORIENTALE"),
"ituri-1": ("ITURI", "ORIENTALE"),
"ituri-2": ("ITURI", "ORIENTALE"),
"tshopo": ("TSHOPO", "ORIENTALE"),
"tshopo-1": ("TSHOPO", "ORIENTALE"),
"tshopo-2": ("TSHOPO", "ORIENTALE"),
# Maniema
"maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"),
+193 -60
View File
File diff suppressed because one or more lines are too long
+1 -1
View File
@@ -9,7 +9,7 @@ import pandas as pd
from pydantic import BaseModel
from core.config.pipeline_config import PipelineConfig
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
+2 -1
View File
@@ -150,7 +150,8 @@ class FeatureExtractionStep(PipelineStep):
except Exception as e:
logging.warning(f"NER tagging failed for row {idx}: {e}")
def _normalize_gender(self, series: pd.Series) -> pd.Series:
@classmethod
def _normalize_gender(cls, series: pd.Series) -> pd.Series:
gender_mapping = {
"m": "m",
"male": "m",
+7 -2
View File
@@ -21,7 +21,7 @@ class Dashboard:
def index(self):
st.title("Dashboard")
col1, col2, col3, col4 = st.columns(4)
col1, col2, col3, col4, col5 = st.columns(5)
# Load basic statistics
try:
@@ -44,7 +44,12 @@ class Dashboard:
if "sex" in df.columns:
gender_dist = df["sex"].value_counts()
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
st.metric("F/M Ratio", f"{ratio:.2f}")
st.metric("F/M Rate", f"{ratio:.2%}")
with col5:
if "annotated" in df.columns:
annotated = (df.get("annotated", 0) == 1).sum()
ratio = annotated / len(df) if len(df) > 0 else 0
st.metric("Annotation Rate", f"{ratio:.2%}")
else:
st.warning("No processed data found. Please run data processing first.")