refactor: clean up imports and improve gender normalization method

This commit is contained in:
2025-09-20 22:55:24 +02:00
parent 0816207a2c
commit dd2a9f2711
5 changed files with 226 additions and 82 deletions
+23 -18
View File
@@ -23,7 +23,7 @@ class RegionMapper:
"bandundu", "bandundu",
"katanga", "katanga",
"equateur", "equateur",
"province-orientale", "orientale",
"maniema", "maniema",
"nord-kivu", "nord-kivu",
"sud-kivu", "sud-kivu",
@@ -70,6 +70,11 @@ REGION_MAPPING: Dict[str, Tuple[str, str]] = {
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"), "mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"), "mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
# Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA # Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
"katanga": ("KATANGA", "KATANGA"),
"katanga-1": ("KATANGA", "KATANGA"),
"katanga-2": ("KATANGA", "KATANGA"),
"katanga-3": ("KATANGA", "KATANGA"),
"katanga-4": ("KATANGA", "KATANGA"),
"haut-katanga": ("HAUT-KATANGA", "KATANGA"), "haut-katanga": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"), "haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"), "haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
@@ -103,23 +108,23 @@ REGION_MAPPING: Dict[str, Tuple[str, str]] = {
"tshuapa-1": ("TSHUAPA", "EQUATEUR"), "tshuapa-1": ("TSHUAPA", "EQUATEUR"),
"tshuapa-2": ("TSHUAPA", "EQUATEUR"), "tshuapa-2": ("TSHUAPA", "EQUATEUR"),
# Province-Orientale # Province-Orientale
"province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"), "province-orientale": ("ORIENTALE", "ORIENTALE"),
"province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"), "province-orientale-1": ("ORIENTALE", "ORIENTALE"),
"province-orientale-2": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"), "province-orientale-2": ("ORIENTALE", "ORIENTALE"),
"province-orientale-3": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"), "province-orientale-3": ("ORIENTALE", "ORIENTALE"),
"province-orientale-4": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"), "province-orientale-4": ("ORIENTALE", "ORIENTALE"),
"haut-uele": ("HAUT-UELE", "PROVINCE-ORIENTALE"), "haut-uele": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"), "haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"), "haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
"bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"), "bas-uele": ("BAS-UELE", "ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"), "bas-uele-1": ("BAS-UELE", "ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"), "bas-uele-2": ("BAS-UELE", "ORIENTALE"),
"ituri": ("ITURI", "PROVINCE-ORIENTALE"), "ituri": ("ITURI", "ORIENTALE"),
"ituri-1": ("ITURI", "PROVINCE-ORIENTALE"), "ituri-1": ("ITURI", "ORIENTALE"),
"ituri-2": ("ITURI", "PROVINCE-ORIENTALE"), "ituri-2": ("ITURI", "ORIENTALE"),
"tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"), "tshopo": ("TSHOPO", "ORIENTALE"),
"tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"), "tshopo-1": ("TSHOPO", "ORIENTALE"),
"tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"), "tshopo-2": ("TSHOPO", "ORIENTALE"),
# Maniema # Maniema
"maniema": ("MANIEMA", "MANIEMA"), "maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"), "maniema-1": ("MANIEMA", "MANIEMA"),
+192 -59
View File
File diff suppressed because one or more lines are too long
+1 -1
View File
@@ -9,7 +9,7 @@ import pandas as pd
from pydantic import BaseModel from pydantic import BaseModel
from core.config.pipeline_config import PipelineConfig from core.config.pipeline_config import PipelineConfig
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig from processing.batch.batch_config import BatchConfig
+2 -1
View File
@@ -150,7 +150,8 @@ class FeatureExtractionStep(PipelineStep):
except Exception as e: except Exception as e:
logging.warning(f"NER tagging failed for row {idx}: {e}") logging.warning(f"NER tagging failed for row {idx}: {e}")
def _normalize_gender(self, series: pd.Series) -> pd.Series: @classmethod
def _normalize_gender(cls, series: pd.Series) -> pd.Series:
gender_mapping = { gender_mapping = {
"m": "m", "m": "m",
"male": "m", "male": "m",
+7 -2
View File
@@ -21,7 +21,7 @@ class Dashboard:
def index(self): def index(self):
st.title("Dashboard") st.title("Dashboard")
col1, col2, col3, col4 = st.columns(4) col1, col2, col3, col4, col5 = st.columns(5)
# Load basic statistics # Load basic statistics
try: try:
@@ -44,7 +44,12 @@ class Dashboard:
if "sex" in df.columns: if "sex" in df.columns:
gender_dist = df["sex"].value_counts() gender_dist = df["sex"].value_counts()
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1) ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
st.metric("F/M Ratio", f"{ratio:.2f}") st.metric("F/M Rate", f"{ratio:.2%}")
with col5:
if "annotated" in df.columns:
annotated = (df.get("annotated", 0) == 1).sum()
ratio = annotated / len(df) if len(df) > 0 else 0
st.metric("Annotation Rate", f"{ratio:.2%}")
else: else:
st.warning("No processed data found. Please run data processing first.") st.warning("No processed data found. Please run data processing first.")