fix: add missing regions in region_mapper

This commit is contained in:
2025-09-23 00:05:35 +02:00
parent c1b502c878
commit dda83510ac
5 changed files with 939 additions and 101 deletions
+96 -100
View File
@@ -33,129 +33,125 @@ class RegionMapper:
# DRC Region to Province Mapping
REGION_MAPPING: Dict[str, Tuple[str, str]] = {
# Kinshasa
"kinshasa": ("KINSHASA", "KINSHASA"),
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
"kinshasa-est": ("KINSHASA", "KINSHASA"),
"kinshasa-funa": ("KINSHASA", "KINSHASA"),
"kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
"kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
# Bas-Congo → Kongo-Central → BAS-CONGO
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
# Kwilu, Kwango, Mai-Ndombe → BANDUNDU
"bandundu": ("BANDUNDU", "BANDUNDU"),
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
"bandundu-2": ("BANDUNDU", "BANDUNDU"),
"bandundu-3": ("BANDUNDU", "BANDUNDU"),
"kwilu": ("KWILU", "BANDUNDU"),
"kwilu-1": ("KWILU", "BANDUNDU"),
"kwilu-2": ("KWILU", "BANDUNDU"),
"kwilu-3": ("KWILU", "BANDUNDU"),
"kwango": ("KWANGO", "BANDUNDU"),
"kwango-1": ("KWANGO", "BANDUNDU"),
"kwango-2": ("KWANGO", "BANDUNDU"),
"mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
# Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
"katanga": ("KATANGA", "KATANGA"),
"katanga-1": ("KATANGA", "KATANGA"),
"katanga-2": ("KATANGA", "KATANGA"),
"katanga-3": ("KATANGA", "KATANGA"),
"katanga-4": ("KATANGA", "KATANGA"),
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
"haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
"lualaba": ("LUALABA", "KATANGA"),
"lualaba-1": ("LUALABA", "KATANGA"),
"lualaba-2": ("LUALABA", "KATANGA"),
"lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
"tanganyika": ("TANGANYIKA", "KATANGA"),
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
# Equateur → MONGALA, NORD-UBANGI, SUD-UBANGI, TSHUAPA
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-fleuve": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-uele": ("BAS-UELE", "ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "ORIENTALE"),
"cataractes": ("KONGO-CENTRAL", "BAS-CONGO"),
"equateur": ("EQUATEUR", "EQUATEUR"),
"equateur-1": ("EQUATEUR", "EQUATEUR"),
"equateur-2": ("EQUATEUR", "EQUATEUR"),
"equateur-3": ("EQUATEUR", "EQUATEUR"),
"equateur-4": ("EQUATEUR", "EQUATEUR"),
"equateur-5": ("EQUATEUR", "EQUATEUR"),
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
"haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
"haut-uele": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
"ituri": ("ITURI", "ORIENTALE"),
"ituri-1": ("ITURI", "ORIENTALE"),
"ituri-2": ("ITURI", "ORIENTALE"),
"ituri-3": ("ITURI", "ORIENTALE"),
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-ce": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-orientale": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"katanga": ("KATANGA", "KATANGA"),
"katanga-1": ("KATANGA", "KATANGA"),
"katanga-2": ("KATANGA", "KATANGA"),
"katanga-3": ("KATANGA", "KATANGA"),
"katanga-4": ("KATANGA", "KATANGA"),
"kinshasa": ("KINSHASA", "KINSHASA"),
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
"kinshasa-est": ("KINSHASA", "KINSHASA"),
"kinshasa-funa": ("KINSHASA", "KINSHASA"),
"kinshasa-global": ("KINSHASA", "KINSHASA"),
"kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
"kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
"kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
"kwango": ("KWANGO", "BANDUNDU"),
"kwango-1": ("KWANGO", "BANDUNDU"),
"kwango-2": ("KWANGO", "BANDUNDU"),
"kwilu": ("KWILU", "BANDUNDU"),
"kwilu-1": ("KWILU", "BANDUNDU"),
"kwilu-2": ("KWILU", "BANDUNDU"),
"kwilu-3": ("KWILU", "BANDUNDU"),
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
"lualaba": ("LUALABA", "KATANGA"),
"lualaba-1": ("LUALABA", "KATANGA"),
"lualaba-2": ("LUALABA", "KATANGA"),
"lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
"lukaya": ("KONGO-CENTRAL", "BAS-CONGO"),
"mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
"maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"),
"maniema-2": ("MANIEMA", "MANIEMA"),
"mongala": ("MONGALA", "EQUATEUR"),
"mongala-1": ("MONGALA", "EQUATEUR"),
"mongala-2": ("MONGALA", "EQUATEUR"),
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
"nord-ubangi": ("NORD-UBANGI", "EQUATEUR"),
"nord-ubangi-1": ("NORD-UBANGI", "EQUATEUR"),
"nord-ubangi-2": ("NORD-UBANGI", "EQUATEUR"),
"sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
"tshuapa": ("TSHUAPA", "EQUATEUR"),
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
# Province-Orientale
"province-orientale": ("ORIENTALE", "ORIENTALE"),
"province-orientale-1": ("ORIENTALE", "ORIENTALE"),
"province-orientale-2": ("ORIENTALE", "ORIENTALE"),
"province-orientale-3": ("ORIENTALE", "ORIENTALE"),
"province-orientale-4": ("ORIENTALE", "ORIENTALE"),
"haut-uele": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
"bas-uele": ("BAS-UELE", "ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "ORIENTALE"),
"ituri": ("ITURI", "ORIENTALE"),
"ituri-1": ("ITURI", "ORIENTALE"),
"ituri-2": ("ITURI", "ORIENTALE"),
"tshopo": ("TSHOPO", "ORIENTALE"),
"tshopo-1": ("TSHOPO", "ORIENTALE"),
"tshopo-2": ("TSHOPO", "ORIENTALE"),
# Maniema
"maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"),
"maniema-2": ("MANIEMA", "MANIEMA"),
# Nord-Kivu
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
# Sud-Kivu
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
# Kasai-Occidental → KASAI, KASAI-CENTRAL
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
# Kasai-Oriental → LOMAMI, SANKURU
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
"sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
"tanganyika": ("TANGANYIKA", "KATANGA"),
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
"tshopo": ("TSHOPO", "ORIENTALE"),
"tshopo-1": ("TSHOPO", "ORIENTALE"),
"tshopo-2": ("TSHOPO", "ORIENTALE"),
"tshuapa": ("TSHUAPA", "EQUATEUR"),
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
}
+832
View File
File diff suppressed because one or more lines are too long
@@ -21,6 +21,7 @@ class PipelineMonitor:
self.checkpoint_dir = paths.checkpoints_dir
self.steps = [
"data_cleaning",
"data_selection",
"feature_extraction",
"ner_annotation",
"llm_annotation",
+9
View File
@@ -17,6 +17,15 @@ class DataSelectionStep(PipelineStep):
"""Process a single batch for data selection"""
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
# Remove rows where region == "global" only for specific years
if "region" in batch.columns and "year" in batch.columns:
target_years = {2015, 2021, 2022}
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(target_years)
removed = int(mask_remove.sum())
if removed:
batch = batch[~mask_remove]
logging.info(f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}")
# Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns]
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
+1 -1
View File
@@ -70,7 +70,7 @@ class FeatureExtractionStep(PipelineStep):
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
if "region" in result.columns:
result["province"] = self.region_mapper.map(result["region"])
result["province"] = self.region_mapper.map(result["region"]).str.lower()
result["province"] = result["province"].astype("category")
if "sex" in result.columns: