fix: add missing regions in region_mapper
This commit is contained in:
@@ -17,6 +17,15 @@ class DataSelectionStep(PipelineStep):
|
||||
"""Process a single batch for data selection"""
|
||||
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
|
||||
|
||||
# Remove rows where region == "global" only for specific years
|
||||
if "region" in batch.columns and "year" in batch.columns:
|
||||
target_years = {2015, 2021, 2022}
|
||||
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(target_years)
|
||||
removed = int(mask_remove.sum())
|
||||
if removed:
|
||||
batch = batch[~mask_remove]
|
||||
logging.info(f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}")
|
||||
|
||||
# Check which columns exist in the batch
|
||||
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
||||
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
|
||||
|
||||
@@ -70,7 +70,7 @@ class FeatureExtractionStep(PipelineStep):
|
||||
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
|
||||
|
||||
if "region" in result.columns:
|
||||
result["province"] = self.region_mapper.map(result["region"])
|
||||
result["province"] = self.region_mapper.map(result["region"]).str.lower()
|
||||
result["province"] = result["province"].astype("category")
|
||||
|
||||
if "sex" in result.columns:
|
||||
|
||||
Reference in New Issue
Block a user