feat: add osm data

2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
@@ -211,7 +211,9 @@ class NameModel:
            for batch in batches:
                batch_losses = {}
                self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
-                logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}")
+                logging.info(
+                    f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
+                )

                # Accumulate into total losses dict
                for k, v in batch_losses.items():
@@ -49,6 +49,9 @@ class Pipeline:
                "processed_batches": step.state.processed_batches,
                "total_batches": step.state.total_batches,
                "failed_batches": len(step.state.failed_batches),
-                "completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
+                "completion_percentage": (
+                    step.state.processed_batches / max(1, step.state.total_batches)
+                )
+                * 100,
            }
        return progress
@@ -0,0 +1,43 @@
+import logging
+
+import pandas as pd
+
+from core.config.pipeline_config import PipelineConfig
+from processing.steps import PipelineStep
+
+
+class DataSelectionStep(PipelineStep):
+    """Configuration-driven data selection step to keep only specified columns"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        super().__init__("data_selection", pipeline_config)
+        self.selected_columns = pipeline_config.data.selected_columns
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process a single batch for data selection"""
+        logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
+
+        # Check which columns exist in the batch
+        available_columns = [col for col in self.selected_columns if col in batch.columns]
+        missing_columns = [col for col in self.selected_columns if col not in batch.columns]
+
+        if missing_columns:
+            logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
+
+        if not available_columns:
+            logging.error(f"No required columns found in batch {batch_id}")
+            return pd.DataFrame()  # Return empty DataFrame if no required columns exist
+
+        # Select only the available required columns
+        selected_batch = batch[available_columns].copy()
+
+        logging.info(
+            f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
+        )
+
+        return selected_batch
+
+    @property
+    def requires_batch_mutation(self) -> bool:
+        """This step modifies the batch data by selecting columns"""
+        return True