feat: add osm data
This commit is contained in:
@@ -211,7 +211,9 @@ class NameModel:
|
||||
for batch in batches:
|
||||
batch_losses = {}
|
||||
self.nlp.update(batch, losses=batch_losses, drop=dropout_rate, sgd=optimizer)
|
||||
logging.info(f"Training batch with {len(batch)} examples, current losses: {batch_losses}")
|
||||
logging.info(
|
||||
f"Training batch with {len(batch)} examples, current losses: {batch_losses}"
|
||||
)
|
||||
|
||||
# Accumulate into total losses dict
|
||||
for k, v in batch_losses.items():
|
||||
|
||||
@@ -49,6 +49,9 @@ class Pipeline:
|
||||
"processed_batches": step.state.processed_batches,
|
||||
"total_batches": step.state.total_batches,
|
||||
"failed_batches": len(step.state.failed_batches),
|
||||
"completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
|
||||
"completion_percentage": (
|
||||
step.state.processed_batches / max(1, step.state.total_batches)
|
||||
)
|
||||
* 100,
|
||||
}
|
||||
return progress
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.steps import PipelineStep
|
||||
|
||||
|
||||
class DataSelectionStep(PipelineStep):
|
||||
"""Configuration-driven data selection step to keep only specified columns"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
super().__init__("data_selection", pipeline_config)
|
||||
self.selected_columns = pipeline_config.data.selected_columns
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process a single batch for data selection"""
|
||||
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
|
||||
|
||||
# Check which columns exist in the batch
|
||||
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
||||
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
|
||||
|
||||
if missing_columns:
|
||||
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
|
||||
|
||||
if not available_columns:
|
||||
logging.error(f"No required columns found in batch {batch_id}")
|
||||
return pd.DataFrame() # Return empty DataFrame if no required columns exist
|
||||
|
||||
# Select only the available required columns
|
||||
selected_batch = batch[available_columns].copy()
|
||||
|
||||
logging.info(
|
||||
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
|
||||
)
|
||||
|
||||
return selected_batch
|
||||
|
||||
@property
|
||||
def requires_batch_mutation(self) -> bool:
|
||||
"""This step modifies the batch data by selecting columns"""
|
||||
return True
|
||||
Reference in New Issue
Block a user