Files
drc-ners-nlp/processing/steps/data_selection_step.py
T
2025-09-21 16:23:44 +02:00

44 lines
1.6 KiB
Python

import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps import PipelineStep
class DataSelectionStep(PipelineStep):
"""Configuration-driven data selection step to keep only specified columns"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_selection", pipeline_config)
self.selected_columns = pipeline_config.data.selected_columns
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data selection"""
logging.info(f"Selecting columns for batch {batch_id} with {len(batch)} rows")
# Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns]
missing_columns = [col for col in self.selected_columns if col not in batch.columns]
if missing_columns:
logging.warning(f"Missing columns in batch {batch_id}: {missing_columns}")
if not available_columns:
logging.error(f"No required columns found in batch {batch_id}")
return pd.DataFrame() # Return empty DataFrame if no required columns exist
# Select only the available required columns
selected_batch = batch[available_columns].copy()
logging.info(
f"Selected {len(available_columns)} columns for batch {batch_id}: {available_columns}"
)
return selected_batch
@property
def requires_batch_mutation(self) -> bool:
"""This step modifies the batch data by selecting columns"""
return True