drc-ners-nlp/processing/steps/data_cleaning_step.py

import logging

import pandas as pd

from core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep


class DataCleaningStep(PipelineStep):
    """Configuration-driven data cleaning step"""

    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("data_cleaning", pipeline_config)
        self.text_cleaner = TextCleaner()
        self.required_columns = ["name", "sex", "region"]

    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process a single batch for data cleaning"""
        logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")

        # Drop rows with essential missing values
        batch = batch.dropna(subset=self.required_columns)

        # Apply text cleaning
        batch = self.text_cleaner.clean_dataframe_text_columns(batch)

        # Remove duplicates
        batch = batch.drop_duplicates(subset=self.required_columns)

        return batch