refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
+28
View File
@@ -0,0 +1,28 @@
import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep
class DataCleaningStep(PipelineStep):
"""Configuration-driven data cleaning step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_cleaning", pipeline_config)
self.text_cleaner = TextCleaner()
self.required_columns = ["name", "sex", "region"]
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data cleaning"""
logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
# Drop rows with essential missing values
batch = batch.dropna(subset=self.required_columns)
# Apply text cleaning
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
return batch