feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -9,6 +9,7 @@ import pandas as pd
 from pydantic import BaseModel

 from core.config.pipeline_config import PipelineConfig
+from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
 from processing.batch.batch_config import BatchConfig


@@ -37,10 +38,11 @@ class PipelineStep(ABC):
    """Abstract base class for pipeline steps"""

    def __init__(
-            self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
+        self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
    ):
        self.name = name
        self.pipeline_config = pipeline_config
+        self.data_loader = DataLoader(pipeline_config)

        # Use provided batch_config or create default from pipeline config
        if batch_config is None:
@@ -53,6 +55,11 @@ class PipelineStep(ABC):
        self.batch_config = batch_config
        self.state = PipelineState()

+    @property
+    def requires_batch_mutation(self) -> bool:
+        """Indicates if this step modifies the batch data"""
+        return False
+
    @abstractmethod
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process a single batch of data"""
@@ -108,12 +115,12 @@ class PipelineStep(ABC):
    def save_batch(self, batch: pd.DataFrame, batch_id: int):
        """Save processed batch to checkpoint"""
        checkpoint_path = self.get_checkpoint_path(batch_id)
-        batch.to_csv(checkpoint_path, index=False)
+        self.data_loader.save_csv(batch, checkpoint_path)
        logging.info(f"Saved batch {batch_id} to {checkpoint_path}")

    def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
        """Load processed batch from checkpoint"""
        checkpoint_path = self.get_checkpoint_path(batch_id)
        if os.path.exists(checkpoint_path):
-            return pd.read_csv(checkpoint_path)
+            return self.data_loader.load_csv_complete(checkpoint_path)
        return None