feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -5,6 +5,7 @@ from typing import Iterator
 import pandas as pd

 from processing.batch.batch_config import BatchConfig
+from processing.batch.memory_monitor import MemoryMonitor
 from processing.steps import PipelineStep


@@ -13,28 +14,36 @@ class BatchProcessor:

    def __init__(self, config: BatchConfig):
        self.config = config
+        self.memory_monitor = MemoryMonitor()

    def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
-        """Create batches from DataFrame"""
+        """Create batches from DataFrame without unnecessary copies"""
        total_rows = len(df)
        batch_size = self.config.batch_size

        for i in range(0, total_rows, batch_size):
-            batch = df.iloc[i : i + batch_size].copy()
+            batch = df.iloc[i : i + batch_size]
            batch_id = i // batch_size
            yield batch, batch_id

    def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
-        """Process batches sequentially"""
+        """Memory-optimized sequential processing"""
        results = []
+        memory_threshold_mb = 1000  # Clean memory when usage exceeds 1 GB

-        for batch, batch_id in self.create_batches(df):
+        for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
            if step.batch_exists(batch_id):
                logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
                processed_batch = step.load_batch(batch_id)
            else:
                try:
-                    processed_batch = step.process_batch(batch, batch_id)
+                    # Only copy if the processing step requires mutation
+                    if step.requires_batch_mutation:
+                        batch_copy = batch.copy()
+                        processed_batch = step.process_batch(batch_copy, batch_id)
+                    else:
+                        processed_batch = step.process_batch(batch, batch_id)
+
                    step.save_batch(processed_batch, batch_id)
                    step.state.processed_batches += 1
                except Exception as e:
@@ -44,14 +53,32 @@ class BatchProcessor:

            results.append(processed_batch)

+            # Memory management
+            if batch_num % self.config.checkpoint_interval == 0:
+                current_memory = self.memory_monitor.get_memory_usage_mb()
+                if current_memory > memory_threshold_mb:
+                    logging.info(f"Memory cleanup triggered at {current_memory:.1f} MB")
+                    self.memory_monitor.cleanup_memory()
+
            # Save state periodically
            if batch_id % self.config.checkpoint_interval == 0:
                step.save_state()

-        return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
+        # Final memory cleanup before concatenation
+        self.memory_monitor.cleanup_memory()
+        self.memory_monitor.log_memory_usage("before_concat")
+
+        result = self._safe_concat(results) if results else pd.DataFrame()
+
+        # Final cleanup
+        del results
+        self.memory_monitor.cleanup_memory()
+        self.memory_monitor.log_memory_usage("sequential_complete")
+
+        return result

    def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
-        """Process batches concurrently"""
+        """Memory-optimized concurrent processing"""
        executor_class = (
            ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
        )
@@ -65,7 +92,9 @@ class BatchProcessor:
                    logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
                    results[batch_id] = step.load_batch(batch_id)
                else:
-                    future = executor.submit(step.process_batch, batch, batch_id)
+                    # Only copy if necessary for concurrent processing
+                    batch_copy = batch.copy() if step.requires_batch_mutation else batch
+                    future = executor.submit(step.process_batch, batch_copy, batch_id)
                    future_to_batch[future] = (batch_id, batch)

            # Collect results as they complete
@@ -81,13 +110,24 @@ class BatchProcessor:
                    logging.error(f"Failed to process batch {batch_id}: {e}")
                    step.state.failed_batches.append(batch_id)

-        # Reassemble results in order
+        # Memory-efficient reassembly
        ordered_results = []
        for batch_id in sorted(results.keys()):
            ordered_results.append(results[batch_id])

        step.save_state()
-        return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
+
+        # Cleanup before concat
+        del results
+        self.memory_monitor.cleanup_memory()
+
+        result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
+
+        # Final cleanup
+        del ordered_results
+        self.memory_monitor.cleanup_memory()
+
+        return result

    def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
        """Process data using the configured strategy"""
@@ -95,8 +135,29 @@ class BatchProcessor:
        step.load_state()

        logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
+        self.memory_monitor.log_memory_usage("process_start")

        if self.config.max_workers == 1:
-            return self.process_sequential(step, df)
+            result = self.process_sequential(step, df)
        else:
-            return self.process_concurrent(step, df)
+            result = self.process_concurrent(step, df)
+
+        self.memory_monitor.log_memory_usage("process_complete")
+        return result
+
+    def _safe_concat(self, dfs: list) -> pd.DataFrame:
+        """Memory-safe concatenation with monitoring"""
+        if not dfs:
+            return pd.DataFrame()
+
+        memory = self.memory_monitor.get_memory_usage_mb()
+        logging.info(f"Starting concat of {len(dfs)} DataFrames at {memory:.1f} MB")
+
+        # Use copy=False to avoid unnecessary copying during concat
+        result = pd.concat(dfs, ignore_index=True, copy=False)
+
+        # Monitor memory after concat
+        memory = self.memory_monitor.get_memory_usage_mb()
+        logging.info(f"Concat complete. Memory: {memory:.1f} MB")
+
+        return result
@@ -0,0 +1,25 @@
+import gc
+import logging
+
+import psutil
+
+
+class MemoryMonitor:
+    """Monitor and manage memory usage during batch processing"""
+
+    @staticmethod
+    def get_memory_usage_mb() -> float:
+        """Get current memory usage in MB"""
+        process = psutil.Process()
+        return process.memory_info().rss / 1024 / 1024
+
+    @staticmethod
+    def cleanup_memory():
+        """Force garbage collection"""
+        gc.collect()
+
+    @staticmethod
+    def log_memory_usage(step_name: str):
+        """Log current memory usage"""
+        memory_mb = MemoryMonitor.get_memory_usage_mb()
+        logging.info(f"Memory usage after {step_name}: {memory_mb:.1f} MB")