feat: enhance logging and memory management across modules
This commit is contained in:
@@ -5,6 +5,7 @@ from typing import Iterator
|
||||
import pandas as pd
|
||||
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.batch.memory_monitor import MemoryMonitor
|
||||
from processing.steps import PipelineStep
|
||||
|
||||
|
||||
@@ -13,28 +14,36 @@ class BatchProcessor:
|
||||
|
||||
def __init__(self, config: BatchConfig):
|
||||
self.config = config
|
||||
self.memory_monitor = MemoryMonitor()
|
||||
|
||||
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
|
||||
"""Create batches from DataFrame"""
|
||||
"""Create batches from DataFrame without unnecessary copies"""
|
||||
total_rows = len(df)
|
||||
batch_size = self.config.batch_size
|
||||
|
||||
for i in range(0, total_rows, batch_size):
|
||||
batch = df.iloc[i : i + batch_size].copy()
|
||||
batch = df.iloc[i : i + batch_size]
|
||||
batch_id = i // batch_size
|
||||
yield batch, batch_id
|
||||
|
||||
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process batches sequentially"""
|
||||
"""Memory-optimized sequential processing"""
|
||||
results = []
|
||||
memory_threshold_mb = 1000 # Clean memory when usage exceeds 1 GB
|
||||
|
||||
for batch, batch_id in self.create_batches(df):
|
||||
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
|
||||
if step.batch_exists(batch_id):
|
||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||
processed_batch = step.load_batch(batch_id)
|
||||
else:
|
||||
try:
|
||||
processed_batch = step.process_batch(batch, batch_id)
|
||||
# Only copy if the processing step requires mutation
|
||||
if step.requires_batch_mutation:
|
||||
batch_copy = batch.copy()
|
||||
processed_batch = step.process_batch(batch_copy, batch_id)
|
||||
else:
|
||||
processed_batch = step.process_batch(batch, batch_id)
|
||||
|
||||
step.save_batch(processed_batch, batch_id)
|
||||
step.state.processed_batches += 1
|
||||
except Exception as e:
|
||||
@@ -44,14 +53,32 @@ class BatchProcessor:
|
||||
|
||||
results.append(processed_batch)
|
||||
|
||||
# Memory management
|
||||
if batch_num % self.config.checkpoint_interval == 0:
|
||||
current_memory = self.memory_monitor.get_memory_usage_mb()
|
||||
if current_memory > memory_threshold_mb:
|
||||
logging.info(f"Memory cleanup triggered at {current_memory:.1f} MB")
|
||||
self.memory_monitor.cleanup_memory()
|
||||
|
||||
# Save state periodically
|
||||
if batch_id % self.config.checkpoint_interval == 0:
|
||||
step.save_state()
|
||||
|
||||
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
|
||||
# Final memory cleanup before concatenation
|
||||
self.memory_monitor.cleanup_memory()
|
||||
self.memory_monitor.log_memory_usage("before_concat")
|
||||
|
||||
result = self._safe_concat(results) if results else pd.DataFrame()
|
||||
|
||||
# Final cleanup
|
||||
del results
|
||||
self.memory_monitor.cleanup_memory()
|
||||
self.memory_monitor.log_memory_usage("sequential_complete")
|
||||
|
||||
return result
|
||||
|
||||
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process batches concurrently"""
|
||||
"""Memory-optimized concurrent processing"""
|
||||
executor_class = (
|
||||
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
|
||||
)
|
||||
@@ -65,7 +92,9 @@ class BatchProcessor:
|
||||
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||
results[batch_id] = step.load_batch(batch_id)
|
||||
else:
|
||||
future = executor.submit(step.process_batch, batch, batch_id)
|
||||
# Only copy if necessary for concurrent processing
|
||||
batch_copy = batch.copy() if step.requires_batch_mutation else batch
|
||||
future = executor.submit(step.process_batch, batch_copy, batch_id)
|
||||
future_to_batch[future] = (batch_id, batch)
|
||||
|
||||
# Collect results as they complete
|
||||
@@ -81,13 +110,24 @@ class BatchProcessor:
|
||||
logging.error(f"Failed to process batch {batch_id}: {e}")
|
||||
step.state.failed_batches.append(batch_id)
|
||||
|
||||
# Reassemble results in order
|
||||
# Memory-efficient reassembly
|
||||
ordered_results = []
|
||||
for batch_id in sorted(results.keys()):
|
||||
ordered_results.append(results[batch_id])
|
||||
|
||||
step.save_state()
|
||||
return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
|
||||
|
||||
# Cleanup before concat
|
||||
del results
|
||||
self.memory_monitor.cleanup_memory()
|
||||
|
||||
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
|
||||
|
||||
# Final cleanup
|
||||
del ordered_results
|
||||
self.memory_monitor.cleanup_memory()
|
||||
|
||||
return result
|
||||
|
||||
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process data using the configured strategy"""
|
||||
@@ -95,8 +135,29 @@ class BatchProcessor:
|
||||
step.load_state()
|
||||
|
||||
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
||||
self.memory_monitor.log_memory_usage("process_start")
|
||||
|
||||
if self.config.max_workers == 1:
|
||||
return self.process_sequential(step, df)
|
||||
result = self.process_sequential(step, df)
|
||||
else:
|
||||
return self.process_concurrent(step, df)
|
||||
result = self.process_concurrent(step, df)
|
||||
|
||||
self.memory_monitor.log_memory_usage("process_complete")
|
||||
return result
|
||||
|
||||
def _safe_concat(self, dfs: list) -> pd.DataFrame:
|
||||
"""Memory-safe concatenation with monitoring"""
|
||||
if not dfs:
|
||||
return pd.DataFrame()
|
||||
|
||||
memory = self.memory_monitor.get_memory_usage_mb()
|
||||
logging.info(f"Starting concat of {len(dfs)} DataFrames at {memory:.1f} MB")
|
||||
|
||||
# Use copy=False to avoid unnecessary copying during concat
|
||||
result = pd.concat(dfs, ignore_index=True, copy=False)
|
||||
|
||||
# Monitor memory after concat
|
||||
memory = self.memory_monitor.get_memory_usage_mb()
|
||||
logging.info(f"Concat complete. Memory: {memory:.1f} MB")
|
||||
|
||||
return result
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
import gc
|
||||
import logging
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
class MemoryMonitor:
|
||||
"""Monitor and manage memory usage during batch processing"""
|
||||
|
||||
@staticmethod
|
||||
def get_memory_usage_mb() -> float:
|
||||
"""Get current memory usage in MB"""
|
||||
process = psutil.Process()
|
||||
return process.memory_info().rss / 1024 / 1024
|
||||
|
||||
@staticmethod
|
||||
def cleanup_memory():
|
||||
"""Force garbage collection"""
|
||||
gc.collect()
|
||||
|
||||
@staticmethod
|
||||
def log_memory_usage(step_name: str):
|
||||
"""Log current memory usage"""
|
||||
memory_mb = MemoryMonitor.get_memory_usage_mb()
|
||||
logging.info(f"Memory usage after {step_name}: {memory_mb:.1f} MB")
|
||||
Reference in New Issue
Block a user