refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,57 @@
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any
|
||||
import time
|
||||
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.batch.batch_processor import BatchProcessor
|
||||
from processing.steps import PipelineStep
|
||||
|
||||
|
||||
class Pipeline:
|
||||
"""Main pipeline orchestrator"""
|
||||
|
||||
def __init__(self, config: BatchConfig):
|
||||
self.config = config
|
||||
self.processor = BatchProcessor(config)
|
||||
self.steps = []
|
||||
|
||||
def add_step(self, step: PipelineStep):
|
||||
"""Add a processing step to the pipeline"""
|
||||
self.steps.append(step)
|
||||
|
||||
def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Run the complete pipeline"""
|
||||
current_data = input_data.copy()
|
||||
|
||||
for step in self.steps:
|
||||
logging.info(f"Running pipeline step: {step.name}")
|
||||
start_time = time.time()
|
||||
|
||||
current_data = self.processor.process(step, current_data)
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
|
||||
|
||||
if step.state.failed_batches:
|
||||
logging.warning(
|
||||
f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
|
||||
)
|
||||
|
||||
return current_data
|
||||
|
||||
def get_progress(self) -> Dict[str, Any]:
|
||||
"""Get progress information for all steps"""
|
||||
progress = {}
|
||||
for step in self.steps:
|
||||
progress[step.name] = {
|
||||
"processed_batches": step.state.processed_batches,
|
||||
"total_batches": step.state.total_batches,
|
||||
"failed_batches": len(step.state.failed_batches),
|
||||
"completion_percentage": (
|
||||
step.state.processed_batches / max(1, step.state.total_batches)
|
||||
)
|
||||
* 100,
|
||||
}
|
||||
return progress
|
||||
Reference in New Issue
Block a user