refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,179 @@
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict
|
||||
|
||||
from core.config.config_manager import ConfigManager
|
||||
from core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class PipelineMonitor:
|
||||
"""Monitor and manage pipeline execution"""
|
||||
|
||||
def __init__(self, paths: Optional[ProjectPaths] = None):
|
||||
if paths is None:
|
||||
# Use default configuration if none provided
|
||||
config_manager = ConfigManager()
|
||||
paths = config_manager.default_paths
|
||||
|
||||
self.paths = paths
|
||||
self.checkpoint_dir = paths.checkpoints_dir
|
||||
self.steps = ["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"]
|
||||
|
||||
def get_step_status(self, step_name: str) -> Dict:
|
||||
"""Get status of a specific pipeline step"""
|
||||
step_dir = self.checkpoint_dir / step_name
|
||||
state_file = step_dir / "pipeline_state.json"
|
||||
|
||||
if not state_file.exists():
|
||||
return {
|
||||
"step": step_name,
|
||||
"status": "not_started",
|
||||
"processed_batches": 0,
|
||||
"total_batches": 0,
|
||||
"failed_batches": 0,
|
||||
"completion_percentage": 0.0,
|
||||
}
|
||||
|
||||
try:
|
||||
with open(state_file, "r") as f:
|
||||
state = json.load(f)
|
||||
|
||||
processed = state.get("processed_batches", 0)
|
||||
total = state.get("total_batches", 0)
|
||||
failed = len(state.get("failed_batches", []))
|
||||
|
||||
if total == 0:
|
||||
completion = 0.0
|
||||
status = "not_started"
|
||||
elif processed >= total:
|
||||
completion = 100.0
|
||||
status = "completed" if failed == 0 else "completed_with_errors"
|
||||
else:
|
||||
completion = (processed / total) * 100
|
||||
status = "in_progress"
|
||||
|
||||
return {
|
||||
"step": step_name,
|
||||
"status": status,
|
||||
"processed_batches": processed,
|
||||
"total_batches": total,
|
||||
"failed_batches": failed,
|
||||
"completion_percentage": completion,
|
||||
"last_checkpoint": state.get("last_checkpoint"),
|
||||
"failed_batch_ids": state.get("failed_batches", []),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error reading state for {step_name}: {e}")
|
||||
return {"step": step_name, "status": "error", "error": str(e)}
|
||||
|
||||
def get_pipeline_status(self) -> Dict:
|
||||
"""Get overall pipeline status"""
|
||||
step_statuses = {}
|
||||
overall_status = "not_started"
|
||||
total_completion = 0.0
|
||||
|
||||
for step in self.steps:
|
||||
status = self.get_step_status(step)
|
||||
step_statuses[step] = status
|
||||
|
||||
if status["status"] == "error":
|
||||
overall_status = "error"
|
||||
elif status["status"] in ["in_progress"]:
|
||||
overall_status = "in_progress"
|
||||
elif status["status"] == "completed_with_errors":
|
||||
overall_status = "completed_with_errors"
|
||||
|
||||
total_completion += status.get("completion_percentage", 0)
|
||||
|
||||
avg_completion = total_completion / len(self.steps)
|
||||
|
||||
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
|
||||
overall_status = "completed"
|
||||
|
||||
return {
|
||||
"overall_status": overall_status,
|
||||
"overall_completion": avg_completion,
|
||||
"steps": step_statuses,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def print_status(self, detailed: bool = False):
|
||||
"""Print pipeline status in a human-readable format"""
|
||||
status = self.get_pipeline_status()
|
||||
|
||||
print("\n=== Pipeline Status ===")
|
||||
print(f"Overall Status: {status['overall_status'].upper()}")
|
||||
print(f"Overall Completion: {status['overall_completion']:.1f}%")
|
||||
print(f"Last Updated: {status['timestamp']}")
|
||||
print()
|
||||
|
||||
for step_name, step_status in status["steps"].items():
|
||||
print(f"{step_name.replace('_', ' ').title()}:")
|
||||
print(f" Status: {step_status['status']}")
|
||||
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
||||
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
|
||||
|
||||
if step_status["failed_batches"] > 0:
|
||||
print(f" Failed Batches: {step_status['failed_batches']}")
|
||||
|
||||
if detailed and "failed_batch_ids" in step_status:
|
||||
print(f" Failed Batch IDs: {step_status['failed_batch_ids']}")
|
||||
|
||||
print()
|
||||
|
||||
def count_checkpoint_files(self) -> Dict:
|
||||
"""Count checkpoint files for each step"""
|
||||
counts = {}
|
||||
total_size = 0
|
||||
|
||||
for step in self.steps:
|
||||
step_dir = self.checkpoint_dir / step
|
||||
if step_dir.exists():
|
||||
csv_files = list(step_dir.glob("*.csv"))
|
||||
step_size = sum(f.stat().st_size for f in csv_files)
|
||||
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
|
||||
total_size += step_size
|
||||
else:
|
||||
counts[step] = {"files": 0, "size_mb": 0}
|
||||
|
||||
counts["total_size_mb"] = total_size / (1024 * 1024)
|
||||
return counts
|
||||
|
||||
def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
|
||||
"""Clean checkpoint files for a specific step"""
|
||||
step_dir = self.checkpoint_dir / step_name
|
||||
|
||||
if not step_dir.exists():
|
||||
logging.info(f"No checkpoints found for {step_name}")
|
||||
return
|
||||
|
||||
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
||||
|
||||
if len(csv_files) <= keep_last:
|
||||
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
|
||||
return
|
||||
|
||||
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
||||
|
||||
for file_path in files_to_delete:
|
||||
try:
|
||||
file_path.unlink()
|
||||
logging.info(f"Deleted {file_path}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to delete {file_path}: {e}")
|
||||
|
||||
def reset_step(self, step_name: str):
|
||||
"""Reset a pipeline step by removing its checkpoints and state"""
|
||||
step_dir = self.checkpoint_dir / step_name
|
||||
|
||||
if step_dir.exists():
|
||||
try:
|
||||
shutil.rmtree(step_dir)
|
||||
logging.info(f"Reset step: {step_name}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to reset {step_name}: {e}")
|
||||
else:
|
||||
logging.info(f"Step {step_name} has no checkpoints to reset")
|
||||
Reference in New Issue
Block a user