refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,179 @@
+import json
+import logging
+import shutil
+from datetime import datetime
+from typing import Optional, Dict
+
+from core.config.config_manager import ConfigManager
+from core.config.project_paths import ProjectPaths
+
+
+class PipelineMonitor:
+    """Monitor and manage pipeline execution"""
+
+    def __init__(self, paths: Optional[ProjectPaths] = None):
+        if paths is None:
+            # Use default configuration if none provided
+            config_manager = ConfigManager()
+            paths = config_manager.default_paths
+
+        self.paths = paths
+        self.checkpoint_dir = paths.checkpoints_dir
+        self.steps = ["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"]
+
+    def get_step_status(self, step_name: str) -> Dict:
+        """Get status of a specific pipeline step"""
+        step_dir = self.checkpoint_dir / step_name
+        state_file = step_dir / "pipeline_state.json"
+
+        if not state_file.exists():
+            return {
+                "step": step_name,
+                "status": "not_started",
+                "processed_batches": 0,
+                "total_batches": 0,
+                "failed_batches": 0,
+                "completion_percentage": 0.0,
+            }
+
+        try:
+            with open(state_file, "r") as f:
+                state = json.load(f)
+
+            processed = state.get("processed_batches", 0)
+            total = state.get("total_batches", 0)
+            failed = len(state.get("failed_batches", []))
+
+            if total == 0:
+                completion = 0.0
+                status = "not_started"
+            elif processed >= total:
+                completion = 100.0
+                status = "completed" if failed == 0 else "completed_with_errors"
+            else:
+                completion = (processed / total) * 100
+                status = "in_progress"
+
+            return {
+                "step": step_name,
+                "status": status,
+                "processed_batches": processed,
+                "total_batches": total,
+                "failed_batches": failed,
+                "completion_percentage": completion,
+                "last_checkpoint": state.get("last_checkpoint"),
+                "failed_batch_ids": state.get("failed_batches", []),
+            }
+
+        except Exception as e:
+            logging.error(f"Error reading state for {step_name}: {e}")
+            return {"step": step_name, "status": "error", "error": str(e)}
+
+    def get_pipeline_status(self) -> Dict:
+        """Get overall pipeline status"""
+        step_statuses = {}
+        overall_status = "not_started"
+        total_completion = 0.0
+
+        for step in self.steps:
+            status = self.get_step_status(step)
+            step_statuses[step] = status
+
+            if status["status"] == "error":
+                overall_status = "error"
+            elif status["status"] in ["in_progress"]:
+                overall_status = "in_progress"
+            elif status["status"] == "completed_with_errors":
+                overall_status = "completed_with_errors"
+
+            total_completion += status.get("completion_percentage", 0)
+
+        avg_completion = total_completion / len(self.steps)
+
+        if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
+            overall_status = "completed"
+
+        return {
+            "overall_status": overall_status,
+            "overall_completion": avg_completion,
+            "steps": step_statuses,
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    def print_status(self, detailed: bool = False):
+        """Print pipeline status in a human-readable format"""
+        status = self.get_pipeline_status()
+
+        print("\n=== Pipeline Status ===")
+        print(f"Overall Status: {status['overall_status'].upper()}")
+        print(f"Overall Completion: {status['overall_completion']:.1f}%")
+        print(f"Last Updated: {status['timestamp']}")
+        print()
+
+        for step_name, step_status in status["steps"].items():
+            print(f"{step_name.replace('_', ' ').title()}:")
+            print(f"  Status: {step_status['status']}")
+            print(f"  Progress: {step_status['completion_percentage']:.1f}%")
+            print(f"  Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
+
+            if step_status["failed_batches"] > 0:
+                print(f"  Failed Batches: {step_status['failed_batches']}")
+
+                if detailed and "failed_batch_ids" in step_status:
+                    print(f"  Failed Batch IDs: {step_status['failed_batch_ids']}")
+
+            print()
+
+    def count_checkpoint_files(self) -> Dict:
+        """Count checkpoint files for each step"""
+        counts = {}
+        total_size = 0
+
+        for step in self.steps:
+            step_dir = self.checkpoint_dir / step
+            if step_dir.exists():
+                csv_files = list(step_dir.glob("*.csv"))
+                step_size = sum(f.stat().st_size for f in csv_files)
+                counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
+                total_size += step_size
+            else:
+                counts[step] = {"files": 0, "size_mb": 0}
+
+        counts["total_size_mb"] = total_size / (1024 * 1024)
+        return counts
+
+    def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
+        """Clean checkpoint files for a specific step"""
+        step_dir = self.checkpoint_dir / step_name
+
+        if not step_dir.exists():
+            logging.info(f"No checkpoints found for {step_name}")
+            return
+
+        csv_files = sorted(step_dir.glob("batch_*.csv"))
+
+        if len(csv_files) <= keep_last:
+            logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
+            return
+
+        files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
+
+        for file_path in files_to_delete:
+            try:
+                file_path.unlink()
+                logging.info(f"Deleted {file_path}")
+            except Exception as e:
+                logging.error(f"Failed to delete {file_path}: {e}")
+
+    def reset_step(self, step_name: str):
+        """Reset a pipeline step by removing its checkpoints and state"""
+        step_dir = self.checkpoint_dir / step_name
+
+        if step_dir.exists():
+            try:
+                shutil.rmtree(step_dir)
+                logging.info(f"Reset step: {step_name}")
+            except Exception as e:
+                logging.error(f"Failed to reset {step_name}: {e}")
+        else:
+            logging.info(f"Step {step_name} has no checkpoints to reset")