refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,154 @@
+#!.venv/bin/python3
+import sys
+import argparse
+import logging
+from pathlib import Path
+from typing import Optional
+
+from core.utils.data_loader import DataLoader
+from core.config import ConfigManager, setup_logging
+from core.utils import ensure_directories, get_data_file_path
+
+from processing.pipeline import Pipeline
+from processing.batch.batch_config import BatchConfig
+from processing.steps.data_splitting_step import DataSplittingStep
+from processing.steps.llm_annotation_step import LLMAnnotationStep
+from processing.steps.feature_extraction_step import FeatureExtractionStep
+from processing.steps.data_cleaning_step import DataCleaningStep
+
+
+def create_pipeline_from_config(config_path: Optional[Path] = None) -> Pipeline:
+    """Create pipeline from configuration file"""
+    config = ConfigManager(config_path).load_config()
+
+    # Setup logging
+    setup_logging(config)
+    ensure_directories(config)
+    batch_config = BatchConfig(
+        batch_size=config.processing.batch_size,
+        max_workers=config.processing.max_workers,
+        checkpoint_interval=config.processing.checkpoint_interval,
+        use_multiprocessing=config.processing.use_multiprocessing,
+    )
+
+    # Add steps based on configuration
+    pipeline = Pipeline(batch_config)
+    steps = [
+        DataCleaningStep(config),
+        FeatureExtractionStep(config),
+        LLMAnnotationStep(config),
+        DataSplittingStep(config),
+    ]
+
+    for stage in config.stages:
+        for step in steps:
+            if step.name == stage:
+                pipeline.add_step(step)
+
+    return pipeline
+
+
+def run_pipeline(config_path: Optional[Path] = None, resume: bool = False) -> int:
+    """Run the complete pipeline"""
+    try:
+        config = ConfigManager(config_path).load_config()
+
+        logging.info(f"Starting pipeline: {config.name} v{config.version}")
+        logging.info(f"Environment: {config.environment}")
+
+        # Load input data
+        input_file_path = get_data_file_path(config.data.input_file, config)
+
+        if not input_file_path.exists():
+            logging.error(f"Input file not found: {input_file_path}")
+            return 1
+
+        data_loader = DataLoader(config)
+        logging.info(f"Loading data from {input_file_path}")
+        df = data_loader.load_csv_complete(input_file_path)
+        logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
+
+        # Create and run pipeline
+        pipeline = create_pipeline_from_config(config_path)
+
+        logging.info("Starting pipeline execution")
+        result_df = pipeline.run(df)
+
+        # Save results using the splitting step
+        splitting_step = pipeline.steps[-1]
+        if isinstance(splitting_step, DataSplittingStep):
+            splitting_step.save_splits(result_df)
+
+        # Show completion statistics
+        progress = pipeline.get_progress()
+        logging.info("=== Pipeline Completion Summary ===")
+        for step_name, stats in progress.items():
+            logging.info(
+                f"{step_name}: {stats['completion_percentage']:.1f}% "
+                f"({stats['processed_batches']}/{stats['total_batches']} batches)"
+            )
+            if stats["failed_batches"] > 0:
+                logging.warning(f"  {stats['failed_batches']} failed batches")
+
+        logging.info("Pipeline completed successfully")
+        return 0
+
+    except Exception as e:
+        logging.error(f"Pipeline failed: {e}", exc_info=True)
+        return 1
+
+
+def main():
+    """Main entry point with minimal command-line interface"""
+    parser = argparse.ArgumentParser(
+        description="DRC Names Processing Pipeline",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Configuration File Examples:
+  config/pipeline.yaml              - Main configuration
+  config/pipeline.development.yaml  - Development environment
+  config/pipeline.production.yaml   - Production environment
+
+Usage Examples:
+  python processing/main.py                                   # Use default config
+  python processing/main.py --config config/pipeline.yaml     # Use specific config
+  python processing/main.py --env development                 # Use environment config
+  python processing/main.py --resume                          # Resume from checkpoints
+        """,
+    )
+
+    parser.add_argument("--config", type=Path, help="Path to configuration file")
+    parser.add_argument(
+        "--env", type=str, help="Environment name (loads config/pipeline.{env}.yaml)"
+    )
+    parser.add_argument(
+        "--resume", action="store_true", help="Resume pipeline from existing checkpoints"
+    )
+    parser.add_argument(
+        "--validate-config", action="store_true", help="Validate configuration file and exit"
+    )
+    args = parser.parse_args()
+
+    # Determine config path
+    config_path = None
+    if args.config:
+        config_path = args.config
+    elif args.env:
+        config_path = Path("config") / f"pipeline.{args.env}.yaml"
+
+    if args.validate_config:
+        try:
+            config = ConfigManager(config_path).load_config()
+            print(f"Configuration is valid: {config.name} v{config.version}")
+            return 0
+        except Exception as e:
+            print(f"Configuration validation failed: {e}")
+            return 1
+
+    # Run pipeline
+    return run_pipeline(config_path, args.resume)
+
+
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)