feat: add NER annotation step and integrate into pipeline

2025-08-11 07:13:09 +02:00
parent 6d39c3afc1
commit d5a4aaaf4a
23 changed files with 1108 additions and 160 deletions
@@ -5,65 +5,31 @@ import traceback
 from pathlib import Path

 from core.config import setup_config
-from processing.monitoring.data_analyzer import DatasetAnalyzer
 from processing.monitoring.pipeline_monitor import PipelineMonitor


 def main():
-    parser = argparse.ArgumentParser(
-        description="Monitor and manage the DRC names processing pipeline"
-    )
-    parser.add_argument("--config", type=Path, help="Path to configuration file")
-    parser.add_argument(
-        "--env", type=str, default="development",
-        help="Environment name (default: development)"
-    )
+    choices = ["data_cleaning", "feature_extraction", "ner_annotation", "llm_annotation", "data_splitting"]

+    parser = argparse.ArgumentParser(description="Monitor and manage the DRC names processing pipeline")
+    parser.add_argument("--config", type=Path, help="Path to configuration file")
+    parser.add_argument("--env", type=str, default="development", help="Environment")
    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # Status command
-    status_parser = subparsers.add_parser("status", help="Show pipeline status")
-    status_parser.add_argument(
-        "--detailed",
-        action="store_true",
-        help="Show detailed information including failed batch IDs",
-    )
+    subparsers.add_parser("status", help="Show pipeline status")

    # Clean command
    clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
-    clean_parser.add_argument(
-        "--step",
-        type=str,
-        choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
-        help="Clean specific step (default: all)",
-    )
-    clean_parser.add_argument(
-        "--keep-last", type=int, default=1, help="Number of recent checkpoints to keep (default: 1)"
-    )
+    clean_parser.add_argument("--step", type=str, choices=choices, help="Specific step (default: all)")
+    clean_parser.add_argument("--keep-last", type=int, default=1, help="Checkpoints to keep (default: 1)")
    clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")

    # Reset command
    reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
-    reset_parser.add_argument(
-        "step",
-        type=str,
-        choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
-        help="Step to reset",
-    )
+    reset_parser.add_argument("--step", type=str, choices=choices, help="Specific step (default: all)")
+    reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
    reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
-
-    # Analyze command
-    analyze_parser = subparsers.add_parser("analyze", help="Analyze dataset")
-    analyze_parser.add_argument(
-        "--file",
-        type=str,
-        default="names_featured.csv",
-        help="Dataset file to analyze (default: names_featured.csv)",
-    )
-
-    # Checkpoint info command
-    info_parser = subparsers.add_parser("info", help="Show checkpoint information")
-
    args = parser.parse_args()

    if not args.command:
@@ -71,13 +37,11 @@ def main():
        return 1

    try:
-        # Load configuration and setup logging
-        config = setup_config(config_path=args.config, env=args.env)
-
+        setup_config(config_path=args.config, env=args.env)
        monitor = PipelineMonitor()

        if args.command == "status":
-            monitor.print_status(detailed=args.detailed)
+            monitor.print_status(detailed=True)

        elif args.command == "clean":
            checkpoint_info = monitor.count_checkpoint_files()
@@ -106,49 +70,13 @@ def main():
                    print("Cancelled")
                    return 0

-            monitor.reset_step(args.step)
-            print(f"Reset completed for {args.step}")
+            if args.step:
+                monitor.reset_step(args.step)
+            else:
+                for step in monitor.steps:
+                    monitor.reset_step(step)

-        elif args.command == "analyze":
-            # Use configured data directory
-            data_dir = config.paths.data_dir
-            filepath = data_dir / args.file
-
-            if not filepath.exists():
-                print(f"File not found: {filepath}")
-                return 1
-
-            analyzer = DatasetAnalyzer(str(filepath))
-
-            if not analyzer.load_data():
-                return 1
-
-            completion_stats = analyzer.analyze_completion()
-
-            print(f"\n=== Dataset Analysis: {args.file} ===")
-            print(f"Total rows: {completion_stats['total_rows']:,}")
-            print(
-                f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)")
-            print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
-            print(
-                f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
-            )
-
-        elif args.command == "info":
-            checkpoint_info = monitor.count_checkpoint_files()
-
-            print(f"\n=== Checkpoint Information ===")
-            print(f"Total storage: {checkpoint_info['total_size_mb']:.1f} MB")
-            print()
-
-            for step in monitor.steps:
-                step_info = checkpoint_info[step]
-                print(f"{step.replace('_', ' ').title()}:")
-                print(f"  Files: {step_info['files']}")
-                print(f"  Size: {step_info['size_mb']:.1f} MB")
-                print()
-
-        return 0
+            print(f"Reset completed")

    except Exception as e:
        print(f"Monitoring failed: {e}")