feat: add NER annotation step and integrate into pipeline

This commit is contained in:
2025-08-11 07:13:09 +02:00
parent 6d39c3afc1
commit d5a4aaaf4a
23 changed files with 1108 additions and 160 deletions
+17 -89
View File
@@ -5,65 +5,31 @@ import traceback
from pathlib import Path
from core.config import setup_config
from processing.monitoring.data_analyzer import DatasetAnalyzer
from processing.monitoring.pipeline_monitor import PipelineMonitor
def main():
parser = argparse.ArgumentParser(
description="Monitor and manage the DRC names processing pipeline"
)
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument(
"--env", type=str, default="development",
help="Environment name (default: development)"
)
choices = ["data_cleaning", "feature_extraction", "ner_annotation", "llm_annotation", "data_splitting"]
parser = argparse.ArgumentParser(description="Monitor and manage the DRC names processing pipeline")
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Status command
status_parser = subparsers.add_parser("status", help="Show pipeline status")
status_parser.add_argument(
"--detailed",
action="store_true",
help="Show detailed information including failed batch IDs",
)
subparsers.add_parser("status", help="Show pipeline status")
# Clean command
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
clean_parser.add_argument(
"--step",
type=str,
choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
help="Clean specific step (default: all)",
)
clean_parser.add_argument(
"--keep-last", type=int, default=1, help="Number of recent checkpoints to keep (default: 1)"
)
clean_parser.add_argument("--step", type=str, choices=choices, help="Specific step (default: all)")
clean_parser.add_argument("--keep-last", type=int, default=1, help="Checkpoints to keep (default: 1)")
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
# Reset command
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
reset_parser.add_argument(
"step",
type=str,
choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
help="Step to reset",
)
reset_parser.add_argument("--step", type=str, choices=choices, help="Specific step (default: all)")
reset_parser.add_argument("--all", action="store_true", help="Reset all steps")
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
# Analyze command
analyze_parser = subparsers.add_parser("analyze", help="Analyze dataset")
analyze_parser.add_argument(
"--file",
type=str,
default="names_featured.csv",
help="Dataset file to analyze (default: names_featured.csv)",
)
# Checkpoint info command
info_parser = subparsers.add_parser("info", help="Show checkpoint information")
args = parser.parse_args()
if not args.command:
@@ -71,13 +37,11 @@ def main():
return 1
try:
# Load configuration and setup logging
config = setup_config(config_path=args.config, env=args.env)
setup_config(config_path=args.config, env=args.env)
monitor = PipelineMonitor()
if args.command == "status":
monitor.print_status(detailed=args.detailed)
monitor.print_status(detailed=True)
elif args.command == "clean":
checkpoint_info = monitor.count_checkpoint_files()
@@ -106,49 +70,13 @@ def main():
print("Cancelled")
return 0
monitor.reset_step(args.step)
print(f"Reset completed for {args.step}")
if args.step:
monitor.reset_step(args.step)
else:
for step in monitor.steps:
monitor.reset_step(step)
elif args.command == "analyze":
# Use configured data directory
data_dir = config.paths.data_dir
filepath = data_dir / args.file
if not filepath.exists():
print(f"File not found: {filepath}")
return 1
analyzer = DatasetAnalyzer(str(filepath))
if not analyzer.load_data():
return 1
completion_stats = analyzer.analyze_completion()
print(f"\n=== Dataset Analysis: {args.file} ===")
print(f"Total rows: {completion_stats['total_rows']:,}")
print(
f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)")
print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
print(
f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
)
elif args.command == "info":
checkpoint_info = monitor.count_checkpoint_files()
print(f"\n=== Checkpoint Information ===")
print(f"Total storage: {checkpoint_info['total_size_mb']:.1f} MB")
print()
for step in monitor.steps:
step_info = checkpoint_info[step]
print(f"{step.replace('_', ' ').title()}:")
print(f" Files: {step_info['files']}")
print(f" Size: {step_info['size_mb']:.1f} MB")
print()
return 0
print(f"Reset completed")
except Exception as e:
print(f"Monitoring failed: {e}")