feat: add NER annotation step and integrate into pipeline

This commit is contained in:
2025-08-11 07:13:09 +02:00
parent 6d39c3afc1
commit d5a4aaaf4a
23 changed files with 1108 additions and 160 deletions
+3 -1
View File
@@ -12,6 +12,7 @@ processing:
stages:
- "data_cleaning"
- "feature_extraction"
#- "ner_annotation"
#- "llm_annotation"
- "data_splitting"
@@ -27,7 +28,8 @@ llm:
# Data handling configuration
data:
max_dataset_size: 100_000
split_evaluation: false
max_dataset_size: null
balance_by_sex: true
# Enhanced logging for development
+1
View File
@@ -12,6 +12,7 @@ processing:
stages:
- "data_cleaning"
- "feature_extraction"
- "ner_annotation"
- "llm_annotation"
- "data_splitting"
+20 -11
View File
@@ -18,9 +18,10 @@ paths:
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
# Pipeline stages
stages: # List of stages in the processing pipeline
stages: # List of stages in the processing pipeline
- "data_cleaning" # Data cleaning stage
- "feature_extraction" # Feature extraction stage
- "ner_annotation" # NER-based annotation stage
- "llm_annotation" # LLM annotation stage (computational intensive)
- "data_splitting" # Data splitting stage
@@ -36,15 +37,20 @@ processing:
- "latin1"
chunk_size: 100_000 # Size of data chunks to process in parallel
# LLM annotation settings
llm:
model_name: "mistral:7b" # Name of the LLM model to use
requests_per_minute: 60 # Requests per minute to the LLM service
requests_per_second: 2 # Requests per second to the LLM service
retry_attempts: 3 # Number of retry attempts for LLM requests
timeout_seconds: 600 # Timeout for LLM requests
max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service
enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service
# Annotation settings
annotation:
llm:
model_name: "mistral:7b" # Name of the LLM model to use
requests_per_minute: 60 # Requests per minute to the LLM service
requests_per_second: 2 # Requests per second to the LLM service
retry_attempts: 3 # Number of retry attempts for LLM requests
timeout_seconds: 600 # Timeout for LLM requests
max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service
enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service
ner:
model_name: "drc_names_ner" # Name of the NER model to use
retry_attempts: 3 # Number of retry attempts for NER requests
# Data handling configuration
data:
@@ -54,8 +60,11 @@ data:
evaluation: "names_evaluation.csv" # Output file for evaluation set
males: "names_males.csv" # Output files for male names
females: "names_females.csv" # Output files for female names
split_evaluation: true # Should the dataset be split into training and evaluation sets ?
ner_data: "names_ner.json" # Output file for NER annotated data
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
split_by_gender: true # Should the dataset be split by gender ?
split_ner_data: true # Should the NER data be extracted and saved?
evaluation_fraction: 0.2 # Fraction of data to use for evaluation
random_seed: 42 # Random seed for reproducibility
max_dataset_size: null # Maximum size of the dataset to process, set to null for no