Files
drc-ners-nlp/config/pipeline.production.yaml
2025-09-21 16:23:44 +02:00

43 lines
821 B
YAML

environment: "production"
debug: false
# Processing settings
processing:
batch_size: 100_000
max_workers: 4
checkpoint_interval: 10
use_multiprocessing: true
# Pipeline stages
stages:
- "data_cleaning"
- "data_selection"
- "feature_extraction"
# - "ner_annotation"
# - "llm_annotation"
- "data_splitting"
# Production LLM settings
llm:
model_name: "mistral:7b"
requests_per_minute: 360
requests_per_second: 3
retry_attempts: 3
timeout_seconds: 45
max_concurrent_requests: 4
enable_rate_limiting: true
# Data handling configuration
data:
max_dataset_size: null
balance_by_sex: false
# Production logging (less verbose)
logging:
level: "INFO"
console_logging: true
file_logging: true
log_file: "pipeline.production.log"
max_log_size: 52428800 # 50MB
backup_count: 10