Files
drc-ners-nlp/config/pipeline.production.yaml
T

51 lines
1.1 KiB
YAML

# Production Environment Configuration
# Optimized settings for production deployment
name: "drc_names_pipeline"
version: "1.0.0"
environment: "production"
debug: false
# Production processing settings (optimized for performance)
processing:
batch_size: 10_000
max_workers: 8
checkpoint_interval: 10
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
# Pipeline stages
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
# Production LLM settings
llm:
model_name: "mistral:7b"
requests_per_minute: 360
requests_per_second: 3
retry_attempts: 3
timeout_seconds: 45
max_concurrent_requests: 4
enable_rate_limiting: true
# Production data settings
data:
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
max_dataset_size: null
balance_by_sex: false
# Production logging (less verbose)
logging:
level: "INFO"
console_logging: false # Disable console in production
file_logging: true
log_file: "pipeline.production.log"
max_log_size: 52428800 # 50MB
backup_count: 10