environment: "production" debug: false # Processing settings processing: batch_size: 100_000 max_workers: 4 checkpoint_interval: 10 use_multiprocessing: true # Pipeline stages stages: - "data_cleaning" - "data_selection" - "feature_extraction" # - "ner_annotation" # - "llm_annotation" - "data_splitting" # Production LLM settings llm: model_name: "mistral:7b" requests_per_minute: 360 requests_per_second: 3 retry_attempts: 3 timeout_seconds: 45 max_concurrent_requests: 4 enable_rate_limiting: true # Data handling configuration data: max_dataset_size: null balance_by_sex: false # Production logging (less verbose) logging: level: "INFO" console_logging: true file_logging: true log_file: "pipeline.production.log" max_log_size: 52428800 # 50MB backup_count: 10