73 lines
1.6 KiB
YAML
73 lines
1.6 KiB
YAML
# DRC Names Processing Pipeline Configuration
|
|
# Main configuration file with default settings
|
|
|
|
name: "drc_names_pipeline"
|
|
version: "1.0.0"
|
|
description: "DRC Names NLP Processing Pipeline"
|
|
environment: "development"
|
|
debug: false
|
|
|
|
# Project directory structure
|
|
paths:
|
|
root_dir: "."
|
|
configs_dir: "./config"
|
|
data_dir: "./data/dataset"
|
|
models_dir: "./data/models"
|
|
outputs_dir: "./data/outputs"
|
|
logs_dir: "./data/logs"
|
|
checkpoints_dir: "./data/checkpoints"
|
|
|
|
# Pipeline stages
|
|
stages:
|
|
- "data_cleaning"
|
|
- "feature_extraction"
|
|
- "llm_annotation"
|
|
- "data_splitting"
|
|
|
|
# Data processing configuration
|
|
processing:
|
|
batch_size: 1_000
|
|
max_workers: 4
|
|
checkpoint_interval: 5
|
|
use_multiprocessing: false
|
|
encoding_options:
|
|
- "utf-8"
|
|
- "utf-16"
|
|
- "latin1"
|
|
chunk_size: 100_000
|
|
|
|
# LLM annotation settings
|
|
llm:
|
|
model_name: "mistral:7b"
|
|
requests_per_minute: 60
|
|
requests_per_second: 2
|
|
retry_attempts: 3
|
|
timeout_seconds: 600
|
|
max_concurrent_requests: 2
|
|
enable_rate_limiting: true
|
|
|
|
# Data handling configuration
|
|
data:
|
|
input_file: "names.csv"
|
|
output_files:
|
|
featured: "names_featured.csv"
|
|
evaluation: "names_evaluation.csv"
|
|
males: "names_males.csv"
|
|
females: "names_females.csv"
|
|
split_evaluation: true
|
|
split_by_gender: true
|
|
evaluation_fraction: 0.2
|
|
random_seed: 42
|
|
max_dataset_size: null
|
|
balance_by_sex: false
|
|
|
|
# Logging configuration
|
|
logging:
|
|
level: "INFO"
|
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
file_logging: true
|
|
console_logging: true
|
|
log_file: "pipeline.log"
|
|
max_log_size: 10485760 # 10MB
|
|
backup_count: 5
|