Files
drc-ners-nlp/config/pipeline.yaml
T

73 lines
4.2 KiB
YAML

# DRC Names Processing Pipeline Configuration
# Main configuration file with default settings
name: "drc_ners_pipeline" # Name of the pipeline
version: "1.0.0" # Version of the pipeline
description: "DRC NERS NLP Processing" # Description of the pipeline
environment: "development" # Environment type (development, production, etc.)
debug: false # Enable debug mode for detailed logging and error reporting
# Project directory structure
paths:
root_dir: "." # Root directory of the project
configs_dir: "./config" # Directory for configuration files
data_dir: "./data/dataset" # Directory for dataset files
models_dir: "./data/models" # Directory for model files
outputs_dir: "./data/outputs" # Directory for output files
logs_dir: "./data/logs" # Directory for log files
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
# Pipeline stages
stages: # List of stages in the processing pipeline
- "data_cleaning" # Data cleaning stage
- "feature_extraction" # Feature extraction stage
- "llm_annotation" # LLM annotation stage (computational intensive)
- "data_splitting" # Data splitting stage
# Data processing configuration
processing:
batch_size: 1_000 # Size of data batches to process at once
max_workers: 4 # Number of worker threads for parallel processing
checkpoint_interval: 5 # Interval for saving checkpoints during processing
use_multiprocessing: false # Enable multiprocessing for CPU-bound tasks
encoding_options: # List of encodings to try when reading files
- "utf-8"
- "utf-16"
- "latin1"
chunk_size: 100_000 # Size of data chunks to process in parallel
# LLM annotation settings
llm:
model_name: "mistral:7b" # Name of the LLM model to use
requests_per_minute: 60 # Requests per minute to the LLM service
requests_per_second: 2 # Requests per second to the LLM service
retry_attempts: 3 # Number of retry attempts for LLM requests
timeout_seconds: 600 # Timeout for LLM requests
max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service
enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service
# Data handling configuration
data:
input_file: "names.csv" # Input file containing names data
output_files:
featured: "names_featured.csv" # Output file for featured data
evaluation: "names_evaluation.csv" # Output file for evaluation set
males: "names_males.csv" # Output files for male names
females: "names_females.csv" # Output files for female names
split_evaluation: true # Should the dataset be split into training and evaluation sets ?
split_by_gender: true # Should the dataset be split by gender ?
evaluation_fraction: 0.2 # Fraction of data to use for evaluation
random_seed: 42 # Random seed for reproducibility
max_dataset_size: null # Maximum size of the dataset to process, set to null for no
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
# Logging configuration
logging:
level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: true # Enable logging to file
console_logging: true # Enable logging to console
log_file: "pipeline.log" # Log file name
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
backup_count: 5 # Number of backup log files to keep