86 lines
4.8 KiB
YAML
86 lines
4.8 KiB
YAML
# DRC Names Processing Pipeline Configuration
|
|
# Main configuration file with default settings
|
|
|
|
name: "drc_ners_pipeline" # Name of the pipeline
|
|
version: "1.0.0" # Version of the pipeline
|
|
description: "DRC NERS NLP Processing" # Description of the pipeline
|
|
environment: "development" # Environment type (development, production, etc.)
|
|
debug: false # Enable debug mode for detailed logging and error reporting
|
|
|
|
# Project directory structure
|
|
paths:
|
|
root_dir: "." # Root directory of the project
|
|
configs_dir: "./config" # Directory for configuration files
|
|
data_dir: "./data/dataset" # Directory for dataset files
|
|
models_dir: "./data/models" # Directory for model files
|
|
outputs_dir: "./data/outputs" # Directory for output files
|
|
logs_dir: "./data/logs" # Directory for log files
|
|
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
|
|
|
|
# Pipeline stages
|
|
# List of stages in the processing pipeline
|
|
stages:
|
|
- "data_cleaning" # Data cleaning stage
|
|
- "feature_extraction" # Feature extraction stage
|
|
- "ner_annotation" # NER-based annotation stage
|
|
- "llm_annotation" # LLM annotation stage (computational intensive)
|
|
- "data_splitting" # Data splitting stage
|
|
|
|
# Data processing configuration
|
|
processing:
|
|
batch_size: 1_000 # Size of data batches to process at once
|
|
max_workers: 4 # Number of worker threads for parallel processing
|
|
checkpoint_interval: 5 # Interval for saving checkpoints during processing
|
|
use_multiprocessing: false # Enable multiprocessing for CPU-bound tasks
|
|
encoding_options: # List of encodings to try when reading files
|
|
- "utf-8"
|
|
- "utf-16"
|
|
- "latin1"
|
|
chunk_size: 100_000 # Size of data chunks to process in parallel
|
|
epochs: 2 # Number of Epochs for training
|
|
|
|
# Annotation settings
|
|
annotation:
|
|
llm:
|
|
model_name: "mistral:7b" # Name of the LLM model to use
|
|
requests_per_minute: 60 # Requests per minute to the LLM service
|
|
requests_per_second: 2 # Requests per second to the LLM service
|
|
retry_attempts: 3 # Number of retry attempts for LLM requests
|
|
timeout_seconds: 600 # Timeout for LLM requests
|
|
max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service
|
|
enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service
|
|
|
|
ner:
|
|
model_name: "drc_names_ner" # Name of the NER model to use
|
|
retry_attempts: 3 # Number of retry attempts for NER requests
|
|
|
|
# Data handling configuration
|
|
data:
|
|
input_file: "names.csv" # Input file containing names data
|
|
output_files:
|
|
featured: "names_featured.csv" # Output file for featured data
|
|
evaluation: "names_evaluation.csv" # Output file for evaluation set
|
|
males: "names_males.csv" # Output files for male names
|
|
females: "names_females.csv" # Output files for female names
|
|
ner_data: "names_ner.json" # Output file for NER annotated data
|
|
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
|
|
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
|
|
split_by_gender: true # Should the dataset be split by gender ?
|
|
split_by_province: true # Should the dataset be split by province ?
|
|
split_ner_data: true # Should the NER data be extracted and saved?
|
|
evaluation_fraction: 0.2 # Fraction of data to use for evaluation
|
|
random_seed: 42 # Random seed for reproducibility
|
|
max_dataset_size: null # Maximum size of the dataset to process, set to null for no
|
|
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
|
|
|
|
# Logging configuration
|
|
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
logging:
|
|
level: "INFO"
|
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
file_logging: true # Enable logging to file
|
|
console_logging: true # Enable logging to console
|
|
log_file: "pipeline.log" # Log file name
|
|
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
|
backup_count: 5 # Number of backup log files to keep
|