drc-ners-nlp/config/pipeline.yaml

# DRC Names Processing Pipeline Configuration
# Main configuration file with default settings

name: "drc_ners_pipeline"                 # Name of the pipeline
version: "1.0.0"                          # Version of the pipeline
description: "DRC NERS NLP Processing"    # Description of the pipeline
environment: "development"                # Environment type (development, production, etc.)
debug: false                              # Enable debug mode for detailed logging and error reporting

# Project directory structure
paths:
  root_dir: "."                           # Root directory of the project
  configs_dir: "./config"                 # Directory for configuration files
  data_dir: "./data/dataset"              # Directory for dataset files
  models_dir: "./data/models"             # Directory for model files
  outputs_dir: "./data/outputs"           # Directory for output files
  logs_dir: "./data/logs"                 # Directory for log files
  checkpoints_dir: "./data/checkpoints"   # Directory for model checkpoints

# Pipeline stages
# List of stages in the processing pipeline
stages:
  - "data_cleaning"                        # Data cleaning stage
  - "feature_extraction"                   # Feature extraction stage
  - "ner_annotation"                       # NER-based annotation stage
  - "llm_annotation"                       # LLM annotation stage (computational intensive)
  - "data_splitting"                       # Data splitting stage

# Data processing configuration
processing:
  batch_size: 1_000                        # Size of data batches to process at once
  max_workers: 4                           # Number of worker threads for parallel processing
  checkpoint_interval: 5                   # Interval for saving checkpoints during processing
  use_multiprocessing: false               # Enable multiprocessing for CPU-bound tasks
  encoding_options:                        # List of encodings to try when reading files
    - "utf-8"
    - "utf-16"
    - "latin1"
  chunk_size: 100_000                      # Size of data chunks to process in parallel
  epochs: 2                                # Number of Epochs for training

# Annotation settings
annotation:
  llm:
    model_name: "mistral:7b"                 # Name of the LLM model to use
    requests_per_minute: 60                  # Requests per minute to the LLM service
    requests_per_second: 2                   # Requests per second to the LLM service
    retry_attempts: 3                        # Number of retry attempts for LLM requests
    timeout_seconds: 600                     # Timeout for LLM requests
    max_concurrent_requests: 2               # Maximum concurrent requests to the LLM service
    enable_rate_limiting: true               # Enable rate limiting to avoid overloading the LLM service

  ner:
    model_name: "drc_names_ner"             # Name of the NER model to use
    retry_attempts: 3                       # Number of retry attempts for NER requests

# Data handling configuration
data:
  input_file: "names.csv"                   # Input file containing names data
  output_files:
    featured: "names_featured.csv"          # Output file for featured data
    evaluation: "names_evaluation.csv"      # Output file for evaluation set
    males: "names_males.csv"                # Output files for male names
    females: "names_females.csv"            # Output files for female names
    ner_data: "names_ner.json"              # Output file for NER annotated data
    ner_spacy: "names_ner.spacy"            # Output file for NER annotated data using spaCy format
  split_evaluation: false                   # Should the dataset be split into training and evaluation sets ?
  split_by_gender: true                     # Should the dataset be split by gender ?
  split_by_province: true                   # Should the dataset be split by province ?
  split_ner_data: true                      # Should the NER data be extracted and saved?
  evaluation_fraction: 0.2                  # Fraction of data to use for evaluation
  random_seed: 42                           # Random seed for reproducibility
  max_dataset_size: null                    # Maximum size of the dataset to process, set to null for no
  balance_by_sex: false                     # Should the dataset be balanced by sex when limiting the dataset size?

# Logging configuration
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  file_logging: true                       # Enable logging to file
  console_logging: true                    # Enable logging to console
  log_file: "pipeline.log"                 # Log file name
  max_log_size: 10485760                   # Maximum size of log file before rotation (10MB)
  backup_count: 5                          # Number of backup log files to keep