# DRC Names Processing Pipeline Configuration # Main configuration file with default settings name: "drc_ners_pipeline" # Name of the pipeline version: "1.0.0" # Version of the pipeline description: "DRC NERS NLP Processing" # Description of the pipeline environment: "development" # Environment type (development, production, etc.) debug: false # Enable debug mode for detailed logging and error reporting # Project directory structure paths: root_dir: "." # Root directory of the project configs_dir: "./config" # Directory for configuration files data_dir: "./data/dataset" # Directory for dataset files models_dir: "./data/models" # Directory for model files outputs_dir: "./data/outputs" # Directory for output files logs_dir: "./data/logs" # Directory for log files checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints # Pipeline stages # List of stages in the processing pipeline stages: - "data_cleaning" # Data cleaning stage - "data_selection" # Data selection stage - keep only required columns - "feature_extraction" # Feature extraction stage - "ner_annotation" # NER-based annotation stage - "llm_annotation" # LLM annotation stage (computational intensive) - "data_splitting" # Data splitting stage # Data processing configuration processing: batch_size: 1_000 # Size of data batches to process at once max_workers: 4 # Number of worker threads for parallel processing checkpoint_interval: 5 # Interval for saving checkpoints during processing use_multiprocessing: false # Enable multiprocessing for CPU-bound tasks encoding_options: # List of encodings to try when reading files - "utf-8" - "utf-16" - "latin1" chunk_size: 100_000 # Size of data chunks to process in parallel epochs: 2 # Number of Epochs for training # Annotation settings annotation: llm: model_name: "mistral:7b" # Name of the LLM model to use requests_per_minute: 60 # Requests per minute to the LLM service requests_per_second: 2 # Requests per second to the LLM service retry_attempts: 3 # Number of retry attempts for LLM requests timeout_seconds: 600 # Timeout for LLM requests max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service ner: model_name: "drc_names_ner" # Name of the NER model to use retry_attempts: 3 # Number of retry attempts for NER requests # Data handling configuration data: input_file: "names.csv" # Input file containing names data output_files: featured: "names_featured.csv" # Output file for featured data evaluation: "names_evaluation.csv" # Output file for evaluation set males: "names_males.csv" # Output files for male names females: "names_females.csv" # Output files for female names ner_data: "names_ner.json" # Output file for NER annotated data ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format selected_columns: # Required columns for processing - name - sex - region - year split_evaluation: false # Should the dataset be split into training and evaluation sets ? split_by_gender: true # Should the dataset be split by gender ? split_by_province: true # Should the dataset be split by province ? split_ner_data: true # Should the NER data be extracted and saved? evaluation_fraction: 0.2 # Fraction of data to use for evaluation random_seed: 42 # Random seed for reproducibility max_dataset_size: null # Maximum size of the dataset to process, set to null for no balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size? # Logging configuration # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) logging: level: "INFO" format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" file_logging: true # Enable logging to file console_logging: true # Enable logging to console log_file: "pipeline.log" # Log file name max_log_size: 10485760 # Maximum size of log file before rotation (10MB) backup_count: 5 # Number of backup log files to keep