feat: enhance training pipeline with research templates and experiment configuration

2025-08-08 23:48:55 +02:00
parent 96291b4ad0
commit 6d39c3afc1
9 changed files with 341 additions and 755 deletions
@@ -1,72 +1,72 @@
 # DRC Names Processing Pipeline Configuration
 # Main configuration file with default settings

-name: "drc_names_pipeline"
-version: "1.0.0"
-description: "DRC Names NLP Processing Pipeline"
-environment: "development"
-debug: false
+name: "drc_ners_pipeline"                 # Name of the pipeline
+version: "1.0.0"                          # Version of the pipeline
+description: "DRC NERS NLP Processing"    # Description of the pipeline
+environment: "development"                # Environment type (development, production, etc.)
+debug: false                              # Enable debug mode for detailed logging and error reporting

 # Project directory structure
 paths:
-  root_dir: "."
-  configs_dir: "./config"
-  data_dir: "./data/dataset"
-  models_dir: "./data/models"
-  outputs_dir: "./data/outputs"
-  logs_dir: "./data/logs"
-  checkpoints_dir: "./data/checkpoints"
+  root_dir: "."                           # Root directory of the project
+  configs_dir: "./config"                 # Directory for configuration files
+  data_dir: "./data/dataset"              # Directory for dataset files
+  models_dir: "./data/models"             # Directory for model files
+  outputs_dir: "./data/outputs"           # Directory for output files
+  logs_dir: "./data/logs"                 # Directory for log files
+  checkpoints_dir: "./data/checkpoints"   # Directory for model checkpoints

 # Pipeline stages
-stages:
-  - "data_cleaning"
-  - "feature_extraction"
-  - "llm_annotation"
-  - "data_splitting"
+stages:                                    # List of stages in the processing pipeline
+  - "data_cleaning"                        # Data cleaning stage
+  - "feature_extraction"                   # Feature extraction stage
+  - "llm_annotation"                       # LLM annotation stage (computational intensive)
+  - "data_splitting"                       # Data splitting stage

 # Data processing configuration
 processing:
-  batch_size: 1_000
-  max_workers: 4
-  checkpoint_interval: 5
-  use_multiprocessing: false
-  encoding_options:
+  batch_size: 1_000                        # Size of data batches to process at once
+  max_workers: 4                           # Number of worker threads for parallel processing
+  checkpoint_interval: 5                   # Interval for saving checkpoints during processing
+  use_multiprocessing: false               # Enable multiprocessing for CPU-bound tasks
+  encoding_options:                        # List of encodings to try when reading files
    - "utf-8"
    - "utf-16"
    - "latin1"
-  chunk_size: 100_000
+  chunk_size: 100_000                      # Size of data chunks to process in parallel

 # LLM annotation settings
 llm:
-  model_name: "mistral:7b"
-  requests_per_minute: 60
-  requests_per_second: 2
-  retry_attempts: 3
-  timeout_seconds: 600
-  max_concurrent_requests: 2
-  enable_rate_limiting: true
+  model_name: "mistral:7b"                 # Name of the LLM model to use
+  requests_per_minute: 60                  # Requests per minute to the LLM service
+  requests_per_second: 2                   # Requests per second to the LLM service
+  retry_attempts: 3                        # Number of retry attempts for LLM requests
+  timeout_seconds: 600                     # Timeout for LLM requests
+  max_concurrent_requests: 2               # Maximum concurrent requests to the LLM service
+  enable_rate_limiting: true               # Enable rate limiting to avoid overloading the LLM service

 # Data handling configuration
 data:
-  input_file: "names.csv"
+  input_file: "names.csv"                   # Input file containing names data
  output_files:
-    featured: "names_featured.csv"
-    evaluation: "names_evaluation.csv"
-    males: "names_males.csv"
-    females: "names_females.csv"
-  split_evaluation: true
-  split_by_gender: true
-  evaluation_fraction: 0.2
-  random_seed: 42
-  max_dataset_size: null
-  balance_by_sex: false
+    featured: "names_featured.csv"          # Output file for featured data
+    evaluation: "names_evaluation.csv"      # Output file for evaluation set
+    males: "names_males.csv"                # Output files for male names
+    females: "names_females.csv"            # Output files for female names
+  split_evaluation: true                    # Should the dataset be split into training and evaluation sets ?
+  split_by_gender: true                     # Should the dataset be split by gender ?
+  evaluation_fraction: 0.2                  # Fraction of data to use for evaluation
+  random_seed: 42                           # Random seed for reproducibility
+  max_dataset_size: null                    # Maximum size of the dataset to process, set to null for no
+  balance_by_sex: false                     # Should the dataset be balanced by sex when limiting the dataset size?

 # Logging configuration
 logging:
-  level: "INFO"
+  level: "INFO"                            # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-  file_logging: true
-  console_logging: true
-  log_file: "pipeline.log"
-  max_log_size: 10485760  # 10MB
-  backup_count: 5
+  file_logging: true                       # Enable logging to file
+  console_logging: true                    # Enable logging to console
+  log_file: "pipeline.log"                 # Log file name
+  max_log_size: 10485760                   # Maximum size of log file before rotation (10MB)
+  backup_count: 5                          # Number of backup log files to keep