feat: enhance training pipeline with research templates and experiment configuration

2025-08-08 23:48:55 +02:00
parent 96291b4ad0
commit 6d39c3afc1
9 changed files with 341 additions and 755 deletions
@@ -1,17 +1,12 @@
-# Production Environment Configuration
-# Optimized settings for production deployment
-
-name: "drc_names_pipeline"
-version: "1.0.0"
 environment: "development"
 debug: true

 # Processing settings
 processing:
-  batch_size: 100_000
+  batch_size: 10_000
  max_workers: 8
  checkpoint_interval: 10
-  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
+  use_multiprocessing: true

 # Pipeline stages
 stages:
@@ -20,7 +15,6 @@ stages:
  #- "llm_annotation"
  - "data_splitting"

-
 # Production LLM settings
 llm:
  model_name: "mistral:7b"
@@ -31,14 +25,10 @@ llm:
  max_concurrent_requests: 4
  enable_rate_limiting: true

-# Development data settings - limited dataset for faster testing
+# Data handling configuration
 data:
-  split_evaluation: true
-  split_by_gender: true
-  evaluation_fraction: 0.2
-  random_seed: 42
-  max_dataset_size: ~  # Limit to 10k records for development/testing
-  balance_by_sex: false     # Balance male/female samples when limiting
+  max_dataset_size: 100_000
+  balance_by_sex: true

 # Enhanced logging for development
 logging:
@@ -1,17 +1,12 @@
-# Production Environment Configuration
-# Optimized settings for production deployment
-
-name: "drc_names_pipeline"
-version: "1.0.0"
 environment: "production"
 debug: false

-# Production processing settings (optimized for performance)
+# Processing settings
 processing:
  batch_size: 10_000
  max_workers: 8
  checkpoint_interval: 10
-  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
+  use_multiprocessing: true

 # Pipeline stages
 stages:
@@ -20,7 +15,6 @@ stages:
  - "llm_annotation"
  - "data_splitting"

-
 # Production LLM settings
 llm:
  model_name: "mistral:7b"
@@ -31,19 +25,15 @@ llm:
  max_concurrent_requests: 4
  enable_rate_limiting: true

-# Production data settings
+# Data handling configuration
 data:
-  split_evaluation: true
-  split_by_gender: true
-  evaluation_fraction: 0.2
-  random_seed: 42
  max_dataset_size: null
  balance_by_sex: false

 # Production logging (less verbose)
 logging:
  level: "INFO"
-  console_logging: false  # Disable console in production
+  console_logging: false
  file_logging: true
  log_file: "pipeline.production.log"
  max_log_size: 52428800  # 50MB
@@ -1,72 +1,72 @@
 # DRC Names Processing Pipeline Configuration
 # Main configuration file with default settings

-name: "drc_names_pipeline"
-version: "1.0.0"
-description: "DRC Names NLP Processing Pipeline"
-environment: "development"
-debug: false
+name: "drc_ners_pipeline"                 # Name of the pipeline
+version: "1.0.0"                          # Version of the pipeline
+description: "DRC NERS NLP Processing"    # Description of the pipeline
+environment: "development"                # Environment type (development, production, etc.)
+debug: false                              # Enable debug mode for detailed logging and error reporting

 # Project directory structure
 paths:
-  root_dir: "."
-  configs_dir: "./config"
-  data_dir: "./data/dataset"
-  models_dir: "./data/models"
-  outputs_dir: "./data/outputs"
-  logs_dir: "./data/logs"
-  checkpoints_dir: "./data/checkpoints"
+  root_dir: "."                           # Root directory of the project
+  configs_dir: "./config"                 # Directory for configuration files
+  data_dir: "./data/dataset"              # Directory for dataset files
+  models_dir: "./data/models"             # Directory for model files
+  outputs_dir: "./data/outputs"           # Directory for output files
+  logs_dir: "./data/logs"                 # Directory for log files
+  checkpoints_dir: "./data/checkpoints"   # Directory for model checkpoints

 # Pipeline stages
-stages:
-  - "data_cleaning"
-  - "feature_extraction"
-  - "llm_annotation"
-  - "data_splitting"
+stages:                                    # List of stages in the processing pipeline
+  - "data_cleaning"                        # Data cleaning stage
+  - "feature_extraction"                   # Feature extraction stage
+  - "llm_annotation"                       # LLM annotation stage (computational intensive)
+  - "data_splitting"                       # Data splitting stage

 # Data processing configuration
 processing:
-  batch_size: 1_000
-  max_workers: 4
-  checkpoint_interval: 5
-  use_multiprocessing: false
-  encoding_options:
+  batch_size: 1_000                        # Size of data batches to process at once
+  max_workers: 4                           # Number of worker threads for parallel processing
+  checkpoint_interval: 5                   # Interval for saving checkpoints during processing
+  use_multiprocessing: false               # Enable multiprocessing for CPU-bound tasks
+  encoding_options:                        # List of encodings to try when reading files
    - "utf-8"
    - "utf-16"
    - "latin1"
-  chunk_size: 100_000
+  chunk_size: 100_000                      # Size of data chunks to process in parallel

 # LLM annotation settings
 llm:
-  model_name: "mistral:7b"
-  requests_per_minute: 60
-  requests_per_second: 2
-  retry_attempts: 3
-  timeout_seconds: 600
-  max_concurrent_requests: 2
-  enable_rate_limiting: true
+  model_name: "mistral:7b"                 # Name of the LLM model to use
+  requests_per_minute: 60                  # Requests per minute to the LLM service
+  requests_per_second: 2                   # Requests per second to the LLM service
+  retry_attempts: 3                        # Number of retry attempts for LLM requests
+  timeout_seconds: 600                     # Timeout for LLM requests
+  max_concurrent_requests: 2               # Maximum concurrent requests to the LLM service
+  enable_rate_limiting: true               # Enable rate limiting to avoid overloading the LLM service

 # Data handling configuration
 data:
-  input_file: "names.csv"
+  input_file: "names.csv"                   # Input file containing names data
  output_files:
-    featured: "names_featured.csv"
-    evaluation: "names_evaluation.csv"
-    males: "names_males.csv"
-    females: "names_females.csv"
-  split_evaluation: true
-  split_by_gender: true
-  evaluation_fraction: 0.2
-  random_seed: 42
-  max_dataset_size: null
-  balance_by_sex: false
+    featured: "names_featured.csv"          # Output file for featured data
+    evaluation: "names_evaluation.csv"      # Output file for evaluation set
+    males: "names_males.csv"                # Output files for male names
+    females: "names_females.csv"            # Output files for female names
+  split_evaluation: true                    # Should the dataset be split into training and evaluation sets ?
+  split_by_gender: true                     # Should the dataset be split by gender ?
+  evaluation_fraction: 0.2                  # Fraction of data to use for evaluation
+  random_seed: 42                           # Random seed for reproducibility
+  max_dataset_size: null                    # Maximum size of the dataset to process, set to null for no
+  balance_by_sex: false                     # Should the dataset be balanced by sex when limiting the dataset size?

 # Logging configuration
 logging:
-  level: "INFO"
+  level: "INFO"                            # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-  file_logging: true
-  console_logging: true
-  log_file: "pipeline.log"
-  max_log_size: 10485760  # 10MB
-  backup_count: 5
+  file_logging: true                       # Enable logging to file
+  console_logging: true                    # Enable logging to console
+  log_file: "pipeline.log"                 # Log file name
+  max_log_size: 10485760                   # Maximum size of log file before rotation (10MB)
+  backup_count: 5                          # Number of backup log files to keep
@@ -1,128 +1,148 @@
-# Research Experiment Configuration Templates
-# These configurations can be used as starting points for different types of experiments
-
-# Baseline Experiments Configuration
 baseline_experiments:
-  - name: "baseline_logistic_regression_fullname"
+  - name: "bigru"
+    description: "Baseline BiGRU with full name features"
+    model_type: "bigru"
+    features: [ "full_name" ]
+    model_params:
+      max_len: 20
+      embedding_dim: 64
+      gru_units: 32
+      epochs: 10
+      batch_size: 32
+    tags: [ "baseline", "neural", "bigru" ]
+
+  - name: "cnn"
+    description: "Baseline CNN with character patterns"
+    model_type: "cnn"
+    features: [ "full_name" ]
+    model_params:
+      max_len: 20
+      embedding_dim: 64
+      filters: 64
+      kernel_size: 3
+      dropout: 0.5
+      epochs: 10
+      batch_size: 32
+    tags: [ "baseline", "neural", "cnn" ]
+
+  - name: "ensemble"
+    description: "Baseline Ensemble with multiple models"
+    model_type: "ensemble"
+    features: [ "full_name", "name_length", "word_count" ]
+    model_params:
+      base_models: [ "logistic_regression", "random_forest", "xgboost" ]
+      voting: "soft"
+      cv_folds: 5
+    tags: [ "baseline", "ensemble" ]
+
+  - name: "lightgbm"
+    description: "Baseline LightGBM with engineered features"
+    model_type: "lightgbm"
+    features: [ "full_name", "name_length", "word_count" ]
+    model_params:
+      n_estimators: 100
+      max_depth: -1
+      learning_rate: 0.1
+      num_leaves: 31
+      subsample: 0.8
+      colsample_bytree: 0.8
+    tags: [ "baseline", "lightgbm" ]
+
+  - name: "logistic_regression_fullname"
    description: "Baseline logistic regression with full name"
    model_type: "logistic_regression"
-    features: ["full_name"]
+    features: [ "full_name" ]
    model_params:
-      ngram_range: [2, 5]
      max_features: 10000
-      max_iter: 1000
-    tags: ["baseline", "fullname"]
+    tags: [ "baseline", "logistic_regression", "fullname" ]

-  - name: "baseline_logistic_regression_native"
+  - name: "logistic_regression_native"
    description: "Logistic regression with native name only"
    model_type: "logistic_regression"
-    features: ["native_name"]
+    features: [ "native_name" ]
    model_params:
-      ngram_range: [2, 4]
      max_features: 5000
-    tags: ["baseline", "native"]
+    tags: [ "baseline", "logistic_regression", "native" ]

-  - name: "baseline_rf_engineered"
-    description: "Random Forest with engineered features"
+  - name: "logistic_regression_surname"
+    description: "Logistic regression with surname name only"
+    model_type: "logistic_regression"
+    features: [ "surname" ]
+    model_params:
+      max_features: 5000
+    tags: [ "baseline", "logistic_regression", "surname" ]
+
+  - name: "lstm"
+    description: "Baseline LSTM with full name features"
+    model_type: "lstm"
+    features: [ "full_name" ]
+    model_params:
+      embedding_dim: 128
+      lstm_units: 64
+      epochs: 10
+      batch_size: 64
+    tags: [ "baseline", "neural", "lstm" ]
+
+  - name: "naive_bayes"
+    description: "Baseline Naive Bayes with full name features"
+    model_type: "naive_bayes"
+    features: [ "full_name" ]
+    model_params:
+      max_features: 5000
+    tags: [ "baseline", "naive_bayes" ]
+
+  - name: "random_forest"
+    description: "Baseline Random Forest with engineered features"
    model_type: "random_forest"
-    features: ["name_length", "word_count", "province"]
+    features: [ "name_length", "word_count", "province" ]
    model_params:
      n_estimators: 100
      max_depth: 10
-    tags: ["baseline", "engineered"]
+      min_samples_split: 2
+      min_samples_leaf: 1
+    tags: [ "baseline", "random_forest", "engineered" ]
+
+  - name: "svm"
+    description: "Baseline SVM with full name features"
+    model_type: "svm"
+    features: [ "full_name" ]
+    model_params:
+      C: 1.0
+      kernel: "rbf"
+      ngram_range: [ 2, 4 ]
+      max_features: 5000
+    tags: [ "baseline", "svm" ]
+
+  - name: "transformer"
+    description: "Baseline Transformer with attention mechanism"
+    model_type: "transformer"
+    features: [ "full_name" ]
+    model_params:
+      embedding_dim: 128
+      num_heads: 4
+      num_layers: 2
+      epochs: 10
+      batch_size: 64
+    tags: [ "baseline", "neural", "transformer" ]
+
+  - name: "xgboost"
+    description: "Baseline XGBoost with engineered features"
+    model_type: "xgboost"
+    features: [ "full_name", "name_length", "word_count" ]
+    model_params:
+      n_estimators: 100
+      max_depth: 6
+      learning_rate: 0.1
+      subsample: 0.8
+      colsample_bytree: 0.8
+    tags: [ "baseline", "xgboost" ]
+
+
+# Advanced Experiments Configuration
+advanced_experiments:

 # Feature Study Configurations
 feature_studies:
-  - name: "native_vs_surname"
-    description: "Compare native name vs surname effectiveness"
-    experiments:
-      - model_type: "logistic_regression"
-        features: ["native_name"]
-        tags: ["feature_study", "native"]
-      - model_type: "logistic_regression"
-        features: ["surname"]
-        tags: ["feature_study", "surname"]

-  - name: "name_parts_analysis"
-    description: "Analyze effectiveness of different name parts"
-    experiments:
-      - features: ["first_word"]
-        tags: ["name_parts", "first"]
-      - features: ["last_word"]
-        tags: ["name_parts", "last"]
-      - features: ["name_beginnings"]
-        feature_params:
-          beginning_length: 3
-        tags: ["name_parts", "beginnings"]
-      - features: ["name_endings"]
-        feature_params:
-          ending_length: 3
-        tags: ["name_parts", "endings"]
-
-# Province-Specific Studies
-province_studies:
-  - name: "kinshasa_study"
-    description: "Gender prediction for Kinshasa province"
-    model_type: "logistic_regression"
-    features: ["full_name"]
-    train_data_filter:
-      province: "kinshasa"
-    tags: ["province_study", "kinshasa"]
-
-  - name: "cross_province_generalization"
-    description: "Train on one province, test on another"
-    experiments:
-      - train_filter: {"province": "kinshasa"}
-        test_filter: {"province": "bas-congo"}
-        tags: ["generalization", "kinshasa_to_bas-congo"]
-
-# Model Comparison Studies
-model_comparisons:
-  - name: "model_comparison_fullname"
-    description: "Compare different models with full name"
-    base_config:
-      features: ["full_name"]
-      tags: ["model_comparison"]
-    models:
-      - model_type: "logistic_regression"
-        model_params:
-          ngram_range: [2, 5]
-      - model_type: "random_forest"
-        # Note: RF will need different feature preparation
-        features: ["name_length", "word_count", "province"]
-
-# Advanced Feature Combinations
-advanced_features:
-  - name: "multi_feature_combination"
-    description: "Test various feature combinations"
-    experiments:
-      - features: ["full_name", "name_length"]
-        tags: ["combination", "name_plus_length"]
-      - features: ["native_name", "surname", "province"]
-        tags: ["combination", "semantic_features"]
-      - features: ["name_beginnings", "name_endings", "word_count"]
-        tags: ["combination", "structural_features"]
-
-# Hyperparameter Studies
-hyperparameter_studies:
-  - name: "ngram_range_study"
-    description: "Study effect of different n-gram ranges"
-    base_config:
-      model_type: "logistic_regression"
-      features: ["full_name"]
-      tags: ["hyperparameter", "ngram"]
-    variants:
-      - model_params: {"ngram_range": [1, 3]}
-      - model_params: {"ngram_range": [2, 4]}
-      - model_params: {"ngram_range": [2, 5]}
-      - model_params: {"ngram_range": [3, 6]}
-
-# Data Size Studies
-data_studies:
-  - name: "learning_curve_study"
-    description: "Study performance vs training data size"
-    base_config:
-      model_type: "logistic_regression"
-      features: ["full_name"]
-      tags: ["learning_curve"]
-    data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0]  # Fractions of training data to use
+# Hyperparameter Tuning Configurations
+hyperparameter_tuning: