refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,46 @@
+# Production Environment Configuration
+# Optimized settings for production deployment
+
+name: "drc_names_pipeline"
+version: "1.0.0"
+environment: "development"
+debug: true
+
+# Processing settings
+processing:
+  batch_size: 100_000
+  max_workers: 8
+  checkpoint_interval: 10
+  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
+
+# Pipeline stages
+stages:
+  - "data_cleaning"
+  - "feature_extraction"
+  #- "llm_annotation"
+  - "data_splitting"
+
+
+# Production LLM settings
+llm:
+  model_name: "mistral:7b"
+  requests_per_minute: 120
+  requests_per_second: 3
+  retry_attempts: 3
+  timeout_seconds: 45
+  max_concurrent_requests: 4
+  enable_rate_limiting: true
+
+# Production data settings
+data:
+  split_evaluation: true
+  split_by_gender: true
+  evaluation_fraction: 0.2
+  random_seed: 42
+
+# Enhanced logging for development
+logging:
+  level: "INFO"
+  console_logging: true
+  file_logging: true
+  log_file: "pipeline.development.log"
@@ -0,0 +1,48 @@
+# Production Environment Configuration
+# Optimized settings for production deployment
+
+name: "drc_names_pipeline"
+version: "1.0.0"
+environment: "production"
+debug: false
+
+# Production processing settings (optimized for performance)
+processing:
+  batch_size: 10_000
+  max_workers: 8
+  checkpoint_interval: 10
+  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
+
+# Pipeline stages
+stages:
+  - "data_cleaning"
+  - "feature_extraction"
+  - "llm_annotation"
+  - "data_splitting"
+
+
+# Production LLM settings
+llm:
+  model_name: "mistral:7b"
+  requests_per_minute: 360
+  requests_per_second: 3
+  retry_attempts: 3
+  timeout_seconds: 45
+  max_concurrent_requests: 4
+  enable_rate_limiting: true
+
+# Production data settings
+data:
+  split_evaluation: true
+  split_by_gender: true
+  evaluation_fraction: 0.2
+  random_seed: 42
+
+# Production logging (less verbose)
+logging:
+  level: "INFO"
+  console_logging: false  # Disable console in production
+  file_logging: true
+  log_file: "pipeline.production.log"
+  max_log_size: 52428800  # 50MB
+  backup_count: 10
@@ -0,0 +1,70 @@
+# DRC Names Processing Pipeline Configuration
+# Main configuration file with default settings
+
+name: "drc_names_pipeline"
+version: "1.0.0"
+description: "DRC Names NLP Processing Pipeline"
+environment: "development"
+debug: false
+
+# Project directory structure
+paths:
+  root_dir: "."
+  configs_dir: "./config"
+  data_dir: "./data/dataset"
+  models_dir: "./data/models"
+  outputs_dir: "./data/outputs"
+  logs_dir: "./data/logs"
+  checkpoints_dir: "./data/checkpoints"
+
+# Pipeline stages
+stages:
+  - "data_cleaning"
+  - "feature_extraction"
+  - "llm_annotation"
+  - "data_splitting"
+
+# Data processing configuration
+processing:
+  batch_size: 1_000
+  max_workers: 4
+  checkpoint_interval: 5
+  use_multiprocessing: false
+  encoding_options:
+    - "utf-8"
+    - "utf-16"
+    - "latin1"
+  chunk_size: 100_000
+
+# LLM annotation settings
+llm:
+  model_name: "mistral:7b"
+  requests_per_minute: 60
+  requests_per_second: 2
+  retry_attempts: 3
+  timeout_seconds: 600
+  max_concurrent_requests: 2
+  enable_rate_limiting: true
+
+# Data handling configuration
+data:
+  input_file: "names.csv"
+  output_files:
+    featured: "names_featured.csv"
+    evaluation: "names_evaluation.csv"
+    males: "names_males.csv"
+    females: "names_females.csv"
+  split_evaluation: true
+  split_by_gender: true
+  evaluation_fraction: 0.2
+  random_seed: 42
+
+# Logging configuration
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+  file_logging: true
+  console_logging: true
+  log_file: "pipeline.log"
+  max_log_size: 10485760  # 10MB
+  backup_count: 5
@@ -0,0 +1,36 @@
+## Instructions:
+Identify the identified_name (native Congolese part) and identified_surname (non-native, French or English part) from the provided full name. 
+Return null if a part cannot be identified. Do not alter the original name, do not change case or add any additional information.
+
+## Examples:
+```
+"tshabu ngandu"
+{
+  "identified_name": "tshabu ngandu",
+  "identified_surname": null
+}
+
+"bapite marie"
+{
+  "identified_name": "bapite",
+  "identified_surname": "marie"
+}
+
+"tshisekedi mulumba jean claude"
+{
+  "identified_name": "tshisekedi mulumba",
+  "identified_surname": "jean claude"
+}
+
+"ilunga wa makuta jean-marie"
+{
+  "identified_name": "ilunga wa makuta",
+  "identified_surname": "jean-marie"
+}
+
+"ntumba wasokadio marie france"
+{
+  "identified_name": "ntumba wasokadio",
+  "identified_surname": "marie france"
+}
+```
@@ -0,0 +1,128 @@
+# Research Experiment Configuration Templates
+# These configurations can be used as starting points for different types of experiments
+
+# Baseline Experiments Configuration
+baseline_experiments:
+  - name: "baseline_logistic_regression_fullname"
+    description: "Baseline logistic regression with full name"
+    model_type: "logistic_regression"
+    features: ["full_name"]
+    model_params:
+      ngram_range: [2, 5]
+      max_features: 10000
+      max_iter: 1000
+    tags: ["baseline", "fullname"]
+
+  - name: "baseline_logistic_regression_native"
+    description: "Logistic regression with native name only"
+    model_type: "logistic_regression"
+    features: ["native_name"]
+    model_params:
+      ngram_range: [2, 4]
+      max_features: 5000
+    tags: ["baseline", "native"]
+
+  - name: "baseline_rf_engineered"
+    description: "Random Forest with engineered features"
+    model_type: "random_forest"
+    features: ["name_length", "word_count", "province"]
+    model_params:
+      n_estimators: 100
+      max_depth: 10
+    tags: ["baseline", "engineered"]
+
+# Feature Study Configurations
+feature_studies:
+  - name: "native_vs_surname"
+    description: "Compare native name vs surname effectiveness"
+    experiments:
+      - model_type: "logistic_regression"
+        features: ["native_name"]
+        tags: ["feature_study", "native"]
+      - model_type: "logistic_regression"
+        features: ["surname"]
+        tags: ["feature_study", "surname"]
+
+  - name: "name_parts_analysis"
+    description: "Analyze effectiveness of different name parts"
+    experiments:
+      - features: ["first_word"]
+        tags: ["name_parts", "first"]
+      - features: ["last_word"]
+        tags: ["name_parts", "last"]
+      - features: ["name_beginnings"]
+        feature_params:
+          beginning_length: 3
+        tags: ["name_parts", "beginnings"]
+      - features: ["name_endings"]
+        feature_params:
+          ending_length: 3
+        tags: ["name_parts", "endings"]
+
+# Province-Specific Studies
+province_studies:
+  - name: "kinshasa_study"
+    description: "Gender prediction for Kinshasa province"
+    model_type: "logistic_regression"
+    features: ["full_name"]
+    train_data_filter:
+      province: "kinshasa"
+    tags: ["province_study", "kinshasa"]
+
+  - name: "cross_province_generalization"
+    description: "Train on one province, test on another"
+    experiments:
+      - train_filter: {"province": "kinshasa"}
+        test_filter: {"province": "bas-congo"}
+        tags: ["generalization", "kinshasa_to_bas-congo"]
+
+# Model Comparison Studies
+model_comparisons:
+  - name: "model_comparison_fullname"
+    description: "Compare different models with full name"
+    base_config:
+      features: ["full_name"]
+      tags: ["model_comparison"]
+    models:
+      - model_type: "logistic_regression"
+        model_params:
+          ngram_range: [2, 5]
+      - model_type: "random_forest"
+        # Note: RF will need different feature preparation
+        features: ["name_length", "word_count", "province"]
+
+# Advanced Feature Combinations
+advanced_features:
+  - name: "multi_feature_combination"
+    description: "Test various feature combinations"
+    experiments:
+      - features: ["full_name", "name_length"]
+        tags: ["combination", "name_plus_length"]
+      - features: ["native_name", "surname", "province"]
+        tags: ["combination", "semantic_features"]
+      - features: ["name_beginnings", "name_endings", "word_count"]
+        tags: ["combination", "structural_features"]
+
+# Hyperparameter Studies
+hyperparameter_studies:
+  - name: "ngram_range_study"
+    description: "Study effect of different n-gram ranges"
+    base_config:
+      model_type: "logistic_regression"
+      features: ["full_name"]
+      tags: ["hyperparameter", "ngram"]
+    variants:
+      - model_params: {"ngram_range": [1, 3]}
+      - model_params: {"ngram_range": [2, 4]}
+      - model_params: {"ngram_range": [2, 5]}
+      - model_params: {"ngram_range": [3, 6]}
+
+# Data Size Studies
+data_studies:
+  - name: "learning_curve_study"
+    description: "Study performance vs training data size"
+    base_config:
+      model_type: "logistic_regression"
+      features: ["full_name"]
+      tags: ["learning_curve"]
+    data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0]  # Fractions of training data to use