feat: enhance training pipeline with research templates and experiment configuration
This commit is contained in:
@@ -1,17 +1,12 @@
|
||||
# Production Environment Configuration
|
||||
# Optimized settings for production deployment
|
||||
|
||||
name: "drc_names_pipeline"
|
||||
version: "1.0.0"
|
||||
environment: "development"
|
||||
debug: true
|
||||
|
||||
# Processing settings
|
||||
processing:
|
||||
batch_size: 100_000
|
||||
batch_size: 10_000
|
||||
max_workers: 8
|
||||
checkpoint_interval: 10
|
||||
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
|
||||
use_multiprocessing: true
|
||||
|
||||
# Pipeline stages
|
||||
stages:
|
||||
@@ -20,7 +15,6 @@ stages:
|
||||
#- "llm_annotation"
|
||||
- "data_splitting"
|
||||
|
||||
|
||||
# Production LLM settings
|
||||
llm:
|
||||
model_name: "mistral:7b"
|
||||
@@ -31,14 +25,10 @@ llm:
|
||||
max_concurrent_requests: 4
|
||||
enable_rate_limiting: true
|
||||
|
||||
# Development data settings - limited dataset for faster testing
|
||||
# Data handling configuration
|
||||
data:
|
||||
split_evaluation: true
|
||||
split_by_gender: true
|
||||
evaluation_fraction: 0.2
|
||||
random_seed: 42
|
||||
max_dataset_size: ~ # Limit to 10k records for development/testing
|
||||
balance_by_sex: false # Balance male/female samples when limiting
|
||||
max_dataset_size: 100_000
|
||||
balance_by_sex: true
|
||||
|
||||
# Enhanced logging for development
|
||||
logging:
|
||||
|
||||
@@ -1,17 +1,12 @@
|
||||
# Production Environment Configuration
|
||||
# Optimized settings for production deployment
|
||||
|
||||
name: "drc_names_pipeline"
|
||||
version: "1.0.0"
|
||||
environment: "production"
|
||||
debug: false
|
||||
|
||||
# Production processing settings (optimized for performance)
|
||||
# Processing settings
|
||||
processing:
|
||||
batch_size: 10_000
|
||||
max_workers: 8
|
||||
checkpoint_interval: 10
|
||||
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
|
||||
use_multiprocessing: true
|
||||
|
||||
# Pipeline stages
|
||||
stages:
|
||||
@@ -20,7 +15,6 @@ stages:
|
||||
- "llm_annotation"
|
||||
- "data_splitting"
|
||||
|
||||
|
||||
# Production LLM settings
|
||||
llm:
|
||||
model_name: "mistral:7b"
|
||||
@@ -31,19 +25,15 @@ llm:
|
||||
max_concurrent_requests: 4
|
||||
enable_rate_limiting: true
|
||||
|
||||
# Production data settings
|
||||
# Data handling configuration
|
||||
data:
|
||||
split_evaluation: true
|
||||
split_by_gender: true
|
||||
evaluation_fraction: 0.2
|
||||
random_seed: 42
|
||||
max_dataset_size: null
|
||||
balance_by_sex: false
|
||||
|
||||
# Production logging (less verbose)
|
||||
logging:
|
||||
level: "INFO"
|
||||
console_logging: false # Disable console in production
|
||||
console_logging: false
|
||||
file_logging: true
|
||||
log_file: "pipeline.production.log"
|
||||
max_log_size: 52428800 # 50MB
|
||||
|
||||
+47
-47
@@ -1,72 +1,72 @@
|
||||
# DRC Names Processing Pipeline Configuration
|
||||
# Main configuration file with default settings
|
||||
|
||||
name: "drc_names_pipeline"
|
||||
version: "1.0.0"
|
||||
description: "DRC Names NLP Processing Pipeline"
|
||||
environment: "development"
|
||||
debug: false
|
||||
name: "drc_ners_pipeline" # Name of the pipeline
|
||||
version: "1.0.0" # Version of the pipeline
|
||||
description: "DRC NERS NLP Processing" # Description of the pipeline
|
||||
environment: "development" # Environment type (development, production, etc.)
|
||||
debug: false # Enable debug mode for detailed logging and error reporting
|
||||
|
||||
# Project directory structure
|
||||
paths:
|
||||
root_dir: "."
|
||||
configs_dir: "./config"
|
||||
data_dir: "./data/dataset"
|
||||
models_dir: "./data/models"
|
||||
outputs_dir: "./data/outputs"
|
||||
logs_dir: "./data/logs"
|
||||
checkpoints_dir: "./data/checkpoints"
|
||||
root_dir: "." # Root directory of the project
|
||||
configs_dir: "./config" # Directory for configuration files
|
||||
data_dir: "./data/dataset" # Directory for dataset files
|
||||
models_dir: "./data/models" # Directory for model files
|
||||
outputs_dir: "./data/outputs" # Directory for output files
|
||||
logs_dir: "./data/logs" # Directory for log files
|
||||
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
|
||||
|
||||
# Pipeline stages
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "feature_extraction"
|
||||
- "llm_annotation"
|
||||
- "data_splitting"
|
||||
stages: # List of stages in the processing pipeline
|
||||
- "data_cleaning" # Data cleaning stage
|
||||
- "feature_extraction" # Feature extraction stage
|
||||
- "llm_annotation" # LLM annotation stage (computational intensive)
|
||||
- "data_splitting" # Data splitting stage
|
||||
|
||||
# Data processing configuration
|
||||
processing:
|
||||
batch_size: 1_000
|
||||
max_workers: 4
|
||||
checkpoint_interval: 5
|
||||
use_multiprocessing: false
|
||||
encoding_options:
|
||||
batch_size: 1_000 # Size of data batches to process at once
|
||||
max_workers: 4 # Number of worker threads for parallel processing
|
||||
checkpoint_interval: 5 # Interval for saving checkpoints during processing
|
||||
use_multiprocessing: false # Enable multiprocessing for CPU-bound tasks
|
||||
encoding_options: # List of encodings to try when reading files
|
||||
- "utf-8"
|
||||
- "utf-16"
|
||||
- "latin1"
|
||||
chunk_size: 100_000
|
||||
chunk_size: 100_000 # Size of data chunks to process in parallel
|
||||
|
||||
# LLM annotation settings
|
||||
llm:
|
||||
model_name: "mistral:7b"
|
||||
requests_per_minute: 60
|
||||
requests_per_second: 2
|
||||
retry_attempts: 3
|
||||
timeout_seconds: 600
|
||||
max_concurrent_requests: 2
|
||||
enable_rate_limiting: true
|
||||
model_name: "mistral:7b" # Name of the LLM model to use
|
||||
requests_per_minute: 60 # Requests per minute to the LLM service
|
||||
requests_per_second: 2 # Requests per second to the LLM service
|
||||
retry_attempts: 3 # Number of retry attempts for LLM requests
|
||||
timeout_seconds: 600 # Timeout for LLM requests
|
||||
max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service
|
||||
enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service
|
||||
|
||||
# Data handling configuration
|
||||
data:
|
||||
input_file: "names.csv"
|
||||
input_file: "names.csv" # Input file containing names data
|
||||
output_files:
|
||||
featured: "names_featured.csv"
|
||||
evaluation: "names_evaluation.csv"
|
||||
males: "names_males.csv"
|
||||
females: "names_females.csv"
|
||||
split_evaluation: true
|
||||
split_by_gender: true
|
||||
evaluation_fraction: 0.2
|
||||
random_seed: 42
|
||||
max_dataset_size: null
|
||||
balance_by_sex: false
|
||||
featured: "names_featured.csv" # Output file for featured data
|
||||
evaluation: "names_evaluation.csv" # Output file for evaluation set
|
||||
males: "names_males.csv" # Output files for male names
|
||||
females: "names_females.csv" # Output files for female names
|
||||
split_evaluation: true # Should the dataset be split into training and evaluation sets ?
|
||||
split_by_gender: true # Should the dataset be split by gender ?
|
||||
evaluation_fraction: 0.2 # Fraction of data to use for evaluation
|
||||
random_seed: 42 # Random seed for reproducibility
|
||||
max_dataset_size: null # Maximum size of the dataset to process, set to null for no
|
||||
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
|
||||
|
||||
# Logging configuration
|
||||
logging:
|
||||
level: "INFO"
|
||||
level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: true
|
||||
console_logging: true
|
||||
log_file: "pipeline.log"
|
||||
max_log_size: 10485760 # 10MB
|
||||
backup_count: 5
|
||||
file_logging: true # Enable logging to file
|
||||
console_logging: true # Enable logging to console
|
||||
log_file: "pipeline.log" # Log file name
|
||||
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
||||
backup_count: 5 # Number of backup log files to keep
|
||||
|
||||
+129
-109
@@ -1,128 +1,148 @@
|
||||
# Research Experiment Configuration Templates
|
||||
# These configurations can be used as starting points for different types of experiments
|
||||
|
||||
# Baseline Experiments Configuration
|
||||
baseline_experiments:
|
||||
- name: "baseline_logistic_regression_fullname"
|
||||
- name: "bigru"
|
||||
description: "Baseline BiGRU with full name features"
|
||||
model_type: "bigru"
|
||||
features: [ "full_name" ]
|
||||
model_params:
|
||||
max_len: 20
|
||||
embedding_dim: 64
|
||||
gru_units: 32
|
||||
epochs: 10
|
||||
batch_size: 32
|
||||
tags: [ "baseline", "neural", "bigru" ]
|
||||
|
||||
- name: "cnn"
|
||||
description: "Baseline CNN with character patterns"
|
||||
model_type: "cnn"
|
||||
features: [ "full_name" ]
|
||||
model_params:
|
||||
max_len: 20
|
||||
embedding_dim: 64
|
||||
filters: 64
|
||||
kernel_size: 3
|
||||
dropout: 0.5
|
||||
epochs: 10
|
||||
batch_size: 32
|
||||
tags: [ "baseline", "neural", "cnn" ]
|
||||
|
||||
- name: "ensemble"
|
||||
description: "Baseline Ensemble with multiple models"
|
||||
model_type: "ensemble"
|
||||
features: [ "full_name", "name_length", "word_count" ]
|
||||
model_params:
|
||||
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
||||
voting: "soft"
|
||||
cv_folds: 5
|
||||
tags: [ "baseline", "ensemble" ]
|
||||
|
||||
- name: "lightgbm"
|
||||
description: "Baseline LightGBM with engineered features"
|
||||
model_type: "lightgbm"
|
||||
features: [ "full_name", "name_length", "word_count" ]
|
||||
model_params:
|
||||
n_estimators: 100
|
||||
max_depth: -1
|
||||
learning_rate: 0.1
|
||||
num_leaves: 31
|
||||
subsample: 0.8
|
||||
colsample_bytree: 0.8
|
||||
tags: [ "baseline", "lightgbm" ]
|
||||
|
||||
- name: "logistic_regression_fullname"
|
||||
description: "Baseline logistic regression with full name"
|
||||
model_type: "logistic_regression"
|
||||
features: ["full_name"]
|
||||
features: [ "full_name" ]
|
||||
model_params:
|
||||
ngram_range: [2, 5]
|
||||
max_features: 10000
|
||||
max_iter: 1000
|
||||
tags: ["baseline", "fullname"]
|
||||
tags: [ "baseline", "logistic_regression", "fullname" ]
|
||||
|
||||
- name: "baseline_logistic_regression_native"
|
||||
- name: "logistic_regression_native"
|
||||
description: "Logistic regression with native name only"
|
||||
model_type: "logistic_regression"
|
||||
features: ["native_name"]
|
||||
features: [ "native_name" ]
|
||||
model_params:
|
||||
ngram_range: [2, 4]
|
||||
max_features: 5000
|
||||
tags: ["baseline", "native"]
|
||||
tags: [ "baseline", "logistic_regression", "native" ]
|
||||
|
||||
- name: "baseline_rf_engineered"
|
||||
description: "Random Forest with engineered features"
|
||||
- name: "logistic_regression_surname"
|
||||
description: "Logistic regression with surname name only"
|
||||
model_type: "logistic_regression"
|
||||
features: [ "surname" ]
|
||||
model_params:
|
||||
max_features: 5000
|
||||
tags: [ "baseline", "logistic_regression", "surname" ]
|
||||
|
||||
- name: "lstm"
|
||||
description: "Baseline LSTM with full name features"
|
||||
model_type: "lstm"
|
||||
features: [ "full_name" ]
|
||||
model_params:
|
||||
embedding_dim: 128
|
||||
lstm_units: 64
|
||||
epochs: 10
|
||||
batch_size: 64
|
||||
tags: [ "baseline", "neural", "lstm" ]
|
||||
|
||||
- name: "naive_bayes"
|
||||
description: "Baseline Naive Bayes with full name features"
|
||||
model_type: "naive_bayes"
|
||||
features: [ "full_name" ]
|
||||
model_params:
|
||||
max_features: 5000
|
||||
tags: [ "baseline", "naive_bayes" ]
|
||||
|
||||
- name: "random_forest"
|
||||
description: "Baseline Random Forest with engineered features"
|
||||
model_type: "random_forest"
|
||||
features: ["name_length", "word_count", "province"]
|
||||
features: [ "name_length", "word_count", "province" ]
|
||||
model_params:
|
||||
n_estimators: 100
|
||||
max_depth: 10
|
||||
tags: ["baseline", "engineered"]
|
||||
min_samples_split: 2
|
||||
min_samples_leaf: 1
|
||||
tags: [ "baseline", "random_forest", "engineered" ]
|
||||
|
||||
- name: "svm"
|
||||
description: "Baseline SVM with full name features"
|
||||
model_type: "svm"
|
||||
features: [ "full_name" ]
|
||||
model_params:
|
||||
C: 1.0
|
||||
kernel: "rbf"
|
||||
ngram_range: [ 2, 4 ]
|
||||
max_features: 5000
|
||||
tags: [ "baseline", "svm" ]
|
||||
|
||||
- name: "transformer"
|
||||
description: "Baseline Transformer with attention mechanism"
|
||||
model_type: "transformer"
|
||||
features: [ "full_name" ]
|
||||
model_params:
|
||||
embedding_dim: 128
|
||||
num_heads: 4
|
||||
num_layers: 2
|
||||
epochs: 10
|
||||
batch_size: 64
|
||||
tags: [ "baseline", "neural", "transformer" ]
|
||||
|
||||
- name: "xgboost"
|
||||
description: "Baseline XGBoost with engineered features"
|
||||
model_type: "xgboost"
|
||||
features: [ "full_name", "name_length", "word_count" ]
|
||||
model_params:
|
||||
n_estimators: 100
|
||||
max_depth: 6
|
||||
learning_rate: 0.1
|
||||
subsample: 0.8
|
||||
colsample_bytree: 0.8
|
||||
tags: [ "baseline", "xgboost" ]
|
||||
|
||||
|
||||
# Advanced Experiments Configuration
|
||||
advanced_experiments:
|
||||
|
||||
# Feature Study Configurations
|
||||
feature_studies:
|
||||
- name: "native_vs_surname"
|
||||
description: "Compare native name vs surname effectiveness"
|
||||
experiments:
|
||||
- model_type: "logistic_regression"
|
||||
features: ["native_name"]
|
||||
tags: ["feature_study", "native"]
|
||||
- model_type: "logistic_regression"
|
||||
features: ["surname"]
|
||||
tags: ["feature_study", "surname"]
|
||||
|
||||
- name: "name_parts_analysis"
|
||||
description: "Analyze effectiveness of different name parts"
|
||||
experiments:
|
||||
- features: ["first_word"]
|
||||
tags: ["name_parts", "first"]
|
||||
- features: ["last_word"]
|
||||
tags: ["name_parts", "last"]
|
||||
- features: ["name_beginnings"]
|
||||
feature_params:
|
||||
beginning_length: 3
|
||||
tags: ["name_parts", "beginnings"]
|
||||
- features: ["name_endings"]
|
||||
feature_params:
|
||||
ending_length: 3
|
||||
tags: ["name_parts", "endings"]
|
||||
|
||||
# Province-Specific Studies
|
||||
province_studies:
|
||||
- name: "kinshasa_study"
|
||||
description: "Gender prediction for Kinshasa province"
|
||||
model_type: "logistic_regression"
|
||||
features: ["full_name"]
|
||||
train_data_filter:
|
||||
province: "kinshasa"
|
||||
tags: ["province_study", "kinshasa"]
|
||||
|
||||
- name: "cross_province_generalization"
|
||||
description: "Train on one province, test on another"
|
||||
experiments:
|
||||
- train_filter: {"province": "kinshasa"}
|
||||
test_filter: {"province": "bas-congo"}
|
||||
tags: ["generalization", "kinshasa_to_bas-congo"]
|
||||
|
||||
# Model Comparison Studies
|
||||
model_comparisons:
|
||||
- name: "model_comparison_fullname"
|
||||
description: "Compare different models with full name"
|
||||
base_config:
|
||||
features: ["full_name"]
|
||||
tags: ["model_comparison"]
|
||||
models:
|
||||
- model_type: "logistic_regression"
|
||||
model_params:
|
||||
ngram_range: [2, 5]
|
||||
- model_type: "random_forest"
|
||||
# Note: RF will need different feature preparation
|
||||
features: ["name_length", "word_count", "province"]
|
||||
|
||||
# Advanced Feature Combinations
|
||||
advanced_features:
|
||||
- name: "multi_feature_combination"
|
||||
description: "Test various feature combinations"
|
||||
experiments:
|
||||
- features: ["full_name", "name_length"]
|
||||
tags: ["combination", "name_plus_length"]
|
||||
- features: ["native_name", "surname", "province"]
|
||||
tags: ["combination", "semantic_features"]
|
||||
- features: ["name_beginnings", "name_endings", "word_count"]
|
||||
tags: ["combination", "structural_features"]
|
||||
|
||||
# Hyperparameter Studies
|
||||
hyperparameter_studies:
|
||||
- name: "ngram_range_study"
|
||||
description: "Study effect of different n-gram ranges"
|
||||
base_config:
|
||||
model_type: "logistic_regression"
|
||||
features: ["full_name"]
|
||||
tags: ["hyperparameter", "ngram"]
|
||||
variants:
|
||||
- model_params: {"ngram_range": [1, 3]}
|
||||
- model_params: {"ngram_range": [2, 4]}
|
||||
- model_params: {"ngram_range": [2, 5]}
|
||||
- model_params: {"ngram_range": [3, 6]}
|
||||
|
||||
# Data Size Studies
|
||||
data_studies:
|
||||
- name: "learning_curve_study"
|
||||
description: "Study performance vs training data size"
|
||||
base_config:
|
||||
model_type: "logistic_regression"
|
||||
features: ["full_name"]
|
||||
tags: ["learning_curve"]
|
||||
data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0] # Fractions of training data to use
|
||||
# Hyperparameter Tuning Configurations
|
||||
hyperparameter_tuning:
|
||||
|
||||
Reference in New Issue
Block a user