refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
+46
View File
@@ -0,0 +1,46 @@
# Production Environment Configuration
# Optimized settings for production deployment
name: "drc_names_pipeline"
version: "1.0.0"
environment: "development"
debug: true
# Processing settings
processing:
batch_size: 100_000
max_workers: 8
checkpoint_interval: 10
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
# Pipeline stages
stages:
- "data_cleaning"
- "feature_extraction"
#- "llm_annotation"
- "data_splitting"
# Production LLM settings
llm:
model_name: "mistral:7b"
requests_per_minute: 120
requests_per_second: 3
retry_attempts: 3
timeout_seconds: 45
max_concurrent_requests: 4
enable_rate_limiting: true
# Production data settings
data:
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
# Enhanced logging for development
logging:
level: "INFO"
console_logging: true
file_logging: true
log_file: "pipeline.development.log"
+48
View File
@@ -0,0 +1,48 @@
# Production Environment Configuration
# Optimized settings for production deployment
name: "drc_names_pipeline"
version: "1.0.0"
environment: "production"
debug: false
# Production processing settings (optimized for performance)
processing:
batch_size: 10_000
max_workers: 8
checkpoint_interval: 10
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
# Pipeline stages
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
# Production LLM settings
llm:
model_name: "mistral:7b"
requests_per_minute: 360
requests_per_second: 3
retry_attempts: 3
timeout_seconds: 45
max_concurrent_requests: 4
enable_rate_limiting: true
# Production data settings
data:
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
# Production logging (less verbose)
logging:
level: "INFO"
console_logging: false # Disable console in production
file_logging: true
log_file: "pipeline.production.log"
max_log_size: 52428800 # 50MB
backup_count: 10
+70
View File
@@ -0,0 +1,70 @@
# DRC Names Processing Pipeline Configuration
# Main configuration file with default settings
name: "drc_names_pipeline"
version: "1.0.0"
description: "DRC Names NLP Processing Pipeline"
environment: "development"
debug: false
# Project directory structure
paths:
root_dir: "."
configs_dir: "./config"
data_dir: "./data/dataset"
models_dir: "./data/models"
outputs_dir: "./data/outputs"
logs_dir: "./data/logs"
checkpoints_dir: "./data/checkpoints"
# Pipeline stages
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
# Data processing configuration
processing:
batch_size: 1_000
max_workers: 4
checkpoint_interval: 5
use_multiprocessing: false
encoding_options:
- "utf-8"
- "utf-16"
- "latin1"
chunk_size: 100_000
# LLM annotation settings
llm:
model_name: "mistral:7b"
requests_per_minute: 60
requests_per_second: 2
retry_attempts: 3
timeout_seconds: 600
max_concurrent_requests: 2
enable_rate_limiting: true
# Data handling configuration
data:
input_file: "names.csv"
output_files:
featured: "names_featured.csv"
evaluation: "names_evaluation.csv"
males: "names_males.csv"
females: "names_females.csv"
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
# Logging configuration
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: true
console_logging: true
log_file: "pipeline.log"
max_log_size: 10485760 # 10MB
backup_count: 5
+36
View File
@@ -0,0 +1,36 @@
## Instructions:
Identify the identified_name (native Congolese part) and identified_surname (non-native, French or English part) from the provided full name.
Return null if a part cannot be identified. Do not alter the original name, do not change case or add any additional information.
## Examples:
```
"tshabu ngandu"
{
"identified_name": "tshabu ngandu",
"identified_surname": null
}
"bapite marie"
{
"identified_name": "bapite",
"identified_surname": "marie"
}
"tshisekedi mulumba jean claude"
{
"identified_name": "tshisekedi mulumba",
"identified_surname": "jean claude"
}
"ilunga wa makuta jean-marie"
{
"identified_name": "ilunga wa makuta",
"identified_surname": "jean-marie"
}
"ntumba wasokadio marie france"
{
"identified_name": "ntumba wasokadio",
"identified_surname": "marie france"
}
```
+128
View File
@@ -0,0 +1,128 @@
# Research Experiment Configuration Templates
# These configurations can be used as starting points for different types of experiments
# Baseline Experiments Configuration
baseline_experiments:
- name: "baseline_logistic_regression_fullname"
description: "Baseline logistic regression with full name"
model_type: "logistic_regression"
features: ["full_name"]
model_params:
ngram_range: [2, 5]
max_features: 10000
max_iter: 1000
tags: ["baseline", "fullname"]
- name: "baseline_logistic_regression_native"
description: "Logistic regression with native name only"
model_type: "logistic_regression"
features: ["native_name"]
model_params:
ngram_range: [2, 4]
max_features: 5000
tags: ["baseline", "native"]
- name: "baseline_rf_engineered"
description: "Random Forest with engineered features"
model_type: "random_forest"
features: ["name_length", "word_count", "province"]
model_params:
n_estimators: 100
max_depth: 10
tags: ["baseline", "engineered"]
# Feature Study Configurations
feature_studies:
- name: "native_vs_surname"
description: "Compare native name vs surname effectiveness"
experiments:
- model_type: "logistic_regression"
features: ["native_name"]
tags: ["feature_study", "native"]
- model_type: "logistic_regression"
features: ["surname"]
tags: ["feature_study", "surname"]
- name: "name_parts_analysis"
description: "Analyze effectiveness of different name parts"
experiments:
- features: ["first_word"]
tags: ["name_parts", "first"]
- features: ["last_word"]
tags: ["name_parts", "last"]
- features: ["name_beginnings"]
feature_params:
beginning_length: 3
tags: ["name_parts", "beginnings"]
- features: ["name_endings"]
feature_params:
ending_length: 3
tags: ["name_parts", "endings"]
# Province-Specific Studies
province_studies:
- name: "kinshasa_study"
description: "Gender prediction for Kinshasa province"
model_type: "logistic_regression"
features: ["full_name"]
train_data_filter:
province: "kinshasa"
tags: ["province_study", "kinshasa"]
- name: "cross_province_generalization"
description: "Train on one province, test on another"
experiments:
- train_filter: {"province": "kinshasa"}
test_filter: {"province": "bas-congo"}
tags: ["generalization", "kinshasa_to_bas-congo"]
# Model Comparison Studies
model_comparisons:
- name: "model_comparison_fullname"
description: "Compare different models with full name"
base_config:
features: ["full_name"]
tags: ["model_comparison"]
models:
- model_type: "logistic_regression"
model_params:
ngram_range: [2, 5]
- model_type: "random_forest"
# Note: RF will need different feature preparation
features: ["name_length", "word_count", "province"]
# Advanced Feature Combinations
advanced_features:
- name: "multi_feature_combination"
description: "Test various feature combinations"
experiments:
- features: ["full_name", "name_length"]
tags: ["combination", "name_plus_length"]
- features: ["native_name", "surname", "province"]
tags: ["combination", "semantic_features"]
- features: ["name_beginnings", "name_endings", "word_count"]
tags: ["combination", "structural_features"]
# Hyperparameter Studies
hyperparameter_studies:
- name: "ngram_range_study"
description: "Study effect of different n-gram ranges"
base_config:
model_type: "logistic_regression"
features: ["full_name"]
tags: ["hyperparameter", "ngram"]
variants:
- model_params: {"ngram_range": [1, 3]}
- model_params: {"ngram_range": [2, 4]}
- model_params: {"ngram_range": [2, 5]}
- model_params: {"ngram_range": [3, 6]}
# Data Size Studies
data_studies:
- name: "learning_curve_study"
description: "Study performance vs training data size"
base_config:
model_type: "logistic_regression"
features: ["full_name"]
tags: ["learning_curve"]
data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0] # Fractions of training data to use