129 lines
4.2 KiB
YAML
129 lines
4.2 KiB
YAML
# Research Experiment Configuration Templates
|
|
# These configurations can be used as starting points for different types of experiments
|
|
|
|
# Baseline Experiments Configuration
|
|
baseline_experiments:
|
|
- name: "baseline_logistic_regression_fullname"
|
|
description: "Baseline logistic regression with full name"
|
|
model_type: "logistic_regression"
|
|
features: ["full_name"]
|
|
model_params:
|
|
ngram_range: [2, 5]
|
|
max_features: 10000
|
|
max_iter: 1000
|
|
tags: ["baseline", "fullname"]
|
|
|
|
- name: "baseline_logistic_regression_native"
|
|
description: "Logistic regression with native name only"
|
|
model_type: "logistic_regression"
|
|
features: ["native_name"]
|
|
model_params:
|
|
ngram_range: [2, 4]
|
|
max_features: 5000
|
|
tags: ["baseline", "native"]
|
|
|
|
- name: "baseline_rf_engineered"
|
|
description: "Random Forest with engineered features"
|
|
model_type: "random_forest"
|
|
features: ["name_length", "word_count", "province"]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: 10
|
|
tags: ["baseline", "engineered"]
|
|
|
|
# Feature Study Configurations
|
|
feature_studies:
|
|
- name: "native_vs_surname"
|
|
description: "Compare native name vs surname effectiveness"
|
|
experiments:
|
|
- model_type: "logistic_regression"
|
|
features: ["native_name"]
|
|
tags: ["feature_study", "native"]
|
|
- model_type: "logistic_regression"
|
|
features: ["surname"]
|
|
tags: ["feature_study", "surname"]
|
|
|
|
- name: "name_parts_analysis"
|
|
description: "Analyze effectiveness of different name parts"
|
|
experiments:
|
|
- features: ["first_word"]
|
|
tags: ["name_parts", "first"]
|
|
- features: ["last_word"]
|
|
tags: ["name_parts", "last"]
|
|
- features: ["name_beginnings"]
|
|
feature_params:
|
|
beginning_length: 3
|
|
tags: ["name_parts", "beginnings"]
|
|
- features: ["name_endings"]
|
|
feature_params:
|
|
ending_length: 3
|
|
tags: ["name_parts", "endings"]
|
|
|
|
# Province-Specific Studies
|
|
province_studies:
|
|
- name: "kinshasa_study"
|
|
description: "Gender prediction for Kinshasa province"
|
|
model_type: "logistic_regression"
|
|
features: ["full_name"]
|
|
train_data_filter:
|
|
province: "kinshasa"
|
|
tags: ["province_study", "kinshasa"]
|
|
|
|
- name: "cross_province_generalization"
|
|
description: "Train on one province, test on another"
|
|
experiments:
|
|
- train_filter: {"province": "kinshasa"}
|
|
test_filter: {"province": "bas-congo"}
|
|
tags: ["generalization", "kinshasa_to_bas-congo"]
|
|
|
|
# Model Comparison Studies
|
|
model_comparisons:
|
|
- name: "model_comparison_fullname"
|
|
description: "Compare different models with full name"
|
|
base_config:
|
|
features: ["full_name"]
|
|
tags: ["model_comparison"]
|
|
models:
|
|
- model_type: "logistic_regression"
|
|
model_params:
|
|
ngram_range: [2, 5]
|
|
- model_type: "random_forest"
|
|
# Note: RF will need different feature preparation
|
|
features: ["name_length", "word_count", "province"]
|
|
|
|
# Advanced Feature Combinations
|
|
advanced_features:
|
|
- name: "multi_feature_combination"
|
|
description: "Test various feature combinations"
|
|
experiments:
|
|
- features: ["full_name", "name_length"]
|
|
tags: ["combination", "name_plus_length"]
|
|
- features: ["native_name", "surname", "province"]
|
|
tags: ["combination", "semantic_features"]
|
|
- features: ["name_beginnings", "name_endings", "word_count"]
|
|
tags: ["combination", "structural_features"]
|
|
|
|
# Hyperparameter Studies
|
|
hyperparameter_studies:
|
|
- name: "ngram_range_study"
|
|
description: "Study effect of different n-gram ranges"
|
|
base_config:
|
|
model_type: "logistic_regression"
|
|
features: ["full_name"]
|
|
tags: ["hyperparameter", "ngram"]
|
|
variants:
|
|
- model_params: {"ngram_range": [1, 3]}
|
|
- model_params: {"ngram_range": [2, 4]}
|
|
- model_params: {"ngram_range": [2, 5]}
|
|
- model_params: {"ngram_range": [3, 6]}
|
|
|
|
# Data Size Studies
|
|
data_studies:
|
|
- name: "learning_curve_study"
|
|
description: "Study performance vs training data size"
|
|
base_config:
|
|
model_type: "logistic_regression"
|
|
features: ["full_name"]
|
|
tags: ["learning_curve"]
|
|
data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0] # Fractions of training data to use
|