317 lines
8.5 KiB
YAML
317 lines
8.5 KiB
YAML
baseline_experiments:
|
|
# BiGRU Models
|
|
- name: "bigru"
|
|
description: "Baseline BiGRU with full name features"
|
|
model_type: "bigru"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
embedding_dim: 64
|
|
gru_units: 32
|
|
epochs: 2
|
|
batch_size: 32
|
|
tags: [ "baseline", "neural", "bigru" ]
|
|
|
|
- name: "bigru_native"
|
|
description: "Baseline BiGRU with native name features"
|
|
model_type: "bigru"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
embedding_dim: 64
|
|
gru_units: 32
|
|
epochs: 2
|
|
batch_size: 32
|
|
tags: [ "baseline", "neural", "bigru", "native" ]
|
|
|
|
- name: "bigru_surname"
|
|
description: "Baseline BiGRU with surname features"
|
|
model_type: "bigru"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
embedding_dim: 64
|
|
gru_units: 32
|
|
epochs: 2
|
|
batch_size: 32
|
|
tags: [ "baseline", "neural", "bigru", "surname" ]
|
|
|
|
## CNN Models
|
|
- name: "cnn"
|
|
description: "Baseline CNN with character patterns"
|
|
model_type: "cnn"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
embedding_dim: 64
|
|
filters: 64
|
|
kernel_size: 3
|
|
dropout: 0.5
|
|
epochs: 2
|
|
batch_size: 32
|
|
tags: [ "baseline", "neural", "cnn" ]
|
|
|
|
- name: "cnn_native"
|
|
description: "Baseline CNN with native name character patterns"
|
|
model_type: "cnn"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
embedding_dim: 64
|
|
filters: 64
|
|
kernel_size: 3
|
|
dropout: 0.5
|
|
epochs: 2
|
|
batch_size: 32
|
|
tags: [ "baseline", "neural", "cnn", "native" ]
|
|
|
|
- name: "cnn_surname"
|
|
description: "Baseline CNN with surname character patterns"
|
|
model_type: "cnn"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
embedding_dim: 64
|
|
filters: 64
|
|
kernel_size: 3
|
|
dropout: 0.5
|
|
epochs: 2
|
|
batch_size: 32
|
|
tags: [ "baseline", "neural", "cnn", "surname" ]
|
|
|
|
# LightGBM Models
|
|
- name: "lightgbm"
|
|
description: "Baseline LightGBM with engineered features"
|
|
model_type: "lightgbm"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: -1
|
|
learning_rate: 0.1
|
|
num_leaves: 31
|
|
subsample: 0.8
|
|
colsample_bytree: 0.8
|
|
tags: [ "baseline", "lightgbm" ]
|
|
|
|
- name: "lightgbm_native"
|
|
description: "Baseline LightGBM with native name features"
|
|
model_type: "lightgbm"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: -1
|
|
learning_rate: 0.1
|
|
num_leaves: 31
|
|
subsample: 0.8
|
|
colsample_bytree: 0.8
|
|
tags: [ "baseline", "lightgbm", "native" ]
|
|
|
|
- name: "lightgbm_surname"
|
|
description: "Baseline LightGBM with surname features"
|
|
model_type: "lightgbm"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: -1
|
|
learning_rate: 0.1
|
|
num_leaves: 31
|
|
subsample: 0.8
|
|
colsample_bytree: 0.8
|
|
tags: [ "baseline", "lightgbm", "surname" ]
|
|
|
|
# Logistic Regression Models
|
|
- name: "logistic_regression"
|
|
description: "Baseline logistic regression with full name"
|
|
model_type: "logistic_regression"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
max_features: 10000
|
|
tags: [ "baseline", "logistic_regression", "fullname" ]
|
|
|
|
- name: "logistic_regression_native"
|
|
description: "Logistic regression with native name only"
|
|
model_type: "logistic_regression"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
max_features: 5000
|
|
tags: [ "baseline", "logistic_regression", "native" ]
|
|
|
|
- name: "logistic_regression_surname"
|
|
description: "Logistic regression with surname name only"
|
|
model_type: "logistic_regression"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
max_features: 5000
|
|
tags: [ "baseline", "logistic_regression", "surname" ]
|
|
|
|
# LSTM Models
|
|
- name: "lstm"
|
|
description: "Baseline LSTM with full name features"
|
|
model_type: "lstm"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
embedding_dim: 128
|
|
lstm_units: 64
|
|
epochs: 2
|
|
batch_size: 64
|
|
tags: [ "baseline", "neural", "lstm" ]
|
|
|
|
- name: "lstm_native"
|
|
description: "Baseline LSTM with native name features"
|
|
model_type: "lstm"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
embedding_dim: 128
|
|
lstm_units: 64
|
|
epochs: 2
|
|
batch_size: 64
|
|
tags: [ "baseline", "neural", "lstm", "native" ]
|
|
|
|
- name: "lstm_surname"
|
|
description: "Baseline LSTM with surname features"
|
|
model_type: "lstm"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
embedding_dim: 128
|
|
lstm_units: 64
|
|
epochs: 2
|
|
batch_size: 64
|
|
tags: [ "baseline", "neural", "lstm", "surname" ]
|
|
|
|
# Naive Bayes Models
|
|
- name: "naive_bayes"
|
|
description: "Baseline Naive Bayes with full name features"
|
|
model_type: "naive_bayes"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
max_features: 5000
|
|
tags: [ "baseline", "naive_bayes" ]
|
|
|
|
- name: "naive_bayes_native"
|
|
description: "Baseline Naive Bayes with native name features"
|
|
model_type: "naive_bayes"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
max_features: 5000
|
|
tags: [ "baseline", "naive_bayes", "native" ]
|
|
|
|
- name: "naive_bayes_surname"
|
|
description: "Baseline Naive Bayes with surname features"
|
|
model_type: "naive_bayes"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
max_features: 5000
|
|
tags: [ "baseline", "naive_bayes", "surname" ]
|
|
|
|
# Random Forest Models
|
|
- name: "random_forest"
|
|
description: "Baseline Random Forest with engineered features"
|
|
model_type: "random_forest"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: 10
|
|
min_samples_split: 2
|
|
min_samples_leaf: 1
|
|
tags: [ "baseline", "random_forest", "engineered" ]
|
|
|
|
- name: "random_forest_native"
|
|
description: "Baseline Random Forest with native name engineered features"
|
|
model_type: "random_forest"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: 10
|
|
min_samples_split: 2
|
|
min_samples_leaf: 1
|
|
tags: [ "baseline", "random_forest", "engineered", "native" ]
|
|
|
|
- name: "random_forest_surname"
|
|
description: "Baseline Random Forest with surname engineered features"
|
|
model_type: "random_forest"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: 10
|
|
min_samples_split: 2
|
|
min_samples_leaf: 1
|
|
tags: [ "baseline", "random_forest", "engineered", "surname" ]
|
|
|
|
# Transformer Models
|
|
- name: "transformer"
|
|
description: "Baseline Transformer with attention mechanism"
|
|
model_type: "transformer"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
embedding_dim: 128
|
|
num_heads: 4
|
|
num_layers: 2
|
|
epochs: 2
|
|
batch_size: 64
|
|
tags: [ "baseline", "neural", "transformer" ]
|
|
|
|
- name: "transformer_native"
|
|
description: "Baseline Transformer with native name attention mechanism"
|
|
model_type: "transformer"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
embedding_dim: 128
|
|
num_heads: 4
|
|
num_layers: 2
|
|
epochs: 2
|
|
batch_size: 64
|
|
tags: [ "baseline", "neural", "transformer", "native" ]
|
|
|
|
- name: "transformer_surname"
|
|
description: "Baseline Transformer with surname attention mechanism"
|
|
model_type: "transformer"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
embedding_dim: 128
|
|
num_heads: 4
|
|
num_layers: 2
|
|
epochs: 2
|
|
batch_size: 64
|
|
tags: [ "baseline", "neural", "transformer", "surname" ]
|
|
|
|
# XGBoost Models
|
|
- name: "xgboost"
|
|
description: "Baseline XGBoost with engineered features"
|
|
model_type: "xgboost"
|
|
features: [ "full_name" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: 6
|
|
learning_rate: 0.1
|
|
subsample: 0.8
|
|
colsample_bytree: 0.8
|
|
tags: [ "baseline", "xgboost" ]
|
|
|
|
- name: "xgboost_native"
|
|
description: "Baseline XGBoost with native name engineered features"
|
|
model_type: "xgboost"
|
|
features: [ "native_name" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: 6
|
|
learning_rate: 0.1
|
|
subsample: 0.8
|
|
colsample_bytree: 0.8
|
|
tags: [ "baseline", "xgboost", "native" ]
|
|
|
|
- name: "xgboost_surname"
|
|
description: "Baseline XGBoost with surname engineered features"
|
|
model_type: "xgboost"
|
|
features: [ "surname" ]
|
|
model_params:
|
|
n_estimators: 100
|
|
max_depth: 6
|
|
learning_rate: 0.1
|
|
subsample: 0.8
|
|
colsample_bytree: 0.8
|
|
tags: [ "baseline", "xgboost", "surname" ]
|
|
|
|
|
|
# Advanced Experiments Configuration
|
|
advanced_experiments:
|
|
|
|
# Feature Study Configurations
|
|
feature_studies:
|
|
|
|
# Hyperparameter Tuning Configurations
|
|
hyperparameter_tuning:
|