124 lines
4.5 KiB
Python
124 lines
4.5 KiB
Python
from typing import List
|
|
|
|
from research.experiment import ExperimentConfig
|
|
from research.experiment.feature_extractor import FeatureType
|
|
|
|
|
|
class ExperimentBuilder:
|
|
"""Helper class to build experiment configurations"""
|
|
|
|
@staticmethod
|
|
def create_baseline_experiments() -> List[ExperimentConfig]:
|
|
"""Create a set of baseline experiments for comparison"""
|
|
|
|
return [
|
|
# Full name experiments
|
|
ExperimentConfig(
|
|
name="baseline_logistic_regression_fullname",
|
|
description="Logistic regression with full name",
|
|
model_type="logistic_regression",
|
|
features=[FeatureType.FULL_NAME],
|
|
tags=["baseline", "fullname"],
|
|
),
|
|
# Native name only
|
|
ExperimentConfig(
|
|
name="baseline_logistic_regression_native",
|
|
description="Logistic regression with native name only",
|
|
model_type="logistic_regression",
|
|
features=[FeatureType.NATIVE_NAME],
|
|
tags=["baseline", "native"],
|
|
),
|
|
# Surname only
|
|
ExperimentConfig(
|
|
name="baseline_logistic_regression_surname",
|
|
description="Logistic regression with surname only",
|
|
model_type="logistic_regression",
|
|
features=[FeatureType.SURNAME],
|
|
tags=["baseline", "surname"],
|
|
),
|
|
# Random Forest with engineered features
|
|
ExperimentConfig(
|
|
name="baseline_rf_engineered",
|
|
description="Random Forest with engineered features",
|
|
model_type="random_forest",
|
|
features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
|
|
tags=["baseline", "engineered"],
|
|
),
|
|
]
|
|
|
|
@staticmethod
|
|
def create_feature_ablation_study() -> List[ExperimentConfig]:
|
|
"""Create experiments for feature ablation study"""
|
|
base_features = [
|
|
FeatureType.FULL_NAME,
|
|
FeatureType.NAME_LENGTH,
|
|
FeatureType.WORD_COUNT,
|
|
FeatureType.PROVINCE,
|
|
]
|
|
|
|
experiments = []
|
|
|
|
# Test removing each feature one by one
|
|
for i, feature_to_remove in enumerate(base_features):
|
|
remaining_features = [f for f in base_features if f != feature_to_remove]
|
|
|
|
experiments.append(
|
|
ExperimentConfig(
|
|
name=f"ablation_remove_{feature_to_remove.value}",
|
|
description=f"Ablation study: removed {feature_to_remove.value}",
|
|
model_type="logistic_regression",
|
|
features=remaining_features,
|
|
tags=["ablation", feature_to_remove.value],
|
|
)
|
|
)
|
|
|
|
return experiments
|
|
|
|
@staticmethod
|
|
def create_name_component_study() -> List[ExperimentConfig]:
|
|
"""Create experiments to study different name components"""
|
|
experiments = []
|
|
|
|
name_components = [
|
|
(FeatureType.FIRST_WORD, "first_word"),
|
|
(FeatureType.LAST_WORD, "last_word"),
|
|
(FeatureType.NATIVE_NAME, "native_name"),
|
|
(FeatureType.SURNAME, "surname"),
|
|
(FeatureType.NAME_BEGINNINGS, "name_beginnings"),
|
|
(FeatureType.NAME_ENDINGS, "name_endings"),
|
|
]
|
|
|
|
for feature, name in name_components:
|
|
experiments.append(
|
|
ExperimentConfig(
|
|
name=f"component_study_{name}",
|
|
description=f"Study of {name} for gender prediction",
|
|
model_type="logistic_regression",
|
|
features=[feature],
|
|
tags=["component_study", name],
|
|
)
|
|
)
|
|
|
|
return experiments
|
|
|
|
@staticmethod
|
|
def create_province_specific_study() -> List[ExperimentConfig]:
|
|
"""Create experiments for province-specific analysis"""
|
|
provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed
|
|
|
|
experiments = []
|
|
|
|
for province in provinces:
|
|
experiments.append(
|
|
ExperimentConfig(
|
|
name=f"province_study_{province}",
|
|
description=f"Gender prediction for {province} province only",
|
|
model_type="logistic_regression",
|
|
features=[FeatureType.FULL_NAME],
|
|
train_data_filter={"province": province},
|
|
tags=["province_study", province],
|
|
)
|
|
)
|
|
|
|
return experiments
|