From e08084797f2c5ed4cff9e66eecd6c8222fbff81d Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Sat, 16 Aug 2025 22:14:55 +0200 Subject: [PATCH] feat: Experiment Builder --- research/experiment/experiment_builder.py | 189 +++++++------- train.py | 65 +---- web/interfaces/experiments.py | 296 +++++++++++----------- 3 files changed, 243 insertions(+), 307 deletions(-) diff --git a/research/experiment/experiment_builder.py b/research/experiment/experiment_builder.py index 0c7e994..c52090b 100644 --- a/research/experiment/experiment_builder.py +++ b/research/experiment/experiment_builder.py @@ -1,5 +1,9 @@ -from typing import List +import logging +from typing import List, Dict +import yaml + +from core.config.pipeline_config import PipelineConfig from research.experiment import ExperimentConfig from research.experiment.feature_extractor import FeatureType @@ -7,117 +11,98 @@ from research.experiment.feature_extractor import FeatureType class ExperimentBuilder: """Helper class to build experiment configurations""" - @staticmethod - def create_baseline_experiments() -> List[ExperimentConfig]: - """Create a set of baseline experiments for comparison""" + def __init__(self, config: PipelineConfig): + self.config = config - return [ - # Full name experiments - ExperimentConfig( - name="baseline_logistic_regression_fullname", - description="Logistic regression with full name", - model_type="logistic_regression", - features=[FeatureType.FULL_NAME], - tags=["baseline", "fullname"], - ), - # Native name only - ExperimentConfig( - name="baseline_logistic_regression_native", - description="Logistic regression with native name only", - model_type="logistic_regression", - features=[FeatureType.NATIVE_NAME], - tags=["baseline", "native"], - ), - # Surname only - ExperimentConfig( - name="baseline_logistic_regression_surname", - description="Logistic regression with surname only", - model_type="logistic_regression", - features=[FeatureType.SURNAME], - tags=["baseline", "surname"], - ), - # Random Forest with engineered features - ExperimentConfig( - name="baseline_rf_engineered", - description="Random Forest with engineered features", - model_type="random_forest", - features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE], - tags=["baseline", "engineered"], - ), - ] + def load_templates(self, templates: str = "research_templates.yaml") -> dict: + """Load research templates from YAML file""" + try: + with open(self.config.paths.configs_dir / templates, "r") as file: + return yaml.safe_load(file) + except FileNotFoundError: + logging.error(f"Templates file not found: {templates}") + raise + except yaml.YAMLError as e: + logging.error(f"Error parsing templates file: {e}") + raise - @staticmethod - def create_feature_ablation_study() -> List[ExperimentConfig]: - """Create experiments for feature ablation study""" - base_features = [ - FeatureType.FULL_NAME, - FeatureType.NAME_LENGTH, - FeatureType.WORD_COUNT, - FeatureType.PROVINCE, - ] + @classmethod + def find_template(cls, templates: dict, name: str, experiment_type: str = "baseline") -> dict: + """Find experiment configuration by name and type""" - experiments = [] + # Map type to section in templates + type_mapping = { + "baseline": "baseline_experiments", + "advanced": "advanced_experiments", + "feature_study": "feature_studies", + "tuning": "hyperparameter_tuning", + } - # Test removing each feature one by one - for i, feature_to_remove in enumerate(base_features): - remaining_features = [f for f in base_features if f != feature_to_remove] - - experiments.append( - ExperimentConfig( - name=f"ablation_remove_{feature_to_remove.value}", - description=f"Ablation study: removed {feature_to_remove.value}", - model_type="logistic_regression", - features=remaining_features, - tags=["ablation", feature_to_remove.value], - ) + section_name = type_mapping.get(experiment_type) + if not section_name: + available_types = list(type_mapping.keys()) + raise ValueError( + f"Unknown experiment type '{experiment_type}'. Available types: {available_types}" ) - return experiments + if section_name not in templates: + raise ValueError(f"Section '{section_name}' not found in templates") - @staticmethod - def create_name_component_study() -> List[ExperimentConfig]: - """Create experiments to study different name components""" - experiments = [] + experiments = templates[section_name] - name_components = [ - (FeatureType.FIRST_WORD, "first_word"), - (FeatureType.LAST_WORD, "last_word"), - (FeatureType.NATIVE_NAME, "native_name"), - (FeatureType.SURNAME, "surname"), - (FeatureType.NAME_BEGINNINGS, "name_beginnings"), - (FeatureType.NAME_ENDINGS, "name_endings"), + # Search for experiment by model name + for experiment in experiments: + # Check if this is the experiment we're looking for + # Look for experiments that match the model type or contain the name + if ( + experiment.get("model_type") == name + or name.lower() in experiment.get("name", "").lower() + or experiment.get("name") == name + or f"baseline_{name}" == experiment.get("name") + or f"advanced_{name}" == experiment.get("name") + ): + return experiment + + # If not found, list available experiments + available_experiments = [ + exp.get("name", exp.get("model_type", "unknown")) for exp in experiments ] + raise ValueError( + f"Experiment '{name}' not found in '{experiment_type}' section. " + f"Available experiments: {available_experiments}" + ) - for feature, name in name_components: - experiments.append( - ExperimentConfig( - name=f"component_study_{name}", - description=f"Study of {name} for gender prediction", - model_type="logistic_regression", - features=[feature], - tags=["component_study", name], - ) - ) + def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]: + """Get all available experiments from templates organized by type""" + templates = self.load_templates(templates_path) - return experiments + return { + "baseline": templates.get("baseline_experiments", []), + "advanced": templates.get("advanced_experiments", []), + "feature_study": templates.get("feature_studies", []), + "tuning": templates.get("hyperparameter_tuning", []) + } - @staticmethod - def create_province_specific_study() -> List[ExperimentConfig]: - """Create experiments for province-specific analysis""" - provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed + @classmethod + def from_template(cls, template_config: dict) -> ExperimentConfig: + """Create an ExperimentConfig from a template configuration""" + # Convert feature strings to FeatureType objects + features = [] + for feature_str in template_config.get("features", []): + try: + features.append(FeatureType(feature_str)) + except ValueError: + logging.warning(f"Unknown feature type: {feature_str}") + continue - experiments = [] - - for province in provinces: - experiments.append( - ExperimentConfig( - name=f"province_study_{province}", - description=f"Gender prediction for {province} province only", - model_type="logistic_regression", - features=[FeatureType.FULL_NAME], - train_data_filter={"province": province}, - tags=["province_study", province], - ) - ) - - return experiments + return ExperimentConfig( + name=template_config.get("name"), + description=template_config.get("description"), + model_type=template_config.get("model_type"), + features=features, + model_params=template_config.get("model_params", {}), + tags=template_config.get("tags", []), + test_size=template_config.get("test_size", 0.2), + cross_validation_folds=template_config.get("cross_validation_folds", 5), + train_data_filter=template_config.get("train_data_filter") + ) diff --git a/train.py b/train.py index 290e87a..eb868b0 100755 --- a/train.py +++ b/train.py @@ -4,69 +4,11 @@ import logging import sys import traceback -import yaml - from core.config import setup_config +from research.experiment.experiment_builder import ExperimentBuilder from research.model_trainer import ModelTrainer -def load_research_templates(templates_path: str = "config/research_templates.yaml") -> dict: - """Load research templates from YAML file""" - try: - with open(templates_path, "r") as file: - return yaml.safe_load(file) - except FileNotFoundError: - logging.error(f"Templates file not found: {templates_path}") - raise - except yaml.YAMLError as e: - logging.error(f"Error parsing templates file: {e}") - raise - - -def find_experiment_config(templates: dict, name: str, experiment_type: str) -> dict: - """Find experiment configuration by name and type""" - # Map type to section in templates - type_mapping = { - "baseline": "baseline_experiments", - "advanced": "advanced_experiments", - "feature_study": "feature_studies", - "tuning": "hyperparameter_tuning", - } - - section_name = type_mapping.get(experiment_type) - if not section_name: - available_types = list(type_mapping.keys()) - raise ValueError( - f"Unknown experiment type '{experiment_type}'. Available types: {available_types}" - ) - - if section_name not in templates: - raise ValueError(f"Section '{section_name}' not found in templates") - - experiments = templates[section_name] - - # Search for experiment by model name - for experiment in experiments: - # Check if this is the experiment we're looking for - # Look for experiments that match the model type or contain the name - if ( - experiment.get("model_type") == name - or name.lower() in experiment.get("name", "").lower() - or f"baseline_{name}" == experiment.get("name") - or f"advanced_{name}" == experiment.get("name") - ): - return experiment - - # If not found, list available experiments - available_experiments = [ - exp.get("name", exp.get("model_type", "unknown")) for exp in experiments - ] - raise ValueError( - f"Experiment '{name}' not found in '{experiment_type}' section. " - f"Available experiments: {available_experiments}" - ) - - def main(): parser = argparse.ArgumentParser(description="Train DRC Names Models using Research Templates") parser.add_argument("--name", type=str, required=True, help="Model name to train") @@ -79,14 +21,15 @@ def main(): try: # Setup pipeline configuration config = setup_config(config_path=args.config, env=args.env) + experiment_builder = ExperimentBuilder(config) # Load research templates logging.info(f"Loading research templates from: {args.templates}") - templates = load_research_templates(args.templates) + templates = experiment_builder.load_templates(args.templates) # Find the specific experiment configuration logging.info(f"Looking for experiment: name='{args.name}', type='{args.type}'") - experiment_config = find_experiment_config(templates, args.name, args.type) + experiment_config = experiment_builder.find_template(templates, args.name, args.type) logging.info(f"Found experiment: {experiment_config.get('name')}") logging.info(f"Description: {experiment_config.get('description')}") diff --git a/web/interfaces/experiments.py b/web/interfaces/experiments.py index 5278124..3a4afa5 100644 --- a/web/interfaces/experiments.py +++ b/web/interfaces/experiments.py @@ -1,8 +1,8 @@ -from typing import List, Dict, Any +from typing import List, Dict import streamlit as st -from core.utils.region_mapper import RegionMapper +from core.config.pipeline_config import PipelineConfig from research.experiment import ExperimentConfig, ExperimentStatus from research.experiment.experiment_builder import ExperimentBuilder from research.experiment.experiment_runner import ExperimentRunner @@ -13,18 +13,20 @@ from research.model_registry import list_available_models class Experiments: def __init__( - self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner + self, config: PipelineConfig, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner ): self.config = config self.experiment_tracker = experiment_tracker self.experiment_runner = experiment_runner + self.experiment_builder = ExperimentBuilder(config) def index(self): st.title("Experiments") - tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"]) + tab1, tab2, tab3 = st.tabs( + ["Templates", "Experiments", "Batch Experiments"]) with tab1: - self.show_experiment_creation() + self.show_template_experiments() with tab2: self.show_experiment_list() @@ -32,151 +34,78 @@ class Experiments: with tab3: self.show_batch_experiments() - def show_experiment_creation(self): - """Show interface for creating new experiments""" - st.subheader("Create New Experiment") - - with st.form("new_experiment"): - col1, col2 = st.columns(2) - - with col1: - exp_name = st.text_input( - "Experiment Name", placeholder="e.g., native_name_gender_prediction" - ) - description = st.text_area( - "Description", placeholder="Brief description of the experiment" - ) - model_type = st.selectbox("Model Type", list_available_models()) - - # Feature selection - feature_options = [f.value for f in FeatureType] - selected_features = st.multiselect( - "Features to Use", feature_options, default=["full_name"] - ) - - with col2: - # Model parameters - st.write("**Model Parameters**") - model_params = {} - if model_type == "logistic_regression": - ngram_min = st.number_input("N-gram Min", 1, 5, 2) - ngram_max = st.number_input("N-gram Max", 2, 8, 5) - max_features = st.number_input("Max Features", 1000, 50000, 10000) - model_params = { - "ngram_range": [ngram_min, ngram_max], - "max_features": max_features, - } - elif model_type == "random_forest": - n_estimators = st.number_input("Number of Trees", 10, 500, 100) - max_depth = st.number_input("Max Depth", 1, 20, 10) - model_params = { - "n_estimators": n_estimators, - "max_depth": max_depth if max_depth > 0 else None, - } - - # Training parameters - st.write("**Training Parameters**") - test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2) - cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5) - - tags = st.text_input( - "Tags (comma-separated)", placeholder="e.g., baseline, feature_study" - ) - - # Advanced options - with st.expander("Advanced Options"): - # Data filters - st.write("**Data Filters**") - filter_province = st.selectbox( - "Filter by Province (optional)", - ["None"] + RegionMapper().get_provinces(), - ) - - min_words = st.number_input("Minimum Word Count", 0, 10, 0) - max_words = st.number_input("Maximum Word Count (0 = no limit)", 0, 20, 0) - - submitted = st.form_submit_button("Create and Run Experiment", type="primary") - - if submitted: - self._handle_experiment_submission( - exp_name, - description, - model_type, - selected_features, - model_params, - test_size, - cv_folds, - tags, - filter_province, - min_words, - max_words, - ) - - def _handle_experiment_submission( - self, - exp_name: str, - description: str, - model_type: str, - selected_features: List[str], - model_params: Dict[str, Any], - test_size: float, - cv_folds: int, - tags: str, - filter_province: str, - min_words: int, - max_words: int, - ): - """Handle experiment form submission""" - if not exp_name: - st.error("Please provide an experiment name") - return - - if not selected_features: - st.error("Please select at least one feature") - return + def show_template_experiments(self): + """Show interface for running predefined template experiments""" + st.subheader("Template Experiments") + st.write("Run predefined experiments based on research templates.") try: - # Prepare data filters - train_filter = {} - if filter_province != "None": - train_filter["province"] = filter_province - if min_words > 0: - train_filter["words"] = {"min": min_words} - if max_words > 0: - if "words" in train_filter: - train_filter["words"]["max"] = max_words - else: - train_filter["words"] = {"max": max_words} + available_experiments = self.experiment_builder.get_templates() - # Create experiment config - features = [FeatureType(f) for f in selected_features] - tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()] + # Create tabs for different experiment types + exp_tabs = st.tabs(["Baseline", "Advanced", "Feature Studies", "Hyperparameter Tuning"]) - config = ExperimentConfig( - name=exp_name, - description=description, - tags=tag_list, - model_type=model_type, - model_params=model_params, - features=features, - train_data_filter=train_filter if train_filter else None, - test_size=test_size, - cross_validation_folds=cv_folds, - ) + with exp_tabs[0]: + self._show_experiments_by_type(available_experiments["baseline"], "baseline") - # Run experiment - with st.spinner("Running experiment..."): - experiment_id = self.experiment_runner.run_experiment(config) + with exp_tabs[1]: + self._show_experiments_by_type(available_experiments["advanced"], "advanced") - st.success(f"Experiment completed successfully!") - st.info(f"Experiment ID: `{experiment_id}`") + with exp_tabs[2]: + self._show_experiments_by_type(available_experiments["feature_study"], "feature_study") - # Show results - experiment = self.experiment_tracker.get_experiment(experiment_id) - if experiment and experiment.test_metrics: - st.write("**Results:**") - for metric, value in experiment.test_metrics.items(): - st.metric(metric.title(), f"{value:.4f}") + with exp_tabs[3]: + self._show_experiments_by_type(available_experiments["tuning"], "tuning") + + except Exception as e: + st.error(f"Error loading experiment templates: {e}") + st.info("Make sure the research templates file exists at `config/research_templates.yaml`") + + def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str): + """Show experiments for a specific type""" + if not experiments: + st.info(f"No {experiment_type} experiments available in templates.") + return + + st.write(f"**{experiment_type.title()} Experiments**") + + # Show available experiments + for i, exp_template in enumerate(experiments): + exp_name = exp_template.get("name", f"Experiment {i + 1}") + exp_description = exp_template.get("description", "No description available") + + with st.expander(f"📊 {exp_name} - {exp_description}"): + col1, col2 = st.columns([2, 1]) + + with col1: + st.json(exp_template) + + with col2: + if st.button(f"🚀 Run Experiment", key=f"run_{experiment_type}_{i}"): + self._run_template_experiment(exp_template) + + def _run_template_experiment(self, exp_template: Dict): + """Run a template experiment""" + try: + with st.spinner(f"Running {exp_template.get('name')}..."): + # Create experiment config from template + experiment_config = self.experiment_builder.from_template(exp_template) + + # Run the experiment + experiment_id = self.experiment_runner.run_experiment(experiment_config) + st.success(f"Experiment '{experiment_config.name}' completed successfully!") + st.info(f"Experiment ID: `{experiment_id}`") + + # Show results + experiment = self.experiment_tracker.get_experiment(experiment_id) + if experiment and experiment.test_metrics: + st.write("**Results:**") + col1, col2, col3 = st.columns(3) + + metrics = list(experiment.test_metrics.items()) + for i, (metric, value) in enumerate(metrics): + with [col1, col2, col3][i % 3]: + st.metric(metric.title(), f"{value:.4f}") except Exception as e: st.error(f"Error running experiment: {e}") @@ -261,6 +190,85 @@ class Experiments: st.subheader("Batch Experiments") st.write("Run multiple experiments with different parameter combinations.") + # Add option to run template batch experiments + batch_type = st.radio("Batch Type", ["Template Batch", "Custom Parameter Sweep"]) + + if batch_type == "Template Batch": + self._show_template_batch_experiments() + else: + self._show_custom_batch_experiments() + + def _show_template_batch_experiments(self): + """Show interface for running batch experiments from templates""" + st.write("**Run Multiple Template Experiments**") + + try: + available_experiments = self.experiment_builder.get_templates() + + # Select experiment types to run + experiment_types = st.multiselect( + "Select Experiment Types", + ["baseline", "advanced", "feature_study", "tuning"], + default=["baseline"] + ) + + if experiment_types: + selected_experiments = [] + + for exp_type in experiment_types: + experiments = available_experiments.get(exp_type, []) + if experiments: + st.write(f"**{exp_type.title()} Experiments:**") + exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)] + selected_names = st.multiselect( + f"Select {exp_type} experiments", + exp_names, + key=f"select_{exp_type}" + ) + + for name in selected_names: + for exp in experiments: + if exp.get("name") == name: + selected_experiments.append(exp) + + if st.button("🚀 Run Selected Template Experiments"): + self._run_template_batch_experiments(selected_experiments) + + except Exception as e: + st.error(f"Error loading templates for batch experiments: {e}") + + def _run_template_batch_experiments(self, selected_experiments: List[Dict]): + """Run batch experiments from templates""" + if not selected_experiments: + st.warning("No experiments selected") + return + + with st.spinner(f"Running {len(selected_experiments)} template experiments..."): + try: + experiment_configs = [] + for exp_template in selected_experiments: + config = self.experiment_builder.from_template(exp_template) + experiment_configs.append(config) + + # Run batch experiments + experiment_ids = self.experiment_runner.run_experiment_batch(experiment_configs) + + st.success(f"Completed {len(experiment_ids)} template experiments!") + + # Show summary + if experiment_ids: + comparison = self.experiment_runner.compare_experiments(experiment_ids) + st.write("**Template Batch Results:**") + st.dataframe( + comparison[["name", "model_type", "test_accuracy"]], + use_container_width=True, + ) + + except Exception as e: + st.error(f"Error running template batch experiments: {e}") + + def _show_custom_batch_experiments(self): + """Show interface for custom parameter sweep experiments""" # Parameter sweep configuration with st.form("batch_experiments"): st.write("**Parameter Sweep Configuration**") @@ -290,7 +298,7 @@ class Experiments: tags = st.text_input("Common Tags", "parameter_sweep,batch") - if st.form_submit_button("🚀 Run Batch Experiments"): + if st.form_submit_button("🚀 Run Parameter Sweep"): self.run_batch_experiments( base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags )