feat: enhance training pipeline with research templates and experiment configuration

2025-08-08 23:48:55 +02:00
parent 96291b4ad0
commit 6d39c3afc1
9 changed files with 341 additions and 755 deletions
@@ -12,6 +12,7 @@ help: ## Show this help message
 .PHONY: setup
 setup: ## Setup virtual environment and install dependencies
 	python -m venv .venv
 	source .venv/bin/activate
 	.venv/bin/pip install --upgrade pip
 	.venv/bin/pip install -r requirements.txt
@@ -20,79 +21,6 @@ install: ## Install/update dependencies
 	pip install --upgrade pip
 	pip install -r requirements.txt
 .PHONY: install-dev
 install-dev: ## Install development dependencies
 	pip install -r requirements.txt
 	pip install jupyter notebook ipykernel pytest black flake8 mypy
 .PHONY: activate
 activate: ## Show activation command
 	@echo "Run: source .venv/bin/activate"
 # =============================================================================
 # MODEL TRAINING & ARTIFACTS
 # =============================================================================
 .PHONY: train-baseline
 train-baseline: ## Train all baseline models and save artifacts
 	python research/train.py --mode baseline
 .PHONY: train-neural
 train-neural: ## Train neural network models (LSTM, CNN, Transformer)
 	python research/train.py --mode neural
 .PHONY: train-model
 train-model: ## Train specific model (use: make train-model MODEL=logistic_regression NAME=my_model)
 	python research/train.py --model-type $(MODEL) --name $(NAME)
 .PHONY: list-models
 list-models: ## List all saved model artifacts
 	python research/train.py --mode list
 # =============================================================================
 # RESEARCH & EXPERIMENTS
 # =============================================================================
 .PHONY: experiment
 experiment: ## Create sample experiment configuration
 	python research/cli.py run --name "sample_experiment" --features full_name --model-type logistic_regression
 .PHONY: baseline
 baseline: ## Run baseline experiments
 	python research/cli.py baseline
 .PHONY: ablation
 ablation: ## Run feature ablation study
 	python research/cli.py ablation
 .PHONY: components
 components: ## Run name component analysis
 	python research/cli.py components
 .PHONY: list-experiments
 list-experiments: ## List all experiments
 	python research/cli.py list
 .PHONY: list-completed
 list-completed: ## List completed experiments only
 	python research/cli.py list --status completed
 .PHONY: export-results
 export-results: ## Export all experiment results to CSV
 	python research/cli.py export --output results_$(shell date +%Y%m%d_%H%M%S).csv
 .PHONY: best-model
 best-model: ## Show best performing model
 	python research/cli.py list --status completed | head -5
 # =============================================================================
 # WEB INTERFACE
 # =============================================================================
 .PHONY: web
 web: ## Launch Streamlit web interface
 	streamlit run web/app.py --server.runOnSave true --server.port 8501
 # =============================================================================
 # DEVELOPMENT & CODE QUALITY
 # =============================================================================
@@ -113,10 +41,6 @@ type-check: ## Type check with mypy
 notebook: ## Start Jupyter notebook
 	jupyter notebook notebooks/
 .PHONY: lab
 lab: ## Start Jupyter lab
 	jupyter lab notebooks/
 # =============================================================================
 # DEPLOYMENT & PRODUCTION
 # =============================================================================
@@ -1,69 +1,20 @@
-# DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System for Congolese Name Analysis
+# A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference
-A comprehensive, research-friendly pipeline for analyzing Congolese names and predicting gender using culturally-aware machine learning models. 
+Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
-This system provides advanced data processing, experiment management, and an intuitive web interface for non-technical users.
+underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
 data.
 This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
 million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
-## Overview
+## Getting Started
-Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data. 
+### Installation & Setup
 This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 7 million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
-Our approach involves:
+Instructions and command line snippets bellow are provided to help you set up the project environment quickly and
 efficiently.
 assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
- **(1) Advanced data processing pipeline** with batching, checkpointing, and parallel processing
+**Using Makefile (Recommended)**
 - **(2) Modular experiment framework** for systematic model comparison and research iteration  
 - **(3) Multiple feature extraction strategies** leveraging name components, linguistic patterns, and demographic data
 - **(4) Culturally-aware gender prediction models** trained specifically on Congolese naming patterns
 - **(5) User-friendly web interface** enabling non-technical users to run experiments and make predictions
 - **(6) Comprehensive research tools** for reproducible experimentation and result analysis
 ## Key Features
 ### **Advanced Data Processing**
 - **Batched processing** with configurable batch sizes and parallel execution
 - **Automatic checkpointing** and resume capability for large datasets
 - **LLM-powered annotation** with rate limiting and retry logic
 - **Memory-efficient** chunked data loading for datasets of any size
 ### **Research-Friendly Experiment Framework**
 - **Modular model architecture** - easily add new models and features
 - **Systematic experiment tracking** with automatic result storage
 - **Feature ablation studies** and component analysis tools
 - **Cross-validation** and statistical significance testing
 - **Automated baseline comparisons** and performance analysis
 ### **Intuitive Web Interface**
 - **No-code experiment creation** with visual parameter selection
 - **Real-time monitoring** of data processing and training progress
 - **Interactive result visualization** with charts and comparisons
 - **Batch prediction capabilities** for CSV file upload and processing
 - **Model comparison tools** with automatic performance rankings
 ### **Comprehensive Analytics**
 - **Feature importance analysis** showing which name components matter most
 - **Province-specific studies** examining regional naming patterns
 - **Learning curve analysis** for understanding data requirements
 - **Prediction confidence scoring** and error analysis tools
 ## Quick Start
 ### Using Make Commands (Recommended)
 ```bash
 # Complete setup and basic processing
 make quick-start
 # Launch web interface
 make web
 # Run research workflow  
 make research-flow
 # Show all available commands
 make help
 ```
 ### Manual Installation
 ```bash
 git clone https://github.com/bernard-ng/drc-ners-nlp.git
@@ -71,246 +22,88 @@ cd drc-ners-nlp
 # Setup environment
 make setup
-make process
+make activate
 # Launch web interface
 make web
 ```
-## Usage
+**Manual Setup**
 ### Web Interface (Recommended for Non-Technical Users)
 Launch the Streamlit web application:
 ```bash
 make web
 ```
 The interface provides:
 - **Dashboard**: Overview of datasets and recent experiments
 - **Data Overview**: Interactive data exploration and statistics  
 - **Data Processing**: Monitor and control the processing pipeline
 - **Experiments**: Create and manage machine learning experiments
 - **Results & Analysis**: Compare models and analyze performance
 - **Predictions**: Make predictions on new names or upload CSV files
 - **Settings**: Configure the system and manage data
 ### Research & Experiments
 #### Quick Research Studies
 ```bash
 # Compare different approaches (full name vs native vs surname)
 make baseline
 # Analyze which name components are most effective
 make components  
 # Test feature importance through ablation study
 make ablation
 # View all experiment results
 make list-experiments
 # Export results for publication
 make export-results
 ```
 #### Custom Experiments
 ```bash
 # Run specific experiment via command line
 python research/cli.py run \
  --name "native_name_study" \
  --features native_name \
  --model-type logistic_regression \
  --description "Test native name effectiveness"
 # Compare multiple experiments
 python research/cli.py compare <exp_id_1> <exp_id_2>
 # View detailed results
 python research/cli.py show <experiment_id>
 ```
 ### Data Processing Pipeline
 #### Basic Processing (No LLM)
 ```bash
 make process-basic    # Fast processing without LLM annotation
 ```
 #### Complete Processing (With LLM)
 ```bash
 make process         # Full pipeline including LLM annotation
 make process-dev     # Development mode with smaller batches
 ```
 #### Monitor Progress
 ```bash
 make monitoring         # Show current pipeline status
 make status          # Show overall system status
 ```
 #### Resume Interrupted Processing
 ```bash
 make process-resume  # Resume from last checkpoint
 ```
 ### Available Models and Features
 #### Models
 - **Logistic Regression**: Character n-gram based classification
 - **Random Forest**: Engineered feature-based classification
 - **LSTM**: Sequential neural network (planned)
 - **Transformer**: Attention-based model (planned)
 #### Features
 - **Full Name**: Complete name as given
 - **Native Name**: Identified native/given name component  
 - **Surname**: Family name component
 - **Name Length**: Character count features
 - **Word Count**: Number of words in name
 - **Province**: Geographic/demographic features
 - **Name Beginnings/Endings**: Prefix/suffix patterns
 - **Character N-grams**: Linguistic pattern features
 ## Configuration
 ### Environment Configurations
 ```bash
-# Switch to development configuration (smaller batches, more logging)
+git clone https://github.com/bernard-ng/drc-ners-nlp.git
-make config-dev
+cd drc-ners-nlp
-# Switch to production configuration (optimized for performance) 
+# Setup environment
-make config-prod
+python -m venv .venv
 .venv/bin/pip install --upgrade pip
 .venv/bin/pip install -r requirements.txt
-# View current configuration
+pip install --upgrade pip
-make show-config
+pip install -r requirements.txt
 pip install jupyter notebook ipykernel pytest black flake8 mypy
 source .venv/bin/activate
 ```
-### Custom Configuration
+## Data Processing
-Edit configuration files in `config/`:
+This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching,
- `pipeline.yaml` - Main configuration
+checkpointing, and parallel processing capabilities.
- `pipeline.development.yaml` - Development overrides  
+step are defined in the `drc-ners-nlp/processing/steps` directory. and configuration to enable them is managed through
- `pipeline.production.yaml` - Production settings
+the `drc-ners-nlp/config/pipeline.yaml` file.
 **Pipeline Configuration**
 Example configuration:
 ```yaml
-processing:
+stages:
-  batch_size: 1000
+  - "data_cleaning"
-  max_workers: 4
+  - "feature_extraction"
-  
+  - "llm_annotation"
-llm:
+  - "data_splitting"
  model_name: "mistral:7b"
  requests_per_minute: 60
 data:
  split_evaluation: true
  split_by_gender: true
 ```
-## Research Capabilities
+**Running the Pipeline**
 ### Systematic Experimentation
 The framework supports systematic research through:
 1. **Baseline Studies**: Compare fundamental approaches
 2. **Feature Studies**: Test individual name components  
 3. **Ablation Studies**: Identify most important features
 4. **Cross-Province Analysis**: Test generalization across regions
 5. **Hyperparameter Optimization**: Systematic parameter tuning
 ### Reproducible Research
 - **Experiment Tracking**: All experiments automatically logged with full configuration
 - **Result Export**: CSV export for publication and further analysis
 - **Statistical Testing**: Cross-validation and confidence intervals
 - **Version Control**: Configuration-based approach enables easy replication
 ### Publication-Ready Output
 ```bash
-# Generate comprehensive results for publication
+python main.py --env development
 make research-flow
 make export-results
 # Get best models for each approach  
 make list-completed
 python research/cli.py list --status completed | head -10
 ```
-## Development
+## Experiments
 This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and
 research iteration. models are defined in the `drc-ners-nlp/research/models` directory.
 you can define model features, training parameters, and evaluation metrics in the `research_templates.yaml` file.
 **Running Experiments**
 ### Code Quality and Testing
 ```bash
-make format          # Format code with black
+python train.py --name="bigru" --type="baseline" --env="development"
-make lint           # Lint with flake8  
+python train.py --name="cnn" --type="baseline" --env="development"
-make check-deps     # Verify dependencies
+python train.py --name="lightgbm" --type="baseline" --env="development"
 python train.py --name="logistic_regression_fullname" --type="baseline" --env="development"
 python train.py --name="logistic_regression_native" --type="baseline" --env="development"
 python train.py --name="logistic_regression_surname" --type="baseline" --env="development"
 python train.py --name="lstm" --type="baseline" --env="development"
 python train.py --name="random_forest" --type="baseline" --env="development"
 python train.py --name="svm" --type="baseline" --env="development"
 python train.py --name="naive_bayes" --type="baseline" --env="development"
 python train.py --name="transformer" --type="baseline" --env="development"
 python train.py --name="xgboost" --type="baseline" --env="development"
 ```
-### Development Workflow
+## Web Interface
 This project includes a user-friendly web interface built with Streamlit, allowing non-technical users to run
 experiments and make predictions without needing to understand the underlying code.
 ### Running the Web Interface
 ```bash
-make daily-work     # Daily development setup
+streamlit run app.py
 make notebook       # Launch Jupyter for analysis
 make web-dev        # Launch web interface with auto-reload
 ```
-### Data Management
+## Contributors
 ```bash
 make check-data     # Verify all data files
 make data-stats     # Show dataset statistics
 make backup-data    # Create timestamped backup
 make clean-checkpoints  # Clean processing checkpoints
 ```
-## Project Structure
+<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
-
+  <img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
-```
+</a>
 ├── Makefile                    # All command shortcuts
 ├── streamlit_app.py           # Web interface application
 ├── config/                    # Configuration files
 │   ├── pipeline.yaml         # Main configuration
 │   ├── pipeline.development.yaml  # Dev settings
 │   └── pipeline.production.yaml   # Prod settings
 ├── core/                      # Core framework
 │   ├── config.py             # Configuration management
 │   ├── domain.py             # Domain-specific data
 │   └── utils.py              # Reusable utilities
 ├── processing/                # Data processing pipeline
 │   ├── main.py               # Main pipeline script
 │   ├── pipeline.py           # Pipeline framework
 │   ├── steps_config.py       # Configurable processing steps
 │   └── monitor.py            # Monitoring utilities
 ├── research/                  # Research and experiments
 │   ├── cli.py                # Command-line interface
 │   ├── experiment.py         # Experiment management
 │   ├── models.py             # Model implementations
 │   └── runner.py             # Experiment execution
 └── dataset/                   # Data files
    └── names.csv             # Raw dataset
 ```
 ## Citation
 If you use this pipeline in your research, please cite:
 ```bibtex
@software{drc_names_pipeline,
  title={DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System},
  author={Your Name},
  year={2025},
  url={https://github.com/bernard-ng/drc-ners-nlp}
 }
 ```
 ## License
 This project is licensed under the MIT License - see the LICENSE file for details.
 ## Acknowledgments
 - Democratic Republic of Congo population data contributors
 - Open source NLP and machine learning communities
 - Cultural linguistics research communities
@@ -1,205 +0,0 @@
 #!.venv/bin/python3
 import argparse
 import logging
 import sys
 from pathlib import Path
 import pandas as pd
 from core.config import setup_config
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
 def list_experiments(args):
    """List experiments with optional filtering"""
    tracker = ExperimentTracker()
    # Apply filters
    filters = {}
    if args.status:
        from research.experiment import ExperimentStatus
        filters["status"] = ExperimentStatus(args.status)
    if args.model_type:
        filters["model_type"] = args.model_type
    if args.tags:
        filters["tags"] = args.tags
    experiments = tracker.list_experiments(**filters)
    if not experiments:
        logging.info("No experiments found matching criteria")
        return
    # Create summary table
    rows = []
    for exp in experiments:
        row = {
            "ID": exp.experiment_id[:12] + "...",
            "Name": exp.config.name,
            "Model": exp.config.model_type,
            "Status": exp.status.value,
            "Test Acc": f"{exp.test_metrics.get('accuracy', 0):.4f}" if exp.test_metrics else "N/A",
            "Start Time": exp.start_time.strftime("%Y-%m-%d %H:%M"),
        }
        rows.append(row)
    df = pd.DataFrame(rows)
    logging.info(df.to_string(index=False))
 def show_experiment_details(args):
    """Show detailed results for an experiment"""
    tracker = ExperimentTracker()
    experiment = tracker.get_experiment(args.experiment_id)
    if not experiment:
        logging.error(f"Experiment not found: {args.experiment_id}")
        return
    logging.info("=== Experiment Details ===")
    logging.info(f"ID: {experiment.experiment_id}")
    logging.info(f"Name: {experiment.config.name}")
    logging.info(f"Description: {experiment.config.description}")
    logging.info(f"Model Type: {experiment.config.model_type}")
    logging.info(f"Features: {', '.join([f.value for f in experiment.config.features])}")
    logging.info(f"Status: {experiment.status.value}")
    logging.info(f"Start Time: {experiment.start_time}")
    logging.info(f"End Time: {experiment.end_time}")
    if experiment.test_metrics:
        logging.info("=== Test Metrics ===")
        for metric, value in experiment.test_metrics.items():
            logging.info(f"{metric}: {value:.4f}")
    if experiment.cv_metrics:
        logging.info("=== Cross-Validation Metrics ===")
        for metric, value in experiment.cv_metrics.items():
            if not metric.endswith("_std"):
                std_key = f"{metric}_std"
                std_val = experiment.cv_metrics.get(std_key, 0)
                logging.info(f"{metric}: {value:.4f} ± {std_val:.4f}")
    if experiment.feature_importance:
        logging.info("=== Top 10 Feature Importances ===")
        sorted_features = sorted(
            experiment.feature_importance.items(), key=lambda x: x[1], reverse=True
        )
        for feature, importance in sorted_features[:10]:
            logging.info(f"{feature}: {importance:.4f}")
    if experiment.prediction_examples:
        logging.info("=== Prediction Examples ===")
        for i, example in enumerate(experiment.prediction_examples[:5]):
            correct = "✓" if example["correct"] else "✗"
            logging.info(
                f"{i + 1}. {example['name']} -> True: {example['true_label']}, "
                f"Pred: {example['predicted_label']} {correct}"
            )
 def compare_experiments_cmd(args):
    """Compare multiple experiments"""
    config = setup_config(env="development")
    runner = ExperimentRunner(config)
    comparison = runner.compare_experiments(args.experiment_ids)
    if comparison.empty:
        logging.info("No experiments found for comparison")
        return
    logging.info("=== Experiment Comparison ===")
    # Show key columns
    key_columns = ["name", "model_type", "features", "test_accuracy", "test_f1"]
    available_columns = [col for col in key_columns if col in comparison.columns]
    logging.info(comparison[available_columns].to_string(index=False))
 def export_results(args):
    """Export experiment results"""
    tracker = ExperimentTracker()
    output_path = tracker.export_results(Path(args.output) if args.output else None)
    logging.info(f"Results exported to: {output_path}")
 def main():
    """Main CLI entry point with unified configuration loading"""
    parser = argparse.ArgumentParser(
        description="DRC Names Research Experiment Manager",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    # Global arguments
    parser.add_argument("--config", type=Path, help="Path to configuration file")
    parser.add_argument(
        "--env", type=str, default="development",
        help="Environment name (default: development)"
    )
    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
    subparsers = parser.add_subparsers(dest="command", help="Available commands")
    # List experiments
    list_parser = subparsers.add_parser("list", help="List experiments")
    list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"])
    list_parser.add_argument("--tags", nargs="+", help="Filter by tags")
    # Show experiment details
    detail_parser = subparsers.add_parser("show", help="Show experiment details")
    detail_parser.add_argument("experiment_id", help="Experiment ID")
    # Compare experiments
    compare_parser = subparsers.add_parser("compare", help="Compare experiments")
    compare_parser.add_argument("experiment_ids", nargs="+", help="Experiment IDs to compare")
    # Export results
    export_parser = subparsers.add_parser("export", help="Export results to CSV")
    export_parser.add_argument("--output", help="Output file path")
    args = parser.parse_args()
    if not args.command:
        parser.print_help()
        return 1
    try:
        # Load configuration and setup logging
        config = setup_config(config_path=args.config, env=args.env)
        # Override log level if verbose requested
        if args.verbose:
            logging.getLogger().setLevel(logging.DEBUG)
        # Execute command
        command_map = {
            "list": list_experiments,
            "show": show_experiment_details,
            "compare": compare_experiments_cmd,
            "export": export_results,
        }
        handler = command_map.get(args.command)
        if handler:
            handler(args)
        return 0
    except Exception as e:
        logging.error(f"Command failed: {e}")
        if args.verbose:
            import traceback
            traceback.print_exc()
        return 1
 if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)
@@ -1,17 +1,12 @@
 # Production Environment Configuration
 # Optimized settings for production deployment
 name: "drc_names_pipeline"
 version: "1.0.0"
 environment: "development"
 debug: true
 # Processing settings
 processing:
-  batch_size: 100_000
+  batch_size: 10_000
  max_workers: 8
  checkpoint_interval: 10
-  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
+  use_multiprocessing: true
 # Pipeline stages
 stages:
@@ -20,7 +15,6 @@ stages:
  #- "llm_annotation"
  - "data_splitting"
 # Production LLM settings
 llm:
  model_name: "mistral:7b"
@@ -31,14 +25,10 @@ llm:
  max_concurrent_requests: 4
  enable_rate_limiting: true
-# Development data settings - limited dataset for faster testing
+# Data handling configuration
 data:
-  split_evaluation: true
+  max_dataset_size: 100_000
-  split_by_gender: true
+  balance_by_sex: true
  evaluation_fraction: 0.2
  random_seed: 42
  max_dataset_size: ~  # Limit to 10k records for development/testing
  balance_by_sex: false     # Balance male/female samples when limiting
 # Enhanced logging for development
 logging:
@@ -1,17 +1,12 @@
 # Production Environment Configuration
 # Optimized settings for production deployment
 name: "drc_names_pipeline"
 version: "1.0.0"
 environment: "production"
 debug: false
-# Production processing settings (optimized for performance)
+# Processing settings
 processing:
  batch_size: 10_000
  max_workers: 8
  checkpoint_interval: 10
-  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
+  use_multiprocessing: true
 # Pipeline stages
 stages:
@@ -20,7 +15,6 @@ stages:
  - "llm_annotation"
  - "data_splitting"
 # Production LLM settings
 llm:
  model_name: "mistral:7b"
@@ -31,19 +25,15 @@ llm:
  max_concurrent_requests: 4
  enable_rate_limiting: true
-# Production data settings
+# Data handling configuration
 data:
  split_evaluation: true
  split_by_gender: true
  evaluation_fraction: 0.2
  random_seed: 42
  max_dataset_size: null
  balance_by_sex: false
 # Production logging (less verbose)
 logging:
  level: "INFO"
-  console_logging: false  # Disable console in production
+  console_logging: false
  file_logging: true
  log_file: "pipeline.production.log"
  max_log_size: 52428800  # 50MB
@@ -1,72 +1,72 @@
 # DRC Names Processing Pipeline Configuration
 # Main configuration file with default settings
-name: "drc_names_pipeline"
+name: "drc_ners_pipeline"                 # Name of the pipeline
-version: "1.0.0"
+version: "1.0.0"                          # Version of the pipeline
-description: "DRC Names NLP Processing Pipeline"
+description: "DRC NERS NLP Processing"    # Description of the pipeline
-environment: "development"
+environment: "development"                # Environment type (development, production, etc.)
-debug: false
+debug: false                              # Enable debug mode for detailed logging and error reporting
 # Project directory structure
 paths:
-  root_dir: "."
+  root_dir: "."                           # Root directory of the project
-  configs_dir: "./config"
+  configs_dir: "./config"                 # Directory for configuration files
-  data_dir: "./data/dataset"
+  data_dir: "./data/dataset"              # Directory for dataset files
-  models_dir: "./data/models"
+  models_dir: "./data/models"             # Directory for model files
-  outputs_dir: "./data/outputs"
+  outputs_dir: "./data/outputs"           # Directory for output files
-  logs_dir: "./data/logs"
+  logs_dir: "./data/logs"                 # Directory for log files
-  checkpoints_dir: "./data/checkpoints"
+  checkpoints_dir: "./data/checkpoints"   # Directory for model checkpoints
 # Pipeline stages
-stages:
+stages:                                    # List of stages in the processing pipeline
-  - "data_cleaning"
+  - "data_cleaning"                        # Data cleaning stage
-  - "feature_extraction"
+  - "feature_extraction"                   # Feature extraction stage
-  - "llm_annotation"
+  - "llm_annotation"                       # LLM annotation stage (computational intensive)
-  - "data_splitting"
+  - "data_splitting"                       # Data splitting stage
 # Data processing configuration
 processing:
-  batch_size: 1_000
+  batch_size: 1_000                        # Size of data batches to process at once
-  max_workers: 4
+  max_workers: 4                           # Number of worker threads for parallel processing
-  checkpoint_interval: 5
+  checkpoint_interval: 5                   # Interval for saving checkpoints during processing
-  use_multiprocessing: false
+  use_multiprocessing: false               # Enable multiprocessing for CPU-bound tasks
-  encoding_options:
+  encoding_options:                        # List of encodings to try when reading files
    - "utf-8"
    - "utf-16"
    - "latin1"
-  chunk_size: 100_000
+  chunk_size: 100_000                      # Size of data chunks to process in parallel
 # LLM annotation settings
 llm:
-  model_name: "mistral:7b"
+  model_name: "mistral:7b"                 # Name of the LLM model to use
-  requests_per_minute: 60
+  requests_per_minute: 60                  # Requests per minute to the LLM service
-  requests_per_second: 2
+  requests_per_second: 2                   # Requests per second to the LLM service
-  retry_attempts: 3
+  retry_attempts: 3                        # Number of retry attempts for LLM requests
-  timeout_seconds: 600
+  timeout_seconds: 600                     # Timeout for LLM requests
-  max_concurrent_requests: 2
+  max_concurrent_requests: 2               # Maximum concurrent requests to the LLM service
-  enable_rate_limiting: true
+  enable_rate_limiting: true               # Enable rate limiting to avoid overloading the LLM service
 # Data handling configuration
 data:
-  input_file: "names.csv"
+  input_file: "names.csv"                   # Input file containing names data
  output_files:
-    featured: "names_featured.csv"
+    featured: "names_featured.csv"          # Output file for featured data
-    evaluation: "names_evaluation.csv"
+    evaluation: "names_evaluation.csv"      # Output file for evaluation set
-    males: "names_males.csv"
+    males: "names_males.csv"                # Output files for male names
-    females: "names_females.csv"
+    females: "names_females.csv"            # Output files for female names
-  split_evaluation: true
+  split_evaluation: true                    # Should the dataset be split into training and evaluation sets ?
-  split_by_gender: true
+  split_by_gender: true                     # Should the dataset be split by gender ?
-  evaluation_fraction: 0.2
+  evaluation_fraction: 0.2                  # Fraction of data to use for evaluation
-  random_seed: 42
+  random_seed: 42                           # Random seed for reproducibility
-  max_dataset_size: null
+  max_dataset_size: null                    # Maximum size of the dataset to process, set to null for no
-  balance_by_sex: false
+  balance_by_sex: false                     # Should the dataset be balanced by sex when limiting the dataset size?
 # Logging configuration
 logging:
-  level: "INFO"
+  level: "INFO"                            # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-  file_logging: true
+  file_logging: true                       # Enable logging to file
-  console_logging: true
+  console_logging: true                    # Enable logging to console
-  log_file: "pipeline.log"
+  log_file: "pipeline.log"                 # Log file name
-  max_log_size: 10485760  # 10MB
+  max_log_size: 10485760                   # Maximum size of log file before rotation (10MB)
-  backup_count: 5
+  backup_count: 5                          # Number of backup log files to keep
@@ -1,128 +1,148 @@
 # Research Experiment Configuration Templates
 # These configurations can be used as starting points for different types of experiments
 # Baseline Experiments Configuration
 baseline_experiments:
-  - name: "baseline_logistic_regression_fullname"
+  - name: "bigru"
    description: "Baseline BiGRU with full name features"
    model_type: "bigru"
    features: [ "full_name" ]
    model_params:
      max_len: 20
      embedding_dim: 64
      gru_units: 32
      epochs: 10
      batch_size: 32
    tags: [ "baseline", "neural", "bigru" ]
  - name: "cnn"
    description: "Baseline CNN with character patterns"
    model_type: "cnn"
    features: [ "full_name" ]
    model_params:
      max_len: 20
      embedding_dim: 64
      filters: 64
      kernel_size: 3
      dropout: 0.5
      epochs: 10
      batch_size: 32
    tags: [ "baseline", "neural", "cnn" ]
  - name: "ensemble"
    description: "Baseline Ensemble with multiple models"
    model_type: "ensemble"
    features: [ "full_name", "name_length", "word_count" ]
    model_params:
      base_models: [ "logistic_regression", "random_forest", "xgboost" ]
      voting: "soft"
      cv_folds: 5
    tags: [ "baseline", "ensemble" ]
  - name: "lightgbm"
    description: "Baseline LightGBM with engineered features"
    model_type: "lightgbm"
    features: [ "full_name", "name_length", "word_count" ]
    model_params:
      n_estimators: 100
      max_depth: -1
      learning_rate: 0.1
      num_leaves: 31
      subsample: 0.8
      colsample_bytree: 0.8
    tags: [ "baseline", "lightgbm" ]
  - name: "logistic_regression_fullname"
    description: "Baseline logistic regression with full name"
    model_type: "logistic_regression"
    features: [ "full_name" ]
    model_params:
      ngram_range: [2, 5]
      max_features: 10000
-      max_iter: 1000
+    tags: [ "baseline", "logistic_regression", "fullname" ]
    tags: ["baseline", "fullname"]
-  - name: "baseline_logistic_regression_native"
+  - name: "logistic_regression_native"
    description: "Logistic regression with native name only"
    model_type: "logistic_regression"
    features: [ "native_name" ]
    model_params:
      ngram_range: [2, 4]
      max_features: 5000
-    tags: ["baseline", "native"]
+    tags: [ "baseline", "logistic_regression", "native" ]
-  - name: "baseline_rf_engineered"
+  - name: "logistic_regression_surname"
-    description: "Random Forest with engineered features"
+    description: "Logistic regression with surname name only"
    model_type: "logistic_regression"
    features: [ "surname" ]
    model_params:
      max_features: 5000
    tags: [ "baseline", "logistic_regression", "surname" ]
  - name: "lstm"
    description: "Baseline LSTM with full name features"
    model_type: "lstm"
    features: [ "full_name" ]
    model_params:
      embedding_dim: 128
      lstm_units: 64
      epochs: 10
      batch_size: 64
    tags: [ "baseline", "neural", "lstm" ]
  - name: "naive_bayes"
    description: "Baseline Naive Bayes with full name features"
    model_type: "naive_bayes"
    features: [ "full_name" ]
    model_params:
      max_features: 5000
    tags: [ "baseline", "naive_bayes" ]
  - name: "random_forest"
    description: "Baseline Random Forest with engineered features"
    model_type: "random_forest"
    features: [ "name_length", "word_count", "province" ]
    model_params:
      n_estimators: 100
      max_depth: 10
-    tags: ["baseline", "engineered"]
+      min_samples_split: 2
      min_samples_leaf: 1
    tags: [ "baseline", "random_forest", "engineered" ]
  - name: "svm"
    description: "Baseline SVM with full name features"
    model_type: "svm"
    features: [ "full_name" ]
    model_params:
      C: 1.0
      kernel: "rbf"
      ngram_range: [ 2, 4 ]
      max_features: 5000
    tags: [ "baseline", "svm" ]
  - name: "transformer"
    description: "Baseline Transformer with attention mechanism"
    model_type: "transformer"
    features: [ "full_name" ]
    model_params:
      embedding_dim: 128
      num_heads: 4
      num_layers: 2
      epochs: 10
      batch_size: 64
    tags: [ "baseline", "neural", "transformer" ]
  - name: "xgboost"
    description: "Baseline XGBoost with engineered features"
    model_type: "xgboost"
    features: [ "full_name", "name_length", "word_count" ]
    model_params:
      n_estimators: 100
      max_depth: 6
      learning_rate: 0.1
      subsample: 0.8
      colsample_bytree: 0.8
    tags: [ "baseline", "xgboost" ]
 # Advanced Experiments Configuration
 advanced_experiments:
 # Feature Study Configurations
 feature_studies:
  - name: "native_vs_surname"
    description: "Compare native name vs surname effectiveness"
    experiments:
      - model_type: "logistic_regression"
        features: ["native_name"]
        tags: ["feature_study", "native"]
      - model_type: "logistic_regression"
        features: ["surname"]
        tags: ["feature_study", "surname"]
-  - name: "name_parts_analysis"
+# Hyperparameter Tuning Configurations
-    description: "Analyze effectiveness of different name parts"
+hyperparameter_tuning:
    experiments:
      - features: ["first_word"]
        tags: ["name_parts", "first"]
      - features: ["last_word"]
        tags: ["name_parts", "last"]
      - features: ["name_beginnings"]
        feature_params:
          beginning_length: 3
        tags: ["name_parts", "beginnings"]
      - features: ["name_endings"]
        feature_params:
          ending_length: 3
        tags: ["name_parts", "endings"]
 # Province-Specific Studies
 province_studies:
  - name: "kinshasa_study"
    description: "Gender prediction for Kinshasa province"
    model_type: "logistic_regression"
    features: ["full_name"]
    train_data_filter:
      province: "kinshasa"
    tags: ["province_study", "kinshasa"]
  - name: "cross_province_generalization"
    description: "Train on one province, test on another"
    experiments:
      - train_filter: {"province": "kinshasa"}
        test_filter: {"province": "bas-congo"}
        tags: ["generalization", "kinshasa_to_bas-congo"]
 # Model Comparison Studies
 model_comparisons:
  - name: "model_comparison_fullname"
    description: "Compare different models with full name"
    base_config:
      features: ["full_name"]
      tags: ["model_comparison"]
    models:
      - model_type: "logistic_regression"
        model_params:
          ngram_range: [2, 5]
      - model_type: "random_forest"
        # Note: RF will need different feature preparation
        features: ["name_length", "word_count", "province"]
 # Advanced Feature Combinations
 advanced_features:
  - name: "multi_feature_combination"
    description: "Test various feature combinations"
    experiments:
      - features: ["full_name", "name_length"]
        tags: ["combination", "name_plus_length"]
      - features: ["native_name", "surname", "province"]
        tags: ["combination", "semantic_features"]
      - features: ["name_beginnings", "name_endings", "word_count"]
        tags: ["combination", "structural_features"]
 # Hyperparameter Studies
 hyperparameter_studies:
  - name: "ngram_range_study"
    description: "Study effect of different n-gram ranges"
    base_config:
      model_type: "logistic_regression"
      features: ["full_name"]
      tags: ["hyperparameter", "ngram"]
    variants:
      - model_params: {"ngram_range": [1, 3]}
      - model_params: {"ngram_range": [2, 4]}
      - model_params: {"ngram_range": [2, 5]}
      - model_params: {"ngram_range": [3, 6]}
 # Data Size Studies
 data_studies:
  - name: "learning_curve_study"
    description: "Study performance vs training data size"
    base_config:
      model_type: "logistic_regression"
      features: ["full_name"]
      tags: ["learning_curve"]
    data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0]  # Fractions of training data to use
@@ -33,6 +33,7 @@ class ModelTrainer:
            model_type: str = "logistic_regression",
            features: List[str] = None,
            model_params: Dict[str, Any] = None,
            tags: List[str] = None,
            save_artifacts: bool = True,
    ) -> str:
        """
@@ -45,6 +46,10 @@ class ModelTrainer:
            features = ["full_name"]
        feature_types = [FeatureType(f) for f in features]
        # Prepare tags - combine default tags with template tags
        default_tags = ["training", model_type]
        experiment_tags = default_tags + (tags or [])
        # Create experiment configuration
        config = ExperimentConfig(
            name=model_name,
@@ -52,7 +57,7 @@ class ModelTrainer:
            model_type=model_type,
            features=feature_types,
            model_params=model_params or {},
-            tags=["training", model_type],
+            tags=experiment_tags,
        )
        # Run experiment
@@ -3,29 +3,98 @@ import argparse
 import logging
 import sys
 import traceback
 import yaml
 from pathlib import Path
 from core.config import setup_config
 from research.model_trainer import ModelTrainer
 def load_research_templates(templates_path: str = "config/research_templates.yaml") -> dict:
    """Load research templates from YAML file"""
    try:
        with open(templates_path, 'r') as file:
            return yaml.safe_load(file)
    except FileNotFoundError:
        logging.error(f"Templates file not found: {templates_path}")
        raise
    except yaml.YAMLError as e:
        logging.error(f"Error parsing templates file: {e}")
        raise
 def find_experiment_config(templates: dict, name: str, experiment_type: str) -> dict:
    """Find experiment configuration by name and type"""
    # Map type to section in templates
    type_mapping = {
        "baseline": "baseline_experiments",
        "advanced": "advanced_experiments",
        "feature_study": "feature_studies",
        "tuning": "hyperparameter_tuning"
    }
    section_name = type_mapping.get(experiment_type)
    if not section_name:
        available_types = list(type_mapping.keys())
        raise ValueError(f"Unknown experiment type '{experiment_type}'. Available types: {available_types}")
    if section_name not in templates:
        raise ValueError(f"Section '{section_name}' not found in templates")
    experiments = templates[section_name]
    # Search for experiment by model name
    for experiment in experiments:
        # Check if this is the experiment we're looking for
        # Look for experiments that match the model type or contain the name
        if (experiment.get("model_type") == name or
            name.lower() in experiment.get("name", "").lower() or
            f"baseline_{name}" == experiment.get("name") or
            f"advanced_{name}" == experiment.get("name")):
            return experiment
    # If not found, list available experiments
    available_experiments = [exp.get("name", exp.get("model_type", "unknown")) for exp in experiments]
    raise ValueError(f"Experiment '{name}' not found in '{experiment_type}' section. "
                    f"Available experiments: {available_experiments}")
 def main():
-    parser = argparse.ArgumentParser(description="Train DRC Names Models")
+    parser = argparse.ArgumentParser(description="Train DRC Names Models using Research Templates")
-    parser.add_argument("--type", type=str, help="Specific model type to train")
+    parser.add_argument("--name", type=str, required=True, help="Model name to train")
-    parser.add_argument("--name", type=str, help="Model name")
+    parser.add_argument("--type", type=str, required=True, help="Experiment type")
    parser.add_argument("--config", type=str, help="Path to configuration file")
    parser.add_argument("--env", type=str, default="development", help="Environment name")
    parser.add_argument("--templates", type=str, default="config/research_templates.yaml")
    args = parser.parse_args()
    try:
        # Setup pipeline configuration
        config = setup_config(config_path=args.config, env=args.env)
        trainer = ModelTrainer(config)
-        # Train specific model
+        # Load research templates
        logging.info(f"Loading research templates from: {args.templates}")
        templates = load_research_templates(args.templates)
        # Find the specific experiment configuration
        logging.info(f"Looking for experiment: name='{args.name}', type='{args.type}'")
        experiment_config = find_experiment_config(templates, args.name, args.type)
        logging.info(f"Found experiment: {experiment_config.get('name')}")
        logging.info(f"Description: {experiment_config.get('description')}")
        logging.info(f"Features: {experiment_config.get('features')}")
        # Train the model using template configuration
        trainer = ModelTrainer(config)
        trainer.train_single_model(
-            model_name=args.name,
+            model_name=experiment_config.get("name"),
-            model_type=args.type,
+            model_type=experiment_config.get("model_type"),
-            features=["full_name"]
+            features=experiment_config.get("features"),
            model_params=experiment_config.get("model_params", {}),
            tags=experiment_config.get("tags", [])
        )
        logging.info("Training completed successfully!")
        return 0
    except Exception as e: