From 6d39c3afc17af3f4fd020b85343278711f74bca9 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Fri, 8 Aug 2025 23:48:55 +0200 Subject: [PATCH] feat: enhance training pipeline with research templates and experiment configuration --- Makefile | 78 +------ README.md | 351 +++++++------------------------ cli.py | 205 ------------------ config/pipeline.development.yaml | 20 +- config/pipeline.production.yaml | 18 +- config/pipeline.yaml | 94 ++++----- config/research_templates.yaml | 238 +++++++++++---------- research/model_trainer.py | 7 +- train.py | 85 +++++++- 9 files changed, 341 insertions(+), 755 deletions(-) delete mode 100755 cli.py diff --git a/Makefile b/Makefile index bf98a27..bda886b 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ help: ## Show this help message .PHONY: setup setup: ## Setup virtual environment and install dependencies python -m venv .venv + source .venv/bin/activate .venv/bin/pip install --upgrade pip .venv/bin/pip install -r requirements.txt @@ -20,79 +21,6 @@ install: ## Install/update dependencies pip install --upgrade pip pip install -r requirements.txt -.PHONY: install-dev -install-dev: ## Install development dependencies - pip install -r requirements.txt - pip install jupyter notebook ipykernel pytest black flake8 mypy - -.PHONY: activate -activate: ## Show activation command - @echo "Run: source .venv/bin/activate" - -# ============================================================================= -# MODEL TRAINING & ARTIFACTS -# ============================================================================= - -.PHONY: train-baseline -train-baseline: ## Train all baseline models and save artifacts - python research/train.py --mode baseline - -.PHONY: train-neural -train-neural: ## Train neural network models (LSTM, CNN, Transformer) - python research/train.py --mode neural - -.PHONY: train-model -train-model: ## Train specific model (use: make train-model MODEL=logistic_regression NAME=my_model) - python research/train.py --model-type $(MODEL) --name $(NAME) - -.PHONY: list-models -list-models: ## List all saved model artifacts - python research/train.py --mode list - -# ============================================================================= -# RESEARCH & EXPERIMENTS -# ============================================================================= - -.PHONY: experiment -experiment: ## Create sample experiment configuration - python research/cli.py run --name "sample_experiment" --features full_name --model-type logistic_regression - -.PHONY: baseline -baseline: ## Run baseline experiments - python research/cli.py baseline - -.PHONY: ablation -ablation: ## Run feature ablation study - python research/cli.py ablation - -.PHONY: components -components: ## Run name component analysis - python research/cli.py components - -.PHONY: list-experiments -list-experiments: ## List all experiments - python research/cli.py list - -.PHONY: list-completed -list-completed: ## List completed experiments only - python research/cli.py list --status completed - -.PHONY: export-results -export-results: ## Export all experiment results to CSV - python research/cli.py export --output results_$(shell date +%Y%m%d_%H%M%S).csv - -.PHONY: best-model -best-model: ## Show best performing model - python research/cli.py list --status completed | head -5 - -# ============================================================================= -# WEB INTERFACE -# ============================================================================= - -.PHONY: web -web: ## Launch Streamlit web interface - streamlit run web/app.py --server.runOnSave true --server.port 8501 - # ============================================================================= # DEVELOPMENT & CODE QUALITY # ============================================================================= @@ -113,10 +41,6 @@ type-check: ## Type check with mypy notebook: ## Start Jupyter notebook jupyter notebook notebooks/ -.PHONY: lab -lab: ## Start Jupyter lab - jupyter lab notebooks/ - # ============================================================================= # DEPLOYMENT & PRODUCTION # ============================================================================= diff --git a/README.md b/README.md index d0e826b..60bc12b 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,20 @@ -# DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System for Congolese Name Analysis +# A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference -A comprehensive, research-friendly pipeline for analyzing Congolese names and predicting gender using culturally-aware machine learning models. -This system provides advanced data processing, experiment management, and an intuitive web interface for non-technical users. +Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often +underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training +data. +This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5 +million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata. -## Overview +## Getting Started -Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data. -This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 7 million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata. +### Installation & Setup -Our approach involves: +Instructions and command line snippets bellow are provided to help you set up the project environment quickly and +efficiently. +assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.). -- **(1) Advanced data processing pipeline** with batching, checkpointing, and parallel processing -- **(2) Modular experiment framework** for systematic model comparison and research iteration -- **(3) Multiple feature extraction strategies** leveraging name components, linguistic patterns, and demographic data -- **(4) Culturally-aware gender prediction models** trained specifically on Congolese naming patterns -- **(5) User-friendly web interface** enabling non-technical users to run experiments and make predictions -- **(6) Comprehensive research tools** for reproducible experimentation and result analysis - -## Key Features - -### **Advanced Data Processing** -- **Batched processing** with configurable batch sizes and parallel execution -- **Automatic checkpointing** and resume capability for large datasets -- **LLM-powered annotation** with rate limiting and retry logic -- **Memory-efficient** chunked data loading for datasets of any size - -### **Research-Friendly Experiment Framework** -- **Modular model architecture** - easily add new models and features -- **Systematic experiment tracking** with automatic result storage -- **Feature ablation studies** and component analysis tools -- **Cross-validation** and statistical significance testing -- **Automated baseline comparisons** and performance analysis - -### **Intuitive Web Interface** -- **No-code experiment creation** with visual parameter selection -- **Real-time monitoring** of data processing and training progress -- **Interactive result visualization** with charts and comparisons -- **Batch prediction capabilities** for CSV file upload and processing -- **Model comparison tools** with automatic performance rankings - -### **Comprehensive Analytics** -- **Feature importance analysis** showing which name components matter most -- **Province-specific studies** examining regional naming patterns -- **Learning curve analysis** for understanding data requirements -- **Prediction confidence scoring** and error analysis tools - -## Quick Start - -### Using Make Commands (Recommended) - -```bash -# Complete setup and basic processing -make quick-start - -# Launch web interface -make web - -# Run research workflow -make research-flow - -# Show all available commands -make help -``` - -### Manual Installation +**Using Makefile (Recommended)** ```bash git clone https://github.com/bernard-ng/drc-ners-nlp.git @@ -71,246 +22,88 @@ cd drc-ners-nlp # Setup environment make setup -make process - -# Launch web interface -make web +make activate ``` -## Usage - -### Web Interface (Recommended for Non-Technical Users) - -Launch the Streamlit web application: -```bash -make web -``` - -The interface provides: -- **Dashboard**: Overview of datasets and recent experiments -- **Data Overview**: Interactive data exploration and statistics -- **Data Processing**: Monitor and control the processing pipeline -- **Experiments**: Create and manage machine learning experiments -- **Results & Analysis**: Compare models and analyze performance -- **Predictions**: Make predictions on new names or upload CSV files -- **Settings**: Configure the system and manage data - -### Research & Experiments - -#### Quick Research Studies -```bash -# Compare different approaches (full name vs native vs surname) -make baseline - -# Analyze which name components are most effective -make components - -# Test feature importance through ablation study -make ablation - -# View all experiment results -make list-experiments - -# Export results for publication -make export-results -``` - -#### Custom Experiments -```bash -# Run specific experiment via command line -python research/cli.py run \ - --name "native_name_study" \ - --features native_name \ - --model-type logistic_regression \ - --description "Test native name effectiveness" - -# Compare multiple experiments -python research/cli.py compare - -# View detailed results -python research/cli.py show -``` - -### Data Processing Pipeline - -#### Basic Processing (No LLM) -```bash -make process-basic # Fast processing without LLM annotation -``` - -#### Complete Processing (With LLM) -```bash -make process # Full pipeline including LLM annotation -make process-dev # Development mode with smaller batches -``` - -#### Monitor Progress -```bash -make monitoring # Show current pipeline status -make status # Show overall system status -``` - -#### Resume Interrupted Processing -```bash -make process-resume # Resume from last checkpoint -``` - -### Available Models and Features - -#### Models -- **Logistic Regression**: Character n-gram based classification -- **Random Forest**: Engineered feature-based classification -- **LSTM**: Sequential neural network (planned) -- **Transformer**: Attention-based model (planned) - -#### Features -- **Full Name**: Complete name as given -- **Native Name**: Identified native/given name component -- **Surname**: Family name component -- **Name Length**: Character count features -- **Word Count**: Number of words in name -- **Province**: Geographic/demographic features -- **Name Beginnings/Endings**: Prefix/suffix patterns -- **Character N-grams**: Linguistic pattern features - -## Configuration - -### Environment Configurations +**Manual Setup** ```bash -# Switch to development configuration (smaller batches, more logging) -make config-dev +git clone https://github.com/bernard-ng/drc-ners-nlp.git +cd drc-ners-nlp -# Switch to production configuration (optimized for performance) -make config-prod +# Setup environment +python -m venv .venv +.venv/bin/pip install --upgrade pip +.venv/bin/pip install -r requirements.txt -# View current configuration -make show-config +pip install --upgrade pip +pip install -r requirements.txt +pip install jupyter notebook ipykernel pytest black flake8 mypy + +source .venv/bin/activate ``` -### Custom Configuration +## Data Processing -Edit configuration files in `config/`: -- `pipeline.yaml` - Main configuration -- `pipeline.development.yaml` - Development overrides -- `pipeline.production.yaml` - Production settings +This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching, +checkpointing, and parallel processing capabilities. +step are defined in the `drc-ners-nlp/processing/steps` directory. and configuration to enable them is managed through +the `drc-ners-nlp/config/pipeline.yaml` file. + +**Pipeline Configuration** -Example configuration: ```yaml -processing: - batch_size: 1000 - max_workers: 4 - -llm: - model_name: "mistral:7b" - requests_per_minute: 60 - -data: - split_evaluation: true - split_by_gender: true +stages: + - "data_cleaning" + - "feature_extraction" + - "llm_annotation" + - "data_splitting" ``` -## Research Capabilities - -### Systematic Experimentation - -The framework supports systematic research through: - -1. **Baseline Studies**: Compare fundamental approaches -2. **Feature Studies**: Test individual name components -3. **Ablation Studies**: Identify most important features -4. **Cross-Province Analysis**: Test generalization across regions -5. **Hyperparameter Optimization**: Systematic parameter tuning - -### Reproducible Research - -- **Experiment Tracking**: All experiments automatically logged with full configuration -- **Result Export**: CSV export for publication and further analysis -- **Statistical Testing**: Cross-validation and confidence intervals -- **Version Control**: Configuration-based approach enables easy replication - -### Publication-Ready Output +**Running the Pipeline** ```bash -# Generate comprehensive results for publication -make research-flow -make export-results - -# Get best models for each approach -make list-completed -python research/cli.py list --status completed | head -10 +python main.py --env development ``` -## Development +## Experiments + +This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and +research iteration. models are defined in the `drc-ners-nlp/research/models` directory. +you can define model features, training parameters, and evaluation metrics in the `research_templates.yaml` file. + +**Running Experiments** -### Code Quality and Testing ```bash -make format # Format code with black -make lint # Lint with flake8 -make check-deps # Verify dependencies +python train.py --name="bigru" --type="baseline" --env="development" +python train.py --name="cnn" --type="baseline" --env="development" +python train.py --name="lightgbm" --type="baseline" --env="development" + +python train.py --name="logistic_regression_fullname" --type="baseline" --env="development" +python train.py --name="logistic_regression_native" --type="baseline" --env="development" +python train.py --name="logistic_regression_surname" --type="baseline" --env="development" + +python train.py --name="lstm" --type="baseline" --env="development" +python train.py --name="random_forest" --type="baseline" --env="development" +python train.py --name="svm" --type="baseline" --env="development" +python train.py --name="naive_bayes" --type="baseline" --env="development" +python train.py --name="transformer" --type="baseline" --env="development" +python train.py --name="xgboost" --type="baseline" --env="development" ``` -### Development Workflow +## Web Interface + +This project includes a user-friendly web interface built with Streamlit, allowing non-technical users to run +experiments and make predictions without needing to understand the underlying code. + +### Running the Web Interface + ```bash -make daily-work # Daily development setup -make notebook # Launch Jupyter for analysis -make web-dev # Launch web interface with auto-reload +streamlit run app.py ``` -### Data Management -```bash -make check-data # Verify all data files -make data-stats # Show dataset statistics -make backup-data # Create timestamped backup -make clean-checkpoints # Clean processing checkpoints -``` +## Contributors -## Project Structure - -``` -├── Makefile # All command shortcuts -├── streamlit_app.py # Web interface application -├── config/ # Configuration files -│ ├── pipeline.yaml # Main configuration -│ ├── pipeline.development.yaml # Dev settings -│ └── pipeline.production.yaml # Prod settings -├── core/ # Core framework -│ ├── config.py # Configuration management -│ ├── domain.py # Domain-specific data -│ └── utils.py # Reusable utilities -├── processing/ # Data processing pipeline -│ ├── main.py # Main pipeline script -│ ├── pipeline.py # Pipeline framework -│ ├── steps_config.py # Configurable processing steps -│ └── monitor.py # Monitoring utilities -├── research/ # Research and experiments -│ ├── cli.py # Command-line interface -│ ├── experiment.py # Experiment management -│ ├── models.py # Model implementations -│ └── runner.py # Experiment execution -└── dataset/ # Data files - └── names.csv # Raw dataset -``` - -## Citation - -If you use this pipeline in your research, please cite: - -```bibtex -@software{drc_names_pipeline, - title={DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System}, - author={Your Name}, - year={2025}, - url={https://github.com/bernard-ng/drc-ners-nlp} -} -``` - -## License - -This project is licensed under the MIT License - see the LICENSE file for details. - -## Acknowledgments - -- Democratic Republic of Congo population data contributors -- Open source NLP and machine learning communities -- Cultural linguistics research communities + + contributors + diff --git a/cli.py b/cli.py deleted file mode 100755 index 8bb30c4..0000000 --- a/cli.py +++ /dev/null @@ -1,205 +0,0 @@ -#!.venv/bin/python3 -import argparse -import logging -import sys -from pathlib import Path - -import pandas as pd - -from core.config import setup_config -from research.experiment.experiment_runner import ExperimentRunner -from research.experiment.experiment_tracker import ExperimentTracker - - -def list_experiments(args): - """List experiments with optional filtering""" - - tracker = ExperimentTracker() - - # Apply filters - filters = {} - if args.status: - from research.experiment import ExperimentStatus - - filters["status"] = ExperimentStatus(args.status) - if args.model_type: - filters["model_type"] = args.model_type - if args.tags: - filters["tags"] = args.tags - - experiments = tracker.list_experiments(**filters) - - if not experiments: - logging.info("No experiments found matching criteria") - return - - # Create summary table - rows = [] - for exp in experiments: - row = { - "ID": exp.experiment_id[:12] + "...", - "Name": exp.config.name, - "Model": exp.config.model_type, - "Status": exp.status.value, - "Test Acc": f"{exp.test_metrics.get('accuracy', 0):.4f}" if exp.test_metrics else "N/A", - "Start Time": exp.start_time.strftime("%Y-%m-%d %H:%M"), - } - rows.append(row) - - df = pd.DataFrame(rows) - logging.info(df.to_string(index=False)) - - -def show_experiment_details(args): - """Show detailed results for an experiment""" - - tracker = ExperimentTracker() - experiment = tracker.get_experiment(args.experiment_id) - - if not experiment: - logging.error(f"Experiment not found: {args.experiment_id}") - return - - logging.info("=== Experiment Details ===") - logging.info(f"ID: {experiment.experiment_id}") - logging.info(f"Name: {experiment.config.name}") - logging.info(f"Description: {experiment.config.description}") - logging.info(f"Model Type: {experiment.config.model_type}") - logging.info(f"Features: {', '.join([f.value for f in experiment.config.features])}") - logging.info(f"Status: {experiment.status.value}") - logging.info(f"Start Time: {experiment.start_time}") - logging.info(f"End Time: {experiment.end_time}") - - if experiment.test_metrics: - logging.info("=== Test Metrics ===") - for metric, value in experiment.test_metrics.items(): - logging.info(f"{metric}: {value:.4f}") - - if experiment.cv_metrics: - logging.info("=== Cross-Validation Metrics ===") - for metric, value in experiment.cv_metrics.items(): - if not metric.endswith("_std"): - std_key = f"{metric}_std" - std_val = experiment.cv_metrics.get(std_key, 0) - logging.info(f"{metric}: {value:.4f} ± {std_val:.4f}") - - if experiment.feature_importance: - logging.info("=== Top 10 Feature Importances ===") - sorted_features = sorted( - experiment.feature_importance.items(), key=lambda x: x[1], reverse=True - ) - for feature, importance in sorted_features[:10]: - logging.info(f"{feature}: {importance:.4f}") - - if experiment.prediction_examples: - logging.info("=== Prediction Examples ===") - for i, example in enumerate(experiment.prediction_examples[:5]): - correct = "✓" if example["correct"] else "✗" - logging.info( - f"{i + 1}. {example['name']} -> True: {example['true_label']}, " - f"Pred: {example['predicted_label']} {correct}" - ) - - -def compare_experiments_cmd(args): - """Compare multiple experiments""" - - config = setup_config(env="development") - runner = ExperimentRunner(config) - comparison = runner.compare_experiments(args.experiment_ids) - - if comparison.empty: - logging.info("No experiments found for comparison") - return - - logging.info("=== Experiment Comparison ===") - - # Show key columns - key_columns = ["name", "model_type", "features", "test_accuracy", "test_f1"] - available_columns = [col for col in key_columns if col in comparison.columns] - - logging.info(comparison[available_columns].to_string(index=False)) - - -def export_results(args): - """Export experiment results""" - - tracker = ExperimentTracker() - output_path = tracker.export_results(Path(args.output) if args.output else None) - - logging.info(f"Results exported to: {output_path}") - - -def main(): - """Main CLI entry point with unified configuration loading""" - parser = argparse.ArgumentParser( - description="DRC Names Research Experiment Manager", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - # Global arguments - parser.add_argument("--config", type=Path, help="Path to configuration file") - parser.add_argument( - "--env", type=str, default="development", - help="Environment name (default: development)" - ) - parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging") - - subparsers = parser.add_subparsers(dest="command", help="Available commands") - - # List experiments - list_parser = subparsers.add_parser("list", help="List experiments") - list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"]) - list_parser.add_argument("--tags", nargs="+", help="Filter by tags") - - # Show experiment details - detail_parser = subparsers.add_parser("show", help="Show experiment details") - detail_parser.add_argument("experiment_id", help="Experiment ID") - - # Compare experiments - compare_parser = subparsers.add_parser("compare", help="Compare experiments") - compare_parser.add_argument("experiment_ids", nargs="+", help="Experiment IDs to compare") - - # Export results - export_parser = subparsers.add_parser("export", help="Export results to CSV") - export_parser.add_argument("--output", help="Output file path") - - args = parser.parse_args() - - if not args.command: - parser.print_help() - return 1 - - try: - # Load configuration and setup logging - config = setup_config(config_path=args.config, env=args.env) - - # Override log level if verbose requested - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - - # Execute command - command_map = { - "list": list_experiments, - "show": show_experiment_details, - "compare": compare_experiments_cmd, - "export": export_results, - } - handler = command_map.get(args.command) - if handler: - handler(args) - - return 0 - - except Exception as e: - logging.error(f"Command failed: {e}") - if args.verbose: - import traceback - - traceback.print_exc() - return 1 - - -if __name__ == "__main__": - exit_code = main() - sys.exit(exit_code) diff --git a/config/pipeline.development.yaml b/config/pipeline.development.yaml index e2d1437..bccc0ae 100644 --- a/config/pipeline.development.yaml +++ b/config/pipeline.development.yaml @@ -1,17 +1,12 @@ -# Production Environment Configuration -# Optimized settings for production deployment - -name: "drc_names_pipeline" -version: "1.0.0" environment: "development" debug: true # Processing settings processing: - batch_size: 100_000 + batch_size: 10_000 max_workers: 8 checkpoint_interval: 10 - use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks + use_multiprocessing: true # Pipeline stages stages: @@ -20,7 +15,6 @@ stages: #- "llm_annotation" - "data_splitting" - # Production LLM settings llm: model_name: "mistral:7b" @@ -31,14 +25,10 @@ llm: max_concurrent_requests: 4 enable_rate_limiting: true -# Development data settings - limited dataset for faster testing +# Data handling configuration data: - split_evaluation: true - split_by_gender: true - evaluation_fraction: 0.2 - random_seed: 42 - max_dataset_size: ~ # Limit to 10k records for development/testing - balance_by_sex: false # Balance male/female samples when limiting + max_dataset_size: 100_000 + balance_by_sex: true # Enhanced logging for development logging: diff --git a/config/pipeline.production.yaml b/config/pipeline.production.yaml index 447b0d7..ace19e0 100644 --- a/config/pipeline.production.yaml +++ b/config/pipeline.production.yaml @@ -1,17 +1,12 @@ -# Production Environment Configuration -# Optimized settings for production deployment - -name: "drc_names_pipeline" -version: "1.0.0" environment: "production" debug: false -# Production processing settings (optimized for performance) +# Processing settings processing: batch_size: 10_000 max_workers: 8 checkpoint_interval: 10 - use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks + use_multiprocessing: true # Pipeline stages stages: @@ -20,7 +15,6 @@ stages: - "llm_annotation" - "data_splitting" - # Production LLM settings llm: model_name: "mistral:7b" @@ -31,19 +25,15 @@ llm: max_concurrent_requests: 4 enable_rate_limiting: true -# Production data settings +# Data handling configuration data: - split_evaluation: true - split_by_gender: true - evaluation_fraction: 0.2 - random_seed: 42 max_dataset_size: null balance_by_sex: false # Production logging (less verbose) logging: level: "INFO" - console_logging: false # Disable console in production + console_logging: false file_logging: true log_file: "pipeline.production.log" max_log_size: 52428800 # 50MB diff --git a/config/pipeline.yaml b/config/pipeline.yaml index 93565d8..c316842 100644 --- a/config/pipeline.yaml +++ b/config/pipeline.yaml @@ -1,72 +1,72 @@ # DRC Names Processing Pipeline Configuration # Main configuration file with default settings -name: "drc_names_pipeline" -version: "1.0.0" -description: "DRC Names NLP Processing Pipeline" -environment: "development" -debug: false +name: "drc_ners_pipeline" # Name of the pipeline +version: "1.0.0" # Version of the pipeline +description: "DRC NERS NLP Processing" # Description of the pipeline +environment: "development" # Environment type (development, production, etc.) +debug: false # Enable debug mode for detailed logging and error reporting # Project directory structure paths: - root_dir: "." - configs_dir: "./config" - data_dir: "./data/dataset" - models_dir: "./data/models" - outputs_dir: "./data/outputs" - logs_dir: "./data/logs" - checkpoints_dir: "./data/checkpoints" + root_dir: "." # Root directory of the project + configs_dir: "./config" # Directory for configuration files + data_dir: "./data/dataset" # Directory for dataset files + models_dir: "./data/models" # Directory for model files + outputs_dir: "./data/outputs" # Directory for output files + logs_dir: "./data/logs" # Directory for log files + checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints # Pipeline stages -stages: - - "data_cleaning" - - "feature_extraction" - - "llm_annotation" - - "data_splitting" +stages: # List of stages in the processing pipeline + - "data_cleaning" # Data cleaning stage + - "feature_extraction" # Feature extraction stage + - "llm_annotation" # LLM annotation stage (computational intensive) + - "data_splitting" # Data splitting stage # Data processing configuration processing: - batch_size: 1_000 - max_workers: 4 - checkpoint_interval: 5 - use_multiprocessing: false - encoding_options: + batch_size: 1_000 # Size of data batches to process at once + max_workers: 4 # Number of worker threads for parallel processing + checkpoint_interval: 5 # Interval for saving checkpoints during processing + use_multiprocessing: false # Enable multiprocessing for CPU-bound tasks + encoding_options: # List of encodings to try when reading files - "utf-8" - "utf-16" - "latin1" - chunk_size: 100_000 + chunk_size: 100_000 # Size of data chunks to process in parallel # LLM annotation settings llm: - model_name: "mistral:7b" - requests_per_minute: 60 - requests_per_second: 2 - retry_attempts: 3 - timeout_seconds: 600 - max_concurrent_requests: 2 - enable_rate_limiting: true + model_name: "mistral:7b" # Name of the LLM model to use + requests_per_minute: 60 # Requests per minute to the LLM service + requests_per_second: 2 # Requests per second to the LLM service + retry_attempts: 3 # Number of retry attempts for LLM requests + timeout_seconds: 600 # Timeout for LLM requests + max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service + enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service # Data handling configuration data: - input_file: "names.csv" + input_file: "names.csv" # Input file containing names data output_files: - featured: "names_featured.csv" - evaluation: "names_evaluation.csv" - males: "names_males.csv" - females: "names_females.csv" - split_evaluation: true - split_by_gender: true - evaluation_fraction: 0.2 - random_seed: 42 - max_dataset_size: null - balance_by_sex: false + featured: "names_featured.csv" # Output file for featured data + evaluation: "names_evaluation.csv" # Output file for evaluation set + males: "names_males.csv" # Output files for male names + females: "names_females.csv" # Output files for female names + split_evaluation: true # Should the dataset be split into training and evaluation sets ? + split_by_gender: true # Should the dataset be split by gender ? + evaluation_fraction: 0.2 # Fraction of data to use for evaluation + random_seed: 42 # Random seed for reproducibility + max_dataset_size: null # Maximum size of the dataset to process, set to null for no + balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size? # Logging configuration logging: - level: "INFO" + level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - file_logging: true - console_logging: true - log_file: "pipeline.log" - max_log_size: 10485760 # 10MB - backup_count: 5 + file_logging: true # Enable logging to file + console_logging: true # Enable logging to console + log_file: "pipeline.log" # Log file name + max_log_size: 10485760 # Maximum size of log file before rotation (10MB) + backup_count: 5 # Number of backup log files to keep diff --git a/config/research_templates.yaml b/config/research_templates.yaml index 544aab8..a968aec 100644 --- a/config/research_templates.yaml +++ b/config/research_templates.yaml @@ -1,128 +1,148 @@ -# Research Experiment Configuration Templates -# These configurations can be used as starting points for different types of experiments - -# Baseline Experiments Configuration baseline_experiments: - - name: "baseline_logistic_regression_fullname" + - name: "bigru" + description: "Baseline BiGRU with full name features" + model_type: "bigru" + features: [ "full_name" ] + model_params: + max_len: 20 + embedding_dim: 64 + gru_units: 32 + epochs: 10 + batch_size: 32 + tags: [ "baseline", "neural", "bigru" ] + + - name: "cnn" + description: "Baseline CNN with character patterns" + model_type: "cnn" + features: [ "full_name" ] + model_params: + max_len: 20 + embedding_dim: 64 + filters: 64 + kernel_size: 3 + dropout: 0.5 + epochs: 10 + batch_size: 32 + tags: [ "baseline", "neural", "cnn" ] + + - name: "ensemble" + description: "Baseline Ensemble with multiple models" + model_type: "ensemble" + features: [ "full_name", "name_length", "word_count" ] + model_params: + base_models: [ "logistic_regression", "random_forest", "xgboost" ] + voting: "soft" + cv_folds: 5 + tags: [ "baseline", "ensemble" ] + + - name: "lightgbm" + description: "Baseline LightGBM with engineered features" + model_type: "lightgbm" + features: [ "full_name", "name_length", "word_count" ] + model_params: + n_estimators: 100 + max_depth: -1 + learning_rate: 0.1 + num_leaves: 31 + subsample: 0.8 + colsample_bytree: 0.8 + tags: [ "baseline", "lightgbm" ] + + - name: "logistic_regression_fullname" description: "Baseline logistic regression with full name" model_type: "logistic_regression" - features: ["full_name"] + features: [ "full_name" ] model_params: - ngram_range: [2, 5] max_features: 10000 - max_iter: 1000 - tags: ["baseline", "fullname"] + tags: [ "baseline", "logistic_regression", "fullname" ] - - name: "baseline_logistic_regression_native" + - name: "logistic_regression_native" description: "Logistic regression with native name only" model_type: "logistic_regression" - features: ["native_name"] + features: [ "native_name" ] model_params: - ngram_range: [2, 4] max_features: 5000 - tags: ["baseline", "native"] + tags: [ "baseline", "logistic_regression", "native" ] - - name: "baseline_rf_engineered" - description: "Random Forest with engineered features" + - name: "logistic_regression_surname" + description: "Logistic regression with surname name only" + model_type: "logistic_regression" + features: [ "surname" ] + model_params: + max_features: 5000 + tags: [ "baseline", "logistic_regression", "surname" ] + + - name: "lstm" + description: "Baseline LSTM with full name features" + model_type: "lstm" + features: [ "full_name" ] + model_params: + embedding_dim: 128 + lstm_units: 64 + epochs: 10 + batch_size: 64 + tags: [ "baseline", "neural", "lstm" ] + + - name: "naive_bayes" + description: "Baseline Naive Bayes with full name features" + model_type: "naive_bayes" + features: [ "full_name" ] + model_params: + max_features: 5000 + tags: [ "baseline", "naive_bayes" ] + + - name: "random_forest" + description: "Baseline Random Forest with engineered features" model_type: "random_forest" - features: ["name_length", "word_count", "province"] + features: [ "name_length", "word_count", "province" ] model_params: n_estimators: 100 max_depth: 10 - tags: ["baseline", "engineered"] + min_samples_split: 2 + min_samples_leaf: 1 + tags: [ "baseline", "random_forest", "engineered" ] + + - name: "svm" + description: "Baseline SVM with full name features" + model_type: "svm" + features: [ "full_name" ] + model_params: + C: 1.0 + kernel: "rbf" + ngram_range: [ 2, 4 ] + max_features: 5000 + tags: [ "baseline", "svm" ] + + - name: "transformer" + description: "Baseline Transformer with attention mechanism" + model_type: "transformer" + features: [ "full_name" ] + model_params: + embedding_dim: 128 + num_heads: 4 + num_layers: 2 + epochs: 10 + batch_size: 64 + tags: [ "baseline", "neural", "transformer" ] + + - name: "xgboost" + description: "Baseline XGBoost with engineered features" + model_type: "xgboost" + features: [ "full_name", "name_length", "word_count" ] + model_params: + n_estimators: 100 + max_depth: 6 + learning_rate: 0.1 + subsample: 0.8 + colsample_bytree: 0.8 + tags: [ "baseline", "xgboost" ] + + +# Advanced Experiments Configuration +advanced_experiments: # Feature Study Configurations feature_studies: - - name: "native_vs_surname" - description: "Compare native name vs surname effectiveness" - experiments: - - model_type: "logistic_regression" - features: ["native_name"] - tags: ["feature_study", "native"] - - model_type: "logistic_regression" - features: ["surname"] - tags: ["feature_study", "surname"] - - name: "name_parts_analysis" - description: "Analyze effectiveness of different name parts" - experiments: - - features: ["first_word"] - tags: ["name_parts", "first"] - - features: ["last_word"] - tags: ["name_parts", "last"] - - features: ["name_beginnings"] - feature_params: - beginning_length: 3 - tags: ["name_parts", "beginnings"] - - features: ["name_endings"] - feature_params: - ending_length: 3 - tags: ["name_parts", "endings"] - -# Province-Specific Studies -province_studies: - - name: "kinshasa_study" - description: "Gender prediction for Kinshasa province" - model_type: "logistic_regression" - features: ["full_name"] - train_data_filter: - province: "kinshasa" - tags: ["province_study", "kinshasa"] - - - name: "cross_province_generalization" - description: "Train on one province, test on another" - experiments: - - train_filter: {"province": "kinshasa"} - test_filter: {"province": "bas-congo"} - tags: ["generalization", "kinshasa_to_bas-congo"] - -# Model Comparison Studies -model_comparisons: - - name: "model_comparison_fullname" - description: "Compare different models with full name" - base_config: - features: ["full_name"] - tags: ["model_comparison"] - models: - - model_type: "logistic_regression" - model_params: - ngram_range: [2, 5] - - model_type: "random_forest" - # Note: RF will need different feature preparation - features: ["name_length", "word_count", "province"] - -# Advanced Feature Combinations -advanced_features: - - name: "multi_feature_combination" - description: "Test various feature combinations" - experiments: - - features: ["full_name", "name_length"] - tags: ["combination", "name_plus_length"] - - features: ["native_name", "surname", "province"] - tags: ["combination", "semantic_features"] - - features: ["name_beginnings", "name_endings", "word_count"] - tags: ["combination", "structural_features"] - -# Hyperparameter Studies -hyperparameter_studies: - - name: "ngram_range_study" - description: "Study effect of different n-gram ranges" - base_config: - model_type: "logistic_regression" - features: ["full_name"] - tags: ["hyperparameter", "ngram"] - variants: - - model_params: {"ngram_range": [1, 3]} - - model_params: {"ngram_range": [2, 4]} - - model_params: {"ngram_range": [2, 5]} - - model_params: {"ngram_range": [3, 6]} - -# Data Size Studies -data_studies: - - name: "learning_curve_study" - description: "Study performance vs training data size" - base_config: - model_type: "logistic_regression" - features: ["full_name"] - tags: ["learning_curve"] - data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0] # Fractions of training data to use +# Hyperparameter Tuning Configurations +hyperparameter_tuning: diff --git a/research/model_trainer.py b/research/model_trainer.py index cf50e22..5e10089 100644 --- a/research/model_trainer.py +++ b/research/model_trainer.py @@ -33,6 +33,7 @@ class ModelTrainer: model_type: str = "logistic_regression", features: List[str] = None, model_params: Dict[str, Any] = None, + tags: List[str] = None, save_artifacts: bool = True, ) -> str: """ @@ -45,6 +46,10 @@ class ModelTrainer: features = ["full_name"] feature_types = [FeatureType(f) for f in features] + # Prepare tags - combine default tags with template tags + default_tags = ["training", model_type] + experiment_tags = default_tags + (tags or []) + # Create experiment configuration config = ExperimentConfig( name=model_name, @@ -52,7 +57,7 @@ class ModelTrainer: model_type=model_type, features=feature_types, model_params=model_params or {}, - tags=["training", model_type], + tags=experiment_tags, ) # Run experiment diff --git a/train.py b/train.py index 849b8b1..f796b35 100755 --- a/train.py +++ b/train.py @@ -3,29 +3,98 @@ import argparse import logging import sys import traceback +import yaml +from pathlib import Path from core.config import setup_config from research.model_trainer import ModelTrainer +def load_research_templates(templates_path: str = "config/research_templates.yaml") -> dict: + """Load research templates from YAML file""" + try: + with open(templates_path, 'r') as file: + return yaml.safe_load(file) + except FileNotFoundError: + logging.error(f"Templates file not found: {templates_path}") + raise + except yaml.YAMLError as e: + logging.error(f"Error parsing templates file: {e}") + raise + + +def find_experiment_config(templates: dict, name: str, experiment_type: str) -> dict: + """Find experiment configuration by name and type""" + # Map type to section in templates + type_mapping = { + "baseline": "baseline_experiments", + "advanced": "advanced_experiments", + "feature_study": "feature_studies", + "tuning": "hyperparameter_tuning" + } + + section_name = type_mapping.get(experiment_type) + if not section_name: + available_types = list(type_mapping.keys()) + raise ValueError(f"Unknown experiment type '{experiment_type}'. Available types: {available_types}") + + if section_name not in templates: + raise ValueError(f"Section '{section_name}' not found in templates") + + experiments = templates[section_name] + + # Search for experiment by model name + for experiment in experiments: + # Check if this is the experiment we're looking for + # Look for experiments that match the model type or contain the name + if (experiment.get("model_type") == name or + name.lower() in experiment.get("name", "").lower() or + f"baseline_{name}" == experiment.get("name") or + f"advanced_{name}" == experiment.get("name")): + return experiment + + # If not found, list available experiments + available_experiments = [exp.get("name", exp.get("model_type", "unknown")) for exp in experiments] + raise ValueError(f"Experiment '{name}' not found in '{experiment_type}' section. " + f"Available experiments: {available_experiments}") + + def main(): - parser = argparse.ArgumentParser(description="Train DRC Names Models") - parser.add_argument("--type", type=str, help="Specific model type to train") - parser.add_argument("--name", type=str, help="Model name") + parser = argparse.ArgumentParser(description="Train DRC Names Models using Research Templates") + parser.add_argument("--name", type=str, required=True, help="Model name to train") + parser.add_argument("--type", type=str, required=True, help="Experiment type") parser.add_argument("--config", type=str, help="Path to configuration file") parser.add_argument("--env", type=str, default="development", help="Environment name") + parser.add_argument("--templates", type=str, default="config/research_templates.yaml") args = parser.parse_args() try: + # Setup pipeline configuration config = setup_config(config_path=args.config, env=args.env) - trainer = ModelTrainer(config) - # Train specific model + # Load research templates + logging.info(f"Loading research templates from: {args.templates}") + templates = load_research_templates(args.templates) + + # Find the specific experiment configuration + logging.info(f"Looking for experiment: name='{args.name}', type='{args.type}'") + experiment_config = find_experiment_config(templates, args.name, args.type) + + logging.info(f"Found experiment: {experiment_config.get('name')}") + logging.info(f"Description: {experiment_config.get('description')}") + logging.info(f"Features: {experiment_config.get('features')}") + + # Train the model using template configuration + trainer = ModelTrainer(config) trainer.train_single_model( - model_name=args.name, - model_type=args.type, - features=["full_name"] + model_name=experiment_config.get("name"), + model_type=experiment_config.get("model_type"), + features=experiment_config.get("features"), + model_params=experiment_config.get("model_params", {}), + tags=experiment_config.get("tags", []) ) + + logging.info("Training completed successfully!") return 0 except Exception as e: