feat: enhance training pipeline with research templates and experiment configuration
This commit is contained in:
@@ -12,6 +12,7 @@ help: ## Show this help message
|
|||||||
.PHONY: setup
|
.PHONY: setup
|
||||||
setup: ## Setup virtual environment and install dependencies
|
setup: ## Setup virtual environment and install dependencies
|
||||||
python -m venv .venv
|
python -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
.venv/bin/pip install --upgrade pip
|
.venv/bin/pip install --upgrade pip
|
||||||
.venv/bin/pip install -r requirements.txt
|
.venv/bin/pip install -r requirements.txt
|
||||||
|
|
||||||
@@ -20,79 +21,6 @@ install: ## Install/update dependencies
|
|||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
.PHONY: install-dev
|
|
||||||
install-dev: ## Install development dependencies
|
|
||||||
pip install -r requirements.txt
|
|
||||||
pip install jupyter notebook ipykernel pytest black flake8 mypy
|
|
||||||
|
|
||||||
.PHONY: activate
|
|
||||||
activate: ## Show activation command
|
|
||||||
@echo "Run: source .venv/bin/activate"
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# MODEL TRAINING & ARTIFACTS
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
.PHONY: train-baseline
|
|
||||||
train-baseline: ## Train all baseline models and save artifacts
|
|
||||||
python research/train.py --mode baseline
|
|
||||||
|
|
||||||
.PHONY: train-neural
|
|
||||||
train-neural: ## Train neural network models (LSTM, CNN, Transformer)
|
|
||||||
python research/train.py --mode neural
|
|
||||||
|
|
||||||
.PHONY: train-model
|
|
||||||
train-model: ## Train specific model (use: make train-model MODEL=logistic_regression NAME=my_model)
|
|
||||||
python research/train.py --model-type $(MODEL) --name $(NAME)
|
|
||||||
|
|
||||||
.PHONY: list-models
|
|
||||||
list-models: ## List all saved model artifacts
|
|
||||||
python research/train.py --mode list
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# RESEARCH & EXPERIMENTS
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
.PHONY: experiment
|
|
||||||
experiment: ## Create sample experiment configuration
|
|
||||||
python research/cli.py run --name "sample_experiment" --features full_name --model-type logistic_regression
|
|
||||||
|
|
||||||
.PHONY: baseline
|
|
||||||
baseline: ## Run baseline experiments
|
|
||||||
python research/cli.py baseline
|
|
||||||
|
|
||||||
.PHONY: ablation
|
|
||||||
ablation: ## Run feature ablation study
|
|
||||||
python research/cli.py ablation
|
|
||||||
|
|
||||||
.PHONY: components
|
|
||||||
components: ## Run name component analysis
|
|
||||||
python research/cli.py components
|
|
||||||
|
|
||||||
.PHONY: list-experiments
|
|
||||||
list-experiments: ## List all experiments
|
|
||||||
python research/cli.py list
|
|
||||||
|
|
||||||
.PHONY: list-completed
|
|
||||||
list-completed: ## List completed experiments only
|
|
||||||
python research/cli.py list --status completed
|
|
||||||
|
|
||||||
.PHONY: export-results
|
|
||||||
export-results: ## Export all experiment results to CSV
|
|
||||||
python research/cli.py export --output results_$(shell date +%Y%m%d_%H%M%S).csv
|
|
||||||
|
|
||||||
.PHONY: best-model
|
|
||||||
best-model: ## Show best performing model
|
|
||||||
python research/cli.py list --status completed | head -5
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# WEB INTERFACE
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
.PHONY: web
|
|
||||||
web: ## Launch Streamlit web interface
|
|
||||||
streamlit run web/app.py --server.runOnSave true --server.port 8501
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# DEVELOPMENT & CODE QUALITY
|
# DEVELOPMENT & CODE QUALITY
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -113,10 +41,6 @@ type-check: ## Type check with mypy
|
|||||||
notebook: ## Start Jupyter notebook
|
notebook: ## Start Jupyter notebook
|
||||||
jupyter notebook notebooks/
|
jupyter notebook notebooks/
|
||||||
|
|
||||||
.PHONY: lab
|
|
||||||
lab: ## Start Jupyter lab
|
|
||||||
jupyter lab notebooks/
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# DEPLOYMENT & PRODUCTION
|
# DEPLOYMENT & PRODUCTION
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -1,69 +1,20 @@
|
|||||||
# DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System for Congolese Name Analysis
|
# A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference
|
||||||
|
|
||||||
A comprehensive, research-friendly pipeline for analyzing Congolese names and predicting gender using culturally-aware machine learning models.
|
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
|
||||||
This system provides advanced data processing, experiment management, and an intuitive web interface for non-technical users.
|
underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
|
||||||
|
data.
|
||||||
|
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
|
||||||
|
million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
|
||||||
|
|
||||||
## Overview
|
## Getting Started
|
||||||
|
|
||||||
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data.
|
### Installation & Setup
|
||||||
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 7 million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
|
|
||||||
|
|
||||||
Our approach involves:
|
Instructions and command line snippets bellow are provided to help you set up the project environment quickly and
|
||||||
|
efficiently.
|
||||||
|
assuming you have Python 3.11 and Git installed and working on a Unix-like system (Linux, macOS, etc.).
|
||||||
|
|
||||||
- **(1) Advanced data processing pipeline** with batching, checkpointing, and parallel processing
|
**Using Makefile (Recommended)**
|
||||||
- **(2) Modular experiment framework** for systematic model comparison and research iteration
|
|
||||||
- **(3) Multiple feature extraction strategies** leveraging name components, linguistic patterns, and demographic data
|
|
||||||
- **(4) Culturally-aware gender prediction models** trained specifically on Congolese naming patterns
|
|
||||||
- **(5) User-friendly web interface** enabling non-technical users to run experiments and make predictions
|
|
||||||
- **(6) Comprehensive research tools** for reproducible experimentation and result analysis
|
|
||||||
|
|
||||||
## Key Features
|
|
||||||
|
|
||||||
### **Advanced Data Processing**
|
|
||||||
- **Batched processing** with configurable batch sizes and parallel execution
|
|
||||||
- **Automatic checkpointing** and resume capability for large datasets
|
|
||||||
- **LLM-powered annotation** with rate limiting and retry logic
|
|
||||||
- **Memory-efficient** chunked data loading for datasets of any size
|
|
||||||
|
|
||||||
### **Research-Friendly Experiment Framework**
|
|
||||||
- **Modular model architecture** - easily add new models and features
|
|
||||||
- **Systematic experiment tracking** with automatic result storage
|
|
||||||
- **Feature ablation studies** and component analysis tools
|
|
||||||
- **Cross-validation** and statistical significance testing
|
|
||||||
- **Automated baseline comparisons** and performance analysis
|
|
||||||
|
|
||||||
### **Intuitive Web Interface**
|
|
||||||
- **No-code experiment creation** with visual parameter selection
|
|
||||||
- **Real-time monitoring** of data processing and training progress
|
|
||||||
- **Interactive result visualization** with charts and comparisons
|
|
||||||
- **Batch prediction capabilities** for CSV file upload and processing
|
|
||||||
- **Model comparison tools** with automatic performance rankings
|
|
||||||
|
|
||||||
### **Comprehensive Analytics**
|
|
||||||
- **Feature importance analysis** showing which name components matter most
|
|
||||||
- **Province-specific studies** examining regional naming patterns
|
|
||||||
- **Learning curve analysis** for understanding data requirements
|
|
||||||
- **Prediction confidence scoring** and error analysis tools
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### Using Make Commands (Recommended)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Complete setup and basic processing
|
|
||||||
make quick-start
|
|
||||||
|
|
||||||
# Launch web interface
|
|
||||||
make web
|
|
||||||
|
|
||||||
# Run research workflow
|
|
||||||
make research-flow
|
|
||||||
|
|
||||||
# Show all available commands
|
|
||||||
make help
|
|
||||||
```
|
|
||||||
|
|
||||||
### Manual Installation
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||||
@@ -71,246 +22,88 @@ cd drc-ners-nlp
|
|||||||
|
|
||||||
# Setup environment
|
# Setup environment
|
||||||
make setup
|
make setup
|
||||||
make process
|
make activate
|
||||||
|
|
||||||
# Launch web interface
|
|
||||||
make web
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
**Manual Setup**
|
||||||
|
|
||||||
### Web Interface (Recommended for Non-Technical Users)
|
|
||||||
|
|
||||||
Launch the Streamlit web application:
|
|
||||||
```bash
|
|
||||||
make web
|
|
||||||
```
|
|
||||||
|
|
||||||
The interface provides:
|
|
||||||
- **Dashboard**: Overview of datasets and recent experiments
|
|
||||||
- **Data Overview**: Interactive data exploration and statistics
|
|
||||||
- **Data Processing**: Monitor and control the processing pipeline
|
|
||||||
- **Experiments**: Create and manage machine learning experiments
|
|
||||||
- **Results & Analysis**: Compare models and analyze performance
|
|
||||||
- **Predictions**: Make predictions on new names or upload CSV files
|
|
||||||
- **Settings**: Configure the system and manage data
|
|
||||||
|
|
||||||
### Research & Experiments
|
|
||||||
|
|
||||||
#### Quick Research Studies
|
|
||||||
```bash
|
|
||||||
# Compare different approaches (full name vs native vs surname)
|
|
||||||
make baseline
|
|
||||||
|
|
||||||
# Analyze which name components are most effective
|
|
||||||
make components
|
|
||||||
|
|
||||||
# Test feature importance through ablation study
|
|
||||||
make ablation
|
|
||||||
|
|
||||||
# View all experiment results
|
|
||||||
make list-experiments
|
|
||||||
|
|
||||||
# Export results for publication
|
|
||||||
make export-results
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Custom Experiments
|
|
||||||
```bash
|
|
||||||
# Run specific experiment via command line
|
|
||||||
python research/cli.py run \
|
|
||||||
--name "native_name_study" \
|
|
||||||
--features native_name \
|
|
||||||
--model-type logistic_regression \
|
|
||||||
--description "Test native name effectiveness"
|
|
||||||
|
|
||||||
# Compare multiple experiments
|
|
||||||
python research/cli.py compare <exp_id_1> <exp_id_2>
|
|
||||||
|
|
||||||
# View detailed results
|
|
||||||
python research/cli.py show <experiment_id>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Data Processing Pipeline
|
|
||||||
|
|
||||||
#### Basic Processing (No LLM)
|
|
||||||
```bash
|
|
||||||
make process-basic # Fast processing without LLM annotation
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Complete Processing (With LLM)
|
|
||||||
```bash
|
|
||||||
make process # Full pipeline including LLM annotation
|
|
||||||
make process-dev # Development mode with smaller batches
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Monitor Progress
|
|
||||||
```bash
|
|
||||||
make monitoring # Show current pipeline status
|
|
||||||
make status # Show overall system status
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Resume Interrupted Processing
|
|
||||||
```bash
|
|
||||||
make process-resume # Resume from last checkpoint
|
|
||||||
```
|
|
||||||
|
|
||||||
### Available Models and Features
|
|
||||||
|
|
||||||
#### Models
|
|
||||||
- **Logistic Regression**: Character n-gram based classification
|
|
||||||
- **Random Forest**: Engineered feature-based classification
|
|
||||||
- **LSTM**: Sequential neural network (planned)
|
|
||||||
- **Transformer**: Attention-based model (planned)
|
|
||||||
|
|
||||||
#### Features
|
|
||||||
- **Full Name**: Complete name as given
|
|
||||||
- **Native Name**: Identified native/given name component
|
|
||||||
- **Surname**: Family name component
|
|
||||||
- **Name Length**: Character count features
|
|
||||||
- **Word Count**: Number of words in name
|
|
||||||
- **Province**: Geographic/demographic features
|
|
||||||
- **Name Beginnings/Endings**: Prefix/suffix patterns
|
|
||||||
- **Character N-grams**: Linguistic pattern features
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Environment Configurations
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Switch to development configuration (smaller batches, more logging)
|
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||||
make config-dev
|
cd drc-ners-nlp
|
||||||
|
|
||||||
# Switch to production configuration (optimized for performance)
|
# Setup environment
|
||||||
make config-prod
|
python -m venv .venv
|
||||||
|
.venv/bin/pip install --upgrade pip
|
||||||
|
.venv/bin/pip install -r requirements.txt
|
||||||
|
|
||||||
# View current configuration
|
pip install --upgrade pip
|
||||||
make show-config
|
pip install -r requirements.txt
|
||||||
|
pip install jupyter notebook ipykernel pytest black flake8 mypy
|
||||||
|
|
||||||
|
source .venv/bin/activate
|
||||||
```
|
```
|
||||||
|
|
||||||
### Custom Configuration
|
## Data Processing
|
||||||
|
|
||||||
Edit configuration files in `config/`:
|
This project includes a robust data processing pipeline designed to handle large datasets efficiently with batching,
|
||||||
- `pipeline.yaml` - Main configuration
|
checkpointing, and parallel processing capabilities.
|
||||||
- `pipeline.development.yaml` - Development overrides
|
step are defined in the `drc-ners-nlp/processing/steps` directory. and configuration to enable them is managed through
|
||||||
- `pipeline.production.yaml` - Production settings
|
the `drc-ners-nlp/config/pipeline.yaml` file.
|
||||||
|
|
||||||
|
**Pipeline Configuration**
|
||||||
|
|
||||||
Example configuration:
|
|
||||||
```yaml
|
```yaml
|
||||||
processing:
|
stages:
|
||||||
batch_size: 1000
|
- "data_cleaning"
|
||||||
max_workers: 4
|
- "feature_extraction"
|
||||||
|
- "llm_annotation"
|
||||||
llm:
|
- "data_splitting"
|
||||||
model_name: "mistral:7b"
|
|
||||||
requests_per_minute: 60
|
|
||||||
|
|
||||||
data:
|
|
||||||
split_evaluation: true
|
|
||||||
split_by_gender: true
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Research Capabilities
|
**Running the Pipeline**
|
||||||
|
|
||||||
### Systematic Experimentation
|
|
||||||
|
|
||||||
The framework supports systematic research through:
|
|
||||||
|
|
||||||
1. **Baseline Studies**: Compare fundamental approaches
|
|
||||||
2. **Feature Studies**: Test individual name components
|
|
||||||
3. **Ablation Studies**: Identify most important features
|
|
||||||
4. **Cross-Province Analysis**: Test generalization across regions
|
|
||||||
5. **Hyperparameter Optimization**: Systematic parameter tuning
|
|
||||||
|
|
||||||
### Reproducible Research
|
|
||||||
|
|
||||||
- **Experiment Tracking**: All experiments automatically logged with full configuration
|
|
||||||
- **Result Export**: CSV export for publication and further analysis
|
|
||||||
- **Statistical Testing**: Cross-validation and confidence intervals
|
|
||||||
- **Version Control**: Configuration-based approach enables easy replication
|
|
||||||
|
|
||||||
### Publication-Ready Output
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Generate comprehensive results for publication
|
python main.py --env development
|
||||||
make research-flow
|
|
||||||
make export-results
|
|
||||||
|
|
||||||
# Get best models for each approach
|
|
||||||
make list-completed
|
|
||||||
python research/cli.py list --status completed | head -10
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Development
|
## Experiments
|
||||||
|
|
||||||
|
This project provides a modular experiment (model training and evaluation) framework for systematic model comparison and
|
||||||
|
research iteration. models are defined in the `drc-ners-nlp/research/models` directory.
|
||||||
|
you can define model features, training parameters, and evaluation metrics in the `research_templates.yaml` file.
|
||||||
|
|
||||||
|
**Running Experiments**
|
||||||
|
|
||||||
### Code Quality and Testing
|
|
||||||
```bash
|
```bash
|
||||||
make format # Format code with black
|
python train.py --name="bigru" --type="baseline" --env="development"
|
||||||
make lint # Lint with flake8
|
python train.py --name="cnn" --type="baseline" --env="development"
|
||||||
make check-deps # Verify dependencies
|
python train.py --name="lightgbm" --type="baseline" --env="development"
|
||||||
|
|
||||||
|
python train.py --name="logistic_regression_fullname" --type="baseline" --env="development"
|
||||||
|
python train.py --name="logistic_regression_native" --type="baseline" --env="development"
|
||||||
|
python train.py --name="logistic_regression_surname" --type="baseline" --env="development"
|
||||||
|
|
||||||
|
python train.py --name="lstm" --type="baseline" --env="development"
|
||||||
|
python train.py --name="random_forest" --type="baseline" --env="development"
|
||||||
|
python train.py --name="svm" --type="baseline" --env="development"
|
||||||
|
python train.py --name="naive_bayes" --type="baseline" --env="development"
|
||||||
|
python train.py --name="transformer" --type="baseline" --env="development"
|
||||||
|
python train.py --name="xgboost" --type="baseline" --env="development"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Development Workflow
|
## Web Interface
|
||||||
|
|
||||||
|
This project includes a user-friendly web interface built with Streamlit, allowing non-technical users to run
|
||||||
|
experiments and make predictions without needing to understand the underlying code.
|
||||||
|
|
||||||
|
### Running the Web Interface
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make daily-work # Daily development setup
|
streamlit run app.py
|
||||||
make notebook # Launch Jupyter for analysis
|
|
||||||
make web-dev # Launch web interface with auto-reload
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Data Management
|
## Contributors
|
||||||
```bash
|
|
||||||
make check-data # Verify all data files
|
|
||||||
make data-stats # Show dataset statistics
|
|
||||||
make backup-data # Create timestamped backup
|
|
||||||
make clean-checkpoints # Clean processing checkpoints
|
|
||||||
```
|
|
||||||
|
|
||||||
## Project Structure
|
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||||
|
<img src="https://contrib.rocks/image?repo=bernard-ng/drc-ners-nlp" alt="contributors"/>
|
||||||
```
|
</a>
|
||||||
├── Makefile # All command shortcuts
|
|
||||||
├── streamlit_app.py # Web interface application
|
|
||||||
├── config/ # Configuration files
|
|
||||||
│ ├── pipeline.yaml # Main configuration
|
|
||||||
│ ├── pipeline.development.yaml # Dev settings
|
|
||||||
│ └── pipeline.production.yaml # Prod settings
|
|
||||||
├── core/ # Core framework
|
|
||||||
│ ├── config.py # Configuration management
|
|
||||||
│ ├── domain.py # Domain-specific data
|
|
||||||
│ └── utils.py # Reusable utilities
|
|
||||||
├── processing/ # Data processing pipeline
|
|
||||||
│ ├── main.py # Main pipeline script
|
|
||||||
│ ├── pipeline.py # Pipeline framework
|
|
||||||
│ ├── steps_config.py # Configurable processing steps
|
|
||||||
│ └── monitor.py # Monitoring utilities
|
|
||||||
├── research/ # Research and experiments
|
|
||||||
│ ├── cli.py # Command-line interface
|
|
||||||
│ ├── experiment.py # Experiment management
|
|
||||||
│ ├── models.py # Model implementations
|
|
||||||
│ └── runner.py # Experiment execution
|
|
||||||
└── dataset/ # Data files
|
|
||||||
└── names.csv # Raw dataset
|
|
||||||
```
|
|
||||||
|
|
||||||
## Citation
|
|
||||||
|
|
||||||
If you use this pipeline in your research, please cite:
|
|
||||||
|
|
||||||
```bibtex
|
|
||||||
@software{drc_names_pipeline,
|
|
||||||
title={DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System},
|
|
||||||
author={Your Name},
|
|
||||||
year={2025},
|
|
||||||
url={https://github.com/bernard-ng/drc-ners-nlp}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
||||||
|
|
||||||
## Acknowledgments
|
|
||||||
|
|
||||||
- Democratic Republic of Congo population data contributors
|
|
||||||
- Open source NLP and machine learning communities
|
|
||||||
- Cultural linguistics research communities
|
|
||||||
|
|||||||
@@ -1,205 +0,0 @@
|
|||||||
#!.venv/bin/python3
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from core.config import setup_config
|
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
|
||||||
|
|
||||||
|
|
||||||
def list_experiments(args):
|
|
||||||
"""List experiments with optional filtering"""
|
|
||||||
|
|
||||||
tracker = ExperimentTracker()
|
|
||||||
|
|
||||||
# Apply filters
|
|
||||||
filters = {}
|
|
||||||
if args.status:
|
|
||||||
from research.experiment import ExperimentStatus
|
|
||||||
|
|
||||||
filters["status"] = ExperimentStatus(args.status)
|
|
||||||
if args.model_type:
|
|
||||||
filters["model_type"] = args.model_type
|
|
||||||
if args.tags:
|
|
||||||
filters["tags"] = args.tags
|
|
||||||
|
|
||||||
experiments = tracker.list_experiments(**filters)
|
|
||||||
|
|
||||||
if not experiments:
|
|
||||||
logging.info("No experiments found matching criteria")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Create summary table
|
|
||||||
rows = []
|
|
||||||
for exp in experiments:
|
|
||||||
row = {
|
|
||||||
"ID": exp.experiment_id[:12] + "...",
|
|
||||||
"Name": exp.config.name,
|
|
||||||
"Model": exp.config.model_type,
|
|
||||||
"Status": exp.status.value,
|
|
||||||
"Test Acc": f"{exp.test_metrics.get('accuracy', 0):.4f}" if exp.test_metrics else "N/A",
|
|
||||||
"Start Time": exp.start_time.strftime("%Y-%m-%d %H:%M"),
|
|
||||||
}
|
|
||||||
rows.append(row)
|
|
||||||
|
|
||||||
df = pd.DataFrame(rows)
|
|
||||||
logging.info(df.to_string(index=False))
|
|
||||||
|
|
||||||
|
|
||||||
def show_experiment_details(args):
|
|
||||||
"""Show detailed results for an experiment"""
|
|
||||||
|
|
||||||
tracker = ExperimentTracker()
|
|
||||||
experiment = tracker.get_experiment(args.experiment_id)
|
|
||||||
|
|
||||||
if not experiment:
|
|
||||||
logging.error(f"Experiment not found: {args.experiment_id}")
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info("=== Experiment Details ===")
|
|
||||||
logging.info(f"ID: {experiment.experiment_id}")
|
|
||||||
logging.info(f"Name: {experiment.config.name}")
|
|
||||||
logging.info(f"Description: {experiment.config.description}")
|
|
||||||
logging.info(f"Model Type: {experiment.config.model_type}")
|
|
||||||
logging.info(f"Features: {', '.join([f.value for f in experiment.config.features])}")
|
|
||||||
logging.info(f"Status: {experiment.status.value}")
|
|
||||||
logging.info(f"Start Time: {experiment.start_time}")
|
|
||||||
logging.info(f"End Time: {experiment.end_time}")
|
|
||||||
|
|
||||||
if experiment.test_metrics:
|
|
||||||
logging.info("=== Test Metrics ===")
|
|
||||||
for metric, value in experiment.test_metrics.items():
|
|
||||||
logging.info(f"{metric}: {value:.4f}")
|
|
||||||
|
|
||||||
if experiment.cv_metrics:
|
|
||||||
logging.info("=== Cross-Validation Metrics ===")
|
|
||||||
for metric, value in experiment.cv_metrics.items():
|
|
||||||
if not metric.endswith("_std"):
|
|
||||||
std_key = f"{metric}_std"
|
|
||||||
std_val = experiment.cv_metrics.get(std_key, 0)
|
|
||||||
logging.info(f"{metric}: {value:.4f} ± {std_val:.4f}")
|
|
||||||
|
|
||||||
if experiment.feature_importance:
|
|
||||||
logging.info("=== Top 10 Feature Importances ===")
|
|
||||||
sorted_features = sorted(
|
|
||||||
experiment.feature_importance.items(), key=lambda x: x[1], reverse=True
|
|
||||||
)
|
|
||||||
for feature, importance in sorted_features[:10]:
|
|
||||||
logging.info(f"{feature}: {importance:.4f}")
|
|
||||||
|
|
||||||
if experiment.prediction_examples:
|
|
||||||
logging.info("=== Prediction Examples ===")
|
|
||||||
for i, example in enumerate(experiment.prediction_examples[:5]):
|
|
||||||
correct = "✓" if example["correct"] else "✗"
|
|
||||||
logging.info(
|
|
||||||
f"{i + 1}. {example['name']} -> True: {example['true_label']}, "
|
|
||||||
f"Pred: {example['predicted_label']} {correct}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def compare_experiments_cmd(args):
|
|
||||||
"""Compare multiple experiments"""
|
|
||||||
|
|
||||||
config = setup_config(env="development")
|
|
||||||
runner = ExperimentRunner(config)
|
|
||||||
comparison = runner.compare_experiments(args.experiment_ids)
|
|
||||||
|
|
||||||
if comparison.empty:
|
|
||||||
logging.info("No experiments found for comparison")
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info("=== Experiment Comparison ===")
|
|
||||||
|
|
||||||
# Show key columns
|
|
||||||
key_columns = ["name", "model_type", "features", "test_accuracy", "test_f1"]
|
|
||||||
available_columns = [col for col in key_columns if col in comparison.columns]
|
|
||||||
|
|
||||||
logging.info(comparison[available_columns].to_string(index=False))
|
|
||||||
|
|
||||||
|
|
||||||
def export_results(args):
|
|
||||||
"""Export experiment results"""
|
|
||||||
|
|
||||||
tracker = ExperimentTracker()
|
|
||||||
output_path = tracker.export_results(Path(args.output) if args.output else None)
|
|
||||||
|
|
||||||
logging.info(f"Results exported to: {output_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main CLI entry point with unified configuration loading"""
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="DRC Names Research Experiment Manager",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Global arguments
|
|
||||||
parser.add_argument("--config", type=Path, help="Path to configuration file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--env", type=str, default="development",
|
|
||||||
help="Environment name (default: development)"
|
|
||||||
)
|
|
||||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
|
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
||||||
|
|
||||||
# List experiments
|
|
||||||
list_parser = subparsers.add_parser("list", help="List experiments")
|
|
||||||
list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"])
|
|
||||||
list_parser.add_argument("--tags", nargs="+", help="Filter by tags")
|
|
||||||
|
|
||||||
# Show experiment details
|
|
||||||
detail_parser = subparsers.add_parser("show", help="Show experiment details")
|
|
||||||
detail_parser.add_argument("experiment_id", help="Experiment ID")
|
|
||||||
|
|
||||||
# Compare experiments
|
|
||||||
compare_parser = subparsers.add_parser("compare", help="Compare experiments")
|
|
||||||
compare_parser.add_argument("experiment_ids", nargs="+", help="Experiment IDs to compare")
|
|
||||||
|
|
||||||
# Export results
|
|
||||||
export_parser = subparsers.add_parser("export", help="Export results to CSV")
|
|
||||||
export_parser.add_argument("--output", help="Output file path")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if not args.command:
|
|
||||||
parser.print_help()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Load configuration and setup logging
|
|
||||||
config = setup_config(config_path=args.config, env=args.env)
|
|
||||||
|
|
||||||
# Override log level if verbose requested
|
|
||||||
if args.verbose:
|
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
# Execute command
|
|
||||||
command_map = {
|
|
||||||
"list": list_experiments,
|
|
||||||
"show": show_experiment_details,
|
|
||||||
"compare": compare_experiments_cmd,
|
|
||||||
"export": export_results,
|
|
||||||
}
|
|
||||||
handler = command_map.get(args.command)
|
|
||||||
if handler:
|
|
||||||
handler(args)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Command failed: {e}")
|
|
||||||
if args.verbose:
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
traceback.print_exc()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
exit_code = main()
|
|
||||||
sys.exit(exit_code)
|
|
||||||
@@ -1,17 +1,12 @@
|
|||||||
# Production Environment Configuration
|
|
||||||
# Optimized settings for production deployment
|
|
||||||
|
|
||||||
name: "drc_names_pipeline"
|
|
||||||
version: "1.0.0"
|
|
||||||
environment: "development"
|
environment: "development"
|
||||||
debug: true
|
debug: true
|
||||||
|
|
||||||
# Processing settings
|
# Processing settings
|
||||||
processing:
|
processing:
|
||||||
batch_size: 100_000
|
batch_size: 10_000
|
||||||
max_workers: 8
|
max_workers: 8
|
||||||
checkpoint_interval: 10
|
checkpoint_interval: 10
|
||||||
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
|
use_multiprocessing: true
|
||||||
|
|
||||||
# Pipeline stages
|
# Pipeline stages
|
||||||
stages:
|
stages:
|
||||||
@@ -20,7 +15,6 @@ stages:
|
|||||||
#- "llm_annotation"
|
#- "llm_annotation"
|
||||||
- "data_splitting"
|
- "data_splitting"
|
||||||
|
|
||||||
|
|
||||||
# Production LLM settings
|
# Production LLM settings
|
||||||
llm:
|
llm:
|
||||||
model_name: "mistral:7b"
|
model_name: "mistral:7b"
|
||||||
@@ -31,14 +25,10 @@ llm:
|
|||||||
max_concurrent_requests: 4
|
max_concurrent_requests: 4
|
||||||
enable_rate_limiting: true
|
enable_rate_limiting: true
|
||||||
|
|
||||||
# Development data settings - limited dataset for faster testing
|
# Data handling configuration
|
||||||
data:
|
data:
|
||||||
split_evaluation: true
|
max_dataset_size: 100_000
|
||||||
split_by_gender: true
|
balance_by_sex: true
|
||||||
evaluation_fraction: 0.2
|
|
||||||
random_seed: 42
|
|
||||||
max_dataset_size: ~ # Limit to 10k records for development/testing
|
|
||||||
balance_by_sex: false # Balance male/female samples when limiting
|
|
||||||
|
|
||||||
# Enhanced logging for development
|
# Enhanced logging for development
|
||||||
logging:
|
logging:
|
||||||
|
|||||||
@@ -1,17 +1,12 @@
|
|||||||
# Production Environment Configuration
|
|
||||||
# Optimized settings for production deployment
|
|
||||||
|
|
||||||
name: "drc_names_pipeline"
|
|
||||||
version: "1.0.0"
|
|
||||||
environment: "production"
|
environment: "production"
|
||||||
debug: false
|
debug: false
|
||||||
|
|
||||||
# Production processing settings (optimized for performance)
|
# Processing settings
|
||||||
processing:
|
processing:
|
||||||
batch_size: 10_000
|
batch_size: 10_000
|
||||||
max_workers: 8
|
max_workers: 8
|
||||||
checkpoint_interval: 10
|
checkpoint_interval: 10
|
||||||
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
|
use_multiprocessing: true
|
||||||
|
|
||||||
# Pipeline stages
|
# Pipeline stages
|
||||||
stages:
|
stages:
|
||||||
@@ -20,7 +15,6 @@ stages:
|
|||||||
- "llm_annotation"
|
- "llm_annotation"
|
||||||
- "data_splitting"
|
- "data_splitting"
|
||||||
|
|
||||||
|
|
||||||
# Production LLM settings
|
# Production LLM settings
|
||||||
llm:
|
llm:
|
||||||
model_name: "mistral:7b"
|
model_name: "mistral:7b"
|
||||||
@@ -31,19 +25,15 @@ llm:
|
|||||||
max_concurrent_requests: 4
|
max_concurrent_requests: 4
|
||||||
enable_rate_limiting: true
|
enable_rate_limiting: true
|
||||||
|
|
||||||
# Production data settings
|
# Data handling configuration
|
||||||
data:
|
data:
|
||||||
split_evaluation: true
|
|
||||||
split_by_gender: true
|
|
||||||
evaluation_fraction: 0.2
|
|
||||||
random_seed: 42
|
|
||||||
max_dataset_size: null
|
max_dataset_size: null
|
||||||
balance_by_sex: false
|
balance_by_sex: false
|
||||||
|
|
||||||
# Production logging (less verbose)
|
# Production logging (less verbose)
|
||||||
logging:
|
logging:
|
||||||
level: "INFO"
|
level: "INFO"
|
||||||
console_logging: false # Disable console in production
|
console_logging: false
|
||||||
file_logging: true
|
file_logging: true
|
||||||
log_file: "pipeline.production.log"
|
log_file: "pipeline.production.log"
|
||||||
max_log_size: 52428800 # 50MB
|
max_log_size: 52428800 # 50MB
|
||||||
|
|||||||
+47
-47
@@ -1,72 +1,72 @@
|
|||||||
# DRC Names Processing Pipeline Configuration
|
# DRC Names Processing Pipeline Configuration
|
||||||
# Main configuration file with default settings
|
# Main configuration file with default settings
|
||||||
|
|
||||||
name: "drc_names_pipeline"
|
name: "drc_ners_pipeline" # Name of the pipeline
|
||||||
version: "1.0.0"
|
version: "1.0.0" # Version of the pipeline
|
||||||
description: "DRC Names NLP Processing Pipeline"
|
description: "DRC NERS NLP Processing" # Description of the pipeline
|
||||||
environment: "development"
|
environment: "development" # Environment type (development, production, etc.)
|
||||||
debug: false
|
debug: false # Enable debug mode for detailed logging and error reporting
|
||||||
|
|
||||||
# Project directory structure
|
# Project directory structure
|
||||||
paths:
|
paths:
|
||||||
root_dir: "."
|
root_dir: "." # Root directory of the project
|
||||||
configs_dir: "./config"
|
configs_dir: "./config" # Directory for configuration files
|
||||||
data_dir: "./data/dataset"
|
data_dir: "./data/dataset" # Directory for dataset files
|
||||||
models_dir: "./data/models"
|
models_dir: "./data/models" # Directory for model files
|
||||||
outputs_dir: "./data/outputs"
|
outputs_dir: "./data/outputs" # Directory for output files
|
||||||
logs_dir: "./data/logs"
|
logs_dir: "./data/logs" # Directory for log files
|
||||||
checkpoints_dir: "./data/checkpoints"
|
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
|
||||||
|
|
||||||
# Pipeline stages
|
# Pipeline stages
|
||||||
stages:
|
stages: # List of stages in the processing pipeline
|
||||||
- "data_cleaning"
|
- "data_cleaning" # Data cleaning stage
|
||||||
- "feature_extraction"
|
- "feature_extraction" # Feature extraction stage
|
||||||
- "llm_annotation"
|
- "llm_annotation" # LLM annotation stage (computational intensive)
|
||||||
- "data_splitting"
|
- "data_splitting" # Data splitting stage
|
||||||
|
|
||||||
# Data processing configuration
|
# Data processing configuration
|
||||||
processing:
|
processing:
|
||||||
batch_size: 1_000
|
batch_size: 1_000 # Size of data batches to process at once
|
||||||
max_workers: 4
|
max_workers: 4 # Number of worker threads for parallel processing
|
||||||
checkpoint_interval: 5
|
checkpoint_interval: 5 # Interval for saving checkpoints during processing
|
||||||
use_multiprocessing: false
|
use_multiprocessing: false # Enable multiprocessing for CPU-bound tasks
|
||||||
encoding_options:
|
encoding_options: # List of encodings to try when reading files
|
||||||
- "utf-8"
|
- "utf-8"
|
||||||
- "utf-16"
|
- "utf-16"
|
||||||
- "latin1"
|
- "latin1"
|
||||||
chunk_size: 100_000
|
chunk_size: 100_000 # Size of data chunks to process in parallel
|
||||||
|
|
||||||
# LLM annotation settings
|
# LLM annotation settings
|
||||||
llm:
|
llm:
|
||||||
model_name: "mistral:7b"
|
model_name: "mistral:7b" # Name of the LLM model to use
|
||||||
requests_per_minute: 60
|
requests_per_minute: 60 # Requests per minute to the LLM service
|
||||||
requests_per_second: 2
|
requests_per_second: 2 # Requests per second to the LLM service
|
||||||
retry_attempts: 3
|
retry_attempts: 3 # Number of retry attempts for LLM requests
|
||||||
timeout_seconds: 600
|
timeout_seconds: 600 # Timeout for LLM requests
|
||||||
max_concurrent_requests: 2
|
max_concurrent_requests: 2 # Maximum concurrent requests to the LLM service
|
||||||
enable_rate_limiting: true
|
enable_rate_limiting: true # Enable rate limiting to avoid overloading the LLM service
|
||||||
|
|
||||||
# Data handling configuration
|
# Data handling configuration
|
||||||
data:
|
data:
|
||||||
input_file: "names.csv"
|
input_file: "names.csv" # Input file containing names data
|
||||||
output_files:
|
output_files:
|
||||||
featured: "names_featured.csv"
|
featured: "names_featured.csv" # Output file for featured data
|
||||||
evaluation: "names_evaluation.csv"
|
evaluation: "names_evaluation.csv" # Output file for evaluation set
|
||||||
males: "names_males.csv"
|
males: "names_males.csv" # Output files for male names
|
||||||
females: "names_females.csv"
|
females: "names_females.csv" # Output files for female names
|
||||||
split_evaluation: true
|
split_evaluation: true # Should the dataset be split into training and evaluation sets ?
|
||||||
split_by_gender: true
|
split_by_gender: true # Should the dataset be split by gender ?
|
||||||
evaluation_fraction: 0.2
|
evaluation_fraction: 0.2 # Fraction of data to use for evaluation
|
||||||
random_seed: 42
|
random_seed: 42 # Random seed for reproducibility
|
||||||
max_dataset_size: null
|
max_dataset_size: null # Maximum size of the dataset to process, set to null for no
|
||||||
balance_by_sex: false
|
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
|
||||||
|
|
||||||
# Logging configuration
|
# Logging configuration
|
||||||
logging:
|
logging:
|
||||||
level: "INFO"
|
level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
file_logging: true
|
file_logging: true # Enable logging to file
|
||||||
console_logging: true
|
console_logging: true # Enable logging to console
|
||||||
log_file: "pipeline.log"
|
log_file: "pipeline.log" # Log file name
|
||||||
max_log_size: 10485760 # 10MB
|
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
||||||
backup_count: 5
|
backup_count: 5 # Number of backup log files to keep
|
||||||
|
|||||||
+126
-106
@@ -1,128 +1,148 @@
|
|||||||
# Research Experiment Configuration Templates
|
|
||||||
# These configurations can be used as starting points for different types of experiments
|
|
||||||
|
|
||||||
# Baseline Experiments Configuration
|
|
||||||
baseline_experiments:
|
baseline_experiments:
|
||||||
- name: "baseline_logistic_regression_fullname"
|
- name: "bigru"
|
||||||
|
description: "Baseline BiGRU with full name features"
|
||||||
|
model_type: "bigru"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
max_len: 20
|
||||||
|
embedding_dim: 64
|
||||||
|
gru_units: 32
|
||||||
|
epochs: 10
|
||||||
|
batch_size: 32
|
||||||
|
tags: [ "baseline", "neural", "bigru" ]
|
||||||
|
|
||||||
|
- name: "cnn"
|
||||||
|
description: "Baseline CNN with character patterns"
|
||||||
|
model_type: "cnn"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
max_len: 20
|
||||||
|
embedding_dim: 64
|
||||||
|
filters: 64
|
||||||
|
kernel_size: 3
|
||||||
|
dropout: 0.5
|
||||||
|
epochs: 10
|
||||||
|
batch_size: 32
|
||||||
|
tags: [ "baseline", "neural", "cnn" ]
|
||||||
|
|
||||||
|
- name: "ensemble"
|
||||||
|
description: "Baseline Ensemble with multiple models"
|
||||||
|
model_type: "ensemble"
|
||||||
|
features: [ "full_name", "name_length", "word_count" ]
|
||||||
|
model_params:
|
||||||
|
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
||||||
|
voting: "soft"
|
||||||
|
cv_folds: 5
|
||||||
|
tags: [ "baseline", "ensemble" ]
|
||||||
|
|
||||||
|
- name: "lightgbm"
|
||||||
|
description: "Baseline LightGBM with engineered features"
|
||||||
|
model_type: "lightgbm"
|
||||||
|
features: [ "full_name", "name_length", "word_count" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: -1
|
||||||
|
learning_rate: 0.1
|
||||||
|
num_leaves: 31
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
tags: [ "baseline", "lightgbm" ]
|
||||||
|
|
||||||
|
- name: "logistic_regression_fullname"
|
||||||
description: "Baseline logistic regression with full name"
|
description: "Baseline logistic regression with full name"
|
||||||
model_type: "logistic_regression"
|
model_type: "logistic_regression"
|
||||||
features: [ "full_name" ]
|
features: [ "full_name" ]
|
||||||
model_params:
|
model_params:
|
||||||
ngram_range: [2, 5]
|
|
||||||
max_features: 10000
|
max_features: 10000
|
||||||
max_iter: 1000
|
tags: [ "baseline", "logistic_regression", "fullname" ]
|
||||||
tags: ["baseline", "fullname"]
|
|
||||||
|
|
||||||
- name: "baseline_logistic_regression_native"
|
- name: "logistic_regression_native"
|
||||||
description: "Logistic regression with native name only"
|
description: "Logistic regression with native name only"
|
||||||
model_type: "logistic_regression"
|
model_type: "logistic_regression"
|
||||||
features: [ "native_name" ]
|
features: [ "native_name" ]
|
||||||
model_params:
|
model_params:
|
||||||
ngram_range: [2, 4]
|
|
||||||
max_features: 5000
|
max_features: 5000
|
||||||
tags: ["baseline", "native"]
|
tags: [ "baseline", "logistic_regression", "native" ]
|
||||||
|
|
||||||
- name: "baseline_rf_engineered"
|
- name: "logistic_regression_surname"
|
||||||
description: "Random Forest with engineered features"
|
description: "Logistic regression with surname name only"
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "logistic_regression", "surname" ]
|
||||||
|
|
||||||
|
- name: "lstm"
|
||||||
|
description: "Baseline LSTM with full name features"
|
||||||
|
model_type: "lstm"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
lstm_units: 64
|
||||||
|
epochs: 10
|
||||||
|
batch_size: 64
|
||||||
|
tags: [ "baseline", "neural", "lstm" ]
|
||||||
|
|
||||||
|
- name: "naive_bayes"
|
||||||
|
description: "Baseline Naive Bayes with full name features"
|
||||||
|
model_type: "naive_bayes"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "naive_bayes" ]
|
||||||
|
|
||||||
|
- name: "random_forest"
|
||||||
|
description: "Baseline Random Forest with engineered features"
|
||||||
model_type: "random_forest"
|
model_type: "random_forest"
|
||||||
features: [ "name_length", "word_count", "province" ]
|
features: [ "name_length", "word_count", "province" ]
|
||||||
model_params:
|
model_params:
|
||||||
n_estimators: 100
|
n_estimators: 100
|
||||||
max_depth: 10
|
max_depth: 10
|
||||||
tags: ["baseline", "engineered"]
|
min_samples_split: 2
|
||||||
|
min_samples_leaf: 1
|
||||||
|
tags: [ "baseline", "random_forest", "engineered" ]
|
||||||
|
|
||||||
|
- name: "svm"
|
||||||
|
description: "Baseline SVM with full name features"
|
||||||
|
model_type: "svm"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
C: 1.0
|
||||||
|
kernel: "rbf"
|
||||||
|
ngram_range: [ 2, 4 ]
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "svm" ]
|
||||||
|
|
||||||
|
- name: "transformer"
|
||||||
|
description: "Baseline Transformer with attention mechanism"
|
||||||
|
model_type: "transformer"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
num_heads: 4
|
||||||
|
num_layers: 2
|
||||||
|
epochs: 10
|
||||||
|
batch_size: 64
|
||||||
|
tags: [ "baseline", "neural", "transformer" ]
|
||||||
|
|
||||||
|
- name: "xgboost"
|
||||||
|
description: "Baseline XGBoost with engineered features"
|
||||||
|
model_type: "xgboost"
|
||||||
|
features: [ "full_name", "name_length", "word_count" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 6
|
||||||
|
learning_rate: 0.1
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
tags: [ "baseline", "xgboost" ]
|
||||||
|
|
||||||
|
|
||||||
|
# Advanced Experiments Configuration
|
||||||
|
advanced_experiments:
|
||||||
|
|
||||||
# Feature Study Configurations
|
# Feature Study Configurations
|
||||||
feature_studies:
|
feature_studies:
|
||||||
- name: "native_vs_surname"
|
|
||||||
description: "Compare native name vs surname effectiveness"
|
|
||||||
experiments:
|
|
||||||
- model_type: "logistic_regression"
|
|
||||||
features: ["native_name"]
|
|
||||||
tags: ["feature_study", "native"]
|
|
||||||
- model_type: "logistic_regression"
|
|
||||||
features: ["surname"]
|
|
||||||
tags: ["feature_study", "surname"]
|
|
||||||
|
|
||||||
- name: "name_parts_analysis"
|
# Hyperparameter Tuning Configurations
|
||||||
description: "Analyze effectiveness of different name parts"
|
hyperparameter_tuning:
|
||||||
experiments:
|
|
||||||
- features: ["first_word"]
|
|
||||||
tags: ["name_parts", "first"]
|
|
||||||
- features: ["last_word"]
|
|
||||||
tags: ["name_parts", "last"]
|
|
||||||
- features: ["name_beginnings"]
|
|
||||||
feature_params:
|
|
||||||
beginning_length: 3
|
|
||||||
tags: ["name_parts", "beginnings"]
|
|
||||||
- features: ["name_endings"]
|
|
||||||
feature_params:
|
|
||||||
ending_length: 3
|
|
||||||
tags: ["name_parts", "endings"]
|
|
||||||
|
|
||||||
# Province-Specific Studies
|
|
||||||
province_studies:
|
|
||||||
- name: "kinshasa_study"
|
|
||||||
description: "Gender prediction for Kinshasa province"
|
|
||||||
model_type: "logistic_regression"
|
|
||||||
features: ["full_name"]
|
|
||||||
train_data_filter:
|
|
||||||
province: "kinshasa"
|
|
||||||
tags: ["province_study", "kinshasa"]
|
|
||||||
|
|
||||||
- name: "cross_province_generalization"
|
|
||||||
description: "Train on one province, test on another"
|
|
||||||
experiments:
|
|
||||||
- train_filter: {"province": "kinshasa"}
|
|
||||||
test_filter: {"province": "bas-congo"}
|
|
||||||
tags: ["generalization", "kinshasa_to_bas-congo"]
|
|
||||||
|
|
||||||
# Model Comparison Studies
|
|
||||||
model_comparisons:
|
|
||||||
- name: "model_comparison_fullname"
|
|
||||||
description: "Compare different models with full name"
|
|
||||||
base_config:
|
|
||||||
features: ["full_name"]
|
|
||||||
tags: ["model_comparison"]
|
|
||||||
models:
|
|
||||||
- model_type: "logistic_regression"
|
|
||||||
model_params:
|
|
||||||
ngram_range: [2, 5]
|
|
||||||
- model_type: "random_forest"
|
|
||||||
# Note: RF will need different feature preparation
|
|
||||||
features: ["name_length", "word_count", "province"]
|
|
||||||
|
|
||||||
# Advanced Feature Combinations
|
|
||||||
advanced_features:
|
|
||||||
- name: "multi_feature_combination"
|
|
||||||
description: "Test various feature combinations"
|
|
||||||
experiments:
|
|
||||||
- features: ["full_name", "name_length"]
|
|
||||||
tags: ["combination", "name_plus_length"]
|
|
||||||
- features: ["native_name", "surname", "province"]
|
|
||||||
tags: ["combination", "semantic_features"]
|
|
||||||
- features: ["name_beginnings", "name_endings", "word_count"]
|
|
||||||
tags: ["combination", "structural_features"]
|
|
||||||
|
|
||||||
# Hyperparameter Studies
|
|
||||||
hyperparameter_studies:
|
|
||||||
- name: "ngram_range_study"
|
|
||||||
description: "Study effect of different n-gram ranges"
|
|
||||||
base_config:
|
|
||||||
model_type: "logistic_regression"
|
|
||||||
features: ["full_name"]
|
|
||||||
tags: ["hyperparameter", "ngram"]
|
|
||||||
variants:
|
|
||||||
- model_params: {"ngram_range": [1, 3]}
|
|
||||||
- model_params: {"ngram_range": [2, 4]}
|
|
||||||
- model_params: {"ngram_range": [2, 5]}
|
|
||||||
- model_params: {"ngram_range": [3, 6]}
|
|
||||||
|
|
||||||
# Data Size Studies
|
|
||||||
data_studies:
|
|
||||||
- name: "learning_curve_study"
|
|
||||||
description: "Study performance vs training data size"
|
|
||||||
base_config:
|
|
||||||
model_type: "logistic_regression"
|
|
||||||
features: ["full_name"]
|
|
||||||
tags: ["learning_curve"]
|
|
||||||
data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0] # Fractions of training data to use
|
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class ModelTrainer:
|
|||||||
model_type: str = "logistic_regression",
|
model_type: str = "logistic_regression",
|
||||||
features: List[str] = None,
|
features: List[str] = None,
|
||||||
model_params: Dict[str, Any] = None,
|
model_params: Dict[str, Any] = None,
|
||||||
|
tags: List[str] = None,
|
||||||
save_artifacts: bool = True,
|
save_artifacts: bool = True,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
@@ -45,6 +46,10 @@ class ModelTrainer:
|
|||||||
features = ["full_name"]
|
features = ["full_name"]
|
||||||
feature_types = [FeatureType(f) for f in features]
|
feature_types = [FeatureType(f) for f in features]
|
||||||
|
|
||||||
|
# Prepare tags - combine default tags with template tags
|
||||||
|
default_tags = ["training", model_type]
|
||||||
|
experiment_tags = default_tags + (tags or [])
|
||||||
|
|
||||||
# Create experiment configuration
|
# Create experiment configuration
|
||||||
config = ExperimentConfig(
|
config = ExperimentConfig(
|
||||||
name=model_name,
|
name=model_name,
|
||||||
@@ -52,7 +57,7 @@ class ModelTrainer:
|
|||||||
model_type=model_type,
|
model_type=model_type,
|
||||||
features=feature_types,
|
features=feature_types,
|
||||||
model_params=model_params or {},
|
model_params=model_params or {},
|
||||||
tags=["training", model_type],
|
tags=experiment_tags,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run experiment
|
# Run experiment
|
||||||
|
|||||||
@@ -3,29 +3,98 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from core.config import setup_config
|
from core.config import setup_config
|
||||||
from research.model_trainer import ModelTrainer
|
from research.model_trainer import ModelTrainer
|
||||||
|
|
||||||
|
|
||||||
|
def load_research_templates(templates_path: str = "config/research_templates.yaml") -> dict:
|
||||||
|
"""Load research templates from YAML file"""
|
||||||
|
try:
|
||||||
|
with open(templates_path, 'r') as file:
|
||||||
|
return yaml.safe_load(file)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.error(f"Templates file not found: {templates_path}")
|
||||||
|
raise
|
||||||
|
except yaml.YAMLError as e:
|
||||||
|
logging.error(f"Error parsing templates file: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def find_experiment_config(templates: dict, name: str, experiment_type: str) -> dict:
|
||||||
|
"""Find experiment configuration by name and type"""
|
||||||
|
# Map type to section in templates
|
||||||
|
type_mapping = {
|
||||||
|
"baseline": "baseline_experiments",
|
||||||
|
"advanced": "advanced_experiments",
|
||||||
|
"feature_study": "feature_studies",
|
||||||
|
"tuning": "hyperparameter_tuning"
|
||||||
|
}
|
||||||
|
|
||||||
|
section_name = type_mapping.get(experiment_type)
|
||||||
|
if not section_name:
|
||||||
|
available_types = list(type_mapping.keys())
|
||||||
|
raise ValueError(f"Unknown experiment type '{experiment_type}'. Available types: {available_types}")
|
||||||
|
|
||||||
|
if section_name not in templates:
|
||||||
|
raise ValueError(f"Section '{section_name}' not found in templates")
|
||||||
|
|
||||||
|
experiments = templates[section_name]
|
||||||
|
|
||||||
|
# Search for experiment by model name
|
||||||
|
for experiment in experiments:
|
||||||
|
# Check if this is the experiment we're looking for
|
||||||
|
# Look for experiments that match the model type or contain the name
|
||||||
|
if (experiment.get("model_type") == name or
|
||||||
|
name.lower() in experiment.get("name", "").lower() or
|
||||||
|
f"baseline_{name}" == experiment.get("name") or
|
||||||
|
f"advanced_{name}" == experiment.get("name")):
|
||||||
|
return experiment
|
||||||
|
|
||||||
|
# If not found, list available experiments
|
||||||
|
available_experiments = [exp.get("name", exp.get("model_type", "unknown")) for exp in experiments]
|
||||||
|
raise ValueError(f"Experiment '{name}' not found in '{experiment_type}' section. "
|
||||||
|
f"Available experiments: {available_experiments}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Train DRC Names Models")
|
parser = argparse.ArgumentParser(description="Train DRC Names Models using Research Templates")
|
||||||
parser.add_argument("--type", type=str, help="Specific model type to train")
|
parser.add_argument("--name", type=str, required=True, help="Model name to train")
|
||||||
parser.add_argument("--name", type=str, help="Model name")
|
parser.add_argument("--type", type=str, required=True, help="Experiment type")
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||||
|
parser.add_argument("--templates", type=str, default="config/research_templates.yaml")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Setup pipeline configuration
|
||||||
config = setup_config(config_path=args.config, env=args.env)
|
config = setup_config(config_path=args.config, env=args.env)
|
||||||
trainer = ModelTrainer(config)
|
|
||||||
|
|
||||||
# Train specific model
|
# Load research templates
|
||||||
|
logging.info(f"Loading research templates from: {args.templates}")
|
||||||
|
templates = load_research_templates(args.templates)
|
||||||
|
|
||||||
|
# Find the specific experiment configuration
|
||||||
|
logging.info(f"Looking for experiment: name='{args.name}', type='{args.type}'")
|
||||||
|
experiment_config = find_experiment_config(templates, args.name, args.type)
|
||||||
|
|
||||||
|
logging.info(f"Found experiment: {experiment_config.get('name')}")
|
||||||
|
logging.info(f"Description: {experiment_config.get('description')}")
|
||||||
|
logging.info(f"Features: {experiment_config.get('features')}")
|
||||||
|
|
||||||
|
# Train the model using template configuration
|
||||||
|
trainer = ModelTrainer(config)
|
||||||
trainer.train_single_model(
|
trainer.train_single_model(
|
||||||
model_name=args.name,
|
model_name=experiment_config.get("name"),
|
||||||
model_type=args.type,
|
model_type=experiment_config.get("model_type"),
|
||||||
features=["full_name"]
|
features=experiment_config.get("features"),
|
||||||
|
model_params=experiment_config.get("model_params", {}),
|
||||||
|
tags=experiment_config.get("tags", [])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logging.info("Training completed successfully!")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user