refactoring: add initial pipeline configuration and model classes
This commit is contained in:
+3
-2
@@ -4,8 +4,9 @@
|
|||||||
__pycache__/
|
__pycache__/
|
||||||
.ipynb_checkpoints/
|
.ipynb_checkpoints/
|
||||||
*.pyc
|
*.pyc
|
||||||
/models/
|
|
||||||
.env.local
|
.env.local
|
||||||
var/
|
var/
|
||||||
/dataset/
|
/data/dataset/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
/data/
|
||||||
|
/backups
|
||||||
|
|||||||
@@ -2,24 +2,127 @@
|
|||||||
default: help
|
default: help
|
||||||
|
|
||||||
.PHONY: help
|
.PHONY: help
|
||||||
help:
|
help: ## Show this help message
|
||||||
@echo Tasks:
|
|
||||||
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||||
|
|
||||||
.PHONY: download
|
# =============================================================================
|
||||||
download:
|
# ENVIRONMENT SETUP
|
||||||
@if [ ! -f dataset/names.csv ]; then \
|
# =============================================================================
|
||||||
set -a; [ -f .env.local ] && . .env.local; set +a; \
|
|
||||||
[ -z "$$DATASET_URL" ] && . .env; \
|
|
||||||
mkdir -p dataset; \
|
|
||||||
curl -L "$${DATASET_URL}" -o dataset/names.csv; \
|
|
||||||
else \
|
|
||||||
echo "dataset/names.csv already exists. Skipping download."; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
.PHONY: clean
|
.PHONY: setup
|
||||||
clean:
|
setup: ## Setup virtual environment and install dependencies
|
||||||
rm -rf ./models
|
python -m venv .venv
|
||||||
rm -rf ./results
|
.venv/bin/pip install --upgrade pip
|
||||||
rm -rf ./dataset/spacy/train.spacy
|
.venv/bin/pip install -r requirements.txt
|
||||||
rm -rf ./dataset/spacy/dev.spacy
|
|
||||||
|
.PHONY: install
|
||||||
|
install: ## Install/update dependencies
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
.PHONY: install-dev
|
||||||
|
install-dev: ## Install development dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install jupyter notebook ipykernel pytest black flake8 mypy
|
||||||
|
|
||||||
|
.PHONY: activate
|
||||||
|
activate: ## Show activation command
|
||||||
|
@echo "Run: source .venv/bin/activate"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# MODEL TRAINING & ARTIFACTS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
.PHONY: train-baseline
|
||||||
|
train-baseline: ## Train all baseline models and save artifacts
|
||||||
|
python research/train.py --mode baseline
|
||||||
|
|
||||||
|
.PHONY: train-neural
|
||||||
|
train-neural: ## Train neural network models (LSTM, CNN, Transformer)
|
||||||
|
python research/train.py --mode neural
|
||||||
|
|
||||||
|
.PHONY: train-model
|
||||||
|
train-model: ## Train specific model (use: make train-model MODEL=logistic_regression NAME=my_model)
|
||||||
|
python research/train.py --model-type $(MODEL) --name $(NAME)
|
||||||
|
|
||||||
|
.PHONY: list-models
|
||||||
|
list-models: ## List all saved model artifacts
|
||||||
|
python research/train.py --mode list
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# RESEARCH & EXPERIMENTS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
.PHONY: experiment
|
||||||
|
experiment: ## Create sample experiment configuration
|
||||||
|
python research/cli.py run --name "sample_experiment" --features full_name --model-type logistic_regression
|
||||||
|
|
||||||
|
.PHONY: baseline
|
||||||
|
baseline: ## Run baseline experiments
|
||||||
|
python research/cli.py baseline
|
||||||
|
|
||||||
|
.PHONY: ablation
|
||||||
|
ablation: ## Run feature ablation study
|
||||||
|
python research/cli.py ablation
|
||||||
|
|
||||||
|
.PHONY: components
|
||||||
|
components: ## Run name component analysis
|
||||||
|
python research/cli.py components
|
||||||
|
|
||||||
|
.PHONY: list-experiments
|
||||||
|
list-experiments: ## List all experiments
|
||||||
|
python research/cli.py list
|
||||||
|
|
||||||
|
.PHONY: list-completed
|
||||||
|
list-completed: ## List completed experiments only
|
||||||
|
python research/cli.py list --status completed
|
||||||
|
|
||||||
|
.PHONY: export-results
|
||||||
|
export-results: ## Export all experiment results to CSV
|
||||||
|
python research/cli.py export --output results_$(shell date +%Y%m%d_%H%M%S).csv
|
||||||
|
|
||||||
|
.PHONY: best-model
|
||||||
|
best-model: ## Show best performing model
|
||||||
|
python research/cli.py list --status completed | head -5
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# WEB INTERFACE
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
.PHONY: web
|
||||||
|
web: ## Launch Streamlit web interface
|
||||||
|
streamlit run web/app.py --server.runOnSave true --server.port 8501
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# DEVELOPMENT & CODE QUALITY
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
.PHONY: format
|
||||||
|
format: ## Format code with black
|
||||||
|
black . --line-length 100
|
||||||
|
|
||||||
|
.PHONY: lint
|
||||||
|
lint: ## Lint code with flake8
|
||||||
|
flake8 . --max-line-length=100 --ignore=E203,W503 --exclude=.venv
|
||||||
|
|
||||||
|
.PHONY: type-check
|
||||||
|
type-check: ## Type check with mypy
|
||||||
|
mypy . --ignore-missing-imports
|
||||||
|
|
||||||
|
.PHONY: notebook
|
||||||
|
notebook: ## Start Jupyter notebook
|
||||||
|
jupyter notebook notebooks/
|
||||||
|
|
||||||
|
.PHONY: lab
|
||||||
|
lab: ## Start Jupyter lab
|
||||||
|
jupyter lab notebooks/
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# DEPLOYMENT & PRODUCTION
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
.PHONY: backup
|
||||||
|
backup: ## Backup datasets and results
|
||||||
|
@mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
|
||||||
|
@cp -r data/ backups/$(shell date +%Y%m%d_%H%M%S)/data/
|
||||||
|
@echo "Backup created in backups/$(shell date +%Y%m%d_%H%M%S)/"
|
||||||
|
|||||||
@@ -1,110 +1,316 @@
|
|||||||
# NERS-NLP: A Culturally-Aware Natural Language Processing System with Named Entity Recognition and Gender Inference Models
|
# DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System for Congolese Name Analysis
|
||||||
|
|
||||||
Despite the growing success of Named Entity Recognition (NER) systems and gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data. In this paper, we propose NERS-NLP, a culturally-aware NLP system with Named Entity Recognition and Gender Inference Models. This study introduces a large-scale dataset of over 7 million names of the population of the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata, including geographical distribution. We explore the linguistic and sociocultural features embedded in these names and examine their impact on two key NLP tasks, namely, entity recognition and gender classification.
|
A comprehensive, research-friendly pipeline for analyzing Congolese names and predicting gender using culturally-aware machine learning models.
|
||||||
Our approach involves :
|
This system provides advanced data processing, experiment management, and an intuitive web interface for non-technical users.
|
||||||
|
|
||||||
- (1) a statistical and feature analysis of Congolese name structures,
|
## Overview
|
||||||
- (2) the development of supervised gender prediction models leveraging name components and demographic patterns,
|
|
||||||
- (3) the integration of the curated name lexicon into NER pipelines to improve recognition accuracy for Congolese entities.
|
|
||||||
|
|
||||||
|
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data.
|
||||||
|
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 7 million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
|
||||||
|
|
||||||
Experiments conducted on custom evaluation sets, including multilingual and code-switched Congolese texts, show that our culturally-aware methods significantly outperform state-of-the-art multilingual baselines.
|
Our approach involves:
|
||||||
This work demonstrates the importance of culturally grounded resources in reducing bias and improving performance in NLP systems applied to underrepresented regions. Our findings open new directions for inclusive language technologies in African contexts and contribute a valuable resource for future research in regional linguistics, onomastics, and identity-aware artificial intelligence.
|
|
||||||
|
|
||||||
|
- **(1) Advanced data processing pipeline** with batching, checkpointing, and parallel processing
|
||||||
|
- **(2) Modular experiment framework** for systematic model comparison and research iteration
|
||||||
|
- **(3) Multiple feature extraction strategies** leveraging name components, linguistic patterns, and demographic data
|
||||||
|
- **(4) Culturally-aware gender prediction models** trained specifically on Congolese naming patterns
|
||||||
|
- **(5) User-friendly web interface** enabling non-technical users to run experiments and make predictions
|
||||||
|
- **(6) Comprehensive research tools** for reproducible experimentation and result analysis
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
### **Advanced Data Processing**
|
||||||
|
- **Batched processing** with configurable batch sizes and parallel execution
|
||||||
|
- **Automatic checkpointing** and resume capability for large datasets
|
||||||
|
- **LLM-powered annotation** with rate limiting and retry logic
|
||||||
|
- **Memory-efficient** chunked data loading for datasets of any size
|
||||||
|
|
||||||
|
### **Research-Friendly Experiment Framework**
|
||||||
|
- **Modular model architecture** - easily add new models and features
|
||||||
|
- **Systematic experiment tracking** with automatic result storage
|
||||||
|
- **Feature ablation studies** and component analysis tools
|
||||||
|
- **Cross-validation** and statistical significance testing
|
||||||
|
- **Automated baseline comparisons** and performance analysis
|
||||||
|
|
||||||
|
### **Intuitive Web Interface**
|
||||||
|
- **No-code experiment creation** with visual parameter selection
|
||||||
|
- **Real-time monitoring** of data processing and training progress
|
||||||
|
- **Interactive result visualization** with charts and comparisons
|
||||||
|
- **Batch prediction capabilities** for CSV file upload and processing
|
||||||
|
- **Model comparison tools** with automatic performance rankings
|
||||||
|
|
||||||
|
### **Comprehensive Analytics**
|
||||||
|
- **Feature importance analysis** showing which name components matter most
|
||||||
|
- **Province-specific studies** examining regional naming patterns
|
||||||
|
- **Learning curve analysis** for understanding data requirements
|
||||||
|
- **Prediction confidence scoring** and error analysis tools
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Using Make Commands (Recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Complete setup and basic processing
|
||||||
|
make quick-start
|
||||||
|
|
||||||
|
# Launch web interface
|
||||||
|
make web
|
||||||
|
|
||||||
|
# Run research workflow
|
||||||
|
make research-flow
|
||||||
|
|
||||||
|
# Show all available commands
|
||||||
|
make help
|
||||||
|
```
|
||||||
|
|
||||||
|
### Manual Installation
|
||||||
|
|
||||||
## Installation
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
git clone https://github.com/bernard-ng/drc-ners-nlp.git
|
||||||
cd drc-ners-nlp
|
cd drc-ners-nlp
|
||||||
|
|
||||||
python3 -m venv .venv
|
# Setup environment
|
||||||
source .venv/bin/activate
|
make setup
|
||||||
|
make process
|
||||||
|
|
||||||
pip install -r requirements.txt
|
# Launch web interface
|
||||||
|
make web
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
## Dataset
|
### Web Interface (Recommended for Non-Technical Users)
|
||||||
### Preparation
|
|
||||||
| Name | Description | Default |
|
Launch the Streamlit web application:
|
||||||
|------------------|--------------------------------------------------------------------|---------|
|
```bash
|
||||||
| --split_eval | Split into evaluation and featured datasets | True |
|
make web
|
||||||
| --no-split_eval | Do not split into evaluation and featured datasets | |
|
```
|
||||||
| --split_by_sex | Split by sex into male/female datasets | True |
|
|
||||||
| --no-split_by_sex| Do not split by sex into male/female datasets | |
|
The interface provides:
|
||||||
|
- **Dashboard**: Overview of datasets and recent experiments
|
||||||
|
- **Data Overview**: Interactive data exploration and statistics
|
||||||
|
- **Data Processing**: Monitor and control the processing pipeline
|
||||||
|
- **Experiments**: Create and manage machine learning experiments
|
||||||
|
- **Results & Analysis**: Compare models and analyze performance
|
||||||
|
- **Predictions**: Make predictions on new names or upload CSV files
|
||||||
|
- **Settings**: Configure the system and manage data
|
||||||
|
|
||||||
|
### Research & Experiments
|
||||||
|
|
||||||
|
#### Quick Research Studies
|
||||||
|
```bash
|
||||||
|
# Compare different approaches (full name vs native vs surname)
|
||||||
|
make baseline
|
||||||
|
|
||||||
|
# Analyze which name components are most effective
|
||||||
|
make components
|
||||||
|
|
||||||
|
# Test feature importance through ablation study
|
||||||
|
make ablation
|
||||||
|
|
||||||
|
# View all experiment results
|
||||||
|
make list-experiments
|
||||||
|
|
||||||
|
# Export results for publication
|
||||||
|
make export-results
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Custom Experiments
|
||||||
|
```bash
|
||||||
|
# Run specific experiment via command line
|
||||||
|
python research/cli.py run \
|
||||||
|
--name "native_name_study" \
|
||||||
|
--features native_name \
|
||||||
|
--model-type logistic_regression \
|
||||||
|
--description "Test native name effectiveness"
|
||||||
|
|
||||||
|
# Compare multiple experiments
|
||||||
|
python research/cli.py compare <exp_id_1> <exp_id_2>
|
||||||
|
|
||||||
|
# View detailed results
|
||||||
|
python research/cli.py show <experiment_id>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Processing Pipeline
|
||||||
|
|
||||||
|
#### Basic Processing (No LLM)
|
||||||
|
```bash
|
||||||
|
make process-basic # Fast processing without LLM annotation
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Complete Processing (With LLM)
|
||||||
|
```bash
|
||||||
|
make process # Full pipeline including LLM annotation
|
||||||
|
make process-dev # Development mode with smaller batches
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Monitor Progress
|
||||||
|
```bash
|
||||||
|
make monitoring # Show current pipeline status
|
||||||
|
make status # Show overall system status
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Resume Interrupted Processing
|
||||||
|
```bash
|
||||||
|
make process-resume # Resume from last checkpoint
|
||||||
|
```
|
||||||
|
|
||||||
|
### Available Models and Features
|
||||||
|
|
||||||
|
#### Models
|
||||||
|
- **Logistic Regression**: Character n-gram based classification
|
||||||
|
- **Random Forest**: Engineered feature-based classification
|
||||||
|
- **LSTM**: Sequential neural network (planned)
|
||||||
|
- **Transformer**: Attention-based model (planned)
|
||||||
|
|
||||||
|
#### Features
|
||||||
|
- **Full Name**: Complete name as given
|
||||||
|
- **Native Name**: Identified native/given name component
|
||||||
|
- **Surname**: Family name component
|
||||||
|
- **Name Length**: Character count features
|
||||||
|
- **Word Count**: Number of words in name
|
||||||
|
- **Province**: Geographic/demographic features
|
||||||
|
- **Name Beginnings/Endings**: Prefix/suffix patterns
|
||||||
|
- **Character N-grams**: Linguistic pattern features
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Environment Configurations
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m processing.prepare --split_eval --split_by_sex
|
# Switch to development configuration (smaller batches, more logging)
|
||||||
|
make config-dev
|
||||||
|
|
||||||
|
# Switch to production configuration (optimized for performance)
|
||||||
|
make config-prod
|
||||||
|
|
||||||
|
# View current configuration
|
||||||
|
make show-config
|
||||||
```
|
```
|
||||||
|
|
||||||
### Annotation
|
### Custom Configuration
|
||||||
| Name | Description | Default |
|
|
||||||
|-------------|-----------------------------------------------------|----------------|
|
|
||||||
| --llm_model | Ollama model name to use | mistral:7b |
|
|
||||||
|
|
||||||
Example:
|
Edit configuration files in `config/`:
|
||||||
|
- `pipeline.yaml` - Main configuration
|
||||||
|
- `pipeline.development.yaml` - Development overrides
|
||||||
|
- `pipeline.production.yaml` - Production settings
|
||||||
|
|
||||||
|
Example configuration:
|
||||||
|
```yaml
|
||||||
|
processing:
|
||||||
|
batch_size: 1000
|
||||||
|
max_workers: 4
|
||||||
|
|
||||||
|
llm:
|
||||||
|
model_name: "mistral:7b"
|
||||||
|
requests_per_minute: 60
|
||||||
|
|
||||||
|
data:
|
||||||
|
split_evaluation: true
|
||||||
|
split_by_gender: true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Research Capabilities
|
||||||
|
|
||||||
|
### Systematic Experimentation
|
||||||
|
|
||||||
|
The framework supports systematic research through:
|
||||||
|
|
||||||
|
1. **Baseline Studies**: Compare fundamental approaches
|
||||||
|
2. **Feature Studies**: Test individual name components
|
||||||
|
3. **Ablation Studies**: Identify most important features
|
||||||
|
4. **Cross-Province Analysis**: Test generalization across regions
|
||||||
|
5. **Hyperparameter Optimization**: Systematic parameter tuning
|
||||||
|
|
||||||
|
### Reproducible Research
|
||||||
|
|
||||||
|
- **Experiment Tracking**: All experiments automatically logged with full configuration
|
||||||
|
- **Result Export**: CSV export for publication and further analysis
|
||||||
|
- **Statistical Testing**: Cross-validation and confidence intervals
|
||||||
|
- **Version Control**: Configuration-based approach enables easy replication
|
||||||
|
|
||||||
|
### Publication-Ready Output
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m processing.annotate --llm_model=mistral7b
|
# Generate comprehensive results for publication
|
||||||
|
make research-flow
|
||||||
|
make export-results
|
||||||
|
|
||||||
|
# Get best models for each approach
|
||||||
|
make list-completed
|
||||||
|
python research/cli.py list --status completed | head -10
|
||||||
```
|
```
|
||||||
|
|
||||||
## Experiments
|
## Development
|
||||||
### Training
|
|
||||||
| Name | Description | Default |
|
|
||||||
|----------------|--------------------------------------------------|--------------------|
|
|
||||||
| --dataset | Path to the dataset file | names_featured.csv |
|
|
||||||
| --size | Number of samples to use (None for full dataset) | None |
|
|
||||||
| --threshold | Probability threshold for gender classification | 0.5 |
|
|
||||||
| --cv | Number of cross-validation folds | None |
|
|
||||||
| --save | Whether to save the trained model | False |
|
|
||||||
| --balanced | Whether to balance the dataset | False |
|
|
||||||
| --epochs | Number of training epochs | 10 |
|
|
||||||
| --test_size | Proportion of data to use as test set | 0.2 |
|
|
||||||
| --random_state | Random seed for reproducibility | 42 |
|
|
||||||
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
|
### Code Quality and Testing
|
||||||
```bash
|
```bash
|
||||||
python -m pipelilne.gender.models.lstm --size 1000000 --save
|
make format # Format code with black
|
||||||
python -m pipelilne.gender.models.logreg --size 1000000 --save
|
make lint # Lint with flake8
|
||||||
python -m pipelilne.gender.models.transformer --size 1000000 --save
|
make check-deps # Verify dependencies
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Development Workflow
|
||||||
```bash
|
```bash
|
||||||
python -m pipelilne.gender.models.lstm --size 1000000 --balanced --save
|
make daily-work # Daily development setup
|
||||||
python -m pipelilne.gender.models.logreg --size 1000000 --balanced --save
|
make notebook # Launch Jupyter for analysis
|
||||||
python -m pipelilne.gender.models.transformer --size 1000000 --balanced --save
|
make web-dev # Launch web interface with auto-reload
|
||||||
```
|
```
|
||||||
|
|
||||||
### Evaluation
|
### Data Management
|
||||||
| Name | Description | Default |
|
|
||||||
|------------|-----------------------------------------------|----------------------|
|
|
||||||
| --model | Model type: logreg, lstm, or transformer | (required) |
|
|
||||||
| --dataset | Path to the dataset CSV file | names_featured.csv |
|
|
||||||
| --size | Number of rows to load from the dataset | None |
|
|
||||||
| --balanced | Load balanced dataset | False |
|
|
||||||
| --threshold| Probability threshold for classification | 0.5 |
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m pipelilne.gender.eval --dataset names_evaluations.csv --model logreg
|
make check-data # Verify all data files
|
||||||
python -m pipelilne.gender.eval --dataset names_evaluations.csv --model lstm
|
make data-stats # Show dataset statistics
|
||||||
python -m pipelilne.gender.eval --dataset names_evaluations.csv --model transformer
|
make backup-data # Create timestamped backup
|
||||||
|
make clean-checkpoints # Clean processing checkpoints
|
||||||
```
|
```
|
||||||
|
|
||||||
### Inference
|
## Project Structure
|
||||||
| Name | Description | Default |
|
|
||||||
|-------------|------------------------------------------|-----------|
|
|
||||||
| --model | Model type: logreg, lstm, or transformer | (required)|
|
|
||||||
| --names | One or more names | (required)|
|
|
||||||
| --threshold | Threshold for classification | 0.5 |
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m pipelilne.gender.predict --model logreg --names "Tshisekedi"
|
|
||||||
python -m pipelilne.gender.predict --model lstm --names "Ilunga Ngandu"
|
|
||||||
python -m pipelilne.gender.predict --model transformer --names "musenga wa musenga"
|
|
||||||
```
|
```
|
||||||
|
├── Makefile # All command shortcuts
|
||||||
|
├── streamlit_app.py # Web interface application
|
||||||
|
├── config/ # Configuration files
|
||||||
|
│ ├── pipeline.yaml # Main configuration
|
||||||
|
│ ├── pipeline.development.yaml # Dev settings
|
||||||
|
│ └── pipeline.production.yaml # Prod settings
|
||||||
|
├── core/ # Core framework
|
||||||
|
│ ├── config.py # Configuration management
|
||||||
|
│ ├── domain.py # Domain-specific data
|
||||||
|
│ └── utils.py # Reusable utilities
|
||||||
|
├── processing/ # Data processing pipeline
|
||||||
|
│ ├── main.py # Main pipeline script
|
||||||
|
│ ├── pipeline.py # Pipeline framework
|
||||||
|
│ ├── steps_config.py # Configurable processing steps
|
||||||
|
│ └── monitor.py # Monitoring utilities
|
||||||
|
├── research/ # Research and experiments
|
||||||
|
│ ├── cli.py # Command-line interface
|
||||||
|
│ ├── experiment.py # Experiment management
|
||||||
|
│ ├── models.py # Model implementations
|
||||||
|
│ └── runner.py # Experiment execution
|
||||||
|
└── dataset/ # Data files
|
||||||
|
└── names.csv # Raw dataset
|
||||||
|
```
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
If you use this pipeline in your research, please cite:
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@software{drc_names_pipeline,
|
||||||
|
title={DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System},
|
||||||
|
author={Your Name},
|
||||||
|
year={2025},
|
||||||
|
url={https://github.com/bernard-ng/drc-ners-nlp}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This project is licensed under the MIT License - see the LICENSE file for details.
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
|
||||||
|
- Democratic Republic of Congo population data contributors
|
||||||
|
- Open source NLP and machine learning communities
|
||||||
|
- Cultural linguistics research communities
|
||||||
|
|||||||
@@ -0,0 +1,383 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from core.config import get_config, setup_logging
|
||||||
|
from research.experiment import ExperimentConfig
|
||||||
|
from research.experiment.experiment_tracker import ExperimentTracker
|
||||||
|
from research.experiment.feature_extractor import FeatureType
|
||||||
|
from research.experiment.experiment_builder import ExperimentBuilder
|
||||||
|
from research.experiment.experiment_runner import ExperimentRunner
|
||||||
|
from research.model_registry import list_available_models
|
||||||
|
|
||||||
|
|
||||||
|
def create_experiment_from_args(args) -> ExperimentConfig:
|
||||||
|
"""Create experiment configuration from command line arguments"""
|
||||||
|
|
||||||
|
features = []
|
||||||
|
if args.features:
|
||||||
|
for feature_name in args.features:
|
||||||
|
try:
|
||||||
|
features.append(FeatureType(feature_name))
|
||||||
|
except ValueError:
|
||||||
|
logging.warning(f"Unknown feature type '{feature_name}', skipping")
|
||||||
|
|
||||||
|
if not features:
|
||||||
|
features = [FeatureType.FULL_NAME] # Default
|
||||||
|
|
||||||
|
# Parse model parameters
|
||||||
|
model_params = {}
|
||||||
|
if args.model_params:
|
||||||
|
try:
|
||||||
|
model_params = json.loads(args.model_params)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logging.warning("Invalid JSON for model parameters, using defaults")
|
||||||
|
|
||||||
|
# Parse feature parameters
|
||||||
|
feature_params = {}
|
||||||
|
if args.feature_params:
|
||||||
|
try:
|
||||||
|
feature_params = json.loads(args.feature_params)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logging.warning("Invalid JSON for feature parameters, using defaults")
|
||||||
|
|
||||||
|
# Parse data filters
|
||||||
|
train_filter = None
|
||||||
|
if args.train_filter:
|
||||||
|
try:
|
||||||
|
train_filter = json.loads(args.train_filter)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logging.warning("Invalid JSON for train filter, ignoring")
|
||||||
|
|
||||||
|
return ExperimentConfig(
|
||||||
|
name=args.name,
|
||||||
|
description=args.description or "",
|
||||||
|
tags=args.tags or [],
|
||||||
|
model_type=args.model_type,
|
||||||
|
model_params=model_params,
|
||||||
|
features=features,
|
||||||
|
feature_params=feature_params,
|
||||||
|
train_data_filter=train_filter,
|
||||||
|
target_column=args.target,
|
||||||
|
test_size=args.test_size,
|
||||||
|
random_seed=args.seed,
|
||||||
|
cross_validation_folds=args.cv_folds,
|
||||||
|
metrics=args.metrics or ["accuracy", "precision", "recall", "f1"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_single_experiment(args):
|
||||||
|
"""Run a single experiment"""
|
||||||
|
|
||||||
|
config = create_experiment_from_args(args)
|
||||||
|
runner = ExperimentRunner()
|
||||||
|
experiment_id = runner.run_experiment(config)
|
||||||
|
|
||||||
|
logging.info(f"Experiment completed: {experiment_id}")
|
||||||
|
|
||||||
|
# Show results
|
||||||
|
experiment = runner.tracker.get_experiment(experiment_id)
|
||||||
|
if experiment:
|
||||||
|
logging.info("Results:")
|
||||||
|
for metric, value in experiment.test_metrics.items():
|
||||||
|
logging.info(f" Test {metric}: {value:.4f}")
|
||||||
|
|
||||||
|
if experiment.cv_metrics:
|
||||||
|
logging.info("Cross-validation:")
|
||||||
|
for metric, value in experiment.cv_metrics.items():
|
||||||
|
if not metric.endswith("_std"):
|
||||||
|
std_key = f"{metric}_std"
|
||||||
|
std_val = experiment.cv_metrics.get(std_key, 0)
|
||||||
|
logging.info(f" CV {metric}: {value:.4f} ± {std_val:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_baseline_experiments(args):
|
||||||
|
"""Run baseline experiments"""
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
builder = ExperimentBuilder()
|
||||||
|
experiments = builder.create_baseline_experiments()
|
||||||
|
|
||||||
|
runner = ExperimentRunner()
|
||||||
|
experiment_ids = runner.run_experiment_batch(experiments)
|
||||||
|
|
||||||
|
logging.info(f"Completed {len(experiment_ids)} baseline experiments")
|
||||||
|
|
||||||
|
# Show comparison
|
||||||
|
if experiment_ids:
|
||||||
|
comparison = runner.compare_experiments(experiment_ids)
|
||||||
|
logging.info("Baseline Results Comparison:")
|
||||||
|
logging.info(
|
||||||
|
comparison[["name", "model_type", "features", "test_accuracy"]].to_string(index=False)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_ablation_study(args):
|
||||||
|
"""Run feature ablation study"""
|
||||||
|
|
||||||
|
builder = ExperimentBuilder()
|
||||||
|
experiments = builder.create_feature_ablation_study()
|
||||||
|
|
||||||
|
runner = ExperimentRunner()
|
||||||
|
experiment_ids = runner.run_experiment_batch(experiments)
|
||||||
|
|
||||||
|
logging.info(f"Completed {len(experiment_ids)} ablation experiments")
|
||||||
|
|
||||||
|
# Show results
|
||||||
|
if experiment_ids:
|
||||||
|
comparison = runner.compare_experiments(experiment_ids)
|
||||||
|
logging.info("Ablation Study Results:")
|
||||||
|
logging.info(comparison[["name", "test_accuracy", "test_f1"]].to_string(index=False))
|
||||||
|
|
||||||
|
|
||||||
|
def run_component_study(args):
|
||||||
|
"""Run name component study"""
|
||||||
|
|
||||||
|
builder = ExperimentBuilder()
|
||||||
|
experiments = builder.create_name_component_study()
|
||||||
|
|
||||||
|
runner = ExperimentRunner()
|
||||||
|
experiment_ids = runner.run_experiment_batch(experiments)
|
||||||
|
|
||||||
|
logging.info(f"Completed {len(experiment_ids)} component study experiments")
|
||||||
|
|
||||||
|
# Show results
|
||||||
|
if experiment_ids:
|
||||||
|
comparison = runner.compare_experiments(experiment_ids)
|
||||||
|
logging.info("Name Component Study Results:")
|
||||||
|
logging.info(
|
||||||
|
comparison[["name", "test_accuracy", "test_precision", "test_recall"]].to_string(
|
||||||
|
index=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def list_experiments(args):
|
||||||
|
"""List experiments with optional filtering"""
|
||||||
|
|
||||||
|
tracker = ExperimentTracker()
|
||||||
|
|
||||||
|
# Apply filters
|
||||||
|
filters = {}
|
||||||
|
if args.status:
|
||||||
|
from research.experiment import ExperimentStatus
|
||||||
|
|
||||||
|
filters["status"] = ExperimentStatus(args.status)
|
||||||
|
if args.model_type:
|
||||||
|
filters["model_type"] = args.model_type
|
||||||
|
if args.tags:
|
||||||
|
filters["tags"] = args.tags
|
||||||
|
|
||||||
|
experiments = tracker.list_experiments(**filters)
|
||||||
|
|
||||||
|
if not experiments:
|
||||||
|
logging.info("No experiments found matching criteria")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create summary table
|
||||||
|
rows = []
|
||||||
|
for exp in experiments:
|
||||||
|
row = {
|
||||||
|
"ID": exp.experiment_id[:12] + "...",
|
||||||
|
"Name": exp.config.name,
|
||||||
|
"Model": exp.config.model_type,
|
||||||
|
"Status": exp.status.value,
|
||||||
|
"Test Acc": f"{exp.test_metrics.get('accuracy', 0):.4f}" if exp.test_metrics else "N/A",
|
||||||
|
"Start Time": exp.start_time.strftime("%Y-%m-%d %H:%M"),
|
||||||
|
}
|
||||||
|
rows.append(row)
|
||||||
|
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
logging.info(df.to_string(index=False))
|
||||||
|
|
||||||
|
|
||||||
|
def show_experiment_details(args):
|
||||||
|
"""Show detailed results for an experiment"""
|
||||||
|
|
||||||
|
tracker = ExperimentTracker()
|
||||||
|
experiment = tracker.get_experiment(args.experiment_id)
|
||||||
|
|
||||||
|
if not experiment:
|
||||||
|
logging.error(f"Experiment not found: {args.experiment_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info("=== Experiment Details ===")
|
||||||
|
logging.info(f"ID: {experiment.experiment_id}")
|
||||||
|
logging.info(f"Name: {experiment.config.name}")
|
||||||
|
logging.info(f"Description: {experiment.config.description}")
|
||||||
|
logging.info(f"Model Type: {experiment.config.model_type}")
|
||||||
|
logging.info(f"Features: {', '.join([f.value for f in experiment.config.features])}")
|
||||||
|
logging.info(f"Status: {experiment.status.value}")
|
||||||
|
logging.info(f"Start Time: {experiment.start_time}")
|
||||||
|
logging.info(f"End Time: {experiment.end_time}")
|
||||||
|
|
||||||
|
if experiment.test_metrics:
|
||||||
|
logging.info("=== Test Metrics ===")
|
||||||
|
for metric, value in experiment.test_metrics.items():
|
||||||
|
logging.info(f"{metric}: {value:.4f}")
|
||||||
|
|
||||||
|
if experiment.cv_metrics:
|
||||||
|
logging.info("=== Cross-Validation Metrics ===")
|
||||||
|
for metric, value in experiment.cv_metrics.items():
|
||||||
|
if not metric.endswith("_std"):
|
||||||
|
std_key = f"{metric}_std"
|
||||||
|
std_val = experiment.cv_metrics.get(std_key, 0)
|
||||||
|
logging.info(f"{metric}: {value:.4f} ± {std_val:.4f}")
|
||||||
|
|
||||||
|
if experiment.feature_importance:
|
||||||
|
logging.info("=== Top 10 Feature Importances ===")
|
||||||
|
sorted_features = sorted(
|
||||||
|
experiment.feature_importance.items(), key=lambda x: x[1], reverse=True
|
||||||
|
)
|
||||||
|
for feature, importance in sorted_features[:10]:
|
||||||
|
logging.info(f"{feature}: {importance:.4f}")
|
||||||
|
|
||||||
|
if experiment.prediction_examples:
|
||||||
|
logging.info("=== Prediction Examples ===")
|
||||||
|
for i, example in enumerate(experiment.prediction_examples[:5]):
|
||||||
|
correct = "✓" if example["correct"] else "✗"
|
||||||
|
logging.info(
|
||||||
|
f"{i + 1}. {example['name']} -> True: {example['true_label']}, "
|
||||||
|
f"Pred: {example['predicted_label']} {correct}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_experiments_cmd(args):
|
||||||
|
"""Compare multiple experiments"""
|
||||||
|
|
||||||
|
runner = ExperimentRunner()
|
||||||
|
comparison = runner.compare_experiments(args.experiment_ids)
|
||||||
|
|
||||||
|
if comparison.empty:
|
||||||
|
logging.info("No experiments found for comparison")
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info("=== Experiment Comparison ===")
|
||||||
|
|
||||||
|
# Show key columns
|
||||||
|
key_columns = ["name", "model_type", "features", "test_accuracy", "test_f1"]
|
||||||
|
available_columns = [col for col in key_columns if col in comparison.columns]
|
||||||
|
|
||||||
|
logging.info(comparison[available_columns].to_string(index=False))
|
||||||
|
|
||||||
|
|
||||||
|
def export_results(args):
|
||||||
|
"""Export experiment results"""
|
||||||
|
|
||||||
|
tracker = ExperimentTracker()
|
||||||
|
output_path = tracker.export_results(Path(args.output) if args.output else None)
|
||||||
|
|
||||||
|
logging.info(f"Results exported to: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main CLI entry point"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="DRC Names Research Experiment Manager",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
|
||||||
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||||
|
|
||||||
|
# Single experiment command
|
||||||
|
exp_parser = subparsers.add_parser("run", help="Run a single experiment")
|
||||||
|
exp_parser.add_argument("--name", required=True, help="Experiment name")
|
||||||
|
exp_parser.add_argument("--description", help="Experiment description")
|
||||||
|
exp_parser.add_argument(
|
||||||
|
"--model-type",
|
||||||
|
default="logistic_regression",
|
||||||
|
choices=list_available_models(),
|
||||||
|
help="Model type",
|
||||||
|
)
|
||||||
|
exp_parser.add_argument(
|
||||||
|
"--features", nargs="+", choices=[f.value for f in FeatureType], help="Features to use"
|
||||||
|
)
|
||||||
|
exp_parser.add_argument("--model-params", help="Model parameters as JSON")
|
||||||
|
exp_parser.add_argument("--feature-params", help="Feature parameters as JSON")
|
||||||
|
exp_parser.add_argument("--train-filter", help="Training data filter as JSON")
|
||||||
|
exp_parser.add_argument("--target", default="sex", help="Target column")
|
||||||
|
exp_parser.add_argument("--test-size", type=float, default=0.2, help="Test set size")
|
||||||
|
exp_parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
||||||
|
exp_parser.add_argument("--cv-folds", type=int, default=5, help="CV folds")
|
||||||
|
exp_parser.add_argument(
|
||||||
|
"--metrics",
|
||||||
|
nargs="+",
|
||||||
|
choices=["accuracy", "precision", "recall", "f1"],
|
||||||
|
help="Metrics to calculate",
|
||||||
|
)
|
||||||
|
exp_parser.add_argument("--tags", nargs="+", help="Experiment tags")
|
||||||
|
|
||||||
|
# Batch experiment commands
|
||||||
|
subparsers.add_parser("baseline", help="Run baseline experiments")
|
||||||
|
subparsers.add_parser("ablation", help="Run feature ablation study")
|
||||||
|
subparsers.add_parser("components", help="Run name component study")
|
||||||
|
|
||||||
|
# List experiments
|
||||||
|
list_parser = subparsers.add_parser("list", help="List experiments")
|
||||||
|
list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"])
|
||||||
|
list_parser.add_argument("--model-type", choices=list_available_models())
|
||||||
|
list_parser.add_argument("--tags", nargs="+", help="Filter by tags")
|
||||||
|
|
||||||
|
# Show experiment details
|
||||||
|
detail_parser = subparsers.add_parser("show", help="Show experiment details")
|
||||||
|
detail_parser.add_argument("experiment_id", help="Experiment ID")
|
||||||
|
|
||||||
|
# Compare experiments
|
||||||
|
compare_parser = subparsers.add_parser("compare", help="Compare experiments")
|
||||||
|
compare_parser.add_argument("experiment_ids", nargs="+", help="Experiment IDs to compare")
|
||||||
|
|
||||||
|
# Export results
|
||||||
|
export_parser = subparsers.add_parser("export", help="Export results to CSV")
|
||||||
|
export_parser.add_argument("--output", help="Output file path")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.command:
|
||||||
|
parser.print_help()
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
config = get_config()
|
||||||
|
if args.verbose:
|
||||||
|
config.logging.level = "DEBUG"
|
||||||
|
setup_logging(config)
|
||||||
|
|
||||||
|
# Execute command
|
||||||
|
try:
|
||||||
|
if args.command == "run":
|
||||||
|
run_single_experiment(args)
|
||||||
|
elif args.command == "baseline":
|
||||||
|
run_baseline_experiments(args)
|
||||||
|
elif args.command == "ablation":
|
||||||
|
run_ablation_study(args)
|
||||||
|
elif args.command == "components":
|
||||||
|
run_component_study(args)
|
||||||
|
elif args.command == "list":
|
||||||
|
list_experiments(args)
|
||||||
|
elif args.command == "show":
|
||||||
|
show_experiment_details(args)
|
||||||
|
elif args.command == "compare":
|
||||||
|
compare_experiments_cmd(args)
|
||||||
|
elif args.command == "export":
|
||||||
|
export_results(args)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Command failed: {e}")
|
||||||
|
if args.verbose:
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = main()
|
||||||
|
sys.exit(exit_code)
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
# Production Environment Configuration
|
||||||
|
# Optimized settings for production deployment
|
||||||
|
|
||||||
|
name: "drc_names_pipeline"
|
||||||
|
version: "1.0.0"
|
||||||
|
environment: "development"
|
||||||
|
debug: true
|
||||||
|
|
||||||
|
# Processing settings
|
||||||
|
processing:
|
||||||
|
batch_size: 100_000
|
||||||
|
max_workers: 8
|
||||||
|
checkpoint_interval: 10
|
||||||
|
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
|
||||||
|
|
||||||
|
# Pipeline stages
|
||||||
|
stages:
|
||||||
|
- "data_cleaning"
|
||||||
|
- "feature_extraction"
|
||||||
|
#- "llm_annotation"
|
||||||
|
- "data_splitting"
|
||||||
|
|
||||||
|
|
||||||
|
# Production LLM settings
|
||||||
|
llm:
|
||||||
|
model_name: "mistral:7b"
|
||||||
|
requests_per_minute: 120
|
||||||
|
requests_per_second: 3
|
||||||
|
retry_attempts: 3
|
||||||
|
timeout_seconds: 45
|
||||||
|
max_concurrent_requests: 4
|
||||||
|
enable_rate_limiting: true
|
||||||
|
|
||||||
|
# Production data settings
|
||||||
|
data:
|
||||||
|
split_evaluation: true
|
||||||
|
split_by_gender: true
|
||||||
|
evaluation_fraction: 0.2
|
||||||
|
random_seed: 42
|
||||||
|
|
||||||
|
# Enhanced logging for development
|
||||||
|
logging:
|
||||||
|
level: "INFO"
|
||||||
|
console_logging: true
|
||||||
|
file_logging: true
|
||||||
|
log_file: "pipeline.development.log"
|
||||||
@@ -0,0 +1,48 @@
|
|||||||
|
# Production Environment Configuration
|
||||||
|
# Optimized settings for production deployment
|
||||||
|
|
||||||
|
name: "drc_names_pipeline"
|
||||||
|
version: "1.0.0"
|
||||||
|
environment: "production"
|
||||||
|
debug: false
|
||||||
|
|
||||||
|
# Production processing settings (optimized for performance)
|
||||||
|
processing:
|
||||||
|
batch_size: 10_000
|
||||||
|
max_workers: 8
|
||||||
|
checkpoint_interval: 10
|
||||||
|
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
|
||||||
|
|
||||||
|
# Pipeline stages
|
||||||
|
stages:
|
||||||
|
- "data_cleaning"
|
||||||
|
- "feature_extraction"
|
||||||
|
- "llm_annotation"
|
||||||
|
- "data_splitting"
|
||||||
|
|
||||||
|
|
||||||
|
# Production LLM settings
|
||||||
|
llm:
|
||||||
|
model_name: "mistral:7b"
|
||||||
|
requests_per_minute: 360
|
||||||
|
requests_per_second: 3
|
||||||
|
retry_attempts: 3
|
||||||
|
timeout_seconds: 45
|
||||||
|
max_concurrent_requests: 4
|
||||||
|
enable_rate_limiting: true
|
||||||
|
|
||||||
|
# Production data settings
|
||||||
|
data:
|
||||||
|
split_evaluation: true
|
||||||
|
split_by_gender: true
|
||||||
|
evaluation_fraction: 0.2
|
||||||
|
random_seed: 42
|
||||||
|
|
||||||
|
# Production logging (less verbose)
|
||||||
|
logging:
|
||||||
|
level: "INFO"
|
||||||
|
console_logging: false # Disable console in production
|
||||||
|
file_logging: true
|
||||||
|
log_file: "pipeline.production.log"
|
||||||
|
max_log_size: 52428800 # 50MB
|
||||||
|
backup_count: 10
|
||||||
@@ -0,0 +1,70 @@
|
|||||||
|
# DRC Names Processing Pipeline Configuration
|
||||||
|
# Main configuration file with default settings
|
||||||
|
|
||||||
|
name: "drc_names_pipeline"
|
||||||
|
version: "1.0.0"
|
||||||
|
description: "DRC Names NLP Processing Pipeline"
|
||||||
|
environment: "development"
|
||||||
|
debug: false
|
||||||
|
|
||||||
|
# Project directory structure
|
||||||
|
paths:
|
||||||
|
root_dir: "."
|
||||||
|
configs_dir: "./config"
|
||||||
|
data_dir: "./data/dataset"
|
||||||
|
models_dir: "./data/models"
|
||||||
|
outputs_dir: "./data/outputs"
|
||||||
|
logs_dir: "./data/logs"
|
||||||
|
checkpoints_dir: "./data/checkpoints"
|
||||||
|
|
||||||
|
# Pipeline stages
|
||||||
|
stages:
|
||||||
|
- "data_cleaning"
|
||||||
|
- "feature_extraction"
|
||||||
|
- "llm_annotation"
|
||||||
|
- "data_splitting"
|
||||||
|
|
||||||
|
# Data processing configuration
|
||||||
|
processing:
|
||||||
|
batch_size: 1_000
|
||||||
|
max_workers: 4
|
||||||
|
checkpoint_interval: 5
|
||||||
|
use_multiprocessing: false
|
||||||
|
encoding_options:
|
||||||
|
- "utf-8"
|
||||||
|
- "utf-16"
|
||||||
|
- "latin1"
|
||||||
|
chunk_size: 100_000
|
||||||
|
|
||||||
|
# LLM annotation settings
|
||||||
|
llm:
|
||||||
|
model_name: "mistral:7b"
|
||||||
|
requests_per_minute: 60
|
||||||
|
requests_per_second: 2
|
||||||
|
retry_attempts: 3
|
||||||
|
timeout_seconds: 600
|
||||||
|
max_concurrent_requests: 2
|
||||||
|
enable_rate_limiting: true
|
||||||
|
|
||||||
|
# Data handling configuration
|
||||||
|
data:
|
||||||
|
input_file: "names.csv"
|
||||||
|
output_files:
|
||||||
|
featured: "names_featured.csv"
|
||||||
|
evaluation: "names_evaluation.csv"
|
||||||
|
males: "names_males.csv"
|
||||||
|
females: "names_females.csv"
|
||||||
|
split_evaluation: true
|
||||||
|
split_by_gender: true
|
||||||
|
evaluation_fraction: 0.2
|
||||||
|
random_seed: 42
|
||||||
|
|
||||||
|
# Logging configuration
|
||||||
|
logging:
|
||||||
|
level: "INFO"
|
||||||
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
file_logging: true
|
||||||
|
console_logging: true
|
||||||
|
log_file: "pipeline.log"
|
||||||
|
max_log_size: 10485760 # 10MB
|
||||||
|
backup_count: 5
|
||||||
@@ -0,0 +1,128 @@
|
|||||||
|
# Research Experiment Configuration Templates
|
||||||
|
# These configurations can be used as starting points for different types of experiments
|
||||||
|
|
||||||
|
# Baseline Experiments Configuration
|
||||||
|
baseline_experiments:
|
||||||
|
- name: "baseline_logistic_regression_fullname"
|
||||||
|
description: "Baseline logistic regression with full name"
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: ["full_name"]
|
||||||
|
model_params:
|
||||||
|
ngram_range: [2, 5]
|
||||||
|
max_features: 10000
|
||||||
|
max_iter: 1000
|
||||||
|
tags: ["baseline", "fullname"]
|
||||||
|
|
||||||
|
- name: "baseline_logistic_regression_native"
|
||||||
|
description: "Logistic regression with native name only"
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: ["native_name"]
|
||||||
|
model_params:
|
||||||
|
ngram_range: [2, 4]
|
||||||
|
max_features: 5000
|
||||||
|
tags: ["baseline", "native"]
|
||||||
|
|
||||||
|
- name: "baseline_rf_engineered"
|
||||||
|
description: "Random Forest with engineered features"
|
||||||
|
model_type: "random_forest"
|
||||||
|
features: ["name_length", "word_count", "province"]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 10
|
||||||
|
tags: ["baseline", "engineered"]
|
||||||
|
|
||||||
|
# Feature Study Configurations
|
||||||
|
feature_studies:
|
||||||
|
- name: "native_vs_surname"
|
||||||
|
description: "Compare native name vs surname effectiveness"
|
||||||
|
experiments:
|
||||||
|
- model_type: "logistic_regression"
|
||||||
|
features: ["native_name"]
|
||||||
|
tags: ["feature_study", "native"]
|
||||||
|
- model_type: "logistic_regression"
|
||||||
|
features: ["surname"]
|
||||||
|
tags: ["feature_study", "surname"]
|
||||||
|
|
||||||
|
- name: "name_parts_analysis"
|
||||||
|
description: "Analyze effectiveness of different name parts"
|
||||||
|
experiments:
|
||||||
|
- features: ["first_word"]
|
||||||
|
tags: ["name_parts", "first"]
|
||||||
|
- features: ["last_word"]
|
||||||
|
tags: ["name_parts", "last"]
|
||||||
|
- features: ["name_beginnings"]
|
||||||
|
feature_params:
|
||||||
|
beginning_length: 3
|
||||||
|
tags: ["name_parts", "beginnings"]
|
||||||
|
- features: ["name_endings"]
|
||||||
|
feature_params:
|
||||||
|
ending_length: 3
|
||||||
|
tags: ["name_parts", "endings"]
|
||||||
|
|
||||||
|
# Province-Specific Studies
|
||||||
|
province_studies:
|
||||||
|
- name: "kinshasa_study"
|
||||||
|
description: "Gender prediction for Kinshasa province"
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: ["full_name"]
|
||||||
|
train_data_filter:
|
||||||
|
province: "kinshasa"
|
||||||
|
tags: ["province_study", "kinshasa"]
|
||||||
|
|
||||||
|
- name: "cross_province_generalization"
|
||||||
|
description: "Train on one province, test on another"
|
||||||
|
experiments:
|
||||||
|
- train_filter: {"province": "kinshasa"}
|
||||||
|
test_filter: {"province": "bas-congo"}
|
||||||
|
tags: ["generalization", "kinshasa_to_bas-congo"]
|
||||||
|
|
||||||
|
# Model Comparison Studies
|
||||||
|
model_comparisons:
|
||||||
|
- name: "model_comparison_fullname"
|
||||||
|
description: "Compare different models with full name"
|
||||||
|
base_config:
|
||||||
|
features: ["full_name"]
|
||||||
|
tags: ["model_comparison"]
|
||||||
|
models:
|
||||||
|
- model_type: "logistic_regression"
|
||||||
|
model_params:
|
||||||
|
ngram_range: [2, 5]
|
||||||
|
- model_type: "random_forest"
|
||||||
|
# Note: RF will need different feature preparation
|
||||||
|
features: ["name_length", "word_count", "province"]
|
||||||
|
|
||||||
|
# Advanced Feature Combinations
|
||||||
|
advanced_features:
|
||||||
|
- name: "multi_feature_combination"
|
||||||
|
description: "Test various feature combinations"
|
||||||
|
experiments:
|
||||||
|
- features: ["full_name", "name_length"]
|
||||||
|
tags: ["combination", "name_plus_length"]
|
||||||
|
- features: ["native_name", "surname", "province"]
|
||||||
|
tags: ["combination", "semantic_features"]
|
||||||
|
- features: ["name_beginnings", "name_endings", "word_count"]
|
||||||
|
tags: ["combination", "structural_features"]
|
||||||
|
|
||||||
|
# Hyperparameter Studies
|
||||||
|
hyperparameter_studies:
|
||||||
|
- name: "ngram_range_study"
|
||||||
|
description: "Study effect of different n-gram ranges"
|
||||||
|
base_config:
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: ["full_name"]
|
||||||
|
tags: ["hyperparameter", "ngram"]
|
||||||
|
variants:
|
||||||
|
- model_params: {"ngram_range": [1, 3]}
|
||||||
|
- model_params: {"ngram_range": [2, 4]}
|
||||||
|
- model_params: {"ngram_range": [2, 5]}
|
||||||
|
- model_params: {"ngram_range": [3, 6]}
|
||||||
|
|
||||||
|
# Data Size Studies
|
||||||
|
data_studies:
|
||||||
|
- name: "learning_curve_study"
|
||||||
|
description: "Study performance vs training data size"
|
||||||
|
base_config:
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: ["full_name"]
|
||||||
|
tags: ["learning_curve"]
|
||||||
|
data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0] # Fractions of training data to use
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from core.config.config_manager import ConfigManager
|
||||||
|
from core.config.logging_config import LoggingConfig
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
config_manager = ConfigManager()
|
||||||
|
|
||||||
|
|
||||||
|
def get_config() -> PipelineConfig:
|
||||||
|
"""Get the global configuration instance"""
|
||||||
|
return config_manager.get_config()
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
|
||||||
|
"""Load configuration from specified path"""
|
||||||
|
if config_path:
|
||||||
|
return config_manager.load_config(Path(config_path))
|
||||||
|
return config_manager.get_config()
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging(config: PipelineConfig):
|
||||||
|
"""Setup logging based on configuration"""
|
||||||
|
|
||||||
|
# Create logs directory
|
||||||
|
log_dir = config.paths.logs_dir
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Setup logging configuration
|
||||||
|
log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
|
||||||
|
|
||||||
|
# Create formatter
|
||||||
|
formatter = logging.Formatter(config.logging.format)
|
||||||
|
|
||||||
|
# Setup root logger
|
||||||
|
root_logger = logging.getLogger()
|
||||||
|
root_logger.setLevel(log_level)
|
||||||
|
|
||||||
|
# Clear existing handlers
|
||||||
|
root_logger.handlers.clear()
|
||||||
|
|
||||||
|
# Console handler
|
||||||
|
if config.logging.console_logging:
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setFormatter(formatter)
|
||||||
|
root_logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
# File handler
|
||||||
|
if config.logging.file_logging:
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
|
||||||
|
log_file_path = log_dir / config.logging.log_file
|
||||||
|
file_handler = RotatingFileHandler(
|
||||||
|
log_file_path,
|
||||||
|
maxBytes=config.logging.max_log_size,
|
||||||
|
backupCount=config.logging.backup_count,
|
||||||
|
)
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
root_logger.addHandler(file_handler)
|
||||||
@@ -0,0 +1,145 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Union, Dict, Any
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
from core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigManager:
|
||||||
|
"""Centralized configuration management"""
|
||||||
|
|
||||||
|
def __init__(self, config_path: Optional[Union[str, Path]] = None):
|
||||||
|
self.config_path = config_path or self._find_config_file()
|
||||||
|
self._config: Optional[PipelineConfig] = None
|
||||||
|
self._setup_default_paths()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _find_config_file(cls) -> Path:
|
||||||
|
"""Find configuration file in standard locations"""
|
||||||
|
possible_paths = [
|
||||||
|
Path.cwd() / "config" / "pipeline.yaml",
|
||||||
|
Path.cwd() / "config" / "pipeline.yml",
|
||||||
|
Path.cwd() / "pipeline.yaml",
|
||||||
|
Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
|
||||||
|
]
|
||||||
|
|
||||||
|
for path in possible_paths:
|
||||||
|
if path.exists():
|
||||||
|
return path
|
||||||
|
|
||||||
|
# Return default path if none found
|
||||||
|
return Path.cwd() / "config" / "pipeline.yaml"
|
||||||
|
|
||||||
|
def _setup_default_paths(self):
|
||||||
|
"""Setup default project paths"""
|
||||||
|
root_dir = Path(__file__).parent.parent.parent
|
||||||
|
self.default_paths = ProjectPaths(
|
||||||
|
root_dir=root_dir,
|
||||||
|
configs_dir=root_dir / "config",
|
||||||
|
data_dir=root_dir / "data" / "dataset",
|
||||||
|
models_dir=root_dir / "data" / "models",
|
||||||
|
outputs_dir=root_dir / "data" / "outputs",
|
||||||
|
logs_dir=root_dir / "data" / "logs",
|
||||||
|
checkpoints_dir=root_dir / "data" / "checkpoints",
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
|
||||||
|
"""Load configuration from file"""
|
||||||
|
if config_path:
|
||||||
|
self.config_path = config_path
|
||||||
|
|
||||||
|
if not self.config_path.exists():
|
||||||
|
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
|
||||||
|
return self._create_default_config()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.config_path, "r") as f:
|
||||||
|
if self.config_path.suffix.lower() in [".yaml", ".yml"]:
|
||||||
|
config_data = yaml.safe_load(f)
|
||||||
|
else:
|
||||||
|
config_data = json.load(f)
|
||||||
|
|
||||||
|
# Ensure paths are properly set
|
||||||
|
if "paths" not in config_data:
|
||||||
|
config_data["paths"] = self.default_paths.dict()
|
||||||
|
|
||||||
|
self._config = PipelineConfig(**config_data)
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to load config from {self.config_path}: {e}")
|
||||||
|
return self._create_default_config()
|
||||||
|
|
||||||
|
def _create_default_config(self) -> PipelineConfig:
|
||||||
|
"""Create default configuration"""
|
||||||
|
return PipelineConfig(paths=self.default_paths)
|
||||||
|
|
||||||
|
def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
|
||||||
|
"""Save configuration to file"""
|
||||||
|
save_path = path or self.config_path
|
||||||
|
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
config_dict = config.model_dump()
|
||||||
|
|
||||||
|
# Convert Path objects to strings for serialization
|
||||||
|
if "paths" in config_dict:
|
||||||
|
for key, value in config_dict["paths"].items():
|
||||||
|
if isinstance(value, Path):
|
||||||
|
config_dict["paths"][key] = str(value)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(save_path, "w") as f:
|
||||||
|
if save_path.suffix.lower() in [".yaml", ".yml"]:
|
||||||
|
yaml.dump(config_dict, f, default_flow_style=False, indent=2)
|
||||||
|
else:
|
||||||
|
json.dump(config_dict, f, indent=2)
|
||||||
|
|
||||||
|
logging.info(f"Configuration saved to {save_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to save config to {save_path}: {e}")
|
||||||
|
|
||||||
|
def get_config(self) -> PipelineConfig:
|
||||||
|
"""Get current configuration, loading if necessary"""
|
||||||
|
if self._config is None:
|
||||||
|
self._config = self.load_config()
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
def update_config(self, updates: Dict[str, Any]):
|
||||||
|
"""Update configuration with new values"""
|
||||||
|
config = self.get_config()
|
||||||
|
|
||||||
|
# Deep update configuration
|
||||||
|
config_dict = config.model_dump()
|
||||||
|
self._deep_update(config_dict, updates)
|
||||||
|
|
||||||
|
self._config = PipelineConfig(**config_dict)
|
||||||
|
|
||||||
|
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
||||||
|
"""Recursively update nested dictionaries"""
|
||||||
|
for key, value in update_dict.items():
|
||||||
|
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
|
||||||
|
self._deep_update(base_dict[key], value)
|
||||||
|
else:
|
||||||
|
base_dict[key] = value
|
||||||
|
|
||||||
|
def get_environment_config(self, env: str) -> PipelineConfig:
|
||||||
|
"""Load environment-specific configuration"""
|
||||||
|
env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
|
||||||
|
|
||||||
|
if env_config_path.exists():
|
||||||
|
base_config = self.load_config()
|
||||||
|
env_config = self.load_config(env_config_path)
|
||||||
|
|
||||||
|
# Merge configurations
|
||||||
|
base_dict = base_config.dict()
|
||||||
|
env_dict = env_config.dict()
|
||||||
|
self._deep_update(base_dict, env_dict)
|
||||||
|
|
||||||
|
return PipelineConfig(**base_dict)
|
||||||
|
|
||||||
|
return self.get_config()
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
from dataclasses import field
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class DataConfig(BaseModel):
|
||||||
|
"""Data handling configuration"""
|
||||||
|
|
||||||
|
input_file: str = "names.csv"
|
||||||
|
output_files: Dict[str, str] = field(
|
||||||
|
default_factory=lambda: {
|
||||||
|
"featured": "names_featured.csv",
|
||||||
|
"evaluation": "names_evaluation.csv",
|
||||||
|
"males": "names_males.csv",
|
||||||
|
"females": "names_females.csv",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
split_evaluation: bool = True
|
||||||
|
split_by_gender: bool = True
|
||||||
|
evaluation_fraction: float = 0.2
|
||||||
|
random_seed: int = 42
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class LLMConfig(BaseModel):
|
||||||
|
"""LLM annotation configuration"""
|
||||||
|
|
||||||
|
model_name: str = "mistral:7b"
|
||||||
|
requests_per_minute: int = 60
|
||||||
|
requests_per_second: int = 2
|
||||||
|
retry_attempts: int = 3
|
||||||
|
timeout_seconds: int = 30
|
||||||
|
max_concurrent_requests: int = 2
|
||||||
|
enable_rate_limiting: bool = False
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class LoggingConfig(BaseModel):
|
||||||
|
"""Logging configuration"""
|
||||||
|
|
||||||
|
level: str = "INFO"
|
||||||
|
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
file_logging: bool = True
|
||||||
|
console_logging: bool = True
|
||||||
|
log_file: str = "pipeline.log"
|
||||||
|
max_log_size: int = 10 * 1024 * 1024 # 10MB
|
||||||
|
backup_count: int = 5
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from core.config.logging_config import LoggingConfig
|
||||||
|
from core.config.data_config import DataConfig
|
||||||
|
from core.config.llm_config import LLMConfig
|
||||||
|
from core.config.processing_config import ProcessingConfig
|
||||||
|
from core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineConfig(BaseModel):
|
||||||
|
"""Main pipeline configuration"""
|
||||||
|
|
||||||
|
name: str = "drc_names_pipeline"
|
||||||
|
version: str = "1.0.0"
|
||||||
|
description: str = "DRC Names NLP Processing Pipeline"
|
||||||
|
|
||||||
|
paths: ProjectPaths
|
||||||
|
stages: list[str] = []
|
||||||
|
processing: ProcessingConfig = ProcessingConfig()
|
||||||
|
llm: LLMConfig = LLMConfig()
|
||||||
|
data: DataConfig = DataConfig()
|
||||||
|
logging: LoggingConfig = LoggingConfig()
|
||||||
|
|
||||||
|
# Environment-specific settings
|
||||||
|
environment: str = "development"
|
||||||
|
debug: bool = True
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
from dataclasses import field
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessingConfig(BaseModel):
|
||||||
|
"""Data processing pipeline configuration"""
|
||||||
|
|
||||||
|
batch_size: int = 1000
|
||||||
|
max_workers: int = 4
|
||||||
|
checkpoint_interval: int = 5
|
||||||
|
use_multiprocessing: bool = False
|
||||||
|
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
||||||
|
chunk_size: int = 100_000
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import BaseModel, field_validator
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectPaths(BaseModel):
|
||||||
|
"""Project directory structure configuration"""
|
||||||
|
|
||||||
|
root_dir: Path
|
||||||
|
data_dir: Path
|
||||||
|
models_dir: Path
|
||||||
|
outputs_dir: Path
|
||||||
|
logs_dir: Path
|
||||||
|
configs_dir: Path
|
||||||
|
checkpoints_dir: Path
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@field_validator("*", mode="before")
|
||||||
|
def convert_to_path(cls, v):
|
||||||
|
return Path(v) if not isinstance(v, Path) else v
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
import logging
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from core.config import get_config, PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def temporary_config_override(**overrides):
|
||||||
|
"""Context manager for temporarily overriding configuration"""
|
||||||
|
config = get_config()
|
||||||
|
original_values = {}
|
||||||
|
|
||||||
|
# Store original values and apply overrides
|
||||||
|
for key, value in overrides.items():
|
||||||
|
if hasattr(config, key):
|
||||||
|
original_values[key] = getattr(config, key)
|
||||||
|
setattr(config, key, value)
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield config
|
||||||
|
finally:
|
||||||
|
# Restore original values
|
||||||
|
for key, value in original_values.items():
|
||||||
|
setattr(config, key, value)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_directories(config: PipelineConfig) -> None:
|
||||||
|
"""Ensure all required directories exist"""
|
||||||
|
directories = [
|
||||||
|
config.paths.data_dir,
|
||||||
|
config.paths.models_dir,
|
||||||
|
config.paths.outputs_dir,
|
||||||
|
config.paths.logs_dir,
|
||||||
|
config.paths.configs_dir,
|
||||||
|
config.paths.checkpoints_dir,
|
||||||
|
]
|
||||||
|
|
||||||
|
for directory in directories:
|
||||||
|
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
logging.info("Ensured all required directories exist")
|
||||||
|
|
||||||
|
|
||||||
|
def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||||
|
"""Get full path for a data file"""
|
||||||
|
return config.paths.data_dir / filename
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||||
|
"""Get full path for a model file"""
|
||||||
|
return config.paths.models_dir / filename
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||||
|
"""Get full path for an output file"""
|
||||||
|
return config.paths.outputs_dir / filename
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Union, Iterator
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
|
class DataLoader:
|
||||||
|
"""Reusable data loading utilities"""
|
||||||
|
|
||||||
|
def __init__(self, config: PipelineConfig):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def load_csv_chunked(
|
||||||
|
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
|
||||||
|
) -> Iterator[pd.DataFrame]:
|
||||||
|
"""Load CSV file in chunks for memory efficiency"""
|
||||||
|
chunk_size = chunk_size or self.config.processing.chunk_size
|
||||||
|
encodings = self.config.processing.encoding_options
|
||||||
|
|
||||||
|
filepath = Path(filepath)
|
||||||
|
|
||||||
|
for encoding in encodings:
|
||||||
|
try:
|
||||||
|
logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
|
||||||
|
|
||||||
|
chunk_iter = pd.read_csv(
|
||||||
|
filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunk_iter):
|
||||||
|
logging.debug(f"Processing chunk {i+1}")
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
|
||||||
|
return
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed with encoding {encoding}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
|
||||||
|
|
||||||
|
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
||||||
|
"""Load complete CSV file into memory"""
|
||||||
|
chunks = list(self.load_csv_chunked(filepath))
|
||||||
|
return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def save_csv(
|
||||||
|
cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
|
||||||
|
) -> None:
|
||||||
|
"""Save DataFrame to CSV with proper handling"""
|
||||||
|
filepath = Path(filepath)
|
||||||
|
|
||||||
|
if create_dirs:
|
||||||
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
df.to_csv(filepath, index=False, encoding="utf-8")
|
||||||
|
logging.info(f"Saved {len(df)} rows to {filepath}")
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
|
class PromptManager:
|
||||||
|
"""Manage prompts for LLM operations"""
|
||||||
|
|
||||||
|
def __init__(self, config: PipelineConfig):
|
||||||
|
self.config = config
|
||||||
|
self.prompts_dir = self.config.paths.configs_dir / "prompts"
|
||||||
|
|
||||||
|
def load_prompt(self, prompt_name: str = "default") -> str:
|
||||||
|
"""Load a prompt template"""
|
||||||
|
prompt_file = self.prompts_dir / f"{prompt_name}.txt"
|
||||||
|
|
||||||
|
if not prompt_file.exists():
|
||||||
|
# Fallback to root directory
|
||||||
|
fallback_file = self.config.paths.root_dir / "prompt.txt"
|
||||||
|
if fallback_file.exists():
|
||||||
|
prompt_file = fallback_file
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
|
||||||
|
|
||||||
|
with open(prompt_file, "r", encoding="utf-8") as f:
|
||||||
|
return f.read().strip()
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from queue import Queue
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RateLimitConfig:
|
||||||
|
"""Configuration for rate limiting LLM requests"""
|
||||||
|
|
||||||
|
requests_per_minute: int = 60
|
||||||
|
requests_per_second: int = 2
|
||||||
|
burst_limit: int = 5
|
||||||
|
|
||||||
|
|
||||||
|
class RateLimiter:
|
||||||
|
"""Thread-safe rate limiter for LLM requests"""
|
||||||
|
|
||||||
|
def __init__(self, config: RateLimitConfig):
|
||||||
|
self.config = config
|
||||||
|
self.request_times = Queue()
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
self.last_request_time = 0
|
||||||
|
|
||||||
|
def wait_if_needed(self):
|
||||||
|
"""Wait if necessary to respect rate limits"""
|
||||||
|
with self.lock:
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Check requests per second limit
|
||||||
|
time_since_last = current_time - self.last_request_time
|
||||||
|
min_interval = 1.0 / self.config.requests_per_second
|
||||||
|
|
||||||
|
if time_since_last < min_interval:
|
||||||
|
sleep_time = min_interval - time_since_last
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Clean old request times (older than 1 minute)
|
||||||
|
while not self.request_times.empty():
|
||||||
|
if current_time - self.request_times.queue[0] > 60:
|
||||||
|
self.request_times.get()
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check requests per minute limit
|
||||||
|
if self.request_times.qsize() >= self.config.requests_per_minute:
|
||||||
|
oldest_request = self.request_times.queue[0]
|
||||||
|
wait_time = 60 - (current_time - oldest_request)
|
||||||
|
if wait_time > 0:
|
||||||
|
time.sleep(wait_time)
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Record this request
|
||||||
|
self.request_times.put(current_time)
|
||||||
|
self.last_request_time = current_time
|
||||||
@@ -1,23 +1,44 @@
|
|||||||
import csv
|
from typing import Optional, Dict, Tuple
|
||||||
import io
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
from typing import List, Dict
|
|
||||||
|
|
||||||
# Paths
|
import pandas as pd
|
||||||
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
DATA_DIR = os.path.join(ROOT_DIR, 'dataset')
|
|
||||||
|
|
||||||
MODELS_DIR = os.path.join(ROOT_DIR, 'models')
|
|
||||||
GENDER_MODELS_DIR = os.path.join(MODELS_DIR, 'gender')
|
|
||||||
GENDER_RESULT_DIR = os.path.join(ROOT_DIR, 'gender', 'results')
|
|
||||||
|
|
||||||
NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner')
|
class RegionMapper:
|
||||||
NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results')
|
"""Reusable region mapping utilities"""
|
||||||
|
|
||||||
REGION_MAPPING = {
|
def __init__(self, mapping: Optional[Dict] = None):
|
||||||
|
self.mapping = mapping or REGION_MAPPING
|
||||||
|
|
||||||
|
def map_region_to_province(self, region: str) -> str:
|
||||||
|
"""Map a region to its province"""
|
||||||
|
region_lower = str(region).lower().strip()
|
||||||
|
return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
|
||||||
|
|
||||||
|
def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
|
||||||
|
"""Vectorized region to province mapping"""
|
||||||
|
return regions.str.lower().map(
|
||||||
|
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_provinces():
|
||||||
|
return [
|
||||||
|
"kinshasa",
|
||||||
|
"bas-congo",
|
||||||
|
"bandundu",
|
||||||
|
"katanga",
|
||||||
|
"equateur",
|
||||||
|
"province-orientale",
|
||||||
|
"maniema",
|
||||||
|
"nord-kivu",
|
||||||
|
"sud-kivu",
|
||||||
|
"kasai-occidental",
|
||||||
|
"kasai-oriental",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# DRC Region to Province Mapping
|
||||||
|
REGION_MAPPING: Dict[str, Tuple[str, str]] = {
|
||||||
# Kinshasa
|
# Kinshasa
|
||||||
"kinshasa": ("KINSHASA", "KINSHASA"),
|
"kinshasa": ("KINSHASA", "KINSHASA"),
|
||||||
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
|
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
|
||||||
@@ -28,7 +49,6 @@ REGION_MAPPING = {
|
|||||||
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
|
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
|
||||||
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
|
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
|
||||||
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
|
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
|
||||||
|
|
||||||
# Bas-Congo → Kongo-Central → BAS-CONGO
|
# Bas-Congo → Kongo-Central → BAS-CONGO
|
||||||
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
|
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||||
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||||
@@ -37,7 +57,6 @@ REGION_MAPPING = {
|
|||||||
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||||
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
|
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||||
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
|
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||||
|
|
||||||
# Kwilu, Kwango, Mai-Ndombe → BANDUNDU
|
# Kwilu, Kwango, Mai-Ndombe → BANDUNDU
|
||||||
"bandundu": ("BANDUNDU", "BANDUNDU"),
|
"bandundu": ("BANDUNDU", "BANDUNDU"),
|
||||||
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
|
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
|
||||||
@@ -54,7 +73,6 @@ REGION_MAPPING = {
|
|||||||
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
|
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
|
||||||
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
|
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
|
||||||
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
|
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
|
||||||
|
|
||||||
# Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
|
# Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
|
||||||
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
|
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
|
||||||
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
|
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
|
||||||
@@ -69,7 +87,6 @@ REGION_MAPPING = {
|
|||||||
"tanganyika": ("TANGANYIKA", "KATANGA"),
|
"tanganyika": ("TANGANYIKA", "KATANGA"),
|
||||||
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
|
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
|
||||||
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
|
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
|
||||||
|
|
||||||
# Equateur → MONGALA, NORD-UBANGI, SUD-UBANGI, TSHUAPA
|
# Equateur → MONGALA, NORD-UBANGI, SUD-UBANGI, TSHUAPA
|
||||||
"equateur": ("EQUATEUR", "EQUATEUR"),
|
"equateur": ("EQUATEUR", "EQUATEUR"),
|
||||||
"equateur-1": ("EQUATEUR", "EQUATEUR"),
|
"equateur-1": ("EQUATEUR", "EQUATEUR"),
|
||||||
@@ -89,7 +106,6 @@ REGION_MAPPING = {
|
|||||||
"tshuapa": ("TSHUAPA", "EQUATEUR"),
|
"tshuapa": ("TSHUAPA", "EQUATEUR"),
|
||||||
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
|
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
|
||||||
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
|
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
|
||||||
|
|
||||||
# Province-Orientale
|
# Province-Orientale
|
||||||
"province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
"province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
||||||
"province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
"province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
||||||
@@ -100,128 +116,47 @@ REGION_MAPPING = {
|
|||||||
"haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
|
"haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
|
||||||
"haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
|
"haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
|
||||||
"bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
|
"bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
|
||||||
|
"bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"),
|
||||||
|
"bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"),
|
||||||
"ituri": ("ITURI", "PROVINCE-ORIENTALE"),
|
"ituri": ("ITURI", "PROVINCE-ORIENTALE"),
|
||||||
"ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
|
"ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
|
||||||
"ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
|
"ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
|
||||||
"ituri-3": ("ITURI", "PROVINCE-ORIENTALE"),
|
|
||||||
"tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
"tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
||||||
"tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
"tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
||||||
"tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
"tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
||||||
|
# Maniema
|
||||||
# Kasaï
|
"maniema": ("MANIEMA", "MANIEMA"),
|
||||||
"kasai-1": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
|
"maniema-1": ("MANIEMA", "MANIEMA"),
|
||||||
"kasai-2": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
|
"maniema-2": ("MANIEMA", "MANIEMA"),
|
||||||
"kasai-ce": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
|
|
||||||
"kasai-central": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
|
|
||||||
"kasai-central-1": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
|
|
||||||
"kasai-central-2": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
|
|
||||||
"kasai-occidental": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
|
|
||||||
"kasai-occidental-1": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
|
|
||||||
"kasai-occidental-2": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
|
|
||||||
"kasai-oriental": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
|
|
||||||
"kasai-oriental-1": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
|
|
||||||
"kasai-oriental-2": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
|
|
||||||
"kasai-oriental-3": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
|
|
||||||
"kasai-orientale": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
|
|
||||||
"lomami": ("LOMAMI", "KASAÏ-ORIENTAL"),
|
|
||||||
"lomami-1": ("LOMAMI", "KASAÏ-ORIENTAL"),
|
|
||||||
"lomami-2": ("LOMAMI", "KASAÏ-ORIENTAL"),
|
|
||||||
"sankuru": ("SANKURU", "KASAÏ-ORIENTAL"),
|
|
||||||
"sankuru-1": ("SANKURU", "KASAÏ-ORIENTAL"),
|
|
||||||
"sankuru-2": ("SANKURU", "KASAÏ-ORIENTAL"),
|
|
||||||
|
|
||||||
# Nord-Kivu
|
# Nord-Kivu
|
||||||
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
|
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
|
||||||
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
|
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
|
||||||
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
|
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
|
||||||
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
|
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
|
||||||
|
|
||||||
# Sud-Kivu
|
# Sud-Kivu
|
||||||
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
|
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
|
||||||
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
|
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
|
||||||
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
|
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
|
||||||
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
|
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
|
||||||
|
# Kasai-Occidental → KASAI, KASAI-CENTRAL
|
||||||
# Maniema
|
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||||
"maniema": ("MANIEMA", "MANIEMA"),
|
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||||
"maniema-1": ("MANIEMA", "MANIEMA"),
|
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||||
"maniema-2": ("MANIEMA", "MANIEMA"),
|
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||||
|
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||||
# Divers
|
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||||
"hors-frontieres": ("AUTRES", "AUTRES"),
|
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||||
"lukaya": ("AUTRES", "AUTRES"),
|
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||||
"recours": ("AUTRES", "AUTRES"),
|
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||||
"junacyc": ("AUTRES", "AUTRES"),
|
# Kasai-Oriental → LOMAMI, SANKURU
|
||||||
"junacyp": ("AUTRES", "AUTRES"),
|
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||||
"junacyc-lualaba-corrige": ("LUALABA", "KATANGA"),
|
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||||
"options-techniques-toutes-les-provinces-et-hors-frontieres": ("AUTRES", "AUTRES"),
|
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||||
"region": ("AUTRES", "AUTRES"),
|
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||||
|
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||||
|
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||||
|
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||||
|
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
|
||||||
|
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
|
||||||
|
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
|
||||||
}
|
}
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format=">> %(message)s")
|
|
||||||
|
|
||||||
def load_json_dataset(path: str) -> list:
|
|
||||||
logging.info(f"Loading JSON dataset from {path}")
|
|
||||||
with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f:
|
|
||||||
return json.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
def save_csv_dataset(data: list, path: str) -> None:
|
|
||||||
logging.info(f"Saving CSV dataset to {path}")
|
|
||||||
with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
|
|
||||||
writer = csv.DictWriter(f, fieldnames=data[0].keys())
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(data)
|
|
||||||
|
|
||||||
|
|
||||||
def load_csv_dataset(path: str, limit: int = None, balanced: bool = False) -> List[Dict[str, str]]:
|
|
||||||
logging.info(f"Loading CSV dataset from {path}")
|
|
||||||
|
|
||||||
file_path = os.path.join(DATA_DIR, path)
|
|
||||||
with open(file_path, "r", encoding="utf-8", errors="replace", newline="") as f:
|
|
||||||
raw_text = f.read().replace('\x00', '')
|
|
||||||
|
|
||||||
reader = csv.DictReader(io.StringIO(raw_text))
|
|
||||||
logging.info(f"Detected fieldnames: {reader.fieldnames}")
|
|
||||||
|
|
||||||
if balanced:
|
|
||||||
by_sex = {'m': [], 'f': []}
|
|
||||||
for row in reader:
|
|
||||||
sex = row.get("sex", "").lower()
|
|
||||||
if sex in by_sex:
|
|
||||||
by_sex[sex].append(row)
|
|
||||||
min_len = min(len(by_sex['m']), len(by_sex['f']))
|
|
||||||
if limit:
|
|
||||||
min_len = min(min_len, limit // 2)
|
|
||||||
data = by_sex['m'][:min_len] + by_sex['f'][:min_len]
|
|
||||||
else:
|
|
||||||
data = []
|
|
||||||
for i, row in enumerate(reader):
|
|
||||||
data.append(row)
|
|
||||||
if limit and i + 1 >= limit:
|
|
||||||
break
|
|
||||||
|
|
||||||
logging.info("Successfully loaded with UTF-8 encoding")
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def save_json_dataset(data: list, path: str) -> None:
|
|
||||||
logging.info(f"Saving JSON dataset to {path}")
|
|
||||||
with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
|
|
||||||
json.dump(data, f, ensure_ascii=False, separators=(',', ':'))
|
|
||||||
|
|
||||||
|
|
||||||
def save_pickle(obj, path):
|
|
||||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
||||||
with open(path, "wb") as f:
|
|
||||||
pickle.dump(obj, f)
|
|
||||||
|
|
||||||
|
|
||||||
def load_pickle(path: str):
|
|
||||||
with open(path, "rb") as f:
|
|
||||||
return pickle.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
def load_prompt() -> str:
|
|
||||||
with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f:
|
|
||||||
return f.read()
|
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
|
class StateManager:
|
||||||
|
"""Manage pipeline state and checkpoints"""
|
||||||
|
|
||||||
|
def __init__(self, config: PipelineConfig):
|
||||||
|
self.config = config
|
||||||
|
self.checkpoints_dir = self.config.paths.checkpoints_dir
|
||||||
|
|
||||||
|
def save_state(self, state: Dict[str, Any], state_name: str) -> None:
|
||||||
|
"""Save pipeline state"""
|
||||||
|
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||||
|
|
||||||
|
with open(state_file, "w") as f:
|
||||||
|
json.dump(state, f, indent=2, default=str)
|
||||||
|
|
||||||
|
logging.debug(f"Saved state to {state_file}")
|
||||||
|
|
||||||
|
def load_state(self, state_name: str) -> Dict[str, Any]:
|
||||||
|
"""Load pipeline state"""
|
||||||
|
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||||
|
|
||||||
|
if not state_file.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
with open(state_file, "r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def clear_state(self, state_name: str) -> None:
|
||||||
|
"""Clear pipeline state"""
|
||||||
|
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||||
|
|
||||||
|
if state_file.exists():
|
||||||
|
state_file.unlink()
|
||||||
|
logging.info(f"Cleared state: {state_name}")
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
from typing import Optional, Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
class TextCleaner:
|
||||||
|
"""Reusable text cleaning utilities"""
|
||||||
|
|
||||||
|
def __init__(self, patterns: Optional[Dict[str, str]] = None):
|
||||||
|
self.patterns = patterns or {
|
||||||
|
"null_bytes": "\x00",
|
||||||
|
"non_breaking_spaces": "\u00a0",
|
||||||
|
"multiple_spaces": r" +",
|
||||||
|
"extra_whitespace": r"\s+",
|
||||||
|
}
|
||||||
|
|
||||||
|
def clean_text_series(self, series: pd.Series) -> pd.Series:
|
||||||
|
"""Clean a pandas Series of text data"""
|
||||||
|
cleaned = series.astype(str)
|
||||||
|
|
||||||
|
# Apply cleaning patterns
|
||||||
|
for pattern_name, pattern in self.patterns.items():
|
||||||
|
if pattern_name == "multiple_spaces":
|
||||||
|
cleaned = cleaned.str.replace(pattern, " ", regex=True)
|
||||||
|
else:
|
||||||
|
cleaned = cleaned.str.replace(pattern, " ", regex=False)
|
||||||
|
|
||||||
|
return cleaned.str.strip().str.lower()
|
||||||
|
|
||||||
|
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Clean all text columns in a DataFrame"""
|
||||||
|
df = df.copy()
|
||||||
|
text_columns = df.select_dtypes(include="object").columns
|
||||||
|
|
||||||
|
for col in text_columns:
|
||||||
|
df[col] = self.clean_text_series(df[col])
|
||||||
|
|
||||||
|
return df
|
||||||
@@ -0,0 +1,154 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from core.utils.data_loader import DataLoader
|
||||||
|
from core.config import ConfigManager, setup_logging
|
||||||
|
from core.utils import ensure_directories, get_data_file_path
|
||||||
|
|
||||||
|
from processing.pipeline import Pipeline
|
||||||
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
from processing.steps.data_splitting_step import DataSplittingStep
|
||||||
|
from processing.steps.llm_annotation_step import LLMAnnotationStep
|
||||||
|
from processing.steps.feature_extraction_step import FeatureExtractionStep
|
||||||
|
from processing.steps.data_cleaning_step import DataCleaningStep
|
||||||
|
|
||||||
|
|
||||||
|
def create_pipeline_from_config(config_path: Optional[Path] = None) -> Pipeline:
|
||||||
|
"""Create pipeline from configuration file"""
|
||||||
|
config = ConfigManager(config_path).load_config()
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
setup_logging(config)
|
||||||
|
ensure_directories(config)
|
||||||
|
batch_config = BatchConfig(
|
||||||
|
batch_size=config.processing.batch_size,
|
||||||
|
max_workers=config.processing.max_workers,
|
||||||
|
checkpoint_interval=config.processing.checkpoint_interval,
|
||||||
|
use_multiprocessing=config.processing.use_multiprocessing,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add steps based on configuration
|
||||||
|
pipeline = Pipeline(batch_config)
|
||||||
|
steps = [
|
||||||
|
DataCleaningStep(config),
|
||||||
|
FeatureExtractionStep(config),
|
||||||
|
LLMAnnotationStep(config),
|
||||||
|
DataSplittingStep(config),
|
||||||
|
]
|
||||||
|
|
||||||
|
for stage in config.stages:
|
||||||
|
for step in steps:
|
||||||
|
if step.name == stage:
|
||||||
|
pipeline.add_step(step)
|
||||||
|
|
||||||
|
return pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def run_pipeline(config_path: Optional[Path] = None, resume: bool = False) -> int:
|
||||||
|
"""Run the complete pipeline"""
|
||||||
|
try:
|
||||||
|
config = ConfigManager(config_path).load_config()
|
||||||
|
|
||||||
|
logging.info(f"Starting pipeline: {config.name} v{config.version}")
|
||||||
|
logging.info(f"Environment: {config.environment}")
|
||||||
|
|
||||||
|
# Load input data
|
||||||
|
input_file_path = get_data_file_path(config.data.input_file, config)
|
||||||
|
|
||||||
|
if not input_file_path.exists():
|
||||||
|
logging.error(f"Input file not found: {input_file_path}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
data_loader = DataLoader(config)
|
||||||
|
logging.info(f"Loading data from {input_file_path}")
|
||||||
|
df = data_loader.load_csv_complete(input_file_path)
|
||||||
|
logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
||||||
|
|
||||||
|
# Create and run pipeline
|
||||||
|
pipeline = create_pipeline_from_config(config_path)
|
||||||
|
|
||||||
|
logging.info("Starting pipeline execution")
|
||||||
|
result_df = pipeline.run(df)
|
||||||
|
|
||||||
|
# Save results using the splitting step
|
||||||
|
splitting_step = pipeline.steps[-1]
|
||||||
|
if isinstance(splitting_step, DataSplittingStep):
|
||||||
|
splitting_step.save_splits(result_df)
|
||||||
|
|
||||||
|
# Show completion statistics
|
||||||
|
progress = pipeline.get_progress()
|
||||||
|
logging.info("=== Pipeline Completion Summary ===")
|
||||||
|
for step_name, stats in progress.items():
|
||||||
|
logging.info(
|
||||||
|
f"{step_name}: {stats['completion_percentage']:.1f}% "
|
||||||
|
f"({stats['processed_batches']}/{stats['total_batches']} batches)"
|
||||||
|
)
|
||||||
|
if stats["failed_batches"] > 0:
|
||||||
|
logging.warning(f" {stats['failed_batches']} failed batches")
|
||||||
|
|
||||||
|
logging.info("Pipeline completed successfully")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point with minimal command-line interface"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="DRC Names Processing Pipeline",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Configuration File Examples:
|
||||||
|
config/pipeline.yaml - Main configuration
|
||||||
|
config/pipeline.development.yaml - Development environment
|
||||||
|
config/pipeline.production.yaml - Production environment
|
||||||
|
|
||||||
|
Usage Examples:
|
||||||
|
python processing/main.py # Use default config
|
||||||
|
python processing/main.py --config config/pipeline.yaml # Use specific config
|
||||||
|
python processing/main.py --env development # Use environment config
|
||||||
|
python processing/main.py --resume # Resume from checkpoints
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--config", type=Path, help="Path to configuration file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--env", type=str, help="Environment name (loads config/pipeline.{env}.yaml)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume", action="store_true", help="Resume pipeline from existing checkpoints"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--validate-config", action="store_true", help="Validate configuration file and exit"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Determine config path
|
||||||
|
config_path = None
|
||||||
|
if args.config:
|
||||||
|
config_path = args.config
|
||||||
|
elif args.env:
|
||||||
|
config_path = Path("config") / f"pipeline.{args.env}.yaml"
|
||||||
|
|
||||||
|
if args.validate_config:
|
||||||
|
try:
|
||||||
|
config = ConfigManager(config_path).load_config()
|
||||||
|
print(f"Configuration is valid: {config.name} v{config.version}")
|
||||||
|
return 0
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Configuration validation failed: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Run pipeline
|
||||||
|
return run_pipeline(config_path, args.resume)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = main()
|
||||||
|
sys.exit(exit_code)
|
||||||
Executable
+157
@@ -0,0 +1,157 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from core.config.config_manager import ConfigManager
|
||||||
|
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
from processing.monitoring.data_analyzer import DatasetAnalyzer
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Monitor and manage the DRC names processing pipeline"
|
||||||
|
)
|
||||||
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||||
|
|
||||||
|
# Status command
|
||||||
|
status_parser = subparsers.add_parser("status", help="Show pipeline status")
|
||||||
|
status_parser.add_argument(
|
||||||
|
"--detailed",
|
||||||
|
action="store_true",
|
||||||
|
help="Show detailed information including failed batch IDs",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean command
|
||||||
|
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
|
||||||
|
clean_parser.add_argument(
|
||||||
|
"--step",
|
||||||
|
type=str,
|
||||||
|
choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
|
||||||
|
help="Clean specific step (default: all)",
|
||||||
|
)
|
||||||
|
clean_parser.add_argument(
|
||||||
|
"--keep-last", type=int, default=1, help="Number of recent checkpoints to keep (default: 1)"
|
||||||
|
)
|
||||||
|
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
|
||||||
|
|
||||||
|
# Reset command
|
||||||
|
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
|
||||||
|
reset_parser.add_argument(
|
||||||
|
"step",
|
||||||
|
type=str,
|
||||||
|
choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
|
||||||
|
help="Step to reset",
|
||||||
|
)
|
||||||
|
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
|
||||||
|
|
||||||
|
# Analyze command
|
||||||
|
analyze_parser = subparsers.add_parser("analyze", help="Analyze dataset")
|
||||||
|
analyze_parser.add_argument(
|
||||||
|
"--file",
|
||||||
|
type=str,
|
||||||
|
default="names_featured.csv",
|
||||||
|
help="Dataset file to analyze (default: names_featured.csv)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Checkpoint info command
|
||||||
|
info_parser = subparsers.add_parser("info", help="Show checkpoint information")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.command:
|
||||||
|
parser.print_help()
|
||||||
|
return 1
|
||||||
|
|
||||||
|
monitor = PipelineMonitor()
|
||||||
|
|
||||||
|
if args.command == "status":
|
||||||
|
monitor.print_status(detailed=args.detailed)
|
||||||
|
|
||||||
|
elif args.command == "clean":
|
||||||
|
checkpoint_info = monitor.count_checkpoint_files()
|
||||||
|
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
|
||||||
|
|
||||||
|
if not args.force:
|
||||||
|
response = input("Are you sure you want to clean checkpoints? (y/N): ")
|
||||||
|
if response.lower() != "y":
|
||||||
|
print("Cancelled")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.step:
|
||||||
|
monitor.clean_step_checkpoints(args.step, args.keep_last)
|
||||||
|
else:
|
||||||
|
for step in monitor.steps:
|
||||||
|
monitor.clean_step_checkpoints(step, args.keep_last)
|
||||||
|
|
||||||
|
print("Checkpoint cleaning completed")
|
||||||
|
|
||||||
|
elif args.command == "reset":
|
||||||
|
if not args.force:
|
||||||
|
response = input(
|
||||||
|
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
|
||||||
|
)
|
||||||
|
if response.lower() != "y":
|
||||||
|
print("Cancelled")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
monitor.reset_step(args.step)
|
||||||
|
print(f"Reset completed for {args.step}")
|
||||||
|
|
||||||
|
elif args.command == "analyze":
|
||||||
|
# Use configured data directory instead of hardcoded DATA_DIR
|
||||||
|
data_dir = ConfigManager().default_paths.data_dir
|
||||||
|
filepath = data_dir / args.file
|
||||||
|
|
||||||
|
if not filepath.exists():
|
||||||
|
print(f"File not found: {filepath}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
analyzer = DatasetAnalyzer(str(filepath))
|
||||||
|
|
||||||
|
if not analyzer.load_data():
|
||||||
|
return 1
|
||||||
|
|
||||||
|
completion_stats = analyzer.analyze_completion()
|
||||||
|
quality_stats = analyzer.analyze_quality()
|
||||||
|
|
||||||
|
print(f"\n=== Dataset Analysis: {args.file} ===")
|
||||||
|
print(f"Total rows: {completion_stats['total_rows']:,}")
|
||||||
|
print(
|
||||||
|
f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)"
|
||||||
|
)
|
||||||
|
print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
|
||||||
|
print(
|
||||||
|
f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if "name_length" in quality_stats:
|
||||||
|
length_stats = quality_stats["name_length"]
|
||||||
|
print(f"\nName length statistics:")
|
||||||
|
print(f" Average: {length_stats['mean']:.1f} characters")
|
||||||
|
print(f" Range: {length_stats['min']}-{length_stats['max']} characters")
|
||||||
|
|
||||||
|
if "word_distribution" in quality_stats:
|
||||||
|
print(f"\nWord count distribution:")
|
||||||
|
for words, count in quality_stats["word_distribution"].items():
|
||||||
|
print(f" {words} words: {count:,} names")
|
||||||
|
|
||||||
|
elif args.command == "info":
|
||||||
|
checkpoint_info = monitor.count_checkpoint_files()
|
||||||
|
|
||||||
|
print(f"\n=== Checkpoint Information ===")
|
||||||
|
print(f"Total storage: {checkpoint_info['total_size_mb']:.1f} MB")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for step in monitor.steps:
|
||||||
|
step_info = checkpoint_info[step]
|
||||||
|
print(f"{step.replace('_', ' ').title()}:")
|
||||||
|
print(f" Files: {step_info['files']}")
|
||||||
|
print(f" Size: {step_info['size_mb']:.1f} MB")
|
||||||
|
print()
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = main()
|
||||||
|
sys.exit(exit_code)
|
||||||
@@ -1,115 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
import tensorflow as tf
|
|
||||||
from sklearn.metrics import (
|
|
||||||
accuracy_score, precision_recall_fscore_support, confusion_matrix
|
|
||||||
)
|
|
||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_logreg(df, threshold):
|
|
||||||
"""
|
|
||||||
Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
|
|
||||||
a pre-trained model and label encoder, transforms the input data into the required format, and
|
|
||||||
performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
|
|
||||||
the encoder class labels.
|
|
||||||
"""
|
|
||||||
model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
|
||||||
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
|
||||||
|
|
||||||
X = df["name"].tolist()
|
|
||||||
y_true = encoder.transform(df["sex"])
|
|
||||||
proba = model.predict_proba(X)
|
|
||||||
y_pred = (proba[:, 1] >= threshold).astype(int)
|
|
||||||
return y_true, y_pred, proba[:, 1], encoder.classes_
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_lstm(df, threshold, max_len=6):
|
|
||||||
"""
|
|
||||||
Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
|
|
||||||
returns the true labels, predicted labels, prediction probabilities, and class names.
|
|
||||||
"""
|
|
||||||
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
|
||||||
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
|
|
||||||
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
|
|
||||||
|
|
||||||
sequences = tokenizer.texts_to_sequences(df["name"])
|
|
||||||
X = pad_sequences(sequences, maxlen=max_len, padding="post")
|
|
||||||
y_true = encoder.transform(df["sex"])
|
|
||||||
proba = model.predict(X)
|
|
||||||
y_pred = (proba[:, 1] >= threshold).astype(int)
|
|
||||||
return y_true, y_pred, proba[:, 1], encoder.classes_
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_transformer(df, threshold, max_len=6):
|
|
||||||
"""
|
|
||||||
Evaluates the transformer model for gender prediction. The function loads a pre-trained
|
|
||||||
transformer model, tokenizer, and label encoder. It processes the input dataframe by
|
|
||||||
tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
|
|
||||||
The function then predicts the probabilities for the given names using the transformer model
|
|
||||||
and generates predictions based on the specified threshold.
|
|
||||||
"""
|
|
||||||
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
|
|
||||||
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
|
|
||||||
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
|
|
||||||
|
|
||||||
sequences = tokenizer.texts_to_sequences(df["name"])
|
|
||||||
X = pad_sequences(sequences, maxlen=max_len, padding="post")
|
|
||||||
y_true = encoder.transform(df["sex"])
|
|
||||||
proba = model.predict(X)
|
|
||||||
y_pred = (proba[:, 1] >= threshold).astype(int)
|
|
||||||
return y_true, y_pred, proba[:, 1], encoder.classes_
|
|
||||||
|
|
||||||
|
|
||||||
def compute_metrics(y_true, y_pred, y_proba, class_names):
|
|
||||||
"""
|
|
||||||
Computes classification metrics for given true and predicted labels, along with
|
|
||||||
class probabilities and class names. The function calculates accuracy, precision,
|
|
||||||
recall, F1 score, and confusion matrix for evaluating model performance.
|
|
||||||
"""
|
|
||||||
acc = accuracy_score(y_true, y_pred)
|
|
||||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
|
|
||||||
cm = confusion_matrix(y_true, y_pred).tolist()
|
|
||||||
|
|
||||||
return {
|
|
||||||
"accuracy": acc,
|
|
||||||
"precision": pr,
|
|
||||||
"recall": rc,
|
|
||||||
"f1": f1,
|
|
||||||
"confusion_matrix": {
|
|
||||||
"labels": class_names.tolist(),
|
|
||||||
"matrix": cm
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
|
|
||||||
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
|
|
||||||
parser.add_argument("--dataset", default="names_evaluation.csv", help="Path to the dataset CSV file")
|
|
||||||
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
|
|
||||||
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
|
|
||||||
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
df = load_csv_dataset(args.dataset, args.size, args.balanced)
|
|
||||||
|
|
||||||
model_funcs = {
|
|
||||||
"logreg": evaluate_logreg,
|
|
||||||
"lstm": evaluate_lstm,
|
|
||||||
"transformer": evaluate_transformer,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
y_true, y_pred, y_proba, classes = model_funcs[args.model](df, args.threshold)
|
|
||||||
except KeyError:
|
|
||||||
raise ValueError(f"Unknown model: {args.model}")
|
|
||||||
|
|
||||||
results = compute_metrics(y_true, y_pred, y_proba, classes)
|
|
||||||
save_json_dataset(results, os.path.join(GENDER_RESULT_DIR, f'{args.model}_eval'))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
import argparse
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from sklearn.metrics import (
|
|
||||||
accuracy_score, precision_recall_fscore_support,
|
|
||||||
classification_report, confusion_matrix
|
|
||||||
)
|
|
||||||
|
|
||||||
from misc import logging
|
|
||||||
|
|
||||||
def evaluate_proba(y_true, y_proba, threshold, class_names):
|
|
||||||
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
|
||||||
acc = accuracy_score(y_true, y_pred)
|
|
||||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
|
|
||||||
cm = confusion_matrix(y_true, y_pred)
|
|
||||||
|
|
||||||
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
|
|
||||||
print("Confusion Matrix:\n", cm)
|
|
||||||
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class BaseConfig:
|
|
||||||
"""
|
|
||||||
Represents the base configuration for a dataset and its associated parameters.
|
|
||||||
|
|
||||||
This class serves as a foundational configuration handler to encapsulate
|
|
||||||
dataset-related parameters and options. It allows customization of dataset
|
|
||||||
behavior, including threshold values, size, cross-validation settings, and
|
|
||||||
whether to save derived configurations. It can also manage configurations
|
|
||||||
for balanced datasets if necessary.
|
|
||||||
"""
|
|
||||||
dataset_path: str = "names_featured.csv"
|
|
||||||
size: Optional[int] = None
|
|
||||||
threshold: float = 0.5
|
|
||||||
cv: Optional[int] = None
|
|
||||||
save: bool = False
|
|
||||||
balanced: bool = False
|
|
||||||
|
|
||||||
epochs: int = 10
|
|
||||||
test_size: float = 0.2
|
|
||||||
random_state: int = 42
|
|
||||||
|
|
||||||
|
|
||||||
def load_config(description: str) -> BaseConfig:
|
|
||||||
"""
|
|
||||||
Parses command-line arguments and loads the configuration for the logistic regression model.
|
|
||||||
|
|
||||||
This function sets up an argument parser for various command-line options including
|
|
||||||
the dataset path, dataset size, dataset balancing, classification threshold,
|
|
||||||
cross-validation folds, and saving the model and its associated artifacts. Once parsed,
|
|
||||||
it transfers the configurations to a ``BaseConfig`` instance and returns it.
|
|
||||||
"""
|
|
||||||
parser = argparse.ArgumentParser(description)
|
|
||||||
|
|
||||||
parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
|
|
||||||
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
|
|
||||||
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
|
|
||||||
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
|
|
||||||
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
|
|
||||||
parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
|
|
||||||
|
|
||||||
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
|
|
||||||
parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
|
|
||||||
parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
return BaseConfig(
|
|
||||||
dataset_path=args.dataset,
|
|
||||||
size=args.size,
|
|
||||||
threshold=args.threshold,
|
|
||||||
cv=args.cv,
|
|
||||||
save=args.save,
|
|
||||||
balanced=args.balanced,
|
|
||||||
epochs=args.epochs,
|
|
||||||
test_size=args.test_size,
|
|
||||||
random_state=args.random_state
|
|
||||||
)
|
|
||||||
@@ -1,123 +0,0 @@
|
|||||||
import os
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.metrics import (
|
|
||||||
accuracy_score, classification_report, confusion_matrix,
|
|
||||||
precision_recall_fscore_support
|
|
||||||
)
|
|
||||||
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
|
|
||||||
from sklearn.pipeline import make_pipeline, Pipeline
|
|
||||||
from sklearn.preprocessing import LabelEncoder
|
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
|
||||||
from pipeline.gender.models import BaseConfig, load_config, logging
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Config(BaseConfig):
|
|
||||||
ngram_range: Tuple[int, int] = (2, 5)
|
|
||||||
max_iter: int = 1000
|
|
||||||
|
|
||||||
|
|
||||||
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
|
|
||||||
"""
|
|
||||||
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
|
|
||||||
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
|
|
||||||
for model training. The transformed labels and the fitted encoder are returned.
|
|
||||||
"""
|
|
||||||
logging.info("Encoding labels")
|
|
||||||
encoder = LabelEncoder()
|
|
||||||
y_encoded = encoder.fit_transform(y)
|
|
||||||
return y_encoded, encoder
|
|
||||||
|
|
||||||
|
|
||||||
def build_model(cfg: Config) -> Pipeline:
|
|
||||||
"""
|
|
||||||
Build a logistic regression model pipeline with a character-level CountVectorizer.
|
|
||||||
The pipeline consists of a CountVectorizer that transforms the input text into
|
|
||||||
character n-grams, followed by a Logistic Regression classifier. The n-gram range
|
|
||||||
and maximum iterations for the logistic regression can be configured through the
|
|
||||||
provided configuration object.
|
|
||||||
"""
|
|
||||||
return make_pipeline(
|
|
||||||
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
|
|
||||||
LogisticRegression(max_iter=cfg.max_iter)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
|
|
||||||
"""
|
|
||||||
Evaluates the performance of a classification model using a specified threshold
|
|
||||||
for predicted probabilities. Computes metrics such as accuracy, precision,
|
|
||||||
recall, F1-score, and the confusion matrix. Also generates a classification
|
|
||||||
report with detailed metrics for each class.
|
|
||||||
|
|
||||||
Logs the evaluation metrics at the specified threshold and prints the confusion
|
|
||||||
matrix and classification report.
|
|
||||||
"""
|
|
||||||
logging.info(f"Evaluating at threshold = {threshold}")
|
|
||||||
y_pred = (y_proba[:, 1] >= threshold).astype(int)
|
|
||||||
acc = accuracy_score(y_true, y_pred)
|
|
||||||
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
|
|
||||||
cm = confusion_matrix(y_true, y_pred)
|
|
||||||
|
|
||||||
logging.info(f"Accuracy: {acc:.4f}")
|
|
||||||
logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
|
|
||||||
print("Confusion Matrix:\n", cm)
|
|
||||||
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
|
|
||||||
|
|
||||||
|
|
||||||
def cross_validate(cfg: Config, X, y) -> None:
|
|
||||||
"""
|
|
||||||
Performs k-fold cross-validation on the provided dataset using the configuration and
|
|
||||||
logs the results including individual fold scores, mean accuracy, and the standard
|
|
||||||
deviation of the scores.
|
|
||||||
"""
|
|
||||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
|
||||||
pipeline = build_model(cfg)
|
|
||||||
scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
|
|
||||||
logging.info(f"Cross-validation scores: {scores}")
|
|
||||||
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
|
|
||||||
|
|
||||||
|
|
||||||
def save_artifacts(model, encoder):
|
|
||||||
"""
|
|
||||||
Saves the trained model and label encoder artifacts to the specified directory.
|
|
||||||
"""
|
|
||||||
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
|
|
||||||
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
|
|
||||||
|
|
||||||
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
cfg = Config(**vars(load_config("logistic regression model")))
|
|
||||||
|
|
||||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
|
||||||
X_raw, y_raw = df["name"], df["sex"]
|
|
||||||
y_encoded, encoder = encode_labels(y_raw)
|
|
||||||
|
|
||||||
if cfg.cv:
|
|
||||||
cross_validate(cfg, X_raw, y_encoded)
|
|
||||||
return
|
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
|
||||||
X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
|
|
||||||
)
|
|
||||||
|
|
||||||
model = build_model(cfg)
|
|
||||||
model.fit(X_train, y_train)
|
|
||||||
|
|
||||||
y_proba = model.predict_proba(X_test)
|
|
||||||
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
|
||||||
|
|
||||||
if cfg.save:
|
|
||||||
save_artifacts(model, encoder)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,144 +0,0 @@
|
|||||||
import os
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.metrics import (
|
|
||||||
accuracy_score
|
|
||||||
)
|
|
||||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
||||||
from sklearn.preprocessing import LabelEncoder
|
|
||||||
from tensorflow.keras.callbacks import ProgbarLogger
|
|
||||||
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
|
||||||
from tensorflow.keras.models import Sequential
|
|
||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
|
||||||
from pipeline.gender.models import load_config, BaseConfig, evaluate_proba, logging
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Config(BaseConfig):
|
|
||||||
max_len: int = 6
|
|
||||||
embedding_dim: int = 64
|
|
||||||
lstm_units: int = 32
|
|
||||||
batch_size: int = 64
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
|
||||||
"""
|
|
||||||
Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
|
|
||||||
This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
|
|
||||||
model training. The resulting outputs are ready for input into a machine learning pipeline.
|
|
||||||
"""
|
|
||||||
logging.info("Loading and preprocessing data")
|
|
||||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
|
||||||
|
|
||||||
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
|
||||||
tokenizer.fit_on_texts(df["name"])
|
|
||||||
sequences = tokenizer.texts_to_sequences(df["name"])
|
|
||||||
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
|
|
||||||
|
|
||||||
label_encoder = LabelEncoder()
|
|
||||||
labels = label_encoder.fit_transform(df["sex"])
|
|
||||||
|
|
||||||
return padded, labels, tokenizer, label_encoder
|
|
||||||
|
|
||||||
|
|
||||||
def build_model(cfg: Config, vocab_size: int) -> Sequential:
|
|
||||||
"""
|
|
||||||
Builds and compiles a Sequential LSTM-based model. The model consists of an
|
|
||||||
embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
|
|
||||||
activation, and an output layer with a softmax activation function. The model
|
|
||||||
is compiled using sparse categorical crossentropy loss and the Adam optimizer.
|
|
||||||
"""
|
|
||||||
logging.info("Building LSTM model")
|
|
||||||
model = Sequential([
|
|
||||||
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
|
|
||||||
Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)),
|
|
||||||
Bidirectional(LSTM(cfg.lstm_units)),
|
|
||||||
Dense(64, activation="relu"),
|
|
||||||
Dense(2, activation="softmax")
|
|
||||||
])
|
|
||||||
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
|
||||||
"""
|
|
||||||
Performs cross-validation on the given dataset using the specified model configuration.
|
|
||||||
The function uses StratifiedKFold cross-validator to split the dataset into training and
|
|
||||||
validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
|
|
||||||
on the validation data, and logs the fold-wise and overall results.
|
|
||||||
"""
|
|
||||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
|
||||||
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
|
||||||
accuracies = []
|
|
||||||
|
|
||||||
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
|
|
||||||
logging.info(f"Fold {fold + 1}")
|
|
||||||
model = build_model(cfg, vocab_size)
|
|
||||||
model.fit(X[train_idx], y[train_idx],
|
|
||||||
epochs=cfg.epochs,
|
|
||||||
batch_size=cfg.batch_size,
|
|
||||||
verbose=0)
|
|
||||||
y_pred = model.predict(X[val_idx])
|
|
||||||
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
|
|
||||||
accuracies.append(acc)
|
|
||||||
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
|
|
||||||
|
|
||||||
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
|
|
||||||
|
|
||||||
|
|
||||||
def save_artifacts(model, tokenizer, encoder):
|
|
||||||
"""
|
|
||||||
Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
|
|
||||||
|
|
||||||
The function ensures that the specified directory for saving artifacts exists,
|
|
||||||
then serializes the model, tokenizer, and encoder using appropriate formats. It
|
|
||||||
also logs the success of the operation to notify the user of the action taken.
|
|
||||||
"""
|
|
||||||
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
|
||||||
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
|
|
||||||
|
|
||||||
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
|
|
||||||
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
|
|
||||||
|
|
||||||
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
|
|
||||||
|
|
||||||
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
|
||||||
vocab_size = len(tokenizer.word_index) + 1
|
|
||||||
|
|
||||||
if cfg.cv:
|
|
||||||
cross_validate(cfg, X, y, vocab_size)
|
|
||||||
return
|
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
|
||||||
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
|
|
||||||
)
|
|
||||||
|
|
||||||
model = build_model(cfg, vocab_size)
|
|
||||||
model.summary()
|
|
||||||
|
|
||||||
logging.info("Training model")
|
|
||||||
model.fit(X_train, y_train,
|
|
||||||
validation_split=0.1,
|
|
||||||
epochs=cfg.epochs,
|
|
||||||
batch_size=cfg.batch_size,
|
|
||||||
callbacks=[ProgbarLogger()])
|
|
||||||
|
|
||||||
y_proba = model.predict(X_test)
|
|
||||||
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
|
||||||
|
|
||||||
if cfg.save:
|
|
||||||
save_artifacts(model, tokenizer, encoder)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,173 +0,0 @@
|
|||||||
import os
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import tensorflow as tf
|
|
||||||
from sklearn.metrics import (
|
|
||||||
accuracy_score
|
|
||||||
)
|
|
||||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
||||||
from sklearn.preprocessing import LabelEncoder
|
|
||||||
from tensorflow.keras.callbacks import ProgbarLogger
|
|
||||||
from tensorflow.keras.layers import (
|
|
||||||
Input, Embedding, Dense, GlobalAveragePooling1D,
|
|
||||||
MultiHeadAttention, Dropout, LayerNormalization
|
|
||||||
)
|
|
||||||
from tensorflow.keras.models import Model
|
|
||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
|
|
||||||
from pipeline.gender.models import BaseConfig, load_config, evaluate_proba, logging
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Config(BaseConfig):
|
|
||||||
max_len: int = 6
|
|
||||||
embedding_dim: int = 64
|
|
||||||
transformer_head_size: int = 64
|
|
||||||
transformer_num_heads: int = 2
|
|
||||||
transformer_ff_dim: int = 128
|
|
||||||
dropout: float = 0.1
|
|
||||||
batch_size: int = 64
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
|
|
||||||
"""
|
|
||||||
Load and preprocess the dataset for training a Transformer model.
|
|
||||||
This function reads a CSV dataset, tokenizes the names, pads the sequences,
|
|
||||||
and encodes the labels. It returns the padded sequences, encoded labels,
|
|
||||||
tokenizer, and label encoder.
|
|
||||||
"""
|
|
||||||
logging.info("Loading and preprocessing data")
|
|
||||||
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
|
|
||||||
|
|
||||||
tokenizer = Tokenizer(oov_token="<OOV>")
|
|
||||||
tokenizer.fit_on_texts(df["name"])
|
|
||||||
|
|
||||||
sequences = tokenizer.texts_to_sequences(df["name"])
|
|
||||||
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
|
|
||||||
|
|
||||||
encoder = LabelEncoder()
|
|
||||||
labels = encoder.fit_transform(df["sex"])
|
|
||||||
return padded, labels, tokenizer, encoder
|
|
||||||
|
|
||||||
|
|
||||||
def transformer_encoder(x, cfg: Config):
|
|
||||||
"""
|
|
||||||
Transformer encoder block that applies multi-head attention and feed-forward
|
|
||||||
neural network layers with residual connections and layer normalization.
|
|
||||||
"""
|
|
||||||
attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
|
|
||||||
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
|
|
||||||
|
|
||||||
ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
|
|
||||||
ff = Dense(x.shape[-1])(ff)
|
|
||||||
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
|
|
||||||
|
|
||||||
|
|
||||||
def build_model(cfg: Config, vocab_size: int) -> Model:
|
|
||||||
"""
|
|
||||||
Builds a Transformer-based model aimed at sequence processing tasks.
|
|
||||||
The model includes an embedding layer integrating positional encodings
|
|
||||||
and a Transformer encoder, followed by a global pooling layer,
|
|
||||||
a dense hidden layer, and a softmax output layer.
|
|
||||||
"""
|
|
||||||
logging.info("Building Transformer model")
|
|
||||||
inputs = Input(shape=(cfg.max_len,))
|
|
||||||
x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
|
|
||||||
|
|
||||||
# Add positional encoding
|
|
||||||
positions = tf.range(start=0, limit=cfg.max_len, delta=1)
|
|
||||||
pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
|
|
||||||
x = x + pos_embedding
|
|
||||||
|
|
||||||
x = transformer_encoder(x, cfg)
|
|
||||||
x = GlobalAveragePooling1D()(x)
|
|
||||||
x = Dense(32, activation="relu")(x)
|
|
||||||
outputs = Dense(2, activation="softmax")(x)
|
|
||||||
|
|
||||||
model = Model(inputs, outputs)
|
|
||||||
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def cross_validate(cfg: Config, X, y, vocab_size: int):
|
|
||||||
"""
|
|
||||||
Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
|
|
||||||
splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
|
|
||||||
data. The overall mean and standard deviation of accuracies across all folds are logged.
|
|
||||||
"""
|
|
||||||
logging.info(f"Running {cfg.cv}-fold cross-validation")
|
|
||||||
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
|
|
||||||
accuracies = []
|
|
||||||
|
|
||||||
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
|
|
||||||
logging.info(f"Fold {fold + 1}")
|
|
||||||
model = build_model(cfg, vocab_size)
|
|
||||||
model.fit(X[train_idx], y[train_idx],
|
|
||||||
epochs=cfg.epochs,
|
|
||||||
batch_size=cfg.batch_size,
|
|
||||||
verbose=0)
|
|
||||||
y_pred = model.predict(X[val_idx])
|
|
||||||
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
|
|
||||||
accuracies.append(acc)
|
|
||||||
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
|
|
||||||
|
|
||||||
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
|
|
||||||
|
|
||||||
|
|
||||||
def save_artifacts(model, tokenizer, encoder):
|
|
||||||
"""
|
|
||||||
Saves the model and associated artifacts to the designated directory. The model
|
|
||||||
is serialized and saved in a `.keras` file, while the tokenizer and label
|
|
||||||
encoder are serialized into `.pkl` files. If the directory does not exist, it
|
|
||||||
is created automatically. This function also logs the completion of the
|
|
||||||
operation.
|
|
||||||
"""
|
|
||||||
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
|
|
||||||
model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
|
|
||||||
|
|
||||||
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
|
|
||||||
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
|
|
||||||
|
|
||||||
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
cfg = Config(**vars(load_config("Transformer model")))
|
|
||||||
|
|
||||||
X, y, tokenizer, encoder = load_and_prepare(cfg)
|
|
||||||
vocab_size = len(tokenizer.word_index) + 1
|
|
||||||
|
|
||||||
if cfg.cv:
|
|
||||||
cross_validate(cfg, X, y, vocab_size)
|
|
||||||
return
|
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
|
||||||
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
|
|
||||||
)
|
|
||||||
|
|
||||||
model = build_model(cfg, vocab_size)
|
|
||||||
model.summary()
|
|
||||||
|
|
||||||
logging.info("Training Transformer model")
|
|
||||||
model.fit(
|
|
||||||
X_train, y_train,
|
|
||||||
validation_split=0.1,
|
|
||||||
epochs=cfg.epochs,
|
|
||||||
batch_size=cfg.batch_size,
|
|
||||||
callbacks=[ProgbarLogger()]
|
|
||||||
)
|
|
||||||
|
|
||||||
y_proba = model.predict(X_test)
|
|
||||||
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
|
|
||||||
|
|
||||||
if cfg.save:
|
|
||||||
save_artifacts(model, tokenizer, encoder)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import tensorflow as tf
|
|
||||||
from sklearn.pipeline import Pipeline
|
|
||||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
||||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
||||||
|
|
||||||
from misc import GENDER_MODELS_DIR, load_pickle
|
|
||||||
|
|
||||||
|
|
||||||
def predict_logreg(names: List[str], threshold: float):
|
|
||||||
"""
|
|
||||||
Predict gender labels for given names using a logistic regression model.
|
|
||||||
|
|
||||||
The function takes in a list of names and predicts the gender labels
|
|
||||||
based on a logistic regression model. A probabilistic threshold is used
|
|
||||||
to classify the names into one of the defined labels.
|
|
||||||
"""
|
|
||||||
model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
|
|
||||||
encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
|
|
||||||
|
|
||||||
model: Pipeline = load_pickle(model_path)
|
|
||||||
label_encoder = load_pickle(encoder_path)
|
|
||||||
|
|
||||||
X = [name.lower().strip() for name in names]
|
|
||||||
proba = model.predict_proba(X)
|
|
||||||
pred = (proba[:, 1] >= threshold).astype(int)
|
|
||||||
labels = label_encoder.inverse_transform(pred)
|
|
||||||
return labels, proba
|
|
||||||
|
|
||||||
|
|
||||||
def predict_lstm(names: List[str], threshold: float, max_len=6):
|
|
||||||
"""
|
|
||||||
Predicts gender labels and probabilities for a list of names using a pre-trained BiLSTM model.
|
|
||||||
|
|
||||||
The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
|
|
||||||
names, and then uses the loaded model to predict gender probabilities. Based on the threshold
|
|
||||||
value, it determines the predicted gender labels.
|
|
||||||
"""
|
|
||||||
model_path = os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")
|
|
||||||
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")
|
|
||||||
encoder_path = os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl")
|
|
||||||
|
|
||||||
model = tf.keras.models.load_model(model_path)
|
|
||||||
tokenizer: Tokenizer = load_pickle(tokenizer_path)
|
|
||||||
label_encoder = load_pickle(encoder_path)
|
|
||||||
|
|
||||||
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
|
|
||||||
X = pad_sequences(X, maxlen=max_len, padding="post")
|
|
||||||
proba = model.predict(X)
|
|
||||||
pred = (proba[:, 1] >= threshold).astype(int)
|
|
||||||
labels = label_encoder.inverse_transform(pred)
|
|
||||||
return labels, proba
|
|
||||||
|
|
||||||
|
|
||||||
def predict_transformer(names: List[str], threshold: float, max_len=6):
|
|
||||||
"""
|
|
||||||
Predicts gender labels for the provided names using a pre-trained transformer model.
|
|
||||||
|
|
||||||
This function loads a pre-trained transformer model along with its tokenizer and label
|
|
||||||
encoder, converts input names into tokenized sequences, and processes them to generate
|
|
||||||
gender predictions. The function returns the predicted labels and the associated
|
|
||||||
probabilities for each sample.
|
|
||||||
"""
|
|
||||||
model_path = os.path.join(GENDER_MODELS_DIR, "transformer.keras")
|
|
||||||
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
|
|
||||||
encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
|
|
||||||
|
|
||||||
model = tf.keras.models.load_model(model_path)
|
|
||||||
tokenizer: Tokenizer = load_pickle(tokenizer_path)
|
|
||||||
label_encoder = load_pickle(encoder_path)
|
|
||||||
|
|
||||||
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
|
|
||||||
X = pad_sequences(X, maxlen=max_len, padding="post")
|
|
||||||
proba = model.predict(X)
|
|
||||||
pred = (proba[:, 1] >= threshold).astype(int)
|
|
||||||
labels = label_encoder.inverse_transform(pred)
|
|
||||||
return labels, proba
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
|
|
||||||
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
|
|
||||||
parser.add_argument("--names", nargs="+", required=True, help="One or more names")
|
|
||||||
parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
model_funcs = {
|
|
||||||
"logreg": predict_logreg,
|
|
||||||
"lstm": predict_lstm,
|
|
||||||
"transformer": predict_transformer,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
labels, proba = model_funcs[args.model](args.names, args.threshold)
|
|
||||||
except KeyError:
|
|
||||||
raise ValueError(f"Unsupported model type: {args.model}")
|
|
||||||
|
|
||||||
for i, name in enumerate(args.names):
|
|
||||||
p_female = proba[i][0]
|
|
||||||
p_male = proba[i][1]
|
|
||||||
print(f"{name} → {labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,109 +0,0 @@
|
|||||||
import os
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
import ollama
|
|
||||||
import pandas as pd
|
|
||||||
from pydantic import BaseModel, ValidationError
|
|
||||||
from tqdm import tqdm
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from misc import load_prompt, load_csv_dataset, DATA_DIR, logging
|
|
||||||
|
|
||||||
|
|
||||||
class NameAnalysis(BaseModel):
|
|
||||||
identified_name: Optional[str]
|
|
||||||
identified_surname: Optional[str]
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_name(client: ollama.Client, model: str, prompt: str, name: str) -> dict:
|
|
||||||
"""
|
|
||||||
Analyze a name using the specified model and prompt.
|
|
||||||
Returns a dictionary with identified name, surname, and category.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
response = client.chat(
|
|
||||||
model=model,
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": prompt},
|
|
||||||
{"role": "user", "content": name},
|
|
||||||
],
|
|
||||||
format=NameAnalysis.model_json_schema(),
|
|
||||||
)
|
|
||||||
analysis = NameAnalysis.model_validate_json(response.message.content)
|
|
||||||
return analysis.model_dump()
|
|
||||||
except ValidationError as ve:
|
|
||||||
logging.warning(f"Validation error: {ve}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Unexpected error: {e}")
|
|
||||||
return {"identified_name": None, "identified_surname": None}
|
|
||||||
|
|
||||||
|
|
||||||
def save_checkpoint(df: pd.DataFrame):
|
|
||||||
df.to_csv(os.path.join(DATA_DIR, "names_featured.csv"), index=False)
|
|
||||||
logging.critical(f"Checkpoint saved")
|
|
||||||
|
|
||||||
|
|
||||||
def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
BATCH_SIZE = 10
|
|
||||||
|
|
||||||
client = ollama.Client()
|
|
||||||
prompt = load_prompt()
|
|
||||||
updates = []
|
|
||||||
|
|
||||||
# Set logging level for HTTP client to reduce noise
|
|
||||||
# This is useful to avoid excessive logging from the HTTP client used by Ollama
|
|
||||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
||||||
|
|
||||||
|
|
||||||
for idx, (row_idx, row) in enumerate(entries.iterrows(), 1):
|
|
||||||
try:
|
|
||||||
entry = analyze_name(client, llm_model, prompt, row["name"])
|
|
||||||
entry["annotated"] = 1
|
|
||||||
updates.append((row_idx, entry))
|
|
||||||
logging.info(f"Analyzed: {row['name']} - {entry}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"Failed to analyze '{row['name']}': {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if idx % BATCH_SIZE == 0 or idx == len(entries):
|
|
||||||
update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
|
|
||||||
update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
|
|
||||||
|
|
||||||
df.update(update_df)
|
|
||||||
save_checkpoint(df)
|
|
||||||
updates.clear() # avoid re-applying same updates
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def main(llm_model: str = "llama3.2:3b"):
|
|
||||||
df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
|
|
||||||
|
|
||||||
# Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
|
|
||||||
df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
|
|
||||||
|
|
||||||
entries = df[df["annotated"] == 0]
|
|
||||||
if entries.empty:
|
|
||||||
logging.info("No names to analyze.")
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info(f"Found {len(entries)} names to analyze.")
|
|
||||||
df = build_updates(llm_model, df, entries)
|
|
||||||
save_checkpoint(df)
|
|
||||||
logging.info("Analysis complete.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="Analyze names using an LLM model.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--llm_model",
|
|
||||||
type=str,
|
|
||||||
default="mistral:7b",
|
|
||||||
help="Ollama model name to use (default: mistral:7b)",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
try:
|
|
||||||
main(llm_model=args.llm_model)
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Fatal error: {e}", exc_info=True)
|
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BatchConfig:
|
||||||
|
"""Configuration for batch processing"""
|
||||||
|
|
||||||
|
batch_size: int = 1000
|
||||||
|
max_workers: int = 4
|
||||||
|
checkpoint_interval: int = 5 # Save checkpoint every N batches
|
||||||
|
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
||||||
@@ -0,0 +1,102 @@
|
|||||||
|
import logging
|
||||||
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
|
class BatchProcessor:
|
||||||
|
"""Handles batch processing with concurrency and checkpointing"""
|
||||||
|
|
||||||
|
def __init__(self, config: BatchConfig):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
|
||||||
|
"""Create batches from DataFrame"""
|
||||||
|
total_rows = len(df)
|
||||||
|
batch_size = self.config.batch_size
|
||||||
|
|
||||||
|
for i in range(0, total_rows, batch_size):
|
||||||
|
batch = df.iloc[i : i + batch_size].copy()
|
||||||
|
batch_id = i // batch_size
|
||||||
|
yield batch, batch_id
|
||||||
|
|
||||||
|
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Process batches sequentially"""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for batch, batch_id in self.create_batches(df):
|
||||||
|
if step.batch_exists(batch_id):
|
||||||
|
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||||
|
processed_batch = step.load_batch(batch_id)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
processed_batch = step.process_batch(batch, batch_id)
|
||||||
|
step.save_batch(processed_batch, batch_id)
|
||||||
|
step.state.processed_batches += 1
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to process batch {batch_id}: {e}")
|
||||||
|
step.state.failed_batches.append(batch_id)
|
||||||
|
continue
|
||||||
|
|
||||||
|
results.append(processed_batch)
|
||||||
|
|
||||||
|
# Save state periodically
|
||||||
|
if batch_id % self.config.checkpoint_interval == 0:
|
||||||
|
step.save_state()
|
||||||
|
|
||||||
|
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
|
||||||
|
|
||||||
|
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Process batches concurrently"""
|
||||||
|
executor_class = (
|
||||||
|
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
|
||||||
|
)
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
with executor_class(max_workers=self.config.max_workers) as executor:
|
||||||
|
# Submit all batches
|
||||||
|
future_to_batch = {}
|
||||||
|
for batch, batch_id in self.create_batches(df):
|
||||||
|
if step.batch_exists(batch_id):
|
||||||
|
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
|
||||||
|
results[batch_id] = step.load_batch(batch_id)
|
||||||
|
else:
|
||||||
|
future = executor.submit(step.process_batch, batch, batch_id)
|
||||||
|
future_to_batch[future] = (batch_id, batch)
|
||||||
|
|
||||||
|
# Collect results as they complete
|
||||||
|
for future in as_completed(future_to_batch):
|
||||||
|
batch_id, batch = future_to_batch[future]
|
||||||
|
try:
|
||||||
|
processed_batch = future.result()
|
||||||
|
step.save_batch(processed_batch, batch_id)
|
||||||
|
results[batch_id] = processed_batch
|
||||||
|
step.state.processed_batches += 1
|
||||||
|
logging.info(f"Completed batch {batch_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to process batch {batch_id}: {e}")
|
||||||
|
step.state.failed_batches.append(batch_id)
|
||||||
|
|
||||||
|
# Reassemble results in order
|
||||||
|
ordered_results = []
|
||||||
|
for batch_id in sorted(results.keys()):
|
||||||
|
ordered_results.append(results[batch_id])
|
||||||
|
|
||||||
|
step.save_state()
|
||||||
|
return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
|
||||||
|
|
||||||
|
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Process data using the configured strategy"""
|
||||||
|
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
|
||||||
|
step.load_state()
|
||||||
|
|
||||||
|
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
|
||||||
|
|
||||||
|
if self.config.max_workers == 1:
|
||||||
|
return self.process_sequential(step, df)
|
||||||
|
else:
|
||||||
|
return self.process_concurrent(step, df)
|
||||||
@@ -0,0 +1,80 @@
|
|||||||
|
import logging
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetAnalyzer:
|
||||||
|
"""Analyze dataset statistics and quality"""
|
||||||
|
|
||||||
|
def __init__(self, filepath: str):
|
||||||
|
self.filepath = filepath
|
||||||
|
self.df = None
|
||||||
|
|
||||||
|
def load_data(self) -> bool:
|
||||||
|
"""Load dataset for analysis"""
|
||||||
|
try:
|
||||||
|
self.df = pd.read_csv(self.filepath)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to load {self.filepath}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def analyze_completion(self) -> Dict:
|
||||||
|
"""Analyze annotation completion status"""
|
||||||
|
if self.df is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
total_rows = len(self.df)
|
||||||
|
|
||||||
|
# Check annotation status
|
||||||
|
if "annotated" in self.df.columns:
|
||||||
|
annotated_count = (self.df["annotated"] == 1).sum()
|
||||||
|
unannotated_count = (self.df["annotated"] == 0).sum()
|
||||||
|
else:
|
||||||
|
annotated_count = 0
|
||||||
|
unannotated_count = total_rows
|
||||||
|
|
||||||
|
# Analyze name completeness
|
||||||
|
complete_names = 0
|
||||||
|
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
|
||||||
|
complete_names = (
|
||||||
|
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
|
||||||
|
).sum()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_rows": total_rows,
|
||||||
|
"annotated_rows": annotated_count,
|
||||||
|
"unannotated_rows": unannotated_count,
|
||||||
|
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
|
||||||
|
"complete_names": complete_names,
|
||||||
|
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
def analyze_quality(self) -> Dict:
|
||||||
|
"""Analyze data quality metrics"""
|
||||||
|
if self.df is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
quality_metrics = {}
|
||||||
|
|
||||||
|
# Missing values
|
||||||
|
missing_data = self.df.isnull().sum()
|
||||||
|
quality_metrics["missing_values"] = missing_data.to_dict()
|
||||||
|
|
||||||
|
# Name length distribution
|
||||||
|
if "name" in self.df.columns:
|
||||||
|
name_lengths = self.df["name"].str.len()
|
||||||
|
quality_metrics["name_length"] = {
|
||||||
|
"mean": name_lengths.mean(),
|
||||||
|
"median": name_lengths.median(),
|
||||||
|
"min": name_lengths.min(),
|
||||||
|
"max": name_lengths.max(),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Word count distribution
|
||||||
|
if "words" in self.df.columns:
|
||||||
|
word_counts = self.df["words"].value_counts().sort_index()
|
||||||
|
quality_metrics["word_distribution"] = word_counts.to_dict()
|
||||||
|
|
||||||
|
return quality_metrics
|
||||||
@@ -0,0 +1,179 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Dict
|
||||||
|
|
||||||
|
from core.config.config_manager import ConfigManager
|
||||||
|
from core.config.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineMonitor:
|
||||||
|
"""Monitor and manage pipeline execution"""
|
||||||
|
|
||||||
|
def __init__(self, paths: Optional[ProjectPaths] = None):
|
||||||
|
if paths is None:
|
||||||
|
# Use default configuration if none provided
|
||||||
|
config_manager = ConfigManager()
|
||||||
|
paths = config_manager.default_paths
|
||||||
|
|
||||||
|
self.paths = paths
|
||||||
|
self.checkpoint_dir = paths.checkpoints_dir
|
||||||
|
self.steps = ["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"]
|
||||||
|
|
||||||
|
def get_step_status(self, step_name: str) -> Dict:
|
||||||
|
"""Get status of a specific pipeline step"""
|
||||||
|
step_dir = self.checkpoint_dir / step_name
|
||||||
|
state_file = step_dir / "pipeline_state.json"
|
||||||
|
|
||||||
|
if not state_file.exists():
|
||||||
|
return {
|
||||||
|
"step": step_name,
|
||||||
|
"status": "not_started",
|
||||||
|
"processed_batches": 0,
|
||||||
|
"total_batches": 0,
|
||||||
|
"failed_batches": 0,
|
||||||
|
"completion_percentage": 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(state_file, "r") as f:
|
||||||
|
state = json.load(f)
|
||||||
|
|
||||||
|
processed = state.get("processed_batches", 0)
|
||||||
|
total = state.get("total_batches", 0)
|
||||||
|
failed = len(state.get("failed_batches", []))
|
||||||
|
|
||||||
|
if total == 0:
|
||||||
|
completion = 0.0
|
||||||
|
status = "not_started"
|
||||||
|
elif processed >= total:
|
||||||
|
completion = 100.0
|
||||||
|
status = "completed" if failed == 0 else "completed_with_errors"
|
||||||
|
else:
|
||||||
|
completion = (processed / total) * 100
|
||||||
|
status = "in_progress"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"step": step_name,
|
||||||
|
"status": status,
|
||||||
|
"processed_batches": processed,
|
||||||
|
"total_batches": total,
|
||||||
|
"failed_batches": failed,
|
||||||
|
"completion_percentage": completion,
|
||||||
|
"last_checkpoint": state.get("last_checkpoint"),
|
||||||
|
"failed_batch_ids": state.get("failed_batches", []),
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error reading state for {step_name}: {e}")
|
||||||
|
return {"step": step_name, "status": "error", "error": str(e)}
|
||||||
|
|
||||||
|
def get_pipeline_status(self) -> Dict:
|
||||||
|
"""Get overall pipeline status"""
|
||||||
|
step_statuses = {}
|
||||||
|
overall_status = "not_started"
|
||||||
|
total_completion = 0.0
|
||||||
|
|
||||||
|
for step in self.steps:
|
||||||
|
status = self.get_step_status(step)
|
||||||
|
step_statuses[step] = status
|
||||||
|
|
||||||
|
if status["status"] == "error":
|
||||||
|
overall_status = "error"
|
||||||
|
elif status["status"] in ["in_progress"]:
|
||||||
|
overall_status = "in_progress"
|
||||||
|
elif status["status"] == "completed_with_errors":
|
||||||
|
overall_status = "completed_with_errors"
|
||||||
|
|
||||||
|
total_completion += status.get("completion_percentage", 0)
|
||||||
|
|
||||||
|
avg_completion = total_completion / len(self.steps)
|
||||||
|
|
||||||
|
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
|
||||||
|
overall_status = "completed"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"overall_status": overall_status,
|
||||||
|
"overall_completion": avg_completion,
|
||||||
|
"steps": step_statuses,
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def print_status(self, detailed: bool = False):
|
||||||
|
"""Print pipeline status in a human-readable format"""
|
||||||
|
status = self.get_pipeline_status()
|
||||||
|
|
||||||
|
print("\n=== Pipeline Status ===")
|
||||||
|
print(f"Overall Status: {status['overall_status'].upper()}")
|
||||||
|
print(f"Overall Completion: {status['overall_completion']:.1f}%")
|
||||||
|
print(f"Last Updated: {status['timestamp']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for step_name, step_status in status["steps"].items():
|
||||||
|
print(f"{step_name.replace('_', ' ').title()}:")
|
||||||
|
print(f" Status: {step_status['status']}")
|
||||||
|
print(f" Progress: {step_status['completion_percentage']:.1f}%")
|
||||||
|
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
|
||||||
|
|
||||||
|
if step_status["failed_batches"] > 0:
|
||||||
|
print(f" Failed Batches: {step_status['failed_batches']}")
|
||||||
|
|
||||||
|
if detailed and "failed_batch_ids" in step_status:
|
||||||
|
print(f" Failed Batch IDs: {step_status['failed_batch_ids']}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
def count_checkpoint_files(self) -> Dict:
|
||||||
|
"""Count checkpoint files for each step"""
|
||||||
|
counts = {}
|
||||||
|
total_size = 0
|
||||||
|
|
||||||
|
for step in self.steps:
|
||||||
|
step_dir = self.checkpoint_dir / step
|
||||||
|
if step_dir.exists():
|
||||||
|
csv_files = list(step_dir.glob("*.csv"))
|
||||||
|
step_size = sum(f.stat().st_size for f in csv_files)
|
||||||
|
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
|
||||||
|
total_size += step_size
|
||||||
|
else:
|
||||||
|
counts[step] = {"files": 0, "size_mb": 0}
|
||||||
|
|
||||||
|
counts["total_size_mb"] = total_size / (1024 * 1024)
|
||||||
|
return counts
|
||||||
|
|
||||||
|
def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
|
||||||
|
"""Clean checkpoint files for a specific step"""
|
||||||
|
step_dir = self.checkpoint_dir / step_name
|
||||||
|
|
||||||
|
if not step_dir.exists():
|
||||||
|
logging.info(f"No checkpoints found for {step_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
csv_files = sorted(step_dir.glob("batch_*.csv"))
|
||||||
|
|
||||||
|
if len(csv_files) <= keep_last:
|
||||||
|
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
|
||||||
|
return
|
||||||
|
|
||||||
|
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
|
||||||
|
|
||||||
|
for file_path in files_to_delete:
|
||||||
|
try:
|
||||||
|
file_path.unlink()
|
||||||
|
logging.info(f"Deleted {file_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to delete {file_path}: {e}")
|
||||||
|
|
||||||
|
def reset_step(self, step_name: str):
|
||||||
|
"""Reset a pipeline step by removing its checkpoints and state"""
|
||||||
|
step_dir = self.checkpoint_dir / step_name
|
||||||
|
|
||||||
|
if step_dir.exists():
|
||||||
|
try:
|
||||||
|
shutil.rmtree(step_dir)
|
||||||
|
logging.info(f"Reset step: {step_name}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to reset {step_name}: {e}")
|
||||||
|
else:
|
||||||
|
logging.info(f"Step {step_name} has no checkpoints to reset")
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from typing import Dict, Any
|
||||||
|
import time
|
||||||
|
|
||||||
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
from processing.batch.batch_processor import BatchProcessor
|
||||||
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
|
class Pipeline:
|
||||||
|
"""Main pipeline orchestrator"""
|
||||||
|
|
||||||
|
def __init__(self, config: BatchConfig):
|
||||||
|
self.config = config
|
||||||
|
self.processor = BatchProcessor(config)
|
||||||
|
self.steps = []
|
||||||
|
|
||||||
|
def add_step(self, step: PipelineStep):
|
||||||
|
"""Add a processing step to the pipeline"""
|
||||||
|
self.steps.append(step)
|
||||||
|
|
||||||
|
def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Run the complete pipeline"""
|
||||||
|
current_data = input_data.copy()
|
||||||
|
|
||||||
|
for step in self.steps:
|
||||||
|
logging.info(f"Running pipeline step: {step.name}")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
current_data = self.processor.process(step, current_data)
|
||||||
|
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
|
||||||
|
|
||||||
|
if step.state.failed_batches:
|
||||||
|
logging.warning(
|
||||||
|
f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
|
||||||
|
)
|
||||||
|
|
||||||
|
return current_data
|
||||||
|
|
||||||
|
def get_progress(self) -> Dict[str, Any]:
|
||||||
|
"""Get progress information for all steps"""
|
||||||
|
progress = {}
|
||||||
|
for step in self.steps:
|
||||||
|
progress[step.name] = {
|
||||||
|
"processed_batches": step.state.processed_batches,
|
||||||
|
"total_batches": step.state.total_batches,
|
||||||
|
"failed_batches": len(step.state.failed_batches),
|
||||||
|
"completion_percentage": (
|
||||||
|
step.state.processed_batches / max(1, step.state.total_batches)
|
||||||
|
)
|
||||||
|
* 100,
|
||||||
|
}
|
||||||
|
return progress
|
||||||
@@ -1,119 +0,0 @@
|
|||||||
import os
|
|
||||||
import argparse
|
|
||||||
import pandas as pd
|
|
||||||
from misc import DATA_DIR, REGION_MAPPING, logging
|
|
||||||
|
|
||||||
|
|
||||||
def clean(filepath) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Clean the CSV file by removing null bytes, non-breaking spaces, and extra spaces.
|
|
||||||
Also, it attempts to read the file with different encodings to handle potential encoding issues.
|
|
||||||
"""
|
|
||||||
|
|
||||||
encodings = ['utf-8', 'utf-16', 'latin1']
|
|
||||||
for enc in encodings:
|
|
||||||
try:
|
|
||||||
logging.info(f"Trying to read {filepath} with encoding: {enc}")
|
|
||||||
# Use chunked reading to handle large files
|
|
||||||
chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
|
|
||||||
cleaned_chunks = []
|
|
||||||
|
|
||||||
for chunk in chunks:
|
|
||||||
# Drop rows with essential missing values early
|
|
||||||
chunk = chunk.dropna(subset=['name', 'sex', 'region'])
|
|
||||||
|
|
||||||
# Clean string columns in-place
|
|
||||||
for col in chunk.select_dtypes(include='object').columns:
|
|
||||||
chunk[col] = (
|
|
||||||
chunk[col]
|
|
||||||
.astype(str)
|
|
||||||
.str.replace('\x00', ' ', regex=False)
|
|
||||||
.str.replace('\u00a0', ' ', regex=False)
|
|
||||||
.str.replace(' +', ' ', regex=True)
|
|
||||||
.str.strip()
|
|
||||||
.str.lower()
|
|
||||||
)
|
|
||||||
|
|
||||||
cleaned_chunks.append(chunk)
|
|
||||||
|
|
||||||
df = pd.concat(cleaned_chunks, ignore_index=True)
|
|
||||||
df.to_csv(filepath, index=False, encoding='utf-8')
|
|
||||||
logging.info(f"Successfully read with encoding: {enc}")
|
|
||||||
return df
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
|
|
||||||
|
|
||||||
|
|
||||||
def process(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Process the DataFrame to extract features and clean data.
|
|
||||||
This includes counting words, calculating name length, and extracting probable native names and surnames.
|
|
||||||
Also maps regions to provinces based on REGION_MAPPING.
|
|
||||||
"""
|
|
||||||
|
|
||||||
logging.info("Preprocessing names")
|
|
||||||
df['words'] = df['name'].str.count(' ') + 1
|
|
||||||
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
|
|
||||||
df['year'] = df['year'].astype(int)
|
|
||||||
|
|
||||||
# Calculate probable_native and probable_surname
|
|
||||||
name_split = df['name'].str.split()
|
|
||||||
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
|
|
||||||
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
|
|
||||||
df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple')
|
|
||||||
df['identified_name'] = None
|
|
||||||
df['identified_surname'] = None
|
|
||||||
df['annotated'] = 0
|
|
||||||
|
|
||||||
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
|
|
||||||
# This is a common pattern in Congolese names
|
|
||||||
three_word_mask = df['words'] == 3
|
|
||||||
df.loc[three_word_mask, 'identified_name'] = df.loc[three_word_mask, 'probable_native']
|
|
||||||
df.loc[three_word_mask, 'identified_surname'] = df.loc[three_word_mask, 'probable_surname']
|
|
||||||
df.loc[three_word_mask, 'annotated'] = 1
|
|
||||||
|
|
||||||
logging.info("Mapping regions to provinces")
|
|
||||||
df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1])
|
|
||||||
df['province'] = df['province'].str.lower()
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def save_artifacts(df: pd.DataFrame, split_eval: bool = True, split_by_sex: bool = True) -> None:
|
|
||||||
"""
|
|
||||||
Splits the input DataFrame into evaluation and featured datasets, saves them as CSV files,
|
|
||||||
and additionally saves separate CSV files for male and female entries if requested.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if split_eval:
|
|
||||||
logging.info("Saving evaluation and featured datasets")
|
|
||||||
eval_idx = df.sample(frac=0.2, random_state=42).index
|
|
||||||
df_evaluation = df.loc[eval_idx]
|
|
||||||
df_featured = df.drop(index=eval_idx)
|
|
||||||
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
|
|
||||||
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
|
|
||||||
else:
|
|
||||||
df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
|
|
||||||
|
|
||||||
if split_by_sex:
|
|
||||||
logging.info("Saving by sex")
|
|
||||||
df[df['sex'] == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
|
|
||||||
df[df['sex'] == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
|
|
||||||
|
|
||||||
|
|
||||||
def main(split_eval: bool = True, split_by_sex: bool = True):
|
|
||||||
df = process(clean(os.path.join(DATA_DIR, 'names.csv')))
|
|
||||||
save_artifacts(df, split_eval=split_eval, split_by_sex=split_by_sex)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description="Prepare name datasets with optional splits.")
|
|
||||||
|
|
||||||
parser.add_argument('--split_eval', action='store_true', default=True, help="Split into evaluation and featured datasets (default: True)")
|
|
||||||
parser.add_argument('--no-split_eval', action='store_false', dest='split_eval', help="Do not split into evaluation and featured datasets")
|
|
||||||
parser.add_argument('--split_by_sex', action='store_true', default=True, help="Split by sex into male/female datasets (default: True)")
|
|
||||||
parser.add_argument('--no-split_by_sex', action='store_false', dest='split_by_sex', help="Do not split by sex into male/female datasets")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(split_eval=args.split_eval, split_by_sex=args.split_by_sex)
|
|
||||||
@@ -0,0 +1,111 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PipelineState:
|
||||||
|
"""Tracks the state of pipeline execution"""
|
||||||
|
|
||||||
|
processed_batches: int = 0
|
||||||
|
total_batches: int = 0
|
||||||
|
failed_batches: List[int] = None
|
||||||
|
last_checkpoint: Optional[str] = None
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if self.failed_batches is None:
|
||||||
|
self.failed_batches = []
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineStep(ABC):
|
||||||
|
"""Abstract base class for pipeline steps"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
||||||
|
):
|
||||||
|
self.name = name
|
||||||
|
self.pipeline_config = pipeline_config
|
||||||
|
|
||||||
|
# Use provided batch_config or create default from pipeline config
|
||||||
|
if batch_config is None:
|
||||||
|
batch_config = BatchConfig(
|
||||||
|
batch_size=pipeline_config.processing.batch_size,
|
||||||
|
max_workers=pipeline_config.processing.max_workers,
|
||||||
|
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||||
|
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||||
|
)
|
||||||
|
self.batch_config = batch_config
|
||||||
|
self.state = PipelineState()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
|
"""Process a single batch of data"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_checkpoint_path(self, batch_id: int) -> str:
|
||||||
|
"""Get the checkpoint file path for a batch"""
|
||||||
|
checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
|
||||||
|
checkpoint_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
|
||||||
|
|
||||||
|
def get_state_path(self) -> str:
|
||||||
|
"""Get the state file path"""
|
||||||
|
state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
|
||||||
|
state_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return str(state_dir / "pipeline_state.json")
|
||||||
|
|
||||||
|
def save_state(self):
|
||||||
|
"""Save pipeline state to disk"""
|
||||||
|
state_file = self.get_state_path()
|
||||||
|
with open(state_file, "w") as f:
|
||||||
|
json.dump(
|
||||||
|
{
|
||||||
|
"processed_batches": self.state.processed_batches,
|
||||||
|
"total_batches": self.state.total_batches,
|
||||||
|
"failed_batches": self.state.failed_batches,
|
||||||
|
"last_checkpoint": self.state.last_checkpoint,
|
||||||
|
},
|
||||||
|
f,
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_state(self) -> bool:
|
||||||
|
"""Load pipeline state from disk. Returns True if state was loaded."""
|
||||||
|
state_file = self.get_state_path()
|
||||||
|
if os.path.exists(state_file):
|
||||||
|
try:
|
||||||
|
with open(state_file, "r") as f:
|
||||||
|
state_data = json.load(f)
|
||||||
|
self.state.processed_batches = state_data.get("processed_batches", 0)
|
||||||
|
self.state.total_batches = state_data.get("total_batches", 0)
|
||||||
|
self.state.failed_batches = state_data.get("failed_batches", [])
|
||||||
|
self.state.last_checkpoint = state_data.get("last_checkpoint")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed to load state: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def batch_exists(self, batch_id: int) -> bool:
|
||||||
|
"""Check if a batch has already been processed (idempotency)"""
|
||||||
|
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||||
|
return os.path.exists(checkpoint_path)
|
||||||
|
|
||||||
|
def save_batch(self, batch: pd.DataFrame, batch_id: int):
|
||||||
|
"""Save processed batch to checkpoint"""
|
||||||
|
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||||
|
batch.to_csv(checkpoint_path, index=False)
|
||||||
|
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
|
||||||
|
|
||||||
|
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
|
||||||
|
"""Load processed batch from checkpoint"""
|
||||||
|
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||||
|
if os.path.exists(checkpoint_path):
|
||||||
|
return pd.read_csv(checkpoint_path)
|
||||||
|
return None
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
from core.utils.text_cleaner import TextCleaner
|
||||||
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
|
class DataCleaningStep(PipelineStep):
|
||||||
|
"""Configuration-driven data cleaning step"""
|
||||||
|
|
||||||
|
def __init__(self, pipeline_config: PipelineConfig):
|
||||||
|
super().__init__("data_cleaning", pipeline_config)
|
||||||
|
self.text_cleaner = TextCleaner()
|
||||||
|
self.required_columns = ["name", "sex", "region"]
|
||||||
|
|
||||||
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
|
"""Process a single batch for data cleaning"""
|
||||||
|
logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
|
||||||
|
|
||||||
|
# Drop rows with essential missing values
|
||||||
|
batch = batch.dropna(subset=self.required_columns)
|
||||||
|
|
||||||
|
# Apply text cleaning
|
||||||
|
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
|
||||||
|
|
||||||
|
return batch
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
from processing.steps.feature_extraction_step import Gender
|
||||||
|
from core.utils.data_loader import DataLoader
|
||||||
|
|
||||||
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
|
class DataSplittingStep(PipelineStep):
|
||||||
|
"""Configuration-driven data splitting step"""
|
||||||
|
|
||||||
|
def __init__(self, pipeline_config: PipelineConfig):
|
||||||
|
batch_config = BatchConfig(
|
||||||
|
batch_size=pipeline_config.processing.batch_size,
|
||||||
|
max_workers=1, # No need for parallelism in splitting
|
||||||
|
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||||
|
use_multiprocessing=False,
|
||||||
|
)
|
||||||
|
super().__init__("data_splitting", pipeline_config, batch_config)
|
||||||
|
self.data_loader = DataLoader(pipeline_config)
|
||||||
|
self.eval_indices = None
|
||||||
|
|
||||||
|
def determine_eval_indices(self, total_size: int) -> set:
|
||||||
|
"""Determine evaluation indices consistently across batches"""
|
||||||
|
if self.eval_indices is None:
|
||||||
|
np.random.seed(self.pipeline_config.data.random_seed)
|
||||||
|
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
|
||||||
|
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
|
||||||
|
return self.eval_indices
|
||||||
|
|
||||||
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
|
"""Process batch for data splitting - no modification needed"""
|
||||||
|
return batch.copy()
|
||||||
|
|
||||||
|
def save_splits(self, df: pd.DataFrame) -> None:
|
||||||
|
"""Save the split datasets based on configuration"""
|
||||||
|
output_files = self.pipeline_config.data.output_files
|
||||||
|
data_dir = self.pipeline_config.paths.data_dir
|
||||||
|
|
||||||
|
if self.pipeline_config.data.split_evaluation:
|
||||||
|
eval_indices = self.determine_eval_indices(len(df))
|
||||||
|
eval_mask = df.index.isin(eval_indices)
|
||||||
|
|
||||||
|
df_evaluation = df[eval_mask]
|
||||||
|
df_featured = df[~eval_mask]
|
||||||
|
|
||||||
|
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
|
||||||
|
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
|
||||||
|
else:
|
||||||
|
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
||||||
|
|
||||||
|
if self.pipeline_config.data.split_by_gender:
|
||||||
|
df_males = df[df["sex"] == Gender.MALE.value]
|
||||||
|
df_females = df[df["sex"] == Gender.FEMALE.value]
|
||||||
|
|
||||||
|
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
|
||||||
|
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
|
||||||
@@ -0,0 +1,99 @@
|
|||||||
|
import logging
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
from core.utils.region_mapper import RegionMapper
|
||||||
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
|
class Gender(Enum):
|
||||||
|
MALE = "m"
|
||||||
|
FEMALE = "f"
|
||||||
|
|
||||||
|
|
||||||
|
class NameCategory(Enum):
|
||||||
|
SIMPLE = "simple"
|
||||||
|
COMPOSE = "compose"
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureExtractionStep(PipelineStep):
|
||||||
|
"""Configuration-driven feature extraction step"""
|
||||||
|
|
||||||
|
def __init__(self, pipeline_config: PipelineConfig):
|
||||||
|
super().__init__("feature_extraction", pipeline_config)
|
||||||
|
self.region_mapper = RegionMapper()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate_gender(cls, gender: str) -> Gender:
|
||||||
|
"""Validate and normalize gender value"""
|
||||||
|
gender_lower = gender.lower().strip()
|
||||||
|
if gender_lower in ["m", "male", "homme", "masculin"]:
|
||||||
|
return Gender.MALE
|
||||||
|
elif gender_lower in ["f", "female", "femme", "féminin"]:
|
||||||
|
return Gender.FEMALE
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown gender: {gender}")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_name_category(cls, word_count: int) -> NameCategory:
|
||||||
|
"""Determine name category based on word count"""
|
||||||
|
if word_count <= 3:
|
||||||
|
return NameCategory.SIMPLE
|
||||||
|
else:
|
||||||
|
return NameCategory.COMPOSE
|
||||||
|
|
||||||
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
|
"""Extract features from names in batch"""
|
||||||
|
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
|
||||||
|
|
||||||
|
batch = batch.copy()
|
||||||
|
|
||||||
|
# Basic features
|
||||||
|
batch["words"] = batch["name"].str.count(" ") + 1
|
||||||
|
batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
|
||||||
|
|
||||||
|
# Handle year column
|
||||||
|
if "year" in batch.columns:
|
||||||
|
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
|
||||||
|
|
||||||
|
# Initialize new columns
|
||||||
|
batch["probable_native"] = None
|
||||||
|
batch["probable_surname"] = None
|
||||||
|
batch["identified_name"] = None
|
||||||
|
batch["identified_surname"] = None
|
||||||
|
batch["annotated"] = 0
|
||||||
|
|
||||||
|
# Vectorized category assignment
|
||||||
|
batch["identified_category"] = batch["words"].apply(
|
||||||
|
lambda x: self.get_name_category(x).value
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assign probable_native and probable_surname for all names
|
||||||
|
name_splits = batch["name"].str.split()
|
||||||
|
batch["probable_native"] = name_splits.apply(
|
||||||
|
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
|
||||||
|
)
|
||||||
|
batch["probable_surname"] = name_splits.apply(
|
||||||
|
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Auto-assign for 3-word names
|
||||||
|
three_word_mask = batch["words"] == 3
|
||||||
|
batch.loc[three_word_mask, "identified_name"] = batch.loc[
|
||||||
|
three_word_mask, "probable_native"
|
||||||
|
]
|
||||||
|
batch.loc[three_word_mask, "identified_surname"] = batch.loc[
|
||||||
|
three_word_mask, "probable_surname"
|
||||||
|
]
|
||||||
|
batch.loc[three_word_mask, "annotated"] = 1
|
||||||
|
|
||||||
|
# Map regions to provinces
|
||||||
|
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
|
||||||
|
|
||||||
|
# Normalize gender
|
||||||
|
if "sex" in batch.columns:
|
||||||
|
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
|
||||||
|
|
||||||
|
return batch
|
||||||
@@ -0,0 +1,168 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
import ollama
|
||||||
|
import pandas as pd
|
||||||
|
from pydantic import ValidationError, BaseModel
|
||||||
|
|
||||||
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
from core.utils.prompt_manager import PromptManager
|
||||||
|
from core.utils.rate_limiter import RateLimiter
|
||||||
|
from core.utils.rate_limiter import RateLimitConfig
|
||||||
|
from processing.batch.batch_config import BatchConfig
|
||||||
|
from processing.steps import PipelineStep
|
||||||
|
|
||||||
|
|
||||||
|
class NameAnnotation(BaseModel):
|
||||||
|
"""Model for name annotation results"""
|
||||||
|
|
||||||
|
identified_name: Optional[str]
|
||||||
|
identified_surname: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class LLMAnnotationStep(PipelineStep):
|
||||||
|
"""Configuration-driven LLM annotation step"""
|
||||||
|
|
||||||
|
def __init__(self, pipeline_config: PipelineConfig):
|
||||||
|
# Create custom batch config for LLM processing
|
||||||
|
batch_config = BatchConfig(
|
||||||
|
batch_size=pipeline_config.processing.batch_size,
|
||||||
|
max_workers=min(
|
||||||
|
pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
|
||||||
|
),
|
||||||
|
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||||
|
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||||
|
)
|
||||||
|
super().__init__("llm_annotation", pipeline_config, batch_config)
|
||||||
|
|
||||||
|
self.prompt = PromptManager(pipeline_config).load_prompt()
|
||||||
|
self.rate_limiter = (
|
||||||
|
self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Statistics
|
||||||
|
self.successful_requests = 0
|
||||||
|
self.failed_requests = 0
|
||||||
|
self.total_retry_attempts = 0
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
def _create_rate_limiter(self):
|
||||||
|
"""Create rate limiter based on configuration"""
|
||||||
|
rate_config = RateLimitConfig(
|
||||||
|
requests_per_minute=self.pipeline_config.llm.requests_per_minute,
|
||||||
|
requests_per_second=self.pipeline_config.llm.requests_per_second,
|
||||||
|
)
|
||||||
|
return RateLimiter(rate_config)
|
||||||
|
|
||||||
|
def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
|
||||||
|
"""Analyze a name with retry logic and rate limiting"""
|
||||||
|
for attempt in range(self.pipeline_config.llm.retry_attempts):
|
||||||
|
try:
|
||||||
|
# Apply rate limiting if enabled
|
||||||
|
if self.rate_limiter:
|
||||||
|
self.rate_limiter.wait_if_needed()
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
response = client.chat(
|
||||||
|
model=self.pipeline_config.llm.model_name,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": self.prompt},
|
||||||
|
{"role": "user", "content": name},
|
||||||
|
],
|
||||||
|
format=NameAnnotation.model_json_schema(),
|
||||||
|
)
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
|
if elapsed_time > self.pipeline_config.llm.timeout_seconds:
|
||||||
|
raise TimeoutError(
|
||||||
|
f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
|
||||||
|
)
|
||||||
|
|
||||||
|
annotation = NameAnnotation.model_validate_json(response.message.content)
|
||||||
|
result = {
|
||||||
|
**annotation.model_dump(),
|
||||||
|
"annotated": 1,
|
||||||
|
"processing_time": elapsed_time,
|
||||||
|
"attempts": attempt + 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.successful_requests += 1
|
||||||
|
if attempt > 0:
|
||||||
|
self.total_retry_attempts += attempt
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except (ValidationError, TimeoutError, Exception) as e:
|
||||||
|
logging.warning(
|
||||||
|
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Exponential backoff with jitter
|
||||||
|
if attempt < self.pipeline_config.llm.retry_attempts - 1:
|
||||||
|
wait_time = (2**attempt) + (time.time() % 1)
|
||||||
|
time.sleep(min(wait_time, 10))
|
||||||
|
|
||||||
|
self.failed_requests += 1
|
||||||
|
return {
|
||||||
|
"identified_name": None,
|
||||||
|
"identified_surname": None,
|
||||||
|
"annotated": 0,
|
||||||
|
"processing_time": 0,
|
||||||
|
"attempts": self.pipeline_config.llm.retry_attempts,
|
||||||
|
"failed": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||||
|
"""Process batch with LLM annotation"""
|
||||||
|
unannotated_mask = batch.get("annotated", 0) == 0
|
||||||
|
unannotated_entries = batch[unannotated_mask]
|
||||||
|
|
||||||
|
if unannotated_entries.empty:
|
||||||
|
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||||
|
return batch
|
||||||
|
|
||||||
|
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
|
||||||
|
|
||||||
|
batch = batch.copy()
|
||||||
|
client = ollama.Client()
|
||||||
|
|
||||||
|
# Process with controlled concurrency
|
||||||
|
max_workers = self.pipeline_config.llm.max_concurrent_requests
|
||||||
|
|
||||||
|
if len(unannotated_entries) == 1 or max_workers == 1:
|
||||||
|
# Sequential processing
|
||||||
|
for idx, row in unannotated_entries.iterrows():
|
||||||
|
result = self.analyze_name_with_retry(client, row["name"], idx)
|
||||||
|
for field, value in result.items():
|
||||||
|
if field not in ["failed"]:
|
||||||
|
batch.loc[idx, field] = value
|
||||||
|
else:
|
||||||
|
# Concurrent processing
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
future_to_idx = {}
|
||||||
|
|
||||||
|
for idx, row in unannotated_entries.iterrows():
|
||||||
|
future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
|
||||||
|
future_to_idx[future] = idx
|
||||||
|
|
||||||
|
for future in as_completed(future_to_idx):
|
||||||
|
idx = future_to_idx[future]
|
||||||
|
try:
|
||||||
|
result = future.result()
|
||||||
|
for field, value in result.items():
|
||||||
|
if field not in ["failed"]:
|
||||||
|
batch.loc[idx, field] = value
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to process row {idx}: {e}")
|
||||||
|
batch.loc[idx, "annotated"] = 0
|
||||||
|
|
||||||
|
# Ensure proper data types
|
||||||
|
batch["annotated"] = (
|
||||||
|
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||||
|
)
|
||||||
|
|
||||||
|
return batch
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
import ollama
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
from misc import load_prompt
|
|
||||||
|
|
||||||
|
|
||||||
class NameAnalysis(BaseModel):
|
|
||||||
identified_name: str | None
|
|
||||||
identified_surname: str | None
|
|
||||||
|
|
||||||
|
|
||||||
name = input("Enter name: ")
|
|
||||||
|
|
||||||
client = ollama.Client()
|
|
||||||
response = client.chat(
|
|
||||||
model="mistral:7b",
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": load_prompt()},
|
|
||||||
{"role": "user", "content": name}
|
|
||||||
],
|
|
||||||
format=NameAnalysis.model_json_schema()
|
|
||||||
)
|
|
||||||
analysis = NameAnalysis.model_validate_json(response.message.content)
|
|
||||||
result = analysis.model_dump()
|
|
||||||
|
|
||||||
print(result)
|
|
||||||
+126
-1
@@ -1,53 +1,178 @@
|
|||||||
absl-py==2.3.0
|
absl-py==2.3.0
|
||||||
|
altair==5.1.2
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.9.0
|
||||||
|
appnope==0.1.4
|
||||||
|
argon2-cffi==25.1.0
|
||||||
|
argon2-cffi-bindings==21.2.0
|
||||||
|
arrow==1.3.0
|
||||||
|
asttokens==3.0.0
|
||||||
astunparse==1.6.3
|
astunparse==1.6.3
|
||||||
|
async-lru==2.0.5
|
||||||
|
attrs==25.3.0
|
||||||
|
babel==2.17.0
|
||||||
|
beautifulsoup4==4.13.4
|
||||||
|
black==25.1.0
|
||||||
|
bleach==6.2.0
|
||||||
|
blinker==1.9.0
|
||||||
|
cachetools==6.1.0
|
||||||
certifi==2025.6.15
|
certifi==2025.6.15
|
||||||
|
cffi==1.17.1
|
||||||
charset-normalizer==3.4.2
|
charset-normalizer==3.4.2
|
||||||
|
click==8.2.1
|
||||||
|
comm==0.2.2
|
||||||
contourpy==1.3.2
|
contourpy==1.3.2
|
||||||
cycler==0.12.1
|
cycler==0.12.1
|
||||||
|
debugpy==1.8.14
|
||||||
|
decorator==5.2.1
|
||||||
|
defusedxml==0.7.1
|
||||||
|
executing==2.2.0
|
||||||
|
fastjsonschema==2.21.1
|
||||||
|
flake8==7.3.0
|
||||||
flatbuffers==25.2.10
|
flatbuffers==25.2.10
|
||||||
fonttools==4.58.4
|
fonttools==4.58.4
|
||||||
|
fqdn==1.5.1
|
||||||
gast==0.6.0
|
gast==0.6.0
|
||||||
|
gitdb==4.0.12
|
||||||
|
GitPython==3.1.45
|
||||||
google-pasta==0.2.0
|
google-pasta==0.2.0
|
||||||
grpcio==1.73.0
|
grpcio==1.73.0
|
||||||
|
h11==0.16.0
|
||||||
h5py==3.14.0
|
h5py==3.14.0
|
||||||
|
httpcore==1.0.9
|
||||||
|
httpx==0.28.1
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
imbalanced-learn==0.13.0
|
||||||
|
ipykernel==6.29.5
|
||||||
|
ipython==9.4.0
|
||||||
|
ipython_pygments_lexers==1.1.1
|
||||||
|
isoduration==20.11.0
|
||||||
|
jedi==0.19.2
|
||||||
|
Jinja2==3.1.6
|
||||||
joblib==1.5.1
|
joblib==1.5.1
|
||||||
|
json5==0.12.0
|
||||||
|
jsonpointer==3.0.0
|
||||||
|
jsonschema==4.24.0
|
||||||
|
jsonschema-specifications==2025.4.1
|
||||||
|
jupyter-events==0.12.0
|
||||||
|
jupyter-lsp==2.2.5
|
||||||
|
jupyter_client==8.6.3
|
||||||
|
jupyter_core==5.8.1
|
||||||
|
jupyter_server==2.16.0
|
||||||
|
jupyter_server_terminals==0.5.3
|
||||||
|
jupyterlab==4.4.4
|
||||||
|
jupyterlab_pygments==0.3.0
|
||||||
|
jupyterlab_server==2.27.3
|
||||||
keras==3.10.0
|
keras==3.10.0
|
||||||
kiwisolver==1.4.8
|
kiwisolver==1.4.8
|
||||||
libclang==18.1.1
|
libclang==18.1.1
|
||||||
|
lightgbm==4.6.0
|
||||||
Markdown==3.8.2
|
Markdown==3.8.2
|
||||||
markdown-it-py==3.0.0
|
markdown-it-py==3.0.0
|
||||||
MarkupSafe==3.0.2
|
MarkupSafe==3.0.2
|
||||||
matplotlib==3.10.3
|
matplotlib==3.10.3
|
||||||
|
matplotlib-inline==0.1.7
|
||||||
|
mccabe==0.7.0
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
|
mistune==3.1.3
|
||||||
ml-dtypes==0.3.2
|
ml-dtypes==0.3.2
|
||||||
|
mypy==1.17.0
|
||||||
|
mypy_extensions==1.1.0
|
||||||
namex==0.1.0
|
namex==0.1.0
|
||||||
|
narwhals==2.0.1
|
||||||
|
nbclient==0.10.2
|
||||||
|
nbconvert==7.16.6
|
||||||
|
nbformat==5.10.4
|
||||||
|
nest-asyncio==1.6.0
|
||||||
|
nltk==3.9.1
|
||||||
|
notebook==7.4.4
|
||||||
|
notebook_shim==0.2.4
|
||||||
numpy==1.26.4
|
numpy==1.26.4
|
||||||
|
ollama==0.5.1
|
||||||
opt_einsum==3.4.0
|
opt_einsum==3.4.0
|
||||||
optree==0.16.0
|
optree==0.16.0
|
||||||
|
overrides==7.7.0
|
||||||
packaging==25.0
|
packaging==25.0
|
||||||
pandas==2.3.0
|
pandas==2.3.0
|
||||||
|
pandocfilters==1.5.1
|
||||||
|
parso==0.8.4
|
||||||
|
pathspec==0.12.1
|
||||||
|
pexpect==4.9.0
|
||||||
pillow==11.2.1
|
pillow==11.2.1
|
||||||
|
platformdirs==4.3.8
|
||||||
|
plotly==6.2.0
|
||||||
|
prometheus_client==0.22.1
|
||||||
|
prompt_toolkit==3.0.51
|
||||||
protobuf==4.25.8
|
protobuf==4.25.8
|
||||||
|
psutil==7.0.0
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
pure_eval==0.2.3
|
||||||
|
pyarrow==21.0.0
|
||||||
|
pycodestyle==2.14.0
|
||||||
|
pycparser==2.22
|
||||||
|
pydantic==2.11.7
|
||||||
|
pydantic_core==2.33.2
|
||||||
|
pydeck==0.9.1
|
||||||
|
pyflakes==3.4.0
|
||||||
Pygments==2.19.1
|
Pygments==2.19.1
|
||||||
pyparsing==3.2.3
|
pyparsing==3.2.3
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
|
python-json-logger==3.3.0
|
||||||
pytz==2025.2
|
pytz==2025.2
|
||||||
|
PyYAML==6.0.2
|
||||||
|
pyzmq==27.0.0
|
||||||
|
referencing==0.36.2
|
||||||
|
regex==2024.11.6
|
||||||
requests==2.32.4
|
requests==2.32.4
|
||||||
|
rfc3339-validator==0.1.4
|
||||||
|
rfc3986-validator==0.1.1
|
||||||
rich==14.0.0
|
rich==14.0.0
|
||||||
scikit-learn==1.7.0
|
rpds-py==0.26.0
|
||||||
|
scikit-learn==1.6.1
|
||||||
scipy==1.15.3
|
scipy==1.15.3
|
||||||
seaborn==0.13.2
|
seaborn==0.13.2
|
||||||
|
Send2Trash==1.8.3
|
||||||
six==1.17.0
|
six==1.17.0
|
||||||
|
sklearn-compat==0.1.3
|
||||||
|
smmap==5.0.2
|
||||||
|
sniffio==1.3.1
|
||||||
|
soupsieve==2.7
|
||||||
|
stack-data==0.6.3
|
||||||
|
streamlit==1.47.1
|
||||||
|
tenacity==9.1.2
|
||||||
tensorboard==2.16.2
|
tensorboard==2.16.2
|
||||||
tensorboard-data-server==0.7.2
|
tensorboard-data-server==0.7.2
|
||||||
tensorflow==2.16.2
|
tensorflow==2.16.2
|
||||||
tensorflow-io-gcs-filesystem==0.37.1
|
tensorflow-io-gcs-filesystem==0.37.1
|
||||||
termcolor==3.1.0
|
termcolor==3.1.0
|
||||||
|
terminado==0.18.1
|
||||||
threadpoolctl==3.6.0
|
threadpoolctl==3.6.0
|
||||||
|
tinycss2==1.4.0
|
||||||
|
toml==0.10.2
|
||||||
|
toolz==1.0.0
|
||||||
|
tornado==6.5.1
|
||||||
tqdm==4.67.1
|
tqdm==4.67.1
|
||||||
|
traitlets==5.14.3
|
||||||
|
types-python-dateutil==2.9.0.20250516
|
||||||
|
types-PyYAML==6.0.12.20250516
|
||||||
|
typing-inspection==0.4.1
|
||||||
typing_extensions==4.14.0
|
typing_extensions==4.14.0
|
||||||
tzdata==2025.2
|
tzdata==2025.2
|
||||||
|
uri-template==1.3.0
|
||||||
urllib3==2.5.0
|
urllib3==2.5.0
|
||||||
|
wcwidth==0.2.13
|
||||||
|
webcolors==24.11.1
|
||||||
|
webencodings==0.5.1
|
||||||
|
websocket-client==1.8.0
|
||||||
Werkzeug==3.1.3
|
Werkzeug==3.1.3
|
||||||
wrapt==1.17.2
|
wrapt==1.17.2
|
||||||
|
xgboost==3.0.3
|
||||||
|
scikit-learn~=1.6.1
|
||||||
|
ollama~=0.5.1
|
||||||
|
pydantic~=2.11.7
|
||||||
|
streamlit~=1.47.1
|
||||||
|
plotly~=6.2.0
|
||||||
|
altair==5.1.2
|
||||||
|
PyYAML~=6.0.2
|
||||||
|
xgboost~=3.0.3
|
||||||
|
lightgbm~=4.6.0
|
||||||
|
|||||||
@@ -0,0 +1,250 @@
|
|||||||
|
import logging
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
|
||||||
|
import joblib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from research.experiment import ExperimentConfig
|
||||||
|
|
||||||
|
|
||||||
|
class BaseModel(ABC):
|
||||||
|
"""Abstract base class for all models"""
|
||||||
|
|
||||||
|
def __init__(self, config: ExperimentConfig):
|
||||||
|
self.config = config
|
||||||
|
self.model = None
|
||||||
|
self.feature_extractor = None
|
||||||
|
self.label_encoder = None
|
||||||
|
self.tokenizer = None # For neural models
|
||||||
|
self.is_fitted = False
|
||||||
|
self.training_history = {} # Store training history for learning curves
|
||||||
|
self.learning_curve_data = {} # Store learning curve experiment data
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def architecture(self) -> str:
|
||||||
|
"""Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
"""Prepare features for training/prediction"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||||
|
"""Fit the model - implemented differently for each architecture"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def cross_validate(
|
||||||
|
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||||
|
) -> Dict[str, float] | dict[str, np.floating[Any]]:
|
||||||
|
"""Perform cross-validation and return average scores"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def generate_learning_curve(
|
||||||
|
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Generate learning curve data for the model"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def predict(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
"""Make predictions"""
|
||||||
|
if not self.is_fitted:
|
||||||
|
raise ValueError("Model must be fitted before making predictions")
|
||||||
|
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
|
||||||
|
predictions = self.model.predict(X_prepared)
|
||||||
|
|
||||||
|
# Handle different prediction formats
|
||||||
|
if hasattr(predictions, "shape") and len(predictions.shape) > 1:
|
||||||
|
# Neural network outputs (probabilities)
|
||||||
|
predictions = predictions.argmax(axis=1)
|
||||||
|
|
||||||
|
return self.label_encoder.inverse_transform(predictions)
|
||||||
|
|
||||||
|
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
"""Get prediction probabilities if supported"""
|
||||||
|
if not self.is_fitted:
|
||||||
|
raise ValueError("Model must be fitted before making predictions")
|
||||||
|
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
|
||||||
|
if hasattr(self.model, "predict_proba"):
|
||||||
|
return self.model.predict_proba(X_prepared)
|
||||||
|
elif hasattr(self.model, "predict"):
|
||||||
|
# For neural networks that return probabilities directly
|
||||||
|
probabilities = self.model.predict(X_prepared)
|
||||||
|
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
|
||||||
|
return probabilities
|
||||||
|
|
||||||
|
raise NotImplementedError("Model does not support probability predictions")
|
||||||
|
|
||||||
|
def get_feature_importance(self) -> Optional[Dict[str, float]]:
|
||||||
|
"""Get feature importance if supported by the model"""
|
||||||
|
|
||||||
|
if hasattr(self.model, "feature_importances_"):
|
||||||
|
# For tree-based models
|
||||||
|
importances = self.model.feature_importances_
|
||||||
|
feature_names = self._get_feature_names()
|
||||||
|
return dict(zip(feature_names, importances))
|
||||||
|
|
||||||
|
elif hasattr(self.model, "coef_"):
|
||||||
|
# For linear models
|
||||||
|
coefficients = np.abs(self.model.coef_[0])
|
||||||
|
feature_names = self._get_feature_names()
|
||||||
|
return dict(zip(feature_names, coefficients))
|
||||||
|
|
||||||
|
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
|
||||||
|
# For sklearn pipelines (like LogisticRegression with vectorizer)
|
||||||
|
classifier = self.model.named_steps["classifier"]
|
||||||
|
if hasattr(classifier, "coef_"):
|
||||||
|
coefficients = np.abs(classifier.coef_[0])
|
||||||
|
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
|
||||||
|
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
|
||||||
|
# Take top features to avoid too many n-grams
|
||||||
|
top_indices = np.argsort(coefficients)[-20:]
|
||||||
|
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_feature_names(self) -> List[str]:
|
||||||
|
"""Get feature names (override in subclasses if needed)"""
|
||||||
|
if hasattr(self.model, "feature_names_in_"):
|
||||||
|
return list(self.model.feature_names_in_)
|
||||||
|
return [f"feature_{i}" for i in range(100)] # Default fallback
|
||||||
|
|
||||||
|
def save(self, path: str):
|
||||||
|
"""Save the complete model with training history"""
|
||||||
|
|
||||||
|
model_data = {
|
||||||
|
"model": self.model,
|
||||||
|
"feature_extractor": self.feature_extractor,
|
||||||
|
"label_encoder": self.label_encoder,
|
||||||
|
"tokenizer": self.tokenizer,
|
||||||
|
"config": self.config.to_dict(),
|
||||||
|
"is_fitted": self.is_fitted,
|
||||||
|
"training_history": self.training_history,
|
||||||
|
"learning_curve_data": self.learning_curve_data,
|
||||||
|
}
|
||||||
|
joblib.dump(model_data, path)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, path: str) -> "BaseModel":
|
||||||
|
"""Load a saved model with training history"""
|
||||||
|
model_data = joblib.load(path)
|
||||||
|
|
||||||
|
# Recreate the model instance
|
||||||
|
from research.experiment import ExperimentConfig
|
||||||
|
|
||||||
|
config = ExperimentConfig.from_dict(model_data["config"])
|
||||||
|
instance = cls(config)
|
||||||
|
|
||||||
|
# Restore state
|
||||||
|
instance.model = model_data["model"]
|
||||||
|
instance.feature_extractor = model_data["feature_extractor"]
|
||||||
|
instance.label_encoder = model_data["label_encoder"]
|
||||||
|
instance.tokenizer = model_data.get("tokenizer")
|
||||||
|
instance.is_fitted = model_data["is_fitted"]
|
||||||
|
instance.training_history = model_data.get("training_history", {})
|
||||||
|
instance.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||||
|
|
||||||
|
return instance
|
||||||
|
|
||||||
|
def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
|
||||||
|
"""Plot and save learning curve"""
|
||||||
|
|
||||||
|
if not self.learning_curve_data:
|
||||||
|
logging.warning("No learning curve data available")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
|
||||||
|
data = self.learning_curve_data
|
||||||
|
train_sizes = data["train_sizes"]
|
||||||
|
train_scores = data["train_scores"]
|
||||||
|
val_scores = data["val_scores"]
|
||||||
|
train_std = data.get("train_scores_std", [0] * len(train_sizes))
|
||||||
|
val_std = data.get("val_scores_std", [0] * len(train_sizes))
|
||||||
|
|
||||||
|
# Plot learning curves
|
||||||
|
plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
|
||||||
|
plt.fill_between(
|
||||||
|
train_sizes,
|
||||||
|
np.array(train_scores) - np.array(train_std),
|
||||||
|
np.array(train_scores) + np.array(train_std),
|
||||||
|
alpha=0.1,
|
||||||
|
color="blue",
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
|
||||||
|
plt.fill_between(
|
||||||
|
train_sizes,
|
||||||
|
np.array(val_scores) - np.array(val_std),
|
||||||
|
np.array(val_scores) + np.array(val_std),
|
||||||
|
alpha=0.1,
|
||||||
|
color="red",
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.xlabel("Training Set Size")
|
||||||
|
plt.ylabel("Accuracy Score")
|
||||||
|
plt.title(f"Learning Curve - {self.__class__.__name__}")
|
||||||
|
plt.legend(loc="best")
|
||||||
|
plt.grid(True, alpha=0.3)
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
if save_path:
|
||||||
|
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close()
|
||||||
|
return save_path
|
||||||
|
else:
|
||||||
|
plt.show()
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def plot_training_history(self, save_path: Optional[str] = None) -> str:
|
||||||
|
"""Plot training history for neural networks"""
|
||||||
|
if not self.training_history:
|
||||||
|
logging.warning("No training history available")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
|
||||||
|
|
||||||
|
# Plot accuracy
|
||||||
|
if "accuracy" in self.training_history:
|
||||||
|
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
|
||||||
|
if "val_accuracy" in self.training_history:
|
||||||
|
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
|
||||||
|
axes[0].set_title("Model Accuracy")
|
||||||
|
axes[0].set_xlabel("Epoch")
|
||||||
|
axes[0].set_ylabel("Accuracy")
|
||||||
|
axes[0].legend()
|
||||||
|
axes[0].grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# Plot loss
|
||||||
|
if "loss" in self.training_history:
|
||||||
|
axes[1].plot(self.training_history["loss"], label="Training Loss")
|
||||||
|
if "val_loss" in self.training_history:
|
||||||
|
axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
|
||||||
|
axes[1].set_title("Model Loss")
|
||||||
|
axes[1].set_xlabel("Epoch")
|
||||||
|
axes[1].set_ylabel("Loss")
|
||||||
|
axes[1].legend()
|
||||||
|
axes[1].grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
if save_path:
|
||||||
|
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close()
|
||||||
|
return save_path
|
||||||
|
else:
|
||||||
|
plt.show()
|
||||||
|
return ""
|
||||||
@@ -0,0 +1,91 @@
|
|||||||
|
from dataclasses import dataclass, field, asdict
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||||
|
|
||||||
|
from .feature_extractor import FeatureType
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExperimentConfig:
|
||||||
|
"""Configuration for a single experiment"""
|
||||||
|
|
||||||
|
# Experiment metadata
|
||||||
|
name: str
|
||||||
|
description: str = ""
|
||||||
|
tags: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
# Model configuration
|
||||||
|
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
|
||||||
|
model_params: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
# Feature configuration
|
||||||
|
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
|
||||||
|
feature_params: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
# Data configuration
|
||||||
|
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
|
||||||
|
test_data_filter: Optional[Dict[str, Any]] = None
|
||||||
|
target_column: str = "sex"
|
||||||
|
|
||||||
|
# Training configuration
|
||||||
|
test_size: float = 0.2
|
||||||
|
random_seed: int = 42
|
||||||
|
cross_validation_folds: int = 5
|
||||||
|
|
||||||
|
# Evaluation configuration
|
||||||
|
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convert to dictionary for serialization"""
|
||||||
|
result = asdict(self)
|
||||||
|
# Convert enums to strings
|
||||||
|
result["features"] = [f.value for f in self.features]
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
|
||||||
|
"""Create from dictionary"""
|
||||||
|
if "features" in data:
|
||||||
|
data["features"] = [FeatureType(f) for f in data["features"]]
|
||||||
|
return cls(**data)
|
||||||
|
|
||||||
|
|
||||||
|
class ExperimentStatus(Enum):
|
||||||
|
"""Experiment execution status"""
|
||||||
|
|
||||||
|
PENDING = "pending"
|
||||||
|
RUNNING = "running"
|
||||||
|
COMPLETED = "completed"
|
||||||
|
FAILED = "failed"
|
||||||
|
CANCELLED = "cancelled"
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_metrics(
|
||||||
|
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
|
||||||
|
) -> Dict[str, float]:
|
||||||
|
"""Calculate specified metrics"""
|
||||||
|
|
||||||
|
if metrics is None:
|
||||||
|
metrics = ["accuracy", "precision", "recall", "f1"]
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
if "accuracy" in metrics:
|
||||||
|
results["accuracy"] = accuracy_score(y_true, y_pred)
|
||||||
|
|
||||||
|
if any(m in metrics for m in ["precision", "recall", "f1"]):
|
||||||
|
precision, recall, f1, _ = precision_recall_fscore_support(
|
||||||
|
y_true, y_pred, average="weighted"
|
||||||
|
)
|
||||||
|
|
||||||
|
if "precision" in metrics:
|
||||||
|
results["precision"] = precision
|
||||||
|
if "recall" in metrics:
|
||||||
|
results["recall"] = recall
|
||||||
|
if "f1" in metrics:
|
||||||
|
results["f1"] = f1
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
from dataclasses import dataclass, field, asdict
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Dict, List, Any
|
||||||
|
|
||||||
|
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExperimentResult:
|
||||||
|
"""Results from an experiment execution"""
|
||||||
|
|
||||||
|
experiment_id: str
|
||||||
|
config: ExperimentConfig
|
||||||
|
|
||||||
|
# Execution metadata
|
||||||
|
start_time: datetime
|
||||||
|
end_time: Optional[datetime] = None
|
||||||
|
status: ExperimentStatus = ExperimentStatus.PENDING
|
||||||
|
error_message: Optional[str] = None
|
||||||
|
|
||||||
|
# Model artifacts
|
||||||
|
model_path: Optional[str] = None
|
||||||
|
feature_extractor_path: Optional[str] = None
|
||||||
|
|
||||||
|
# Metrics
|
||||||
|
train_metrics: Dict[str, float] = field(default_factory=dict)
|
||||||
|
test_metrics: Dict[str, float] = field(default_factory=dict)
|
||||||
|
cv_metrics: Dict[str, float] = field(default_factory=dict)
|
||||||
|
|
||||||
|
# Additional results
|
||||||
|
confusion_matrix: Optional[List[List[int]]] = None
|
||||||
|
feature_importance: Optional[Dict[str, float]] = None
|
||||||
|
prediction_examples: Optional[List[Dict]] = None
|
||||||
|
|
||||||
|
# Data statistics
|
||||||
|
train_size: int = 0
|
||||||
|
test_size: int = 0
|
||||||
|
class_distribution: Dict[str, int] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convert to dictionary for serialization"""
|
||||||
|
result = asdict(self)
|
||||||
|
result["config"] = self.config.to_dict()
|
||||||
|
result["start_time"] = self.start_time.isoformat()
|
||||||
|
result["end_time"] = self.end_time.isoformat() if self.end_time else None
|
||||||
|
result["status"] = self.status.value
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
|
||||||
|
"""Create from dictionary"""
|
||||||
|
data["config"] = ExperimentConfig.from_dict(data["config"])
|
||||||
|
data["start_time"] = datetime.fromisoformat(data["start_time"])
|
||||||
|
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
|
||||||
|
data["status"] = ExperimentStatus(data["status"])
|
||||||
|
return cls(**data)
|
||||||
@@ -0,0 +1,123 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
from research.experiment import ExperimentConfig
|
||||||
|
from research.experiment.feature_extractor import FeatureType
|
||||||
|
|
||||||
|
|
||||||
|
class ExperimentBuilder:
|
||||||
|
"""Helper class to build experiment configurations"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_baseline_experiments() -> List[ExperimentConfig]:
|
||||||
|
"""Create a set of baseline experiments for comparison"""
|
||||||
|
|
||||||
|
return [
|
||||||
|
# Full name experiments
|
||||||
|
ExperimentConfig(
|
||||||
|
name="baseline_logistic_regression_fullname",
|
||||||
|
description="Logistic regression with full name",
|
||||||
|
model_type="logistic_regression",
|
||||||
|
features=[FeatureType.FULL_NAME],
|
||||||
|
tags=["baseline", "fullname"],
|
||||||
|
),
|
||||||
|
# Native name only
|
||||||
|
ExperimentConfig(
|
||||||
|
name="baseline_logistic_regression_native",
|
||||||
|
description="Logistic regression with native name only",
|
||||||
|
model_type="logistic_regression",
|
||||||
|
features=[FeatureType.NATIVE_NAME],
|
||||||
|
tags=["baseline", "native"],
|
||||||
|
),
|
||||||
|
# Surname only
|
||||||
|
ExperimentConfig(
|
||||||
|
name="baseline_logistic_regression_surname",
|
||||||
|
description="Logistic regression with surname only",
|
||||||
|
model_type="logistic_regression",
|
||||||
|
features=[FeatureType.SURNAME],
|
||||||
|
tags=["baseline", "surname"],
|
||||||
|
),
|
||||||
|
# Random Forest with engineered features
|
||||||
|
ExperimentConfig(
|
||||||
|
name="baseline_rf_engineered",
|
||||||
|
description="Random Forest with engineered features",
|
||||||
|
model_type="random_forest",
|
||||||
|
features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
|
||||||
|
tags=["baseline", "engineered"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_feature_ablation_study() -> List[ExperimentConfig]:
|
||||||
|
"""Create experiments for feature ablation study"""
|
||||||
|
base_features = [
|
||||||
|
FeatureType.FULL_NAME,
|
||||||
|
FeatureType.NAME_LENGTH,
|
||||||
|
FeatureType.WORD_COUNT,
|
||||||
|
FeatureType.PROVINCE,
|
||||||
|
]
|
||||||
|
|
||||||
|
experiments = []
|
||||||
|
|
||||||
|
# Test removing each feature one by one
|
||||||
|
for i, feature_to_remove in enumerate(base_features):
|
||||||
|
remaining_features = [f for f in base_features if f != feature_to_remove]
|
||||||
|
|
||||||
|
experiments.append(
|
||||||
|
ExperimentConfig(
|
||||||
|
name=f"ablation_remove_{feature_to_remove.value}",
|
||||||
|
description=f"Ablation study: removed {feature_to_remove.value}",
|
||||||
|
model_type="logistic_regression",
|
||||||
|
features=remaining_features,
|
||||||
|
tags=["ablation", feature_to_remove.value],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return experiments
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_name_component_study() -> List[ExperimentConfig]:
|
||||||
|
"""Create experiments to study different name components"""
|
||||||
|
experiments = []
|
||||||
|
|
||||||
|
name_components = [
|
||||||
|
(FeatureType.FIRST_WORD, "first_word"),
|
||||||
|
(FeatureType.LAST_WORD, "last_word"),
|
||||||
|
(FeatureType.NATIVE_NAME, "native_name"),
|
||||||
|
(FeatureType.SURNAME, "surname"),
|
||||||
|
(FeatureType.NAME_BEGINNINGS, "name_beginnings"),
|
||||||
|
(FeatureType.NAME_ENDINGS, "name_endings"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for feature, name in name_components:
|
||||||
|
experiments.append(
|
||||||
|
ExperimentConfig(
|
||||||
|
name=f"component_study_{name}",
|
||||||
|
description=f"Study of {name} for gender prediction",
|
||||||
|
model_type="logistic_regression",
|
||||||
|
features=[feature],
|
||||||
|
tags=["component_study", name],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return experiments
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_province_specific_study() -> List[ExperimentConfig]:
|
||||||
|
"""Create experiments for province-specific analysis"""
|
||||||
|
provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed
|
||||||
|
|
||||||
|
experiments = []
|
||||||
|
|
||||||
|
for province in provinces:
|
||||||
|
experiments.append(
|
||||||
|
ExperimentConfig(
|
||||||
|
name=f"province_study_{province}",
|
||||||
|
description=f"Gender prediction for {province} province only",
|
||||||
|
model_type="logistic_regression",
|
||||||
|
features=[FeatureType.FULL_NAME],
|
||||||
|
train_data_filter={"province": province},
|
||||||
|
tags=["province_study", province],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return experiments
|
||||||
@@ -0,0 +1,238 @@
|
|||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
from core.config import PipelineConfig
|
||||||
|
from core.utils import get_data_file_path
|
||||||
|
from core.utils.data_loader import DataLoader
|
||||||
|
from research.base_model import BaseModel
|
||||||
|
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
|
||||||
|
from research.experiment.experiment_tracker import ExperimentTracker
|
||||||
|
from research.model_registry import create_model
|
||||||
|
|
||||||
|
|
||||||
|
class ExperimentRunner:
|
||||||
|
"""Runs and manages experiments"""
|
||||||
|
|
||||||
|
def __init__(self, config: PipelineConfig):
|
||||||
|
self.config = config
|
||||||
|
self.tracker = ExperimentTracker(self.config)
|
||||||
|
self.data_loader = DataLoader(self.config)
|
||||||
|
|
||||||
|
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
|
||||||
|
"""Run a single experiment and return experiment ID"""
|
||||||
|
# Create experiment
|
||||||
|
experiment_id = self.tracker.create_experiment(experiment_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logging.info(f"Starting experiment: {experiment_id}")
|
||||||
|
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||||
|
df = self.data_loader.load_csv_complete(data_path)
|
||||||
|
|
||||||
|
# Apply data filters if specified
|
||||||
|
df = self._apply_data_filters(df, experiment_config)
|
||||||
|
|
||||||
|
# Prepare target variable
|
||||||
|
y = df[experiment_config.target_column]
|
||||||
|
X = df
|
||||||
|
|
||||||
|
# Split data
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
test_size=experiment_config.test_size,
|
||||||
|
random_state=experiment_config.random_seed,
|
||||||
|
stratify=y,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create and train model
|
||||||
|
model = create_model(experiment_config)
|
||||||
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Make predictions
|
||||||
|
train_pred = model.predict(X_train)
|
||||||
|
test_pred = model.predict(X_test)
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
|
||||||
|
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
|
||||||
|
|
||||||
|
# Cross-validation if requested
|
||||||
|
cv_metrics = {}
|
||||||
|
if experiment_config.cross_validation_folds > 1:
|
||||||
|
cv_metrics = model.cross_validate(
|
||||||
|
X_train, y_train, experiment_config.cross_validation_folds
|
||||||
|
)
|
||||||
|
|
||||||
|
# Additional analysis
|
||||||
|
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
|
||||||
|
feature_importance = model.get_feature_importance()
|
||||||
|
|
||||||
|
# Create prediction examples
|
||||||
|
prediction_examples = self._create_prediction_examples(
|
||||||
|
X_test, y_test, test_pred, model, n_examples=10
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate class distribution
|
||||||
|
class_distribution = y.value_counts().to_dict()
|
||||||
|
|
||||||
|
# Save model
|
||||||
|
model_path = self._save_model(model, experiment_id)
|
||||||
|
|
||||||
|
# Update experiment with results
|
||||||
|
self.tracker.update_experiment(
|
||||||
|
experiment_id,
|
||||||
|
status=ExperimentStatus.COMPLETED,
|
||||||
|
end_time=datetime.now(),
|
||||||
|
model_path=str(model_path),
|
||||||
|
train_metrics=train_metrics,
|
||||||
|
test_metrics=test_metrics,
|
||||||
|
cv_metrics=cv_metrics,
|
||||||
|
confusion_matrix=conf_matrix,
|
||||||
|
feature_importance=feature_importance,
|
||||||
|
prediction_examples=prediction_examples,
|
||||||
|
train_size=len(X_train),
|
||||||
|
test_size=len(X_test),
|
||||||
|
class_distribution=class_distribution,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info(f"Experiment {experiment_id} completed successfully")
|
||||||
|
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
|
||||||
|
|
||||||
|
return experiment_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
|
||||||
|
self.tracker.update_experiment(
|
||||||
|
experiment_id,
|
||||||
|
status=ExperimentStatus.FAILED,
|
||||||
|
end_time=datetime.now(),
|
||||||
|
error_message=str(e),
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
|
||||||
|
"""Run multiple experiments"""
|
||||||
|
experiment_ids = []
|
||||||
|
|
||||||
|
for i, config in enumerate(experiments):
|
||||||
|
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
|
||||||
|
try:
|
||||||
|
exp_id = self.run_experiment(config)
|
||||||
|
experiment_ids.append(exp_id)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to run experiment {config.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return experiment_ids
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
|
||||||
|
"""Apply data filters specified in experiment config"""
|
||||||
|
filtered_df = df.copy()
|
||||||
|
|
||||||
|
# Apply training data filters
|
||||||
|
if config.train_data_filter:
|
||||||
|
for column, criteria in config.train_data_filter.items():
|
||||||
|
if column in filtered_df.columns:
|
||||||
|
if isinstance(criteria, list):
|
||||||
|
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
|
||||||
|
elif isinstance(criteria, dict):
|
||||||
|
if "min" in criteria:
|
||||||
|
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
|
||||||
|
if "max" in criteria:
|
||||||
|
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
|
||||||
|
else:
|
||||||
|
filtered_df = filtered_df[filtered_df[column] == criteria]
|
||||||
|
|
||||||
|
return filtered_df
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _create_prediction_examples(
|
||||||
|
cls,
|
||||||
|
X_test: pd.DataFrame,
|
||||||
|
y_test: pd.Series,
|
||||||
|
predictions: np.ndarray,
|
||||||
|
model: BaseModel,
|
||||||
|
n_examples: int = 10,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Create prediction examples for analysis"""
|
||||||
|
examples = []
|
||||||
|
|
||||||
|
# Get both correct and incorrect predictions
|
||||||
|
correct_mask = y_test == predictions
|
||||||
|
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
|
||||||
|
correct_indices = X_test[correct_mask].index[: n_examples // 2]
|
||||||
|
|
||||||
|
sample_indices = list(incorrect_indices) + list(correct_indices)
|
||||||
|
|
||||||
|
for idx in sample_indices[:n_examples]:
|
||||||
|
example = {
|
||||||
|
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
|
||||||
|
"true_label": y_test.loc[idx],
|
||||||
|
"predicted_label": predictions[X_test.index.get_loc(idx)],
|
||||||
|
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add probability if available
|
||||||
|
if model.architecture == "traditional":
|
||||||
|
proba = model.predict_proba(X_test.loc[[idx]])
|
||||||
|
example["prediction_confidence"] = float(proba.max())
|
||||||
|
|
||||||
|
examples.append(example)
|
||||||
|
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
|
||||||
|
"""Save trained model"""
|
||||||
|
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
|
||||||
|
model_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
model_path = model_dir / "model.joblib"
|
||||||
|
model.save(str(model_path))
|
||||||
|
|
||||||
|
return model_path
|
||||||
|
|
||||||
|
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
|
||||||
|
"""Load a model from a completed experiment"""
|
||||||
|
experiment = self.tracker.get_experiment(experiment_id)
|
||||||
|
|
||||||
|
if experiment and experiment.model_path:
|
||||||
|
return BaseModel.load(experiment.model_path)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def compare_experiments(
|
||||||
|
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Compare experiments and return analysis"""
|
||||||
|
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||||
|
|
||||||
|
if f"test_{metric}" in comparison_df.columns:
|
||||||
|
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
|
||||||
|
|
||||||
|
return comparison_df
|
||||||
|
|
||||||
|
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
|
||||||
|
"""Get feature importance analysis for an experiment"""
|
||||||
|
experiment = self.tracker.get_experiment(experiment_id)
|
||||||
|
|
||||||
|
if experiment and experiment.feature_importance:
|
||||||
|
importance_df = pd.DataFrame(
|
||||||
|
[
|
||||||
|
{"feature": feature, "importance": importance}
|
||||||
|
for feature, importance in experiment.feature_importance.items()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return importance_df.sort_values("importance", ascending=False)
|
||||||
|
|
||||||
|
return None
|
||||||
@@ -0,0 +1,194 @@
|
|||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Dict, List
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from core.config import PipelineConfig, get_config
|
||||||
|
|
||||||
|
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||||
|
from research.experiment.experiement_result import ExperimentResult
|
||||||
|
|
||||||
|
|
||||||
|
class ExperimentTracker:
|
||||||
|
"""Tracks and manages experiments"""
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[PipelineConfig] = None):
|
||||||
|
self.config = config or get_config()
|
||||||
|
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
|
||||||
|
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
self.results_db_path = self.experiments_dir / "experiments.json"
|
||||||
|
self._results: Dict[str, ExperimentResult] = {}
|
||||||
|
self._load_results()
|
||||||
|
|
||||||
|
def _load_results(self):
|
||||||
|
"""Load existing experiment results"""
|
||||||
|
if self.results_db_path.exists():
|
||||||
|
try:
|
||||||
|
with open(self.results_db_path, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
for exp_id, exp_data in data.items():
|
||||||
|
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Failed to load experiment results: {e}")
|
||||||
|
|
||||||
|
def _save_results(self):
|
||||||
|
"""Save experiment results to disk"""
|
||||||
|
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
|
||||||
|
|
||||||
|
with open(self.results_db_path, "w") as f:
|
||||||
|
json.dump(data, f, indent=2, default=str)
|
||||||
|
|
||||||
|
def create_experiment(self, config: ExperimentConfig) -> str:
|
||||||
|
"""Create a new experiment and return its ID"""
|
||||||
|
# Generate experiment ID
|
||||||
|
config_hash = hashlib.md5(
|
||||||
|
json.dumps(config.to_dict(), sort_keys=True).encode()
|
||||||
|
).hexdigest()[:8]
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
|
||||||
|
|
||||||
|
# Create result object
|
||||||
|
result = ExperimentResult(
|
||||||
|
experiment_id=experiment_id, config=config, start_time=datetime.now()
|
||||||
|
)
|
||||||
|
|
||||||
|
self._results[experiment_id] = result
|
||||||
|
self._save_results()
|
||||||
|
|
||||||
|
return experiment_id
|
||||||
|
|
||||||
|
def update_experiment(self, experiment_id: str, **updates):
|
||||||
|
"""Update an experiment's results"""
|
||||||
|
if experiment_id in self._results:
|
||||||
|
result = self._results[experiment_id]
|
||||||
|
|
||||||
|
for key, value in updates.items():
|
||||||
|
if hasattr(result, key):
|
||||||
|
setattr(result, key, value)
|
||||||
|
|
||||||
|
self._save_results()
|
||||||
|
|
||||||
|
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
|
||||||
|
"""Get experiment by ID"""
|
||||||
|
return self._results.get(experiment_id)
|
||||||
|
|
||||||
|
def list_experiments(
|
||||||
|
self,
|
||||||
|
status: Optional[ExperimentStatus] = None,
|
||||||
|
tags: Optional[List[str]] = None,
|
||||||
|
model_type: Optional[str] = None,
|
||||||
|
) -> List[ExperimentResult]:
|
||||||
|
"""List experiments with optional filtering"""
|
||||||
|
results = list(self._results.values())
|
||||||
|
|
||||||
|
if status:
|
||||||
|
results = [r for r in results if r.status == status]
|
||||||
|
|
||||||
|
if tags:
|
||||||
|
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
|
||||||
|
|
||||||
|
if model_type:
|
||||||
|
results = [r for r in results if r.config.model_type == model_type]
|
||||||
|
|
||||||
|
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||||
|
|
||||||
|
def get_best_experiment(
|
||||||
|
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||||
|
) -> Optional[ExperimentResult]:
|
||||||
|
"""Get the best experiment based on a metric"""
|
||||||
|
experiments = self.list_experiments()
|
||||||
|
|
||||||
|
if filters:
|
||||||
|
# Apply additional filters
|
||||||
|
if "model_type" in filters:
|
||||||
|
experiments = [
|
||||||
|
e for e in experiments if e.config.model_type == filters["model_type"]
|
||||||
|
]
|
||||||
|
if "features" in filters:
|
||||||
|
experiments = [
|
||||||
|
e
|
||||||
|
for e in experiments
|
||||||
|
if any(f in e.config.features for f in filters["features"])
|
||||||
|
]
|
||||||
|
|
||||||
|
valid_experiments = []
|
||||||
|
for exp in experiments:
|
||||||
|
if exp.status == ExperimentStatus.COMPLETED:
|
||||||
|
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
|
||||||
|
if metric in metrics_dict:
|
||||||
|
valid_experiments.append((exp, metrics_dict[metric]))
|
||||||
|
|
||||||
|
if not valid_experiments:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return max(valid_experiments, key=lambda x: x[1])[0]
|
||||||
|
|
||||||
|
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
|
||||||
|
"""Compare multiple experiments in a DataFrame"""
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
for exp_id in experiment_ids:
|
||||||
|
exp = self.get_experiment(exp_id)
|
||||||
|
if exp:
|
||||||
|
row = {
|
||||||
|
"experiment_id": exp_id,
|
||||||
|
"name": exp.config.name,
|
||||||
|
"model_type": exp.config.model_type,
|
||||||
|
"features": ",".join([f.value for f in exp.config.features]),
|
||||||
|
"status": exp.status.value,
|
||||||
|
"train_size": exp.train_size,
|
||||||
|
"test_size": exp.test_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add metrics
|
||||||
|
for metric, value in exp.test_metrics.items():
|
||||||
|
row[f"test_{metric}"] = value
|
||||||
|
|
||||||
|
for metric, value in exp.cv_metrics.items():
|
||||||
|
row[f"cv_{metric}"] = value
|
||||||
|
|
||||||
|
rows.append(row)
|
||||||
|
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
def export_results(self, output_path: Optional[Path] = None) -> Path:
|
||||||
|
"""Export all results to CSV"""
|
||||||
|
if output_path is None:
|
||||||
|
output_path = (
|
||||||
|
self.experiments_dir
|
||||||
|
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for exp in self._results.values():
|
||||||
|
row = {
|
||||||
|
"experiment_id": exp.experiment_id,
|
||||||
|
"name": exp.config.name,
|
||||||
|
"description": exp.config.description,
|
||||||
|
"model_type": exp.config.model_type,
|
||||||
|
"features": ",".join([f.value for f in exp.config.features]),
|
||||||
|
"status": exp.status.value,
|
||||||
|
"start_time": exp.start_time.isoformat(),
|
||||||
|
"end_time": exp.end_time.isoformat() if exp.end_time else None,
|
||||||
|
"train_size": exp.train_size,
|
||||||
|
"test_size": exp.test_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add all metrics
|
||||||
|
for metric, value in exp.test_metrics.items():
|
||||||
|
row[f"test_{metric}"] = value
|
||||||
|
|
||||||
|
for metric, value in exp.cv_metrics.items():
|
||||||
|
row[f"cv_{metric}"] = value
|
||||||
|
|
||||||
|
rows.append(row)
|
||||||
|
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
df.to_csv(output_path, index=False)
|
||||||
|
|
||||||
|
return output_path
|
||||||
@@ -0,0 +1,90 @@
|
|||||||
|
from enum import Enum
|
||||||
|
from typing import List, Dict, Any, Union
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureType(Enum):
|
||||||
|
"""Types of features that can be extracted from names"""
|
||||||
|
|
||||||
|
FULL_NAME = "full_name"
|
||||||
|
NATIVE_NAME = "native_name"
|
||||||
|
SURNAME = "surname"
|
||||||
|
FIRST_WORD = "first_word"
|
||||||
|
LAST_WORD = "last_word"
|
||||||
|
NAME_LENGTH = "name_length"
|
||||||
|
WORD_COUNT = "word_count"
|
||||||
|
PROVINCE = "province"
|
||||||
|
CHAR_NGRAMS = "char_ngrams"
|
||||||
|
WORD_NGRAMS = "word_ngrams"
|
||||||
|
NAME_ENDINGS = "name_endings"
|
||||||
|
NAME_BEGINNINGS = "name_beginnings"
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureExtractor:
|
||||||
|
"""Extract different types of features from name data"""
|
||||||
|
|
||||||
|
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
|
||||||
|
self.feature_types = feature_types
|
||||||
|
self.feature_params = feature_params or {}
|
||||||
|
|
||||||
|
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Extract all configured features"""
|
||||||
|
features_df = pd.DataFrame(index=df.index)
|
||||||
|
|
||||||
|
for feature_type in self.feature_types:
|
||||||
|
feature_data = self._extract_single_feature(df, feature_type)
|
||||||
|
|
||||||
|
if isinstance(feature_data, pd.DataFrame):
|
||||||
|
features_df = pd.concat([features_df, feature_data], axis=1)
|
||||||
|
else:
|
||||||
|
features_df[feature_type.value] = feature_data
|
||||||
|
|
||||||
|
return features_df
|
||||||
|
|
||||||
|
def _extract_single_feature(
|
||||||
|
self, df: pd.DataFrame, feature_type: FeatureType
|
||||||
|
) -> Union[pd.Series, pd.DataFrame]:
|
||||||
|
"""Extract a single type of feature"""
|
||||||
|
if feature_type == FeatureType.FULL_NAME:
|
||||||
|
return df["name"].fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.NATIVE_NAME:
|
||||||
|
return df["identified_name"].fillna(df["probable_native"]).fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.SURNAME:
|
||||||
|
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.FIRST_WORD:
|
||||||
|
return df["name"].str.split().str[0].fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.LAST_WORD:
|
||||||
|
return df["name"].str.split().str[-1].fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.NAME_LENGTH:
|
||||||
|
return df["name"].str.len().fillna(0)
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.WORD_COUNT:
|
||||||
|
return df["words"].fillna(1)
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.PROVINCE:
|
||||||
|
return df["province"].fillna("unknown")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.NAME_ENDINGS:
|
||||||
|
n = self.feature_params.get("ending_length", 3)
|
||||||
|
return df["name"].str[-n:].fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.NAME_BEGINNINGS:
|
||||||
|
n = self.feature_params.get("beginning_length", 3)
|
||||||
|
return df["name"].str[:n].fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.CHAR_NGRAMS:
|
||||||
|
# This will be handled by the model's vectorizer
|
||||||
|
return df["name"].fillna("")
|
||||||
|
|
||||||
|
elif feature_type == FeatureType.WORD_NGRAMS:
|
||||||
|
# This will be handled by the model's vectorizer
|
||||||
|
return df["name"].fillna("")
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown feature type: {feature_type}")
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
from research.base_model import BaseModel
|
||||||
|
from research.experiment import ExperimentConfig
|
||||||
|
from research.models.bigru_model import BiGRUModel
|
||||||
|
from research.models.cnn_model import CNNModel
|
||||||
|
from research.models.ensemble_model import EnsembleModel
|
||||||
|
from research.models.lightgbm_model import LightGBMModel
|
||||||
|
from research.models.logistic_regression_model import LogisticRegressionModel
|
||||||
|
from research.models.lstm_model import LSTMModel
|
||||||
|
from research.models.naive_bayes_model import NaiveBayesModel
|
||||||
|
from research.models.random_forest_model import RandomForestModel
|
||||||
|
from research.models.svm_model import SVMModel
|
||||||
|
from research.models.transformer_model import TransformerModel
|
||||||
|
from research.models.xgboost_model import XGBoostModel
|
||||||
|
|
||||||
|
MODEL_REGISTRY = {
|
||||||
|
"bigru": BiGRUModel,
|
||||||
|
"cnn": CNNModel,
|
||||||
|
"ensemble": EnsembleModel,
|
||||||
|
"lightgbm": LightGBMModel,
|
||||||
|
"logistic_regression": LogisticRegressionModel,
|
||||||
|
"lstm": LSTMModel,
|
||||||
|
"naive_bayes": NaiveBayesModel,
|
||||||
|
"random_forest": RandomForestModel,
|
||||||
|
"svm": SVMModel,
|
||||||
|
"transformer": TransformerModel,
|
||||||
|
"xgboost": XGBoostModel,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(config: ExperimentConfig) -> BaseModel:
|
||||||
|
"""Factory function to create models"""
|
||||||
|
model_class = MODEL_REGISTRY.get(config.model_type)
|
||||||
|
|
||||||
|
if model_class is None:
|
||||||
|
raise ValueError(f"Unknown model type: {config.model_type}")
|
||||||
|
|
||||||
|
return model_class(config)
|
||||||
|
|
||||||
|
|
||||||
|
def list_available_models() -> List[str]:
|
||||||
|
"""List all available model types"""
|
||||||
|
return list(MODEL_REGISTRY.keys())
|
||||||
@@ -0,0 +1,281 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from core.config import get_config
|
||||||
|
from core.utils import get_data_file_path
|
||||||
|
from core.utils.data_loader import DataLoader
|
||||||
|
from research.experiment import FeatureType, ExperimentConfig
|
||||||
|
from research.experiment.experiment_runner import ExperimentRunner
|
||||||
|
from research.experiment.experiment_tracker import ExperimentTracker
|
||||||
|
|
||||||
|
|
||||||
|
class ModelTrainer:
|
||||||
|
"""Comprehensive model training and artifact management"""
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
self.config = config or get_config()
|
||||||
|
self.data_loader = DataLoader(self.config)
|
||||||
|
self.experiment_runner = ExperimentRunner(self.config)
|
||||||
|
self.experiment_tracker = ExperimentTracker(self.config)
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Setup model artifacts directory
|
||||||
|
self.models_dir = self.config.paths.models_dir
|
||||||
|
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def train_single_model(
|
||||||
|
self,
|
||||||
|
model_name: str,
|
||||||
|
model_type: str = "logistic_regression",
|
||||||
|
features: List[str] = None,
|
||||||
|
model_params: Dict[str, Any] = None,
|
||||||
|
save_artifacts: bool = True,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Train a single model and save its artifacts.
|
||||||
|
Returns the experiment ID.
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Training {model_type} model: {model_name}")
|
||||||
|
|
||||||
|
if features is None:
|
||||||
|
features = ["full_name"]
|
||||||
|
feature_types = [FeatureType(f) for f in features]
|
||||||
|
|
||||||
|
# Create experiment configuration
|
||||||
|
config = ExperimentConfig(
|
||||||
|
name=model_name,
|
||||||
|
description=f"Training {model_type} model with features: {', '.join(features)}",
|
||||||
|
model_type=model_type,
|
||||||
|
features=feature_types,
|
||||||
|
model_params=model_params or {},
|
||||||
|
tags=["training", model_type],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run experiment
|
||||||
|
experiment_id = self.experiment_runner.run_experiment(config)
|
||||||
|
experiment = self.experiment_tracker.get_experiment(experiment_id)
|
||||||
|
|
||||||
|
if experiment and experiment.test_metrics:
|
||||||
|
self.logger.info("Training completed successfully!")
|
||||||
|
self.logger.info(f" Experiment ID: {experiment_id}")
|
||||||
|
self.logger.info(f" Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
|
||||||
|
self.logger.info(f" Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
|
||||||
|
|
||||||
|
if save_artifacts:
|
||||||
|
self.save_model_artifacts(experiment_id)
|
||||||
|
|
||||||
|
return experiment_id
|
||||||
|
|
||||||
|
def train_multiple_models(
|
||||||
|
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Train multiple models with different configurations.
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Training {len(model_configs)} models...")
|
||||||
|
|
||||||
|
experiment_ids = []
|
||||||
|
|
||||||
|
for i, config in enumerate(model_configs):
|
||||||
|
model_name = f"{base_name}_{config['model_type']}_{i + 1}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
exp_id = self.train_single_model(
|
||||||
|
model_name=model_name,
|
||||||
|
model_type=config["model_type"],
|
||||||
|
features=config.get("features", ["full_name"]),
|
||||||
|
model_params=config.get("model_params", {}),
|
||||||
|
save_artifacts=save_all,
|
||||||
|
)
|
||||||
|
experiment_ids.append(exp_id)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to train {model_name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.logger.info(f"Completed training {len(experiment_ids)} models successfully")
|
||||||
|
return experiment_ids
|
||||||
|
|
||||||
|
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Save model artifacts in a structured way for easy loading.
|
||||||
|
Returns paths to saved artifacts.
|
||||||
|
"""
|
||||||
|
experiment = self.experiment_tracker.get_experiment(experiment_id)
|
||||||
|
if not experiment:
|
||||||
|
raise ValueError(f"Experiment {experiment_id} not found")
|
||||||
|
|
||||||
|
# Create model-specific directory
|
||||||
|
model_dir = self.models_dir / experiment_id
|
||||||
|
model_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Load the trained model
|
||||||
|
trained_model = self.experiment_runner.load_experiment_model(experiment_id)
|
||||||
|
if not trained_model:
|
||||||
|
raise ValueError(f"Could not load model for experiment {experiment_id}")
|
||||||
|
|
||||||
|
# Save complete model with joblib
|
||||||
|
model_path = model_dir / "complete_model.joblib"
|
||||||
|
trained_model.save(str(model_path))
|
||||||
|
|
||||||
|
# Save model configuration
|
||||||
|
config_path = model_dir / "model_config.json"
|
||||||
|
with open(config_path, "w") as f:
|
||||||
|
import json
|
||||||
|
|
||||||
|
json.dump(experiment.config.to_dict(), f, indent=2)
|
||||||
|
|
||||||
|
# Save experiment results
|
||||||
|
results_path = model_dir / "experiment_results.json"
|
||||||
|
with open(results_path, "w") as f:
|
||||||
|
json.dump(experiment.to_dict(), f, indent=2, default=str)
|
||||||
|
|
||||||
|
# Generate and save learning curves
|
||||||
|
learning_curve_path = None
|
||||||
|
training_history_path = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load data for learning curve generation
|
||||||
|
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||||
|
if data_path.exists():
|
||||||
|
df = self.data_loader.load_csv_complete(data_path)
|
||||||
|
|
||||||
|
# Generate learning curve
|
||||||
|
self.logger.info("Generating learning curve...")
|
||||||
|
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
|
||||||
|
|
||||||
|
# Plot and save learning curve
|
||||||
|
learning_curve_path = model_dir / "learning_curve.png"
|
||||||
|
trained_model.plot_learning_curve(str(learning_curve_path))
|
||||||
|
|
||||||
|
# Plot and save training history (for neural networks)
|
||||||
|
if trained_model.training_history:
|
||||||
|
training_history_path = model_dir / "training_history.png"
|
||||||
|
trained_model.plot_training_history(str(training_history_path))
|
||||||
|
|
||||||
|
# Save learning curve data as JSON
|
||||||
|
learning_data_path = model_dir / "learning_curve_data.json"
|
||||||
|
with open(learning_data_path, "w") as f:
|
||||||
|
json.dump(trained_model.learning_curve_data, f, indent=2)
|
||||||
|
|
||||||
|
# Save training history data as JSON
|
||||||
|
if trained_model.training_history:
|
||||||
|
history_data_path = model_dir / "training_history_data.json"
|
||||||
|
with open(history_data_path, "w") as f:
|
||||||
|
json.dump(trained_model.training_history, f, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not generate learning curves: {e}")
|
||||||
|
|
||||||
|
# Save artifacts metadata
|
||||||
|
metadata = {
|
||||||
|
"experiment_id": experiment_id,
|
||||||
|
"model_name": experiment.config.name,
|
||||||
|
"model_type": experiment.config.model_type,
|
||||||
|
"features": [f.value for f in experiment.config.features],
|
||||||
|
"training_date": datetime.now().isoformat(),
|
||||||
|
"test_accuracy": experiment.test_metrics.get("accuracy", 0),
|
||||||
|
"test_f1": experiment.test_metrics.get("f1", 0),
|
||||||
|
"model_path": str(model_path),
|
||||||
|
"config_path": str(config_path),
|
||||||
|
"results_path": str(results_path),
|
||||||
|
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
||||||
|
"training_history_plot": str(training_history_path) if training_history_path else None,
|
||||||
|
"has_learning_curve": bool(trained_model.learning_curve_data),
|
||||||
|
"has_training_history": bool(trained_model.training_history),
|
||||||
|
}
|
||||||
|
|
||||||
|
metadata_path = model_dir / "metadata.json"
|
||||||
|
with open(metadata_path, "w") as f:
|
||||||
|
json.dump(metadata, f, indent=2)
|
||||||
|
|
||||||
|
self.logger.info(f"Model artifacts saved to: {model_dir}")
|
||||||
|
self.logger.info(f" - Complete model: {model_path.name}")
|
||||||
|
self.logger.info(f" - Configuration: {config_path.name}")
|
||||||
|
self.logger.info(f" - Results: {results_path.name}")
|
||||||
|
self.logger.info(f" - Metadata: {metadata_path.name}")
|
||||||
|
|
||||||
|
if learning_curve_path and learning_curve_path.exists():
|
||||||
|
self.logger.info(f" - Learning curve: {learning_curve_path.name}")
|
||||||
|
|
||||||
|
if training_history_path and training_history_path.exists():
|
||||||
|
self.logger.info(f" - Training history: {training_history_path.name}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"model_dir": str(model_dir),
|
||||||
|
"model_path": str(model_path),
|
||||||
|
"config_path": str(config_path),
|
||||||
|
"results_path": str(results_path),
|
||||||
|
"metadata_path": str(metadata_path),
|
||||||
|
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
|
||||||
|
"training_history_plot": str(training_history_path) if training_history_path else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_trained_model(self, experiment_id: str):
|
||||||
|
"""
|
||||||
|
Load a previously trained model from artifacts.
|
||||||
|
"""
|
||||||
|
model_dir = self.models_dir / experiment_id
|
||||||
|
model_path = model_dir / "complete_model.joblib"
|
||||||
|
|
||||||
|
if not model_path.exists():
|
||||||
|
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
|
||||||
|
|
||||||
|
# Load the model class dynamically
|
||||||
|
metadata_path = model_dir / "metadata.json"
|
||||||
|
with open(metadata_path, "r") as f:
|
||||||
|
metadata = json.load(f)
|
||||||
|
|
||||||
|
model_type = metadata["model_type"]
|
||||||
|
from research.model_registry import MODEL_REGISTRY
|
||||||
|
|
||||||
|
model_class = MODEL_REGISTRY[model_type]
|
||||||
|
|
||||||
|
# Load the complete model
|
||||||
|
loaded_model = model_class.load(str(model_path))
|
||||||
|
|
||||||
|
self.logger.info(f"Loaded model: {metadata['model_name']}")
|
||||||
|
self.logger.info(f" Type: {model_type}")
|
||||||
|
self.logger.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
|
||||||
|
|
||||||
|
return loaded_model
|
||||||
|
|
||||||
|
def list_saved_models(self) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
List all saved model artifacts.
|
||||||
|
"""
|
||||||
|
models_data = []
|
||||||
|
|
||||||
|
for model_dir in self.models_dir.iterdir():
|
||||||
|
if model_dir.is_dir():
|
||||||
|
metadata_path = model_dir / "metadata.json"
|
||||||
|
if metadata_path.exists():
|
||||||
|
try:
|
||||||
|
with open(metadata_path, "r") as f:
|
||||||
|
metadata = json.load(f)
|
||||||
|
models_data.append(metadata)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}")
|
||||||
|
|
||||||
|
if not models_data:
|
||||||
|
self.logger.info("No saved models found.")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
df = pd.DataFrame(models_data)
|
||||||
|
|
||||||
|
# Format the display
|
||||||
|
display_columns = [
|
||||||
|
"model_name",
|
||||||
|
"model_type",
|
||||||
|
"features",
|
||||||
|
"test_accuracy",
|
||||||
|
"test_f1",
|
||||||
|
"training_date",
|
||||||
|
]
|
||||||
|
available_columns = [col for col in display_columns if col in df.columns]
|
||||||
|
|
||||||
|
return df[available_columns].sort_values("training_date", ascending=False)
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
|
||||||
|
from tensorflow.keras.models import Sequential
|
||||||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
|
from research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
|
class BiGRUModel(NeuralNetworkModel):
|
||||||
|
"""Bidirectional GRU model for name classification"""
|
||||||
|
|
||||||
|
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||||
|
params = kwargs
|
||||||
|
model = Sequential(
|
||||||
|
[
|
||||||
|
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||||
|
Bidirectional(
|
||||||
|
GRU(
|
||||||
|
params.get("gru_units", 32),
|
||||||
|
return_sequences=True,
|
||||||
|
dropout=params.get("dropout", 0.2),
|
||||||
|
)
|
||||||
|
),
|
||||||
|
Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
|
||||||
|
Dense(64, activation="relu"),
|
||||||
|
Dropout(params.get("dropout", 0.5)),
|
||||||
|
Dense(2, activation="softmax"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
model.compile(
|
||||||
|
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
text_data = []
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||||
|
|
||||||
|
if not text_data:
|
||||||
|
raise ValueError("No text data found in the provided DataFrame.")
|
||||||
|
|
||||||
|
if self.tokenizer is None:
|
||||||
|
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||||
|
self.tokenizer.fit_on_texts(text_data)
|
||||||
|
|
||||||
|
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||||
|
max_len = self.config.model_params.get("max_len", 6)
|
||||||
|
|
||||||
|
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tensorflow.keras.layers import (
|
||||||
|
Embedding,
|
||||||
|
Conv1D,
|
||||||
|
MaxPooling1D,
|
||||||
|
GlobalMaxPooling1D,
|
||||||
|
Dense,
|
||||||
|
Dropout,
|
||||||
|
)
|
||||||
|
from tensorflow.keras.models import Sequential
|
||||||
|
|
||||||
|
from research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
|
class CNNModel(NeuralNetworkModel):
|
||||||
|
"""1D Convolutional Neural Network for character patterns"""
|
||||||
|
|
||||||
|
def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any:
|
||||||
|
"""Build CNN model with known vocabulary size"""
|
||||||
|
|
||||||
|
params = kwargs
|
||||||
|
model = Sequential(
|
||||||
|
[
|
||||||
|
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||||
|
Conv1D(
|
||||||
|
filters=params.get("filters", 64),
|
||||||
|
kernel_size=params.get("kernel_size", 3),
|
||||||
|
activation="relu",
|
||||||
|
),
|
||||||
|
MaxPooling1D(pool_size=2),
|
||||||
|
Conv1D(
|
||||||
|
filters=params.get("filters", 64),
|
||||||
|
kernel_size=params.get("kernel_size", 3),
|
||||||
|
activation="relu",
|
||||||
|
),
|
||||||
|
GlobalMaxPooling1D(),
|
||||||
|
Dense(64, activation="relu"),
|
||||||
|
Dropout(params.get("dropout", 0.5)),
|
||||||
|
Dense(2, activation="softmax"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
model.compile(
|
||||||
|
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
"""Prepare sequences for CNN using extracted features"""
|
||||||
|
# X here contains the features already extracted by FeatureExtractor
|
||||||
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
|
|
||||||
|
# Get text data from extracted features - use character level for CNN
|
||||||
|
text_data = []
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||||
|
|
||||||
|
if not text_data:
|
||||||
|
# Fallback - should not happen if FeatureExtractor is properly configured
|
||||||
|
text_data = [""] * len(X)
|
||||||
|
|
||||||
|
# Initialize character-level tokenizer
|
||||||
|
if self.tokenizer is None:
|
||||||
|
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
|
||||||
|
self.tokenizer.fit_on_texts(text_data)
|
||||||
|
|
||||||
|
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||||
|
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
|
||||||
|
|
||||||
|
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
|
from research.experiment import ExperimentConfig
|
||||||
|
from research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
|
class EnsembleModel(TraditionalModel):
|
||||||
|
"""Ensemble model combining multiple base models"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def architecture(self) -> str:
|
||||||
|
"""Return the architecture type"""
|
||||||
|
return "ensemble"
|
||||||
|
|
||||||
|
def __init__(self, config: ExperimentConfig):
|
||||||
|
super().__init__(config)
|
||||||
|
self.base_models = []
|
||||||
|
self.model_weights = None
|
||||||
|
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
params = self.config.model_params
|
||||||
|
base_model_types = params.get(
|
||||||
|
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create base models with simplified configs
|
||||||
|
estimators = []
|
||||||
|
for model_type in base_model_types:
|
||||||
|
if model_type == "logistic_regression":
|
||||||
|
model = Pipeline(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"vectorizer",
|
||||||
|
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"classifier",
|
||||||
|
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
estimators.append((f"logistic_regression", model))
|
||||||
|
|
||||||
|
elif model_type == "random_forest":
|
||||||
|
model = Pipeline(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"vectorizer",
|
||||||
|
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"classifier",
|
||||||
|
RandomForestClassifier(
|
||||||
|
n_estimators=50, random_state=self.config.random_seed
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
estimators.append((f"rf", model))
|
||||||
|
|
||||||
|
elif model_type == "naive_bayes":
|
||||||
|
model = Pipeline(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"vectorizer",
|
||||||
|
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
|
||||||
|
),
|
||||||
|
("classifier", MultinomialNB()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
estimators.append((f"nb", model))
|
||||||
|
|
||||||
|
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||||
|
return VotingClassifier(estimators=estimators, voting=voting_type)
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
text_features = []
|
||||||
|
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_features.append(X[feature_type.value].astype(str))
|
||||||
|
|
||||||
|
if len(text_features) == 1:
|
||||||
|
return text_features[0].values
|
||||||
|
else:
|
||||||
|
combined = text_features[0].astype(str)
|
||||||
|
for feature in text_features[1:]:
|
||||||
|
combined = combined + " " + feature.astype(str)
|
||||||
|
return combined.values
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
import lightgbm as lgb
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
from research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
|
class LightGBMModel(TraditionalModel):
|
||||||
|
"""LightGBM with engineered features"""
|
||||||
|
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
params = self.config.model_params
|
||||||
|
|
||||||
|
return lgb.LGBMClassifier(
|
||||||
|
n_estimators=params.get("n_estimators", 100),
|
||||||
|
max_depth=params.get("max_depth", -1),
|
||||||
|
learning_rate=params.get("learning_rate", 0.1),
|
||||||
|
num_leaves=params.get("num_leaves", 31),
|
||||||
|
subsample=params.get("subsample", 0.8),
|
||||||
|
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||||
|
random_state=self.config.random_seed,
|
||||||
|
verbose=-1,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
features = []
|
||||||
|
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
column = X[feature_type.value]
|
||||||
|
|
||||||
|
if feature_type.value in ["name_length", "word_count"]:
|
||||||
|
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||||
|
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||||
|
# Character n-grams for text features
|
||||||
|
vectorizer = CountVectorizer(
|
||||||
|
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||||
|
)
|
||||||
|
char_features = vectorizer.fit_transform(
|
||||||
|
column.fillna("").astype(str)
|
||||||
|
).toarray()
|
||||||
|
features.append(char_features)
|
||||||
|
else:
|
||||||
|
le = LabelEncoder()
|
||||||
|
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||||
|
features.append(encoded.reshape(-1, 1))
|
||||||
|
|
||||||
|
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
|
from research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
|
class LogisticRegressionModel(TraditionalModel):
|
||||||
|
"""Logistic Regression with character n-grams"""
|
||||||
|
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
params = self.config.model_params
|
||||||
|
vectorizer = CountVectorizer(
|
||||||
|
analyzer="char",
|
||||||
|
ngram_range=params.get("ngram_range", (2, 5)),
|
||||||
|
max_features=params.get("max_features", 10000),
|
||||||
|
)
|
||||||
|
|
||||||
|
classifier = LogisticRegression(
|
||||||
|
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
|
||||||
|
)
|
||||||
|
|
||||||
|
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
text_features = []
|
||||||
|
|
||||||
|
# Collect text-based features from the extracted features DataFrame
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_features.append(X[feature_type.value].astype(str))
|
||||||
|
|
||||||
|
# Combine text features
|
||||||
|
if len(text_features) == 1:
|
||||||
|
return text_features[0].values
|
||||||
|
else:
|
||||||
|
# Concatenate multiple text features with separator
|
||||||
|
combined = text_features[0].astype(str)
|
||||||
|
for feature in text_features[1:]:
|
||||||
|
combined = combined + " " + feature.astype(str)
|
||||||
|
return combined.values
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
|
||||||
|
from tensorflow.keras.models import Sequential
|
||||||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
|
from research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
|
class LSTMModel(NeuralNetworkModel):
|
||||||
|
"""LSTM model for sequence learning"""
|
||||||
|
|
||||||
|
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||||
|
params = kwargs
|
||||||
|
model = Sequential(
|
||||||
|
[
|
||||||
|
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
|
||||||
|
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
|
||||||
|
Bidirectional(LSTM(params.get("lstm_units", 32))),
|
||||||
|
Dense(64, activation="relu"),
|
||||||
|
Dense(2, activation="softmax"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
model.compile(
|
||||||
|
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
text_data = []
|
||||||
|
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||||
|
|
||||||
|
if not text_data:
|
||||||
|
raise ValueError("No text data found in the provided DataFrame.")
|
||||||
|
|
||||||
|
# Initialize tokenizer if needed
|
||||||
|
if self.tokenizer is None:
|
||||||
|
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
|
||||||
|
self.tokenizer.fit_on_texts(text_data)
|
||||||
|
|
||||||
|
# Convert to sequences
|
||||||
|
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||||
|
max_len = self.config.model_params.get("max_len", 6)
|
||||||
|
|
||||||
|
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
|
||||||
|
from research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
|
class NaiveBayesModel(TraditionalModel):
|
||||||
|
"""Multinomial Naive Bayes with character n-grams"""
|
||||||
|
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
params = self.config.model_params
|
||||||
|
vectorizer = CountVectorizer(
|
||||||
|
analyzer="char",
|
||||||
|
ngram_range=params.get("ngram_range", (1, 4)),
|
||||||
|
max_features=params.get("max_features", 8000),
|
||||||
|
)
|
||||||
|
|
||||||
|
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
|
||||||
|
|
||||||
|
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
text_features = []
|
||||||
|
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_features.append(X[feature_type.value].astype(str))
|
||||||
|
|
||||||
|
if len(text_features) == 1:
|
||||||
|
return text_features[0].values
|
||||||
|
else:
|
||||||
|
combined = text_features[0].astype(str)
|
||||||
|
for feature in text_features[1:]:
|
||||||
|
combined = combined + " " + feature.astype(str)
|
||||||
|
return combined.values
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
from research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
|
class RandomForestModel(TraditionalModel):
|
||||||
|
"""Random Forest with engineered features"""
|
||||||
|
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
|
||||||
|
params = self.config.model_params
|
||||||
|
|
||||||
|
return RandomForestClassifier(
|
||||||
|
n_estimators=params.get("n_estimators", 100),
|
||||||
|
max_depth=params.get("max_depth", None),
|
||||||
|
random_state=self.config.random_seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
features = []
|
||||||
|
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
column = X[feature_type.value]
|
||||||
|
|
||||||
|
# Handle different feature types
|
||||||
|
if feature_type.value in ["name_length", "word_count"]:
|
||||||
|
# Numerical features
|
||||||
|
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||||
|
else:
|
||||||
|
# Categorical features (encode them)
|
||||||
|
le = LabelEncoder()
|
||||||
|
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||||
|
features.append(encoded.reshape(-1, 1))
|
||||||
|
|
||||||
|
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
|
from research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
|
class SVMModel(TraditionalModel):
|
||||||
|
"""Support Vector Machine with character n-grams and RBF kernel"""
|
||||||
|
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
params = self.config.model_params
|
||||||
|
vectorizer = TfidfVectorizer(
|
||||||
|
analyzer="char",
|
||||||
|
ngram_range=params.get("ngram_range", (2, 4)),
|
||||||
|
max_features=params.get("max_features", 5000),
|
||||||
|
)
|
||||||
|
|
||||||
|
classifier = SVC(
|
||||||
|
kernel=params.get("kernel", "rbf"),
|
||||||
|
C=params.get("C", 1.0),
|
||||||
|
gamma=params.get("gamma", "scale"),
|
||||||
|
probability=True, # Enable probability prediction
|
||||||
|
random_state=self.config.random_seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
text_features = []
|
||||||
|
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_features.append(X[feature_type.value].astype(str))
|
||||||
|
|
||||||
|
if len(text_features) == 1:
|
||||||
|
return text_features[0].values
|
||||||
|
else:
|
||||||
|
combined = text_features[0].astype(str)
|
||||||
|
for feature in text_features[1:]:
|
||||||
|
combined = combined + " " + feature.astype(str)
|
||||||
|
return combined.values
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.keras.layers import (
|
||||||
|
Input,
|
||||||
|
Embedding,
|
||||||
|
Dense,
|
||||||
|
GlobalAveragePooling1D,
|
||||||
|
MultiHeadAttention,
|
||||||
|
Dropout,
|
||||||
|
LayerNormalization,
|
||||||
|
)
|
||||||
|
from tensorflow.keras.models import Model
|
||||||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
|
||||||
|
from research.neural_network_model import NeuralNetworkModel
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerModel(NeuralNetworkModel):
|
||||||
|
"""Transformer-based model"""
|
||||||
|
|
||||||
|
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
|
||||||
|
params = kwargs
|
||||||
|
|
||||||
|
# Build Transformer model
|
||||||
|
inputs = Input(shape=(max_len,))
|
||||||
|
x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
|
||||||
|
|
||||||
|
# Add positional encoding
|
||||||
|
positions = tf.range(start=0, limit=max_len, delta=1)
|
||||||
|
pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))(
|
||||||
|
positions
|
||||||
|
)
|
||||||
|
x = x + pos_embedding
|
||||||
|
|
||||||
|
x = self._transformer_encoder(x, params)
|
||||||
|
x = GlobalAveragePooling1D()(x)
|
||||||
|
x = Dense(32, activation="relu")(x)
|
||||||
|
outputs = Dense(2, activation="softmax")(x)
|
||||||
|
|
||||||
|
model = Model(inputs, outputs)
|
||||||
|
model.compile(
|
||||||
|
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _transformer_encoder(cls, x, cfg_params):
|
||||||
|
"""Transformer encoder block"""
|
||||||
|
|
||||||
|
attn = MultiHeadAttention(
|
||||||
|
num_heads=cfg_params.get("transformer_num_heads", 2),
|
||||||
|
key_dim=cfg_params.get("transformer_head_size", 64),
|
||||||
|
)(x, x)
|
||||||
|
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
|
||||||
|
|
||||||
|
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
|
||||||
|
ff = Dense(x.shape[-1])(ff)
|
||||||
|
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
text_data = []
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
text_data.extend(X[feature_type.value].astype(str).tolist())
|
||||||
|
|
||||||
|
if not text_data:
|
||||||
|
raise ValueError("No text data found in the provided DataFrame.")
|
||||||
|
|
||||||
|
# Initialize tokenizer if needed
|
||||||
|
if self.tokenizer is None:
|
||||||
|
self.tokenizer = Tokenizer(oov_token="<OOV>")
|
||||||
|
self.tokenizer.fit_on_texts(text_data)
|
||||||
|
|
||||||
|
# Convert to sequences
|
||||||
|
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
|
||||||
|
max_len = self.config.model_params.get("max_len", 6)
|
||||||
|
|
||||||
|
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import xgboost as xgb
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
from research.traditional_model import TraditionalModel
|
||||||
|
|
||||||
|
|
||||||
|
class XGBoostModel(TraditionalModel):
|
||||||
|
"""XGBoost with engineered features and character embeddings"""
|
||||||
|
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
params = self.config.model_params
|
||||||
|
|
||||||
|
return xgb.XGBClassifier(
|
||||||
|
n_estimators=params.get("n_estimators", 100),
|
||||||
|
max_depth=params.get("max_depth", 6),
|
||||||
|
learning_rate=params.get("learning_rate", 0.1),
|
||||||
|
subsample=params.get("subsample", 0.8),
|
||||||
|
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||||
|
random_state=self.config.random_seed,
|
||||||
|
eval_metric="logloss",
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
features = []
|
||||||
|
|
||||||
|
for feature_type in self.config.features:
|
||||||
|
if feature_type.value in X.columns:
|
||||||
|
column = X[feature_type.value]
|
||||||
|
|
||||||
|
if feature_type.value in ["name_length", "word_count"]:
|
||||||
|
# Numerical features
|
||||||
|
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||||
|
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||||
|
# Character-level features for names
|
||||||
|
vectorizer = CountVectorizer(
|
||||||
|
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||||
|
)
|
||||||
|
char_features = vectorizer.fit_transform(
|
||||||
|
column.fillna("").astype(str)
|
||||||
|
).toarray()
|
||||||
|
features.append(char_features)
|
||||||
|
else:
|
||||||
|
# Categorical features
|
||||||
|
le = LabelEncoder()
|
||||||
|
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||||
|
features.append(encoded.reshape(-1, 1))
|
||||||
|
|
||||||
|
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||||
@@ -0,0 +1,201 @@
|
|||||||
|
import logging
|
||||||
|
from abc import abstractmethod
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
from sklearn.metrics import precision_recall_fscore_support
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
from research.base_model import BaseModel
|
||||||
|
from research.experiment.feature_extractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class NeuralNetworkModel(BaseModel):
|
||||||
|
"""Base class for neural network models (TensorFlow/Keras)"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def architecture(self) -> str:
|
||||||
|
return "neural_network"
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||||
|
"""Build neural network model with known vocabulary size"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||||
|
"""Fit the neural network model with deferred building"""
|
||||||
|
logging.info(f"Training {self.__class__.__name__}")
|
||||||
|
|
||||||
|
# Setup feature extraction
|
||||||
|
if self.feature_extractor is None:
|
||||||
|
self.feature_extractor = FeatureExtractor(
|
||||||
|
self.config.features, self.config.feature_params
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract and prepare features (this will also initialize tokenizer)
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
|
||||||
|
# Encode labels
|
||||||
|
if self.label_encoder is None:
|
||||||
|
self.label_encoder = LabelEncoder()
|
||||||
|
y_encoded = self.label_encoder.fit_transform(y)
|
||||||
|
else:
|
||||||
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
|
# Now we can build the model with known vocab size
|
||||||
|
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||||
|
|
||||||
|
# Get additional model parameters
|
||||||
|
max_len = self.config.model_params.get("max_len", 6)
|
||||||
|
|
||||||
|
self.model = self.build_model_with_vocab(
|
||||||
|
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||||
|
)
|
||||||
|
|
||||||
|
# Train the neural network
|
||||||
|
history = self.model.fit(
|
||||||
|
X_prepared,
|
||||||
|
y_encoded,
|
||||||
|
epochs=self.config.model_params.get("epochs", 10),
|
||||||
|
batch_size=self.config.model_params.get("batch_size", 64),
|
||||||
|
validation_split=0.1,
|
||||||
|
verbose=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store training history
|
||||||
|
self.training_history = {
|
||||||
|
"accuracy": history.history["accuracy"],
|
||||||
|
"loss": history.history["loss"],
|
||||||
|
"val_accuracy": history.history.get("val_accuracy", []),
|
||||||
|
"val_loss": history.history.get("val_loss", []),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.is_fitted = True
|
||||||
|
return self
|
||||||
|
|
||||||
|
def cross_validate(
|
||||||
|
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||||
|
) -> dict[str, np.floating[Any]]:
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
|
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
||||||
|
|
||||||
|
accuracies = []
|
||||||
|
precisions = []
|
||||||
|
recalls = []
|
||||||
|
f1_scores = []
|
||||||
|
|
||||||
|
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
|
||||||
|
# Create fresh model for each fold
|
||||||
|
fold_model = self.build_model()
|
||||||
|
|
||||||
|
# Train on fold
|
||||||
|
if hasattr(fold_model, "fit"):
|
||||||
|
fold_model.fit(
|
||||||
|
X_prepared[train_idx],
|
||||||
|
y_encoded[train_idx],
|
||||||
|
epochs=self.config.model_params.get("epochs", 10),
|
||||||
|
batch_size=self.config.model_params.get("batch_size", 32),
|
||||||
|
verbose=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Predict on validation
|
||||||
|
y_pred = fold_model.predict(X_prepared[val_idx])
|
||||||
|
if len(y_pred.shape) > 1:
|
||||||
|
y_pred = y_pred.argmax(axis=1)
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
acc = accuracy_score(y_encoded[val_idx], y_pred)
|
||||||
|
prec, rec, f1, _ = precision_recall_fscore_support(
|
||||||
|
y_encoded[val_idx], y_pred, average="weighted"
|
||||||
|
)
|
||||||
|
|
||||||
|
accuracies.append(acc)
|
||||||
|
precisions.append(prec)
|
||||||
|
recalls.append(rec)
|
||||||
|
f1_scores.append(f1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"accuracy": np.mean(accuracies),
|
||||||
|
"accuracy_std": np.std(accuracies),
|
||||||
|
"precision": np.mean(precisions),
|
||||||
|
"precision_std": np.std(precisions),
|
||||||
|
"recall": np.mean(recalls),
|
||||||
|
"recall_std": np.std(recalls),
|
||||||
|
"f1": np.mean(f1_scores),
|
||||||
|
"f1_std": np.std(f1_scores),
|
||||||
|
}
|
||||||
|
|
||||||
|
def generate_learning_curve(
|
||||||
|
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Generate learning curve data for the model"""
|
||||||
|
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||||
|
|
||||||
|
learning_curve_data = {
|
||||||
|
"train_sizes": [],
|
||||||
|
"train_scores": [],
|
||||||
|
"val_scores": [],
|
||||||
|
"train_scores_std": [],
|
||||||
|
"val_scores_std": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Split data once for validation
|
||||||
|
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
||||||
|
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
|
||||||
|
)
|
||||||
|
|
||||||
|
for size in train_sizes:
|
||||||
|
train_size = int(len(X_train_full) * size)
|
||||||
|
if train_size < 10: # Minimum training size
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sample training data
|
||||||
|
indices = np.random.choice(len(X_train_full), train_size, replace=False)
|
||||||
|
X_train_subset = X_train_full[indices]
|
||||||
|
y_train_subset = y_train_full[indices]
|
||||||
|
|
||||||
|
# Train multiple models for variance estimation
|
||||||
|
train_scores = []
|
||||||
|
val_scores = []
|
||||||
|
|
||||||
|
for seed in range(3): # 3 runs for variance
|
||||||
|
# Build fresh model
|
||||||
|
model = self.build_model()
|
||||||
|
|
||||||
|
# Train model
|
||||||
|
if hasattr(model, "fit"):
|
||||||
|
history = model.fit(
|
||||||
|
X_train_subset,
|
||||||
|
y_train_subset,
|
||||||
|
epochs=self.config.model_params.get("epochs", 10),
|
||||||
|
batch_size=self.config.model_params.get("batch_size", 32),
|
||||||
|
validation_data=(X_val, y_val),
|
||||||
|
verbose=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Evaluate
|
||||||
|
train_pred = model.predict(X_train_subset)
|
||||||
|
val_pred = model.predict(X_val)
|
||||||
|
|
||||||
|
train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
|
||||||
|
val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
|
||||||
|
|
||||||
|
train_scores.append(train_acc)
|
||||||
|
val_scores.append(val_acc)
|
||||||
|
|
||||||
|
learning_curve_data["train_sizes"].append(train_size)
|
||||||
|
learning_curve_data["train_scores"].append(np.mean(train_scores))
|
||||||
|
learning_curve_data["val_scores"].append(np.mean(val_scores))
|
||||||
|
learning_curve_data["train_scores_std"].append(np.std(train_scores))
|
||||||
|
learning_curve_data["val_scores_std"].append(np.std(val_scores))
|
||||||
|
|
||||||
|
self.learning_curve_data = learning_curve_data
|
||||||
|
return learning_curve_data
|
||||||
@@ -0,0 +1,134 @@
|
|||||||
|
import logging
|
||||||
|
from abc import abstractmethod
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.model_selection import StratifiedKFold, cross_val_score
|
||||||
|
from sklearn.model_selection import learning_curve
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
from research.base_model import BaseModel
|
||||||
|
from research.experiment.feature_extractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class TraditionalModel(BaseModel):
|
||||||
|
"""Base class for traditional ML models (scikit-learn compatible)"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def architecture(self) -> str:
|
||||||
|
return "traditional"
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def build_model(self) -> BaseEstimator:
|
||||||
|
"""Build and return the sklearn model instance"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
|
||||||
|
"""Fit the traditional ML model"""
|
||||||
|
logging.info(f"Training {self.__class__.__name__}")
|
||||||
|
|
||||||
|
# Build model if not already built
|
||||||
|
if self.model is None:
|
||||||
|
self.model = self.build_model()
|
||||||
|
|
||||||
|
# Setup feature extraction
|
||||||
|
if self.feature_extractor is None:
|
||||||
|
self.feature_extractor = FeatureExtractor(
|
||||||
|
self.config.features, self.config.feature_params
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract and prepare features
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
|
||||||
|
# Encode labels
|
||||||
|
if self.label_encoder is None:
|
||||||
|
self.label_encoder = LabelEncoder()
|
||||||
|
y_encoded = self.label_encoder.fit_transform(y)
|
||||||
|
else:
|
||||||
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
|
# Train model
|
||||||
|
self.model.fit(X_prepared, y_encoded)
|
||||||
|
self.is_fitted = True
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
|
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
|
||||||
|
|
||||||
|
# Calculate different metrics
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# Accuracy
|
||||||
|
accuracy_scores = cross_val_score(
|
||||||
|
self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
|
||||||
|
)
|
||||||
|
results["accuracy"] = accuracy_scores.mean()
|
||||||
|
results["accuracy_std"] = accuracy_scores.std()
|
||||||
|
|
||||||
|
# Precision, Recall, F1
|
||||||
|
for metric in ["precision", "recall", "f1"]:
|
||||||
|
if metric in self.config.metrics:
|
||||||
|
scores = cross_val_score(
|
||||||
|
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
|
||||||
|
)
|
||||||
|
results[metric] = scores.mean()
|
||||||
|
results[f"{metric}_std"] = scores.std()
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def generate_learning_curve(
|
||||||
|
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Generate learning curve data for the model"""
|
||||||
|
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||||
|
|
||||||
|
if train_sizes is None:
|
||||||
|
train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
|
||||||
|
|
||||||
|
# Prepare features
|
||||||
|
if self.feature_extractor is None:
|
||||||
|
self.feature_extractor = FeatureExtractor(
|
||||||
|
self.config.features, self.config.feature_params
|
||||||
|
)
|
||||||
|
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
|
||||||
|
# Encode labels
|
||||||
|
if self.label_encoder is None:
|
||||||
|
self.label_encoder = LabelEncoder()
|
||||||
|
y_encoded = self.label_encoder.fit_transform(y)
|
||||||
|
else:
|
||||||
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
|
try:
|
||||||
|
train_sizes_abs, train_scores, val_scores = learning_curve(
|
||||||
|
self.build_model(),
|
||||||
|
X_prepared,
|
||||||
|
y_encoded,
|
||||||
|
train_sizes=train_sizes,
|
||||||
|
cv=3, # Use 3-fold CV for speed
|
||||||
|
scoring="accuracy",
|
||||||
|
random_state=self.config.random_seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
learning_curve_data = {
|
||||||
|
"train_sizes": train_sizes_abs.tolist(),
|
||||||
|
"train_scores": train_scores.mean(axis=1).tolist(),
|
||||||
|
"val_scores": val_scores.mean(axis=1).tolist(),
|
||||||
|
"train_scores_std": train_scores.std(axis=1).tolist(),
|
||||||
|
"val_scores_std": val_scores.std(axis=1).tolist(),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Could not generate learning curve: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
self.learning_curve_data = learning_curve_data
|
||||||
|
return learning_curve_data
|
||||||
@@ -0,0 +1,152 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from research.model_trainer import ModelTrainer
|
||||||
|
|
||||||
|
|
||||||
|
def train_baseline_models():
|
||||||
|
"""
|
||||||
|
Quick function to train all baseline models and save artifacts.
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.info("Training Baseline Models with Artifact Saving")
|
||||||
|
|
||||||
|
trainer = ModelTrainer()
|
||||||
|
|
||||||
|
# Define baseline model configurations
|
||||||
|
baseline_configs = [
|
||||||
|
{
|
||||||
|
"model_type": "logistic_regression",
|
||||||
|
"features": ["full_name"],
|
||||||
|
"model_params": {"ngram_range": [2, 5], "max_features": 10000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_type": "logistic_regression",
|
||||||
|
"features": ["native_name"],
|
||||||
|
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_type": "logistic_regression",
|
||||||
|
"features": ["surname"],
|
||||||
|
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_type": "random_forest",
|
||||||
|
"features": ["name_length", "word_count", "province"],
|
||||||
|
"model_params": {"n_estimators": 100, "max_depth": 10},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_type": "svm",
|
||||||
|
"features": ["full_name"],
|
||||||
|
"model_params": {"kernel": "rbf", "C": 1.0},
|
||||||
|
},
|
||||||
|
{"model_type": "naive_bayes", "features": ["full_name"], "model_params": {"alpha": 1.0}},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Train all baseline models
|
||||||
|
experiment_ids = trainer.train_multiple_models("baseline", baseline_configs)
|
||||||
|
|
||||||
|
# Show summary
|
||||||
|
logger.info(f"\n Training Summary:")
|
||||||
|
for exp_id in experiment_ids:
|
||||||
|
experiment = trainer.experiment_tracker.get_experiment(exp_id)
|
||||||
|
if experiment:
|
||||||
|
acc = experiment.test_metrics.get("accuracy", 0)
|
||||||
|
logger.info(f" {experiment.config.name}: {acc:.4f} accuracy")
|
||||||
|
|
||||||
|
return experiment_ids
|
||||||
|
|
||||||
|
|
||||||
|
def train_neural_networks():
|
||||||
|
"""
|
||||||
|
Train neural network models with proper parameters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info("Training Neural Network Models")
|
||||||
|
|
||||||
|
trainer = ModelTrainer()
|
||||||
|
|
||||||
|
neural_configs = [
|
||||||
|
{
|
||||||
|
"model_type": "lstm",
|
||||||
|
"features": ["full_name"],
|
||||||
|
"model_params": {
|
||||||
|
"embedding_dim": 64,
|
||||||
|
"lstm_units": 32,
|
||||||
|
"epochs": 10,
|
||||||
|
"batch_size": 64,
|
||||||
|
"max_len": 6,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_type": "cnn",
|
||||||
|
"features": ["full_name"],
|
||||||
|
"model_params": {
|
||||||
|
"embedding_dim": 64,
|
||||||
|
"filters": 64,
|
||||||
|
"kernel_size": 3,
|
||||||
|
"epochs": 10,
|
||||||
|
"batch_size": 64,
|
||||||
|
"max_len": 20, # Character level
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_type": "transformer",
|
||||||
|
"features": ["full_name"],
|
||||||
|
"model_params": {
|
||||||
|
"embedding_dim": 64,
|
||||||
|
"transformer_num_heads": 2,
|
||||||
|
"epochs": 10,
|
||||||
|
"batch_size": 64,
|
||||||
|
"max_len": 6,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
experiment_ids = trainer.train_multiple_models("neural_networks", neural_configs)
|
||||||
|
return experiment_ids
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Main training script with different options.
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Train DRC Names Models")
|
||||||
|
parser.add_argument(
|
||||||
|
"--mode",
|
||||||
|
choices=["baseline", "neural", "list"],
|
||||||
|
default="list",
|
||||||
|
help="Training mode",
|
||||||
|
)
|
||||||
|
parser.add_argument("--model-type", type=str, help="Specific model type to train")
|
||||||
|
parser.add_argument("--name", type=str, help="Model name")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
trainer = ModelTrainer()
|
||||||
|
|
||||||
|
if args.mode == "baseline":
|
||||||
|
train_baseline_models()
|
||||||
|
|
||||||
|
elif args.mode == "neural":
|
||||||
|
train_neural_networks()
|
||||||
|
|
||||||
|
elif args.mode == "list":
|
||||||
|
logging.info("📋 Saved Models:")
|
||||||
|
saved_models = trainer.list_saved_models()
|
||||||
|
if not saved_models.empty:
|
||||||
|
logging.info(saved_models.to_string(index=False))
|
||||||
|
else:
|
||||||
|
logging.info("No saved models found.")
|
||||||
|
|
||||||
|
elif args.model_type and args.name:
|
||||||
|
# Train specific model
|
||||||
|
trainer.train_single_model(
|
||||||
|
model_name=args.name, model_type=args.model_type, features=["full_name"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from core.utils import get_data_file_path
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
|
try:
|
||||||
|
return pd.read_csv(file_path)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error loading dataset: {e}")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
class Dashboard:
|
||||||
|
def __init__(self, config, experiment_tracker, experiment_runner):
|
||||||
|
self.config = config
|
||||||
|
self.experiment_tracker = experiment_tracker
|
||||||
|
self.experiment_runner = experiment_runner
|
||||||
|
|
||||||
|
def index(self):
|
||||||
|
st.header("Dashboard")
|
||||||
|
col1, col2, col3, col4 = st.columns(4)
|
||||||
|
|
||||||
|
# Load basic statistics
|
||||||
|
try:
|
||||||
|
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||||
|
if data_path.exists():
|
||||||
|
df = load_dataset(str(data_path))
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
st.metric("Total Names", f"{len(df):,}")
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
annotated = (df.get("annotated", 0) == 1).sum()
|
||||||
|
st.metric("Annotated Names", f"{annotated:,}")
|
||||||
|
|
||||||
|
with col3:
|
||||||
|
provinces = df["province"].nunique() if "province" in df.columns else 0
|
||||||
|
st.metric("Provinces", provinces)
|
||||||
|
|
||||||
|
with col4:
|
||||||
|
if "sex" in df.columns:
|
||||||
|
gender_dist = df["sex"].value_counts()
|
||||||
|
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
|
||||||
|
st.metric("F/M Ratio", f"{ratio:.2f}")
|
||||||
|
else:
|
||||||
|
st.warning("No processed data found. Please run data processing first.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error loading dashboard data: {e}")
|
||||||
|
|
||||||
|
# Recent experiments
|
||||||
|
st.subheader("Recent Experiments")
|
||||||
|
experiments = self.experiment_tracker.list_experiments()[:5]
|
||||||
|
|
||||||
|
if experiments:
|
||||||
|
exp_data = []
|
||||||
|
for exp in experiments:
|
||||||
|
exp_data.append(
|
||||||
|
{
|
||||||
|
"Name": exp.config.name,
|
||||||
|
"Model": exp.config.model_type,
|
||||||
|
"Status": exp.status.value,
|
||||||
|
"Accuracy": (
|
||||||
|
f"{exp.test_metrics.get('accuracy', 0):.3f}"
|
||||||
|
if exp.test_metrics
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
"Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
|
||||||
|
else:
|
||||||
|
st.info("No experiments found. Create your first experiment in the Experiments tab!")
|
||||||
@@ -0,0 +1,154 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import plotly.express as px
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from core.utils import get_data_file_path
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
|
try:
|
||||||
|
return pd.read_csv(file_path)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error loading dataset: {e}")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
class DataOverview:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def index(self):
|
||||||
|
st.header("Data Overview")
|
||||||
|
data_files = {
|
||||||
|
"Names": self.config.data.input_file,
|
||||||
|
"Featured Dataset": self.config.data.output_files["featured"],
|
||||||
|
"Evaluation Dataset": self.config.data.output_files["evaluation"],
|
||||||
|
"Male Names": self.config.data.output_files["males"],
|
||||||
|
"Female Names": self.config.data.output_files["females"],
|
||||||
|
}
|
||||||
|
|
||||||
|
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
|
||||||
|
file_path = get_data_file_path(data_files[selected_file], self.config)
|
||||||
|
|
||||||
|
if not file_path.exists():
|
||||||
|
st.warning(f"Dataset not found: {file_path}")
|
||||||
|
st.warning("Please run data processing first to generate datasets.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Load and display data
|
||||||
|
df = load_dataset(str(file_path))
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
|
st.error("Failed to load dataset")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Basic statistics
|
||||||
|
col1, col2, col3, col4 = st.columns(4)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
st.metric("Total Records", f"{len(df):,}")
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
if "annotated" in df.columns:
|
||||||
|
annotated_pct = (df["annotated"] == 1).mean() * 100
|
||||||
|
st.metric("Annotated", f"{annotated_pct:.1f}%")
|
||||||
|
|
||||||
|
with col3:
|
||||||
|
if "words" in df.columns:
|
||||||
|
avg_words = df["words"].mean()
|
||||||
|
st.metric("Avg Words", f"{avg_words:.1f}")
|
||||||
|
|
||||||
|
with col4:
|
||||||
|
if "length" in df.columns:
|
||||||
|
avg_length = df["length"].mean()
|
||||||
|
st.metric("Avg Length", f"{avg_length:.0f}")
|
||||||
|
|
||||||
|
# Data quality analysis
|
||||||
|
st.subheader("Data Quality Analysis")
|
||||||
|
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
# Missing values
|
||||||
|
missing_data = df.isnull().sum()
|
||||||
|
if missing_data.sum() > 0:
|
||||||
|
fig = px.bar(
|
||||||
|
x=missing_data.index, y=missing_data.values, title="Missing Values by Column"
|
||||||
|
)
|
||||||
|
fig.update_layout(height=400)
|
||||||
|
st.plotly_chart(fig, use_container_width=True)
|
||||||
|
else:
|
||||||
|
st.success("No missing values found")
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
# Gender distribution
|
||||||
|
if "sex" in df.columns:
|
||||||
|
gender_counts = df["sex"].value_counts()
|
||||||
|
fig = px.pie(
|
||||||
|
values=gender_counts.values,
|
||||||
|
names=gender_counts.index,
|
||||||
|
title="Gender Distribution",
|
||||||
|
)
|
||||||
|
fig.update_layout(height=400)
|
||||||
|
st.plotly_chart(fig, use_container_width=True)
|
||||||
|
|
||||||
|
# Word count distribution
|
||||||
|
if "words" in df.columns:
|
||||||
|
st.subheader("Name Structure Analysis")
|
||||||
|
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
word_dist = df["words"].value_counts().sort_index()
|
||||||
|
fig = px.bar(
|
||||||
|
x=word_dist.index,
|
||||||
|
y=word_dist.values,
|
||||||
|
title="Distribution of Word Count in Names",
|
||||||
|
)
|
||||||
|
fig.update_layout(height=400)
|
||||||
|
st.plotly_chart(fig, use_container_width=True)
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
# Province distribution
|
||||||
|
if "province" in df.columns:
|
||||||
|
province_counts = df["province"].value_counts().head(10)
|
||||||
|
fig = px.bar(
|
||||||
|
x=province_counts.values,
|
||||||
|
y=province_counts.index,
|
||||||
|
orientation="h",
|
||||||
|
title="Top 10 Provinces by Name Count",
|
||||||
|
)
|
||||||
|
fig.update_layout(height=400)
|
||||||
|
st.plotly_chart(fig, use_container_width=True)
|
||||||
|
|
||||||
|
# Sample data
|
||||||
|
st.subheader("Sample Data")
|
||||||
|
|
||||||
|
# Display columns selector
|
||||||
|
if not df.empty:
|
||||||
|
columns_to_show = st.multiselect(
|
||||||
|
"Select columns to display",
|
||||||
|
df.columns.tolist(),
|
||||||
|
default=(
|
||||||
|
["name", "sex", "province", "words"]
|
||||||
|
if all(col in df.columns for col in ["name", "sex", "province", "words"])
|
||||||
|
else df.columns[:5].tolist()
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if columns_to_show:
|
||||||
|
sample_size = st.slider("Number of rows to display", 10, min(1000, len(df)), 50)
|
||||||
|
st.dataframe(df[columns_to_show].head(sample_size), use_container_width=True)
|
||||||
|
|
||||||
|
# Data export
|
||||||
|
st.subheader("Export Data")
|
||||||
|
if st.button("Download as CSV"):
|
||||||
|
csv = df.to_csv(index=False)
|
||||||
|
st.download_button(
|
||||||
|
label="Download CSV",
|
||||||
|
data=csv,
|
||||||
|
file_name=f"{selected_file.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
)
|
||||||
@@ -0,0 +1,127 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import plotly.express as px
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from web.log_reader import LogReader
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||||
|
try:
|
||||||
|
return pd.read_csv(file_path)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error loading dataset: {e}")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
class DataProcessing:
|
||||||
|
def __init__(self, config, pipeline_monitor):
|
||||||
|
self.config = config
|
||||||
|
self.pipeline_monitor = pipeline_monitor
|
||||||
|
|
||||||
|
def index(self):
|
||||||
|
st.header("Data Processing Pipeline")
|
||||||
|
status = self.pipeline_monitor.get_pipeline_status()
|
||||||
|
|
||||||
|
# Overall progress
|
||||||
|
overall_progress = status["overall_completion"] / 100
|
||||||
|
st.progress(overall_progress)
|
||||||
|
st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
|
||||||
|
|
||||||
|
# Step details
|
||||||
|
for step_name, step_status in status["steps"].items():
|
||||||
|
with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
|
||||||
|
col1, col2, col3 = st.columns(3)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
st.metric("Processed Batches", step_status["processed_batches"])
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
st.metric("Total Batches", step_status["total_batches"])
|
||||||
|
|
||||||
|
with col3:
|
||||||
|
st.metric("Failed Batches", step_status["failed_batches"])
|
||||||
|
|
||||||
|
if step_status["completion_percentage"] > 0:
|
||||||
|
st.progress(step_status["completion_percentage"] / 100)
|
||||||
|
|
||||||
|
# Read actual log entries from the log file
|
||||||
|
st.subheader("Recent Processing Logs")
|
||||||
|
try:
|
||||||
|
log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
|
||||||
|
log_reader = LogReader(log_file_path)
|
||||||
|
|
||||||
|
# Options for filtering logs
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
with col1:
|
||||||
|
log_level_filter = st.selectbox(
|
||||||
|
"Filter by Level",
|
||||||
|
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
|
||||||
|
key="log_level_filter"
|
||||||
|
)
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
num_entries = st.number_input(
|
||||||
|
"Number of entries",
|
||||||
|
min_value=5,
|
||||||
|
max_value=50,
|
||||||
|
value=10,
|
||||||
|
key="num_log_entries"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get log entries based on filter
|
||||||
|
if log_level_filter == "All":
|
||||||
|
log_entries = log_reader.read_last_entries(num_entries)
|
||||||
|
else:
|
||||||
|
log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
|
||||||
|
|
||||||
|
if log_entries:
|
||||||
|
for entry in log_entries:
|
||||||
|
if entry.level == "ERROR":
|
||||||
|
st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||||
|
elif entry.level == "WARNING":
|
||||||
|
st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||||
|
elif entry.level == "INFO":
|
||||||
|
st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||||
|
else:
|
||||||
|
st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||||
|
|
||||||
|
# Show log statistics
|
||||||
|
st.subheader("Log Statistics")
|
||||||
|
log_stats = log_reader.get_log_stats()
|
||||||
|
|
||||||
|
if log_stats:
|
||||||
|
col1, col2, col3, col4 = st.columns(4)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
st.metric("Total Lines", log_stats.get('total_lines', 0))
|
||||||
|
with col2:
|
||||||
|
st.metric("INFO", log_stats.get('INFO', 0))
|
||||||
|
with col3:
|
||||||
|
st.metric("WARNING", log_stats.get('WARNING', 0))
|
||||||
|
with col4:
|
||||||
|
st.metric("ERROR", log_stats.get('ERROR', 0))
|
||||||
|
|
||||||
|
# Log level distribution chart
|
||||||
|
levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
|
||||||
|
counts = [log_stats.get(level, 0) for level in levels]
|
||||||
|
|
||||||
|
if sum(counts) > 0:
|
||||||
|
fig = px.bar(
|
||||||
|
x=levels,
|
||||||
|
y=counts,
|
||||||
|
title="Log Entries by Level",
|
||||||
|
color=levels,
|
||||||
|
color_discrete_map={
|
||||||
|
'INFO': 'blue',
|
||||||
|
'WARNING': 'orange',
|
||||||
|
'ERROR': 'red',
|
||||||
|
'DEBUG': 'gray',
|
||||||
|
'CRITICAL': 'darkred'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
st.plotly_chart(fig, use_container_width=True)
|
||||||
|
else:
|
||||||
|
st.info("No log entries found or log file is empty.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error reading log file: {e}")
|
||||||
@@ -0,0 +1,185 @@
|
|||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LogEntry:
|
||||||
|
"""Represents a single log entry."""
|
||||||
|
timestamp: datetime
|
||||||
|
logger: str
|
||||||
|
level: str
|
||||||
|
message: str
|
||||||
|
raw_line: str
|
||||||
|
|
||||||
|
|
||||||
|
class LogReader:
|
||||||
|
"""Utility class for reading and parsing log files."""
|
||||||
|
|
||||||
|
def __init__(self, log_file_path: Path):
|
||||||
|
"""Initialize the log reader with a log file path."""
|
||||||
|
self.log_file_path = Path(log_file_path)
|
||||||
|
# Pattern to match Python logging format: timestamp - logger - level - message
|
||||||
|
self.log_pattern = re.compile(
|
||||||
|
r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
|
||||||
|
)
|
||||||
|
|
||||||
|
def read_last_entries(self, count: int = 10) -> List[LogEntry]:
|
||||||
|
"""Read the last N entries from the log file."""
|
||||||
|
if not self.log_file_path.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
# Parse log entries from the end
|
||||||
|
entries = []
|
||||||
|
for line in reversed(lines[-count*2:]): # Read more lines in case some don't match
|
||||||
|
entry = self._parse_log_line(line.strip())
|
||||||
|
if entry:
|
||||||
|
entries.append(entry)
|
||||||
|
if len(entries) >= count:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Return entries in chronological order (oldest first of the last N)
|
||||||
|
return list(reversed(entries))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading log file: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def read_entries_by_level(self, level: str, count: int = 50) -> List[LogEntry]:
|
||||||
|
"""Read entries filtered by log level."""
|
||||||
|
if not self.log_file_path.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
for line in reversed(lines):
|
||||||
|
entry = self._parse_log_line(line.strip())
|
||||||
|
if entry and entry.level.upper() == level.upper():
|
||||||
|
entries.append(entry)
|
||||||
|
if len(entries) >= count:
|
||||||
|
break
|
||||||
|
|
||||||
|
return list(reversed(entries))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading log file: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def read_entries_since(self, since: datetime, count: int = 100) -> List[LogEntry]:
|
||||||
|
"""Read entries since a specific datetime."""
|
||||||
|
if not self.log_file_path.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
for line in reversed(lines):
|
||||||
|
entry = self._parse_log_line(line.strip())
|
||||||
|
if entry:
|
||||||
|
if entry.timestamp >= since:
|
||||||
|
entries.append(entry)
|
||||||
|
else:
|
||||||
|
# Stop reading if we've gone past the since time
|
||||||
|
break
|
||||||
|
if len(entries) >= count:
|
||||||
|
break
|
||||||
|
|
||||||
|
return list(reversed(entries))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading log file: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_log_stats(self) -> Dict[str, int]:
|
||||||
|
"""Get statistics about the log file."""
|
||||||
|
if not self.log_file_path.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
'total_lines': len(lines),
|
||||||
|
'INFO': 0,
|
||||||
|
'WARNING': 0,
|
||||||
|
'ERROR': 0,
|
||||||
|
'DEBUG': 0,
|
||||||
|
'CRITICAL': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
entry = self._parse_log_line(line.strip())
|
||||||
|
if entry:
|
||||||
|
level = entry.level.upper()
|
||||||
|
if level in stats:
|
||||||
|
stats[level] += 1
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading log file: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _parse_log_line(self, line: str) -> Optional[LogEntry]:
|
||||||
|
"""Parse a single log line into a LogEntry object."""
|
||||||
|
if not line:
|
||||||
|
return None
|
||||||
|
|
||||||
|
match = self.log_pattern.match(line)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
timestamp_str, logger, level, message = match.groups()
|
||||||
|
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
|
||||||
|
|
||||||
|
return LogEntry(
|
||||||
|
timestamp=timestamp,
|
||||||
|
logger=logger,
|
||||||
|
level=level,
|
||||||
|
message=message,
|
||||||
|
raw_line=line
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class MultiLogReader:
|
||||||
|
"""Reader for multiple log files."""
|
||||||
|
|
||||||
|
def __init__(self, log_directory: Path):
|
||||||
|
"""Initialize with a directory containing log files."""
|
||||||
|
self.log_directory = Path(log_directory)
|
||||||
|
|
||||||
|
def get_available_log_files(self) -> List[Path]:
|
||||||
|
"""Get list of available log files."""
|
||||||
|
if not self.log_directory.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
return list(self.log_directory.glob('*.log'))
|
||||||
|
|
||||||
|
def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
|
||||||
|
"""Read entries from all log files and merge them chronologically."""
|
||||||
|
all_entries = []
|
||||||
|
|
||||||
|
for log_file in self.get_available_log_files():
|
||||||
|
reader = LogReader(log_file)
|
||||||
|
entries = reader.read_last_entries(count)
|
||||||
|
all_entries.extend(entries)
|
||||||
|
|
||||||
|
# Sort by timestamp
|
||||||
|
all_entries.sort(key=lambda x: x.timestamp, reverse=True)
|
||||||
|
|
||||||
|
return all_entries[:count]
|
||||||
Reference in New Issue
Block a user