refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
+3 -2
View File
@@ -4,8 +4,9 @@
__pycache__/
.ipynb_checkpoints/
*.pyc
/models/
.env.local
var/
/dataset/
/data/dataset/
.DS_Store
/data/
/backups
+121 -18
View File
@@ -2,24 +2,127 @@
default: help
.PHONY: help
help:
@echo Tasks:
help: ## Show this help message
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
.PHONY: download
download:
@if [ ! -f dataset/names.csv ]; then \
set -a; [ -f .env.local ] && . .env.local; set +a; \
[ -z "$$DATASET_URL" ] && . .env; \
mkdir -p dataset; \
curl -L "$${DATASET_URL}" -o dataset/names.csv; \
else \
echo "dataset/names.csv already exists. Skipping download."; \
fi
# =============================================================================
# ENVIRONMENT SETUP
# =============================================================================
.PHONY: clean
clean:
rm -rf ./models
rm -rf ./results
rm -rf ./dataset/spacy/train.spacy
rm -rf ./dataset/spacy/dev.spacy
.PHONY: setup
setup: ## Setup virtual environment and install dependencies
python -m venv .venv
.venv/bin/pip install --upgrade pip
.venv/bin/pip install -r requirements.txt
.PHONY: install
install: ## Install/update dependencies
pip install --upgrade pip
pip install -r requirements.txt
.PHONY: install-dev
install-dev: ## Install development dependencies
pip install -r requirements.txt
pip install jupyter notebook ipykernel pytest black flake8 mypy
.PHONY: activate
activate: ## Show activation command
@echo "Run: source .venv/bin/activate"
# =============================================================================
# MODEL TRAINING & ARTIFACTS
# =============================================================================
.PHONY: train-baseline
train-baseline: ## Train all baseline models and save artifacts
python research/train.py --mode baseline
.PHONY: train-neural
train-neural: ## Train neural network models (LSTM, CNN, Transformer)
python research/train.py --mode neural
.PHONY: train-model
train-model: ## Train specific model (use: make train-model MODEL=logistic_regression NAME=my_model)
python research/train.py --model-type $(MODEL) --name $(NAME)
.PHONY: list-models
list-models: ## List all saved model artifacts
python research/train.py --mode list
# =============================================================================
# RESEARCH & EXPERIMENTS
# =============================================================================
.PHONY: experiment
experiment: ## Create sample experiment configuration
python research/cli.py run --name "sample_experiment" --features full_name --model-type logistic_regression
.PHONY: baseline
baseline: ## Run baseline experiments
python research/cli.py baseline
.PHONY: ablation
ablation: ## Run feature ablation study
python research/cli.py ablation
.PHONY: components
components: ## Run name component analysis
python research/cli.py components
.PHONY: list-experiments
list-experiments: ## List all experiments
python research/cli.py list
.PHONY: list-completed
list-completed: ## List completed experiments only
python research/cli.py list --status completed
.PHONY: export-results
export-results: ## Export all experiment results to CSV
python research/cli.py export --output results_$(shell date +%Y%m%d_%H%M%S).csv
.PHONY: best-model
best-model: ## Show best performing model
python research/cli.py list --status completed | head -5
# =============================================================================
# WEB INTERFACE
# =============================================================================
.PHONY: web
web: ## Launch Streamlit web interface
streamlit run web/app.py --server.runOnSave true --server.port 8501
# =============================================================================
# DEVELOPMENT & CODE QUALITY
# =============================================================================
.PHONY: format
format: ## Format code with black
black . --line-length 100
.PHONY: lint
lint: ## Lint code with flake8
flake8 . --max-line-length=100 --ignore=E203,W503 --exclude=.venv
.PHONY: type-check
type-check: ## Type check with mypy
mypy . --ignore-missing-imports
.PHONY: notebook
notebook: ## Start Jupyter notebook
jupyter notebook notebooks/
.PHONY: lab
lab: ## Start Jupyter lab
jupyter lab notebooks/
# =============================================================================
# DEPLOYMENT & PRODUCTION
# =============================================================================
.PHONY: backup
backup: ## Backup datasets and results
@mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
@cp -r data/ backups/$(shell date +%Y%m%d_%H%M%S)/data/
@echo "Backup created in backups/$(shell date +%Y%m%d_%H%M%S)/"
+280 -74
View File
@@ -1,110 +1,316 @@
# NERS-NLP: A Culturally-Aware Natural Language Processing System with Named Entity Recognition and Gender Inference Models
# DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System for Congolese Name Analysis
A comprehensive, research-friendly pipeline for analyzing Congolese names and predicting gender using culturally-aware machine learning models.
This system provides advanced data processing, experiment management, and an intuitive web interface for non-technical users.
## Overview
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data.
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 7 million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
Despite the growing success of Named Entity Recognition (NER) systems and gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data. In this paper, we propose NERS-NLP, a culturally-aware NLP system with Named Entity Recognition and Gender Inference Models. This study introduces a large-scale dataset of over 7 million names of the population of the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata, including geographical distribution. We explore the linguistic and sociocultural features embedded in these names and examine their impact on two key NLP tasks, namely, entity recognition and gender classification.
Our approach involves:
- (1) a statistical and feature analysis of Congolese name structures,
- (2) the development of supervised gender prediction models leveraging name components and demographic patterns,
- (3) the integration of the curated name lexicon into NER pipelines to improve recognition accuracy for Congolese entities.
- **(1) Advanced data processing pipeline** with batching, checkpointing, and parallel processing
- **(2) Modular experiment framework** for systematic model comparison and research iteration
- **(3) Multiple feature extraction strategies** leveraging name components, linguistic patterns, and demographic data
- **(4) Culturally-aware gender prediction models** trained specifically on Congolese naming patterns
- **(5) User-friendly web interface** enabling non-technical users to run experiments and make predictions
- **(6) Comprehensive research tools** for reproducible experimentation and result analysis
## Key Features
Experiments conducted on custom evaluation sets, including multilingual and code-switched Congolese texts, show that our culturally-aware methods significantly outperform state-of-the-art multilingual baselines.
This work demonstrates the importance of culturally grounded resources in reducing bias and improving performance in NLP systems applied to underrepresented regions. Our findings open new directions for inclusive language technologies in African contexts and contribute a valuable resource for future research in regional linguistics, onomastics, and identity-aware artificial intelligence.
### **Advanced Data Processing**
- **Batched processing** with configurable batch sizes and parallel execution
- **Automatic checkpointing** and resume capability for large datasets
- **LLM-powered annotation** with rate limiting and retry logic
- **Memory-efficient** chunked data loading for datasets of any size
### **Research-Friendly Experiment Framework**
- **Modular model architecture** - easily add new models and features
- **Systematic experiment tracking** with automatic result storage
- **Feature ablation studies** and component analysis tools
- **Cross-validation** and statistical significance testing
- **Automated baseline comparisons** and performance analysis
### **Intuitive Web Interface**
- **No-code experiment creation** with visual parameter selection
- **Real-time monitoring** of data processing and training progress
- **Interactive result visualization** with charts and comparisons
- **Batch prediction capabilities** for CSV file upload and processing
- **Model comparison tools** with automatic performance rankings
### **Comprehensive Analytics**
- **Feature importance analysis** showing which name components matter most
- **Province-specific studies** examining regional naming patterns
- **Learning curve analysis** for understanding data requirements
- **Prediction confidence scoring** and error analysis tools
## Quick Start
### Using Make Commands (Recommended)
```bash
# Complete setup and basic processing
make quick-start
# Launch web interface
make web
# Run research workflow
make research-flow
# Show all available commands
make help
```
### Manual Installation
## Installation
```bash
git clone https://github.com/bernard-ng/drc-ners-nlp.git
cd drc-ners-nlp
python3 -m venv .venv
source .venv/bin/activate
# Setup environment
make setup
make process
pip install -r requirements.txt
# Launch web interface
make web
```
## Usage
## Dataset
### Preparation
| Name | Description | Default |
|------------------|--------------------------------------------------------------------|---------|
| --split_eval | Split into evaluation and featured datasets | True |
| --no-split_eval | Do not split into evaluation and featured datasets | |
| --split_by_sex | Split by sex into male/female datasets | True |
| --no-split_by_sex| Do not split by sex into male/female datasets | |
### Web Interface (Recommended for Non-Technical Users)
Launch the Streamlit web application:
```bash
make web
```
The interface provides:
- **Dashboard**: Overview of datasets and recent experiments
- **Data Overview**: Interactive data exploration and statistics
- **Data Processing**: Monitor and control the processing pipeline
- **Experiments**: Create and manage machine learning experiments
- **Results & Analysis**: Compare models and analyze performance
- **Predictions**: Make predictions on new names or upload CSV files
- **Settings**: Configure the system and manage data
### Research & Experiments
#### Quick Research Studies
```bash
# Compare different approaches (full name vs native vs surname)
make baseline
# Analyze which name components are most effective
make components
# Test feature importance through ablation study
make ablation
# View all experiment results
make list-experiments
# Export results for publication
make export-results
```
#### Custom Experiments
```bash
# Run specific experiment via command line
python research/cli.py run \
--name "native_name_study" \
--features native_name \
--model-type logistic_regression \
--description "Test native name effectiveness"
# Compare multiple experiments
python research/cli.py compare <exp_id_1> <exp_id_2>
# View detailed results
python research/cli.py show <experiment_id>
```
### Data Processing Pipeline
#### Basic Processing (No LLM)
```bash
make process-basic # Fast processing without LLM annotation
```
#### Complete Processing (With LLM)
```bash
make process # Full pipeline including LLM annotation
make process-dev # Development mode with smaller batches
```
#### Monitor Progress
```bash
make monitoring # Show current pipeline status
make status # Show overall system status
```
#### Resume Interrupted Processing
```bash
make process-resume # Resume from last checkpoint
```
### Available Models and Features
#### Models
- **Logistic Regression**: Character n-gram based classification
- **Random Forest**: Engineered feature-based classification
- **LSTM**: Sequential neural network (planned)
- **Transformer**: Attention-based model (planned)
#### Features
- **Full Name**: Complete name as given
- **Native Name**: Identified native/given name component
- **Surname**: Family name component
- **Name Length**: Character count features
- **Word Count**: Number of words in name
- **Province**: Geographic/demographic features
- **Name Beginnings/Endings**: Prefix/suffix patterns
- **Character N-grams**: Linguistic pattern features
## Configuration
### Environment Configurations
```bash
python -m processing.prepare --split_eval --split_by_sex
# Switch to development configuration (smaller batches, more logging)
make config-dev
# Switch to production configuration (optimized for performance)
make config-prod
# View current configuration
make show-config
```
### Annotation
| Name | Description | Default |
|-------------|-----------------------------------------------------|----------------|
| --llm_model | Ollama model name to use | mistral:7b |
### Custom Configuration
Example:
Edit configuration files in `config/`:
- `pipeline.yaml` - Main configuration
- `pipeline.development.yaml` - Development overrides
- `pipeline.production.yaml` - Production settings
Example configuration:
```yaml
processing:
batch_size: 1000
max_workers: 4
llm:
model_name: "mistral:7b"
requests_per_minute: 60
data:
split_evaluation: true
split_by_gender: true
```
## Research Capabilities
### Systematic Experimentation
The framework supports systematic research through:
1. **Baseline Studies**: Compare fundamental approaches
2. **Feature Studies**: Test individual name components
3. **Ablation Studies**: Identify most important features
4. **Cross-Province Analysis**: Test generalization across regions
5. **Hyperparameter Optimization**: Systematic parameter tuning
### Reproducible Research
- **Experiment Tracking**: All experiments automatically logged with full configuration
- **Result Export**: CSV export for publication and further analysis
- **Statistical Testing**: Cross-validation and confidence intervals
- **Version Control**: Configuration-based approach enables easy replication
### Publication-Ready Output
```bash
python -m processing.annotate --llm_model=mistral7b
# Generate comprehensive results for publication
make research-flow
make export-results
# Get best models for each approach
make list-completed
python research/cli.py list --status completed | head -10
```
## Experiments
### Training
| Name | Description | Default |
|----------------|--------------------------------------------------|--------------------|
| --dataset | Path to the dataset file | names_featured.csv |
| --size | Number of samples to use (None for full dataset) | None |
| --threshold | Probability threshold for gender classification | 0.5 |
| --cv | Number of cross-validation folds | None |
| --save | Whether to save the trained model | False |
| --balanced | Whether to balance the dataset | False |
| --epochs | Number of training epochs | 10 |
| --test_size | Proportion of data to use as test set | 0.2 |
| --random_state | Random seed for reproducibility | 42 |
Examples:
## Development
### Code Quality and Testing
```bash
python -m pipelilne.gender.models.lstm --size 1000000 --save
python -m pipelilne.gender.models.logreg --size 1000000 --save
python -m pipelilne.gender.models.transformer --size 1000000 --save
make format # Format code with black
make lint # Lint with flake8
make check-deps # Verify dependencies
```
### Development Workflow
```bash
python -m pipelilne.gender.models.lstm --size 1000000 --balanced --save
python -m pipelilne.gender.models.logreg --size 1000000 --balanced --save
python -m pipelilne.gender.models.transformer --size 1000000 --balanced --save
make daily-work # Daily development setup
make notebook # Launch Jupyter for analysis
make web-dev # Launch web interface with auto-reload
```
### Evaluation
| Name | Description | Default |
|------------|-----------------------------------------------|----------------------|
| --model | Model type: logreg, lstm, or transformer | (required) |
| --dataset | Path to the dataset CSV file | names_featured.csv |
| --size | Number of rows to load from the dataset | None |
| --balanced | Load balanced dataset | False |
| --threshold| Probability threshold for classification | 0.5 |
Examples:
### Data Management
```bash
python -m pipelilne.gender.eval --dataset names_evaluations.csv --model logreg
python -m pipelilne.gender.eval --dataset names_evaluations.csv --model lstm
python -m pipelilne.gender.eval --dataset names_evaluations.csv --model transformer
make check-data # Verify all data files
make data-stats # Show dataset statistics
make backup-data # Create timestamped backup
make clean-checkpoints # Clean processing checkpoints
```
### Inference
| Name | Description | Default |
|-------------|------------------------------------------|-----------|
| --model | Model type: logreg, lstm, or transformer | (required)|
| --names | One or more names | (required)|
| --threshold | Threshold for classification | 0.5 |
## Project Structure
Examples:
```bash
python -m pipelilne.gender.predict --model logreg --names "Tshisekedi"
python -m pipelilne.gender.predict --model lstm --names "Ilunga Ngandu"
python -m pipelilne.gender.predict --model transformer --names "musenga wa musenga"
```
├── Makefile # All command shortcuts
├── streamlit_app.py # Web interface application
├── config/ # Configuration files
│ ├── pipeline.yaml # Main configuration
│ ├── pipeline.development.yaml # Dev settings
│ └── pipeline.production.yaml # Prod settings
├── core/ # Core framework
│ ├── config.py # Configuration management
│ ├── domain.py # Domain-specific data
│ └── utils.py # Reusable utilities
├── processing/ # Data processing pipeline
│ ├── main.py # Main pipeline script
│ ├── pipeline.py # Pipeline framework
│ ├── steps_config.py # Configurable processing steps
│ └── monitor.py # Monitoring utilities
├── research/ # Research and experiments
│ ├── cli.py # Command-line interface
│ ├── experiment.py # Experiment management
│ ├── models.py # Model implementations
│ └── runner.py # Experiment execution
└── dataset/ # Data files
└── names.csv # Raw dataset
```
## Citation
If you use this pipeline in your research, please cite:
```bibtex
@software{drc_names_pipeline,
title={DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System},
author={Your Name},
year={2025},
url={https://github.com/bernard-ng/drc-ners-nlp}
}
```
## License
This project is licensed under the MIT License - see the LICENSE file for details.
## Acknowledgments
- Democratic Republic of Congo population data contributors
- Open source NLP and machine learning communities
- Cultural linguistics research communities
+1073
View File
File diff suppressed because it is too large Load Diff
Executable
+383
View File
@@ -0,0 +1,383 @@
#!.venv/bin/python3
import argparse
import sys
from pathlib import Path
import json
import pandas as pd
import logging
from core.config import get_config, setup_logging
from research.experiment import ExperimentConfig
from research.experiment.experiment_tracker import ExperimentTracker
from research.experiment.feature_extractor import FeatureType
from research.experiment.experiment_builder import ExperimentBuilder
from research.experiment.experiment_runner import ExperimentRunner
from research.model_registry import list_available_models
def create_experiment_from_args(args) -> ExperimentConfig:
"""Create experiment configuration from command line arguments"""
features = []
if args.features:
for feature_name in args.features:
try:
features.append(FeatureType(feature_name))
except ValueError:
logging.warning(f"Unknown feature type '{feature_name}', skipping")
if not features:
features = [FeatureType.FULL_NAME] # Default
# Parse model parameters
model_params = {}
if args.model_params:
try:
model_params = json.loads(args.model_params)
except json.JSONDecodeError:
logging.warning("Invalid JSON for model parameters, using defaults")
# Parse feature parameters
feature_params = {}
if args.feature_params:
try:
feature_params = json.loads(args.feature_params)
except json.JSONDecodeError:
logging.warning("Invalid JSON for feature parameters, using defaults")
# Parse data filters
train_filter = None
if args.train_filter:
try:
train_filter = json.loads(args.train_filter)
except json.JSONDecodeError:
logging.warning("Invalid JSON for train filter, ignoring")
return ExperimentConfig(
name=args.name,
description=args.description or "",
tags=args.tags or [],
model_type=args.model_type,
model_params=model_params,
features=features,
feature_params=feature_params,
train_data_filter=train_filter,
target_column=args.target,
test_size=args.test_size,
random_seed=args.seed,
cross_validation_folds=args.cv_folds,
metrics=args.metrics or ["accuracy", "precision", "recall", "f1"],
)
def run_single_experiment(args):
"""Run a single experiment"""
config = create_experiment_from_args(args)
runner = ExperimentRunner()
experiment_id = runner.run_experiment(config)
logging.info(f"Experiment completed: {experiment_id}")
# Show results
experiment = runner.tracker.get_experiment(experiment_id)
if experiment:
logging.info("Results:")
for metric, value in experiment.test_metrics.items():
logging.info(f" Test {metric}: {value:.4f}")
if experiment.cv_metrics:
logging.info("Cross-validation:")
for metric, value in experiment.cv_metrics.items():
if not metric.endswith("_std"):
std_key = f"{metric}_std"
std_val = experiment.cv_metrics.get(std_key, 0)
logging.info(f" CV {metric}: {value:.4f} ± {std_val:.4f}")
def run_baseline_experiments(args):
"""Run baseline experiments"""
logger = logging.getLogger(__name__)
builder = ExperimentBuilder()
experiments = builder.create_baseline_experiments()
runner = ExperimentRunner()
experiment_ids = runner.run_experiment_batch(experiments)
logging.info(f"Completed {len(experiment_ids)} baseline experiments")
# Show comparison
if experiment_ids:
comparison = runner.compare_experiments(experiment_ids)
logging.info("Baseline Results Comparison:")
logging.info(
comparison[["name", "model_type", "features", "test_accuracy"]].to_string(index=False)
)
def run_ablation_study(args):
"""Run feature ablation study"""
builder = ExperimentBuilder()
experiments = builder.create_feature_ablation_study()
runner = ExperimentRunner()
experiment_ids = runner.run_experiment_batch(experiments)
logging.info(f"Completed {len(experiment_ids)} ablation experiments")
# Show results
if experiment_ids:
comparison = runner.compare_experiments(experiment_ids)
logging.info("Ablation Study Results:")
logging.info(comparison[["name", "test_accuracy", "test_f1"]].to_string(index=False))
def run_component_study(args):
"""Run name component study"""
builder = ExperimentBuilder()
experiments = builder.create_name_component_study()
runner = ExperimentRunner()
experiment_ids = runner.run_experiment_batch(experiments)
logging.info(f"Completed {len(experiment_ids)} component study experiments")
# Show results
if experiment_ids:
comparison = runner.compare_experiments(experiment_ids)
logging.info("Name Component Study Results:")
logging.info(
comparison[["name", "test_accuracy", "test_precision", "test_recall"]].to_string(
index=False
)
)
def list_experiments(args):
"""List experiments with optional filtering"""
tracker = ExperimentTracker()
# Apply filters
filters = {}
if args.status:
from research.experiment import ExperimentStatus
filters["status"] = ExperimentStatus(args.status)
if args.model_type:
filters["model_type"] = args.model_type
if args.tags:
filters["tags"] = args.tags
experiments = tracker.list_experiments(**filters)
if not experiments:
logging.info("No experiments found matching criteria")
return
# Create summary table
rows = []
for exp in experiments:
row = {
"ID": exp.experiment_id[:12] + "...",
"Name": exp.config.name,
"Model": exp.config.model_type,
"Status": exp.status.value,
"Test Acc": f"{exp.test_metrics.get('accuracy', 0):.4f}" if exp.test_metrics else "N/A",
"Start Time": exp.start_time.strftime("%Y-%m-%d %H:%M"),
}
rows.append(row)
df = pd.DataFrame(rows)
logging.info(df.to_string(index=False))
def show_experiment_details(args):
"""Show detailed results for an experiment"""
tracker = ExperimentTracker()
experiment = tracker.get_experiment(args.experiment_id)
if not experiment:
logging.error(f"Experiment not found: {args.experiment_id}")
return
logging.info("=== Experiment Details ===")
logging.info(f"ID: {experiment.experiment_id}")
logging.info(f"Name: {experiment.config.name}")
logging.info(f"Description: {experiment.config.description}")
logging.info(f"Model Type: {experiment.config.model_type}")
logging.info(f"Features: {', '.join([f.value for f in experiment.config.features])}")
logging.info(f"Status: {experiment.status.value}")
logging.info(f"Start Time: {experiment.start_time}")
logging.info(f"End Time: {experiment.end_time}")
if experiment.test_metrics:
logging.info("=== Test Metrics ===")
for metric, value in experiment.test_metrics.items():
logging.info(f"{metric}: {value:.4f}")
if experiment.cv_metrics:
logging.info("=== Cross-Validation Metrics ===")
for metric, value in experiment.cv_metrics.items():
if not metric.endswith("_std"):
std_key = f"{metric}_std"
std_val = experiment.cv_metrics.get(std_key, 0)
logging.info(f"{metric}: {value:.4f} ± {std_val:.4f}")
if experiment.feature_importance:
logging.info("=== Top 10 Feature Importances ===")
sorted_features = sorted(
experiment.feature_importance.items(), key=lambda x: x[1], reverse=True
)
for feature, importance in sorted_features[:10]:
logging.info(f"{feature}: {importance:.4f}")
if experiment.prediction_examples:
logging.info("=== Prediction Examples ===")
for i, example in enumerate(experiment.prediction_examples[:5]):
correct = "" if example["correct"] else ""
logging.info(
f"{i + 1}. {example['name']} -> True: {example['true_label']}, "
f"Pred: {example['predicted_label']} {correct}"
)
def compare_experiments_cmd(args):
"""Compare multiple experiments"""
runner = ExperimentRunner()
comparison = runner.compare_experiments(args.experiment_ids)
if comparison.empty:
logging.info("No experiments found for comparison")
return
logging.info("=== Experiment Comparison ===")
# Show key columns
key_columns = ["name", "model_type", "features", "test_accuracy", "test_f1"]
available_columns = [col for col in key_columns if col in comparison.columns]
logging.info(comparison[available_columns].to_string(index=False))
def export_results(args):
"""Export experiment results"""
tracker = ExperimentTracker()
output_path = tracker.export_results(Path(args.output) if args.output else None)
logging.info(f"Results exported to: {output_path}")
def main():
"""Main CLI entry point"""
parser = argparse.ArgumentParser(
description="DRC Names Research Experiment Manager",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
# Setup logging
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Single experiment command
exp_parser = subparsers.add_parser("run", help="Run a single experiment")
exp_parser.add_argument("--name", required=True, help="Experiment name")
exp_parser.add_argument("--description", help="Experiment description")
exp_parser.add_argument(
"--model-type",
default="logistic_regression",
choices=list_available_models(),
help="Model type",
)
exp_parser.add_argument(
"--features", nargs="+", choices=[f.value for f in FeatureType], help="Features to use"
)
exp_parser.add_argument("--model-params", help="Model parameters as JSON")
exp_parser.add_argument("--feature-params", help="Feature parameters as JSON")
exp_parser.add_argument("--train-filter", help="Training data filter as JSON")
exp_parser.add_argument("--target", default="sex", help="Target column")
exp_parser.add_argument("--test-size", type=float, default=0.2, help="Test set size")
exp_parser.add_argument("--seed", type=int, default=42, help="Random seed")
exp_parser.add_argument("--cv-folds", type=int, default=5, help="CV folds")
exp_parser.add_argument(
"--metrics",
nargs="+",
choices=["accuracy", "precision", "recall", "f1"],
help="Metrics to calculate",
)
exp_parser.add_argument("--tags", nargs="+", help="Experiment tags")
# Batch experiment commands
subparsers.add_parser("baseline", help="Run baseline experiments")
subparsers.add_parser("ablation", help="Run feature ablation study")
subparsers.add_parser("components", help="Run name component study")
# List experiments
list_parser = subparsers.add_parser("list", help="List experiments")
list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"])
list_parser.add_argument("--model-type", choices=list_available_models())
list_parser.add_argument("--tags", nargs="+", help="Filter by tags")
# Show experiment details
detail_parser = subparsers.add_parser("show", help="Show experiment details")
detail_parser.add_argument("experiment_id", help="Experiment ID")
# Compare experiments
compare_parser = subparsers.add_parser("compare", help="Compare experiments")
compare_parser.add_argument("experiment_ids", nargs="+", help="Experiment IDs to compare")
# Export results
export_parser = subparsers.add_parser("export", help="Export results to CSV")
export_parser.add_argument("--output", help="Output file path")
args = parser.parse_args()
if not args.command:
parser.print_help()
return 1
# Setup logging
config = get_config()
if args.verbose:
config.logging.level = "DEBUG"
setup_logging(config)
# Execute command
try:
if args.command == "run":
run_single_experiment(args)
elif args.command == "baseline":
run_baseline_experiments(args)
elif args.command == "ablation":
run_ablation_study(args)
elif args.command == "components":
run_component_study(args)
elif args.command == "list":
list_experiments(args)
elif args.command == "show":
show_experiment_details(args)
elif args.command == "compare":
compare_experiments_cmd(args)
elif args.command == "export":
export_results(args)
return 0
except Exception as e:
logging.error(f"Command failed: {e}")
if args.verbose:
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
+46
View File
@@ -0,0 +1,46 @@
# Production Environment Configuration
# Optimized settings for production deployment
name: "drc_names_pipeline"
version: "1.0.0"
environment: "development"
debug: true
# Processing settings
processing:
batch_size: 100_000
max_workers: 8
checkpoint_interval: 10
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
# Pipeline stages
stages:
- "data_cleaning"
- "feature_extraction"
#- "llm_annotation"
- "data_splitting"
# Production LLM settings
llm:
model_name: "mistral:7b"
requests_per_minute: 120
requests_per_second: 3
retry_attempts: 3
timeout_seconds: 45
max_concurrent_requests: 4
enable_rate_limiting: true
# Production data settings
data:
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
# Enhanced logging for development
logging:
level: "INFO"
console_logging: true
file_logging: true
log_file: "pipeline.development.log"
+48
View File
@@ -0,0 +1,48 @@
# Production Environment Configuration
# Optimized settings for production deployment
name: "drc_names_pipeline"
version: "1.0.0"
environment: "production"
debug: false
# Production processing settings (optimized for performance)
processing:
batch_size: 10_000
max_workers: 8
checkpoint_interval: 10
use_multiprocessing: true # Enable multiprocessing for CPU-bound tasks
# Pipeline stages
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
# Production LLM settings
llm:
model_name: "mistral:7b"
requests_per_minute: 360
requests_per_second: 3
retry_attempts: 3
timeout_seconds: 45
max_concurrent_requests: 4
enable_rate_limiting: true
# Production data settings
data:
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
# Production logging (less verbose)
logging:
level: "INFO"
console_logging: false # Disable console in production
file_logging: true
log_file: "pipeline.production.log"
max_log_size: 52428800 # 50MB
backup_count: 10
+70
View File
@@ -0,0 +1,70 @@
# DRC Names Processing Pipeline Configuration
# Main configuration file with default settings
name: "drc_names_pipeline"
version: "1.0.0"
description: "DRC Names NLP Processing Pipeline"
environment: "development"
debug: false
# Project directory structure
paths:
root_dir: "."
configs_dir: "./config"
data_dir: "./data/dataset"
models_dir: "./data/models"
outputs_dir: "./data/outputs"
logs_dir: "./data/logs"
checkpoints_dir: "./data/checkpoints"
# Pipeline stages
stages:
- "data_cleaning"
- "feature_extraction"
- "llm_annotation"
- "data_splitting"
# Data processing configuration
processing:
batch_size: 1_000
max_workers: 4
checkpoint_interval: 5
use_multiprocessing: false
encoding_options:
- "utf-8"
- "utf-16"
- "latin1"
chunk_size: 100_000
# LLM annotation settings
llm:
model_name: "mistral:7b"
requests_per_minute: 60
requests_per_second: 2
retry_attempts: 3
timeout_seconds: 600
max_concurrent_requests: 2
enable_rate_limiting: true
# Data handling configuration
data:
input_file: "names.csv"
output_files:
featured: "names_featured.csv"
evaluation: "names_evaluation.csv"
males: "names_males.csv"
females: "names_females.csv"
split_evaluation: true
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
# Logging configuration
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: true
console_logging: true
log_file: "pipeline.log"
max_log_size: 10485760 # 10MB
backup_count: 5
+128
View File
@@ -0,0 +1,128 @@
# Research Experiment Configuration Templates
# These configurations can be used as starting points for different types of experiments
# Baseline Experiments Configuration
baseline_experiments:
- name: "baseline_logistic_regression_fullname"
description: "Baseline logistic regression with full name"
model_type: "logistic_regression"
features: ["full_name"]
model_params:
ngram_range: [2, 5]
max_features: 10000
max_iter: 1000
tags: ["baseline", "fullname"]
- name: "baseline_logistic_regression_native"
description: "Logistic regression with native name only"
model_type: "logistic_regression"
features: ["native_name"]
model_params:
ngram_range: [2, 4]
max_features: 5000
tags: ["baseline", "native"]
- name: "baseline_rf_engineered"
description: "Random Forest with engineered features"
model_type: "random_forest"
features: ["name_length", "word_count", "province"]
model_params:
n_estimators: 100
max_depth: 10
tags: ["baseline", "engineered"]
# Feature Study Configurations
feature_studies:
- name: "native_vs_surname"
description: "Compare native name vs surname effectiveness"
experiments:
- model_type: "logistic_regression"
features: ["native_name"]
tags: ["feature_study", "native"]
- model_type: "logistic_regression"
features: ["surname"]
tags: ["feature_study", "surname"]
- name: "name_parts_analysis"
description: "Analyze effectiveness of different name parts"
experiments:
- features: ["first_word"]
tags: ["name_parts", "first"]
- features: ["last_word"]
tags: ["name_parts", "last"]
- features: ["name_beginnings"]
feature_params:
beginning_length: 3
tags: ["name_parts", "beginnings"]
- features: ["name_endings"]
feature_params:
ending_length: 3
tags: ["name_parts", "endings"]
# Province-Specific Studies
province_studies:
- name: "kinshasa_study"
description: "Gender prediction for Kinshasa province"
model_type: "logistic_regression"
features: ["full_name"]
train_data_filter:
province: "kinshasa"
tags: ["province_study", "kinshasa"]
- name: "cross_province_generalization"
description: "Train on one province, test on another"
experiments:
- train_filter: {"province": "kinshasa"}
test_filter: {"province": "bas-congo"}
tags: ["generalization", "kinshasa_to_bas-congo"]
# Model Comparison Studies
model_comparisons:
- name: "model_comparison_fullname"
description: "Compare different models with full name"
base_config:
features: ["full_name"]
tags: ["model_comparison"]
models:
- model_type: "logistic_regression"
model_params:
ngram_range: [2, 5]
- model_type: "random_forest"
# Note: RF will need different feature preparation
features: ["name_length", "word_count", "province"]
# Advanced Feature Combinations
advanced_features:
- name: "multi_feature_combination"
description: "Test various feature combinations"
experiments:
- features: ["full_name", "name_length"]
tags: ["combination", "name_plus_length"]
- features: ["native_name", "surname", "province"]
tags: ["combination", "semantic_features"]
- features: ["name_beginnings", "name_endings", "word_count"]
tags: ["combination", "structural_features"]
# Hyperparameter Studies
hyperparameter_studies:
- name: "ngram_range_study"
description: "Study effect of different n-gram ranges"
base_config:
model_type: "logistic_regression"
features: ["full_name"]
tags: ["hyperparameter", "ngram"]
variants:
- model_params: {"ngram_range": [1, 3]}
- model_params: {"ngram_range": [2, 4]}
- model_params: {"ngram_range": [2, 5]}
- model_params: {"ngram_range": [3, 6]}
# Data Size Studies
data_studies:
- name: "learning_curve_study"
description: "Study performance vs training data size"
base_config:
model_type: "logistic_regression"
features: ["full_name"]
tags: ["learning_curve"]
data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0] # Fractions of training data to use
+61
View File
@@ -0,0 +1,61 @@
import logging
from pathlib import Path
from typing import Optional, Union
from core.config.config_manager import ConfigManager
from core.config.logging_config import LoggingConfig
from core.config.pipeline_config import PipelineConfig
config_manager = ConfigManager()
def get_config() -> PipelineConfig:
"""Get the global configuration instance"""
return config_manager.get_config()
def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
"""Load configuration from specified path"""
if config_path:
return config_manager.load_config(Path(config_path))
return config_manager.get_config()
def setup_logging(config: PipelineConfig):
"""Setup logging based on configuration"""
# Create logs directory
log_dir = config.paths.logs_dir
log_dir.mkdir(parents=True, exist_ok=True)
# Setup logging configuration
log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
# Create formatter
formatter = logging.Formatter(config.logging.format)
# Setup root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Clear existing handlers
root_logger.handlers.clear()
# Console handler
if config.logging.console_logging:
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
root_logger.addHandler(console_handler)
# File handler
if config.logging.file_logging:
from logging.handlers import RotatingFileHandler
log_file_path = log_dir / config.logging.log_file
file_handler = RotatingFileHandler(
log_file_path,
maxBytes=config.logging.max_log_size,
backupCount=config.logging.backup_count,
)
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)
+145
View File
@@ -0,0 +1,145 @@
import json
import logging
from pathlib import Path
from typing import Optional, Union, Dict, Any
import yaml
from core.config.pipeline_config import PipelineConfig
from core.config.project_paths import ProjectPaths
class ConfigManager:
"""Centralized configuration management"""
def __init__(self, config_path: Optional[Union[str, Path]] = None):
self.config_path = config_path or self._find_config_file()
self._config: Optional[PipelineConfig] = None
self._setup_default_paths()
@classmethod
def _find_config_file(cls) -> Path:
"""Find configuration file in standard locations"""
possible_paths = [
Path.cwd() / "config" / "pipeline.yaml",
Path.cwd() / "config" / "pipeline.yml",
Path.cwd() / "pipeline.yaml",
Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
]
for path in possible_paths:
if path.exists():
return path
# Return default path if none found
return Path.cwd() / "config" / "pipeline.yaml"
def _setup_default_paths(self):
"""Setup default project paths"""
root_dir = Path(__file__).parent.parent.parent
self.default_paths = ProjectPaths(
root_dir=root_dir,
configs_dir=root_dir / "config",
data_dir=root_dir / "data" / "dataset",
models_dir=root_dir / "data" / "models",
outputs_dir=root_dir / "data" / "outputs",
logs_dir=root_dir / "data" / "logs",
checkpoints_dir=root_dir / "data" / "checkpoints",
)
def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
"""Load configuration from file"""
if config_path:
self.config_path = config_path
if not self.config_path.exists():
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
return self._create_default_config()
try:
with open(self.config_path, "r") as f:
if self.config_path.suffix.lower() in [".yaml", ".yml"]:
config_data = yaml.safe_load(f)
else:
config_data = json.load(f)
# Ensure paths are properly set
if "paths" not in config_data:
config_data["paths"] = self.default_paths.dict()
self._config = PipelineConfig(**config_data)
return self._config
except Exception as e:
logging.error(f"Failed to load config from {self.config_path}: {e}")
return self._create_default_config()
def _create_default_config(self) -> PipelineConfig:
"""Create default configuration"""
return PipelineConfig(paths=self.default_paths)
def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
"""Save configuration to file"""
save_path = path or self.config_path
save_path.parent.mkdir(parents=True, exist_ok=True)
config_dict = config.model_dump()
# Convert Path objects to strings for serialization
if "paths" in config_dict:
for key, value in config_dict["paths"].items():
if isinstance(value, Path):
config_dict["paths"][key] = str(value)
try:
with open(save_path, "w") as f:
if save_path.suffix.lower() in [".yaml", ".yml"]:
yaml.dump(config_dict, f, default_flow_style=False, indent=2)
else:
json.dump(config_dict, f, indent=2)
logging.info(f"Configuration saved to {save_path}")
except Exception as e:
logging.error(f"Failed to save config to {save_path}: {e}")
def get_config(self) -> PipelineConfig:
"""Get current configuration, loading if necessary"""
if self._config is None:
self._config = self.load_config()
return self._config
def update_config(self, updates: Dict[str, Any]):
"""Update configuration with new values"""
config = self.get_config()
# Deep update configuration
config_dict = config.model_dump()
self._deep_update(config_dict, updates)
self._config = PipelineConfig(**config_dict)
def _deep_update(self, base_dict: Dict, update_dict: Dict):
"""Recursively update nested dictionaries"""
for key, value in update_dict.items():
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
self._deep_update(base_dict[key], value)
else:
base_dict[key] = value
def get_environment_config(self, env: str) -> PipelineConfig:
"""Load environment-specific configuration"""
env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
if env_config_path.exists():
base_config = self.load_config()
env_config = self.load_config(env_config_path)
# Merge configurations
base_dict = base_config.dict()
env_dict = env_config.dict()
self._deep_update(base_dict, env_dict)
return PipelineConfig(**base_dict)
return self.get_config()
+22
View File
@@ -0,0 +1,22 @@
from dataclasses import field
from typing import Dict
from pydantic import BaseModel
class DataConfig(BaseModel):
"""Data handling configuration"""
input_file: str = "names.csv"
output_files: Dict[str, str] = field(
default_factory=lambda: {
"featured": "names_featured.csv",
"evaluation": "names_evaluation.csv",
"males": "names_males.csv",
"females": "names_females.csv",
}
)
split_evaluation: bool = True
split_by_gender: bool = True
evaluation_fraction: float = 0.2
random_seed: int = 42
+13
View File
@@ -0,0 +1,13 @@
from pydantic import BaseModel
class LLMConfig(BaseModel):
"""LLM annotation configuration"""
model_name: str = "mistral:7b"
requests_per_minute: int = 60
requests_per_second: int = 2
retry_attempts: int = 3
timeout_seconds: int = 30
max_concurrent_requests: int = 2
enable_rate_limiting: bool = False
+13
View File
@@ -0,0 +1,13 @@
from pydantic import BaseModel
class LoggingConfig(BaseModel):
"""Logging configuration"""
level: str = "INFO"
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: bool = True
console_logging: bool = True
log_file: str = "pipeline.log"
max_log_size: int = 10 * 1024 * 1024 # 10MB
backup_count: int = 5
+29
View File
@@ -0,0 +1,29 @@
from pydantic import BaseModel
from core.config.logging_config import LoggingConfig
from core.config.data_config import DataConfig
from core.config.llm_config import LLMConfig
from core.config.processing_config import ProcessingConfig
from core.config.project_paths import ProjectPaths
class PipelineConfig(BaseModel):
"""Main pipeline configuration"""
name: str = "drc_names_pipeline"
version: str = "1.0.0"
description: str = "DRC Names NLP Processing Pipeline"
paths: ProjectPaths
stages: list[str] = []
processing: ProcessingConfig = ProcessingConfig()
llm: LLMConfig = LLMConfig()
data: DataConfig = DataConfig()
logging: LoggingConfig = LoggingConfig()
# Environment-specific settings
environment: str = "development"
debug: bool = True
class Config:
arbitrary_types_allowed = True
+14
View File
@@ -0,0 +1,14 @@
from dataclasses import field
from pydantic import BaseModel
class ProcessingConfig(BaseModel):
"""Data processing pipeline configuration"""
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5
use_multiprocessing: bool = False
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
chunk_size: int = 100_000
+23
View File
@@ -0,0 +1,23 @@
from pathlib import Path
from pydantic import BaseModel, field_validator
class ProjectPaths(BaseModel):
"""Project directory structure configuration"""
root_dir: Path
data_dir: Path
models_dir: Path
outputs_dir: Path
logs_dir: Path
configs_dir: Path
checkpoints_dir: Path
class Config:
arbitrary_types_allowed = True
@classmethod
@field_validator("*", mode="before")
def convert_to_path(cls, v):
return Path(v) if not isinstance(v, Path) else v
+57
View File
@@ -0,0 +1,57 @@
import logging
from contextlib import contextmanager
from pathlib import Path
from core.config import get_config, PipelineConfig
@contextmanager
def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration"""
config = get_config()
original_values = {}
# Store original values and apply overrides
for key, value in overrides.items():
if hasattr(config, key):
original_values[key] = getattr(config, key)
setattr(config, key, value)
try:
yield config
finally:
# Restore original values
for key, value in original_values.items():
setattr(config, key, value)
def ensure_directories(config: PipelineConfig) -> None:
"""Ensure all required directories exist"""
directories = [
config.paths.data_dir,
config.paths.models_dir,
config.paths.outputs_dir,
config.paths.logs_dir,
config.paths.configs_dir,
config.paths.checkpoints_dir,
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
logging.info("Ensured all required directories exist")
def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
"""Get full path for a data file"""
return config.paths.data_dir / filename
def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
"""Get full path for a model file"""
return config.paths.models_dir / filename
def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
"""Get full path for an output file"""
return config.paths.outputs_dir / filename
+62
View File
@@ -0,0 +1,62 @@
import logging
from pathlib import Path
from typing import Optional, Union, Iterator
import pandas as pd
from core.config.pipeline_config import PipelineConfig
class DataLoader:
"""Reusable data loading utilities"""
def __init__(self, config: PipelineConfig):
self.config = config
def load_csv_chunked(
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
) -> Iterator[pd.DataFrame]:
"""Load CSV file in chunks for memory efficiency"""
chunk_size = chunk_size or self.config.processing.chunk_size
encodings = self.config.processing.encoding_options
filepath = Path(filepath)
for encoding in encodings:
try:
logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
chunk_iter = pd.read_csv(
filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
)
for i, chunk in enumerate(chunk_iter):
logging.debug(f"Processing chunk {i+1}")
yield chunk
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
return
except Exception as e:
logging.warning(f"Failed with encoding {encoding}: {e}")
continue
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
"""Load complete CSV file into memory"""
chunks = list(self.load_csv_chunked(filepath))
return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
@classmethod
def save_csv(
cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
) -> None:
"""Save DataFrame to CSV with proper handling"""
filepath = Path(filepath)
if create_dirs:
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False, encoding="utf-8")
logging.info(f"Saved {len(df)} rows to {filepath}")
+3
View File
@@ -0,0 +1,3 @@
+24
View File
@@ -0,0 +1,24 @@
from core.config.pipeline_config import PipelineConfig
class PromptManager:
"""Manage prompts for LLM operations"""
def __init__(self, config: PipelineConfig):
self.config = config
self.prompts_dir = self.config.paths.configs_dir / "prompts"
def load_prompt(self, prompt_name: str = "default") -> str:
"""Load a prompt template"""
prompt_file = self.prompts_dir / f"{prompt_name}.txt"
if not prompt_file.exists():
# Fallback to root directory
fallback_file = self.config.paths.root_dir / "prompt.txt"
if fallback_file.exists():
prompt_file = fallback_file
else:
raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
with open(prompt_file, "r", encoding="utf-8") as f:
return f.read().strip()
+56
View File
@@ -0,0 +1,56 @@
import threading
import time
from dataclasses import dataclass
from queue import Queue
@dataclass
class RateLimitConfig:
"""Configuration for rate limiting LLM requests"""
requests_per_minute: int = 60
requests_per_second: int = 2
burst_limit: int = 5
class RateLimiter:
"""Thread-safe rate limiter for LLM requests"""
def __init__(self, config: RateLimitConfig):
self.config = config
self.request_times = Queue()
self.lock = threading.Lock()
self.last_request_time = 0
def wait_if_needed(self):
"""Wait if necessary to respect rate limits"""
with self.lock:
current_time = time.time()
# Check requests per second limit
time_since_last = current_time - self.last_request_time
min_interval = 1.0 / self.config.requests_per_second
if time_since_last < min_interval:
sleep_time = min_interval - time_since_last
time.sleep(sleep_time)
current_time = time.time()
# Clean old request times (older than 1 minute)
while not self.request_times.empty():
if current_time - self.request_times.queue[0] > 60:
self.request_times.get()
else:
break
# Check requests per minute limit
if self.request_times.qsize() >= self.config.requests_per_minute:
oldest_request = self.request_times.queue[0]
wait_time = 60 - (current_time - oldest_request)
if wait_time > 0:
time.sleep(wait_time)
current_time = time.time()
# Record this request
self.request_times.put(current_time)
self.last_request_time = current_time
+64 -129
View File
@@ -1,23 +1,44 @@
import csv
import io
import json
import logging
import os
import pickle
from typing import List, Dict
from typing import Optional, Dict, Tuple
# Paths
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_DIR = os.path.join(ROOT_DIR, 'dataset')
import pandas as pd
MODELS_DIR = os.path.join(ROOT_DIR, 'models')
GENDER_MODELS_DIR = os.path.join(MODELS_DIR, 'gender')
GENDER_RESULT_DIR = os.path.join(ROOT_DIR, 'gender', 'results')
NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner')
NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results')
class RegionMapper:
"""Reusable region mapping utilities"""
REGION_MAPPING = {
def __init__(self, mapping: Optional[Dict] = None):
self.mapping = mapping or REGION_MAPPING
def map_region_to_province(self, region: str) -> str:
"""Map a region to its province"""
region_lower = str(region).lower().strip()
return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
"""Vectorized region to province mapping"""
return regions.str.lower().map(
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
)
@staticmethod
def get_provinces():
return [
"kinshasa",
"bas-congo",
"bandundu",
"katanga",
"equateur",
"province-orientale",
"maniema",
"nord-kivu",
"sud-kivu",
"kasai-occidental",
"kasai-oriental",
]
# DRC Region to Province Mapping
REGION_MAPPING: Dict[str, Tuple[str, str]] = {
# Kinshasa
"kinshasa": ("KINSHASA", "KINSHASA"),
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
@@ -28,7 +49,6 @@ REGION_MAPPING = {
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
# Bas-Congo → Kongo-Central → BAS-CONGO
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
@@ -37,7 +57,6 @@ REGION_MAPPING = {
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
# Kwilu, Kwango, Mai-Ndombe → BANDUNDU
"bandundu": ("BANDUNDU", "BANDUNDU"),
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
@@ -54,7 +73,6 @@ REGION_MAPPING = {
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
# Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
@@ -69,7 +87,6 @@ REGION_MAPPING = {
"tanganyika": ("TANGANYIKA", "KATANGA"),
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
# Equateur → MONGALA, NORD-UBANGI, SUD-UBANGI, TSHUAPA
"equateur": ("EQUATEUR", "EQUATEUR"),
"equateur-1": ("EQUATEUR", "EQUATEUR"),
@@ -89,7 +106,6 @@ REGION_MAPPING = {
"tshuapa": ("TSHUAPA", "EQUATEUR"),
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
# Province-Orientale
"province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
"province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
@@ -100,128 +116,47 @@ REGION_MAPPING = {
"haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
"bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"),
"ituri": ("ITURI", "PROVINCE-ORIENTALE"),
"ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
"ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
"ituri-3": ("ITURI", "PROVINCE-ORIENTALE"),
"tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
"tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
"tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
# Kasaï
"kasai-1": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
"kasai-2": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
"kasai-ce": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
"kasai-central": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
"kasai-central-1": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
"kasai-central-2": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
"kasai-occidental": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
"kasai-occidental-1": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
"kasai-occidental-2": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
"kasai-oriental": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
"kasai-oriental-1": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
"kasai-oriental-2": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
"kasai-oriental-3": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
"kasai-orientale": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
"lomami": ("LOMAMI", "KASAÏ-ORIENTAL"),
"lomami-1": ("LOMAMI", "KASAÏ-ORIENTAL"),
"lomami-2": ("LOMAMI", "KASAÏ-ORIENTAL"),
"sankuru": ("SANKURU", "KASAÏ-ORIENTAL"),
"sankuru-1": ("SANKURU", "KASAÏ-ORIENTAL"),
"sankuru-2": ("SANKURU", "KASAÏ-ORIENTAL"),
# Maniema
"maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"),
"maniema-2": ("MANIEMA", "MANIEMA"),
# Nord-Kivu
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
# Sud-Kivu
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
# Maniema
"maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"),
"maniema-2": ("MANIEMA", "MANIEMA"),
# Divers
"hors-frontieres": ("AUTRES", "AUTRES"),
"lukaya": ("AUTRES", "AUTRES"),
"recours": ("AUTRES", "AUTRES"),
"junacyc": ("AUTRES", "AUTRES"),
"junacyp": ("AUTRES", "AUTRES"),
"junacyc-lualaba-corrige": ("LUALABA", "KATANGA"),
"options-techniques-toutes-les-provinces-et-hors-frontieres": ("AUTRES", "AUTRES"),
"region": ("AUTRES", "AUTRES"),
# Kasai-Occidental → KASAI, KASAI-CENTRAL
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
# Kasai-Oriental → LOMAMI, SANKURU
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
}
logging.basicConfig(level=logging.INFO, format=">> %(message)s")
def load_json_dataset(path: str) -> list:
logging.info(f"Loading JSON dataset from {path}")
with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f:
return json.load(f)
def save_csv_dataset(data: list, path: str) -> None:
logging.info(f"Saving CSV dataset to {path}")
with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
def load_csv_dataset(path: str, limit: int = None, balanced: bool = False) -> List[Dict[str, str]]:
logging.info(f"Loading CSV dataset from {path}")
file_path = os.path.join(DATA_DIR, path)
with open(file_path, "r", encoding="utf-8", errors="replace", newline="") as f:
raw_text = f.read().replace('\x00', '')
reader = csv.DictReader(io.StringIO(raw_text))
logging.info(f"Detected fieldnames: {reader.fieldnames}")
if balanced:
by_sex = {'m': [], 'f': []}
for row in reader:
sex = row.get("sex", "").lower()
if sex in by_sex:
by_sex[sex].append(row)
min_len = min(len(by_sex['m']), len(by_sex['f']))
if limit:
min_len = min(min_len, limit // 2)
data = by_sex['m'][:min_len] + by_sex['f'][:min_len]
else:
data = []
for i, row in enumerate(reader):
data.append(row)
if limit and i + 1 >= limit:
break
logging.info("Successfully loaded with UTF-8 encoding")
return data
def save_json_dataset(data: list, path: str) -> None:
logging.info(f"Saving JSON dataset to {path}")
with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, separators=(',', ':'))
def save_pickle(obj, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "wb") as f:
pickle.dump(obj, f)
def load_pickle(path: str):
with open(path, "rb") as f:
return pickle.load(f)
def load_prompt() -> str:
with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f:
return f.read()
+41
View File
@@ -0,0 +1,41 @@
import json
import logging
from typing import Dict, Any
from core.config.pipeline_config import PipelineConfig
class StateManager:
"""Manage pipeline state and checkpoints"""
def __init__(self, config: PipelineConfig):
self.config = config
self.checkpoints_dir = self.config.paths.checkpoints_dir
def save_state(self, state: Dict[str, Any], state_name: str) -> None:
"""Save pipeline state"""
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
state_file = self.checkpoints_dir / f"{state_name}.json"
with open(state_file, "w") as f:
json.dump(state, f, indent=2, default=str)
logging.debug(f"Saved state to {state_file}")
def load_state(self, state_name: str) -> Dict[str, Any]:
"""Load pipeline state"""
state_file = self.checkpoints_dir / f"{state_name}.json"
if not state_file.exists():
return {}
with open(state_file, "r") as f:
return json.load(f)
def clear_state(self, state_name: str) -> None:
"""Clear pipeline state"""
state_file = self.checkpoints_dir / f"{state_name}.json"
if state_file.exists():
state_file.unlink()
logging.info(f"Cleared state: {state_name}")
+38
View File
@@ -0,0 +1,38 @@
from typing import Optional, Dict
import pandas as pd
class TextCleaner:
"""Reusable text cleaning utilities"""
def __init__(self, patterns: Optional[Dict[str, str]] = None):
self.patterns = patterns or {
"null_bytes": "\x00",
"non_breaking_spaces": "\u00a0",
"multiple_spaces": r" +",
"extra_whitespace": r"\s+",
}
def clean_text_series(self, series: pd.Series) -> pd.Series:
"""Clean a pandas Series of text data"""
cleaned = series.astype(str)
# Apply cleaning patterns
for pattern_name, pattern in self.patterns.items():
if pattern_name == "multiple_spaces":
cleaned = cleaned.str.replace(pattern, " ", regex=True)
else:
cleaned = cleaned.str.replace(pattern, " ", regex=False)
return cleaned.str.strip().str.lower()
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean all text columns in a DataFrame"""
df = df.copy()
text_columns = df.select_dtypes(include="object").columns
for col in text_columns:
df[col] = self.clean_text_series(df[col])
return df
Executable
+154
View File
@@ -0,0 +1,154 @@
#!.venv/bin/python3
import sys
import argparse
import logging
from pathlib import Path
from typing import Optional
from core.utils.data_loader import DataLoader
from core.config import ConfigManager, setup_logging
from core.utils import ensure_directories, get_data_file_path
from processing.pipeline import Pipeline
from processing.batch.batch_config import BatchConfig
from processing.steps.data_splitting_step import DataSplittingStep
from processing.steps.llm_annotation_step import LLMAnnotationStep
from processing.steps.feature_extraction_step import FeatureExtractionStep
from processing.steps.data_cleaning_step import DataCleaningStep
def create_pipeline_from_config(config_path: Optional[Path] = None) -> Pipeline:
"""Create pipeline from configuration file"""
config = ConfigManager(config_path).load_config()
# Setup logging
setup_logging(config)
ensure_directories(config)
batch_config = BatchConfig(
batch_size=config.processing.batch_size,
max_workers=config.processing.max_workers,
checkpoint_interval=config.processing.checkpoint_interval,
use_multiprocessing=config.processing.use_multiprocessing,
)
# Add steps based on configuration
pipeline = Pipeline(batch_config)
steps = [
DataCleaningStep(config),
FeatureExtractionStep(config),
LLMAnnotationStep(config),
DataSplittingStep(config),
]
for stage in config.stages:
for step in steps:
if step.name == stage:
pipeline.add_step(step)
return pipeline
def run_pipeline(config_path: Optional[Path] = None, resume: bool = False) -> int:
"""Run the complete pipeline"""
try:
config = ConfigManager(config_path).load_config()
logging.info(f"Starting pipeline: {config.name} v{config.version}")
logging.info(f"Environment: {config.environment}")
# Load input data
input_file_path = get_data_file_path(config.data.input_file, config)
if not input_file_path.exists():
logging.error(f"Input file not found: {input_file_path}")
return 1
data_loader = DataLoader(config)
logging.info(f"Loading data from {input_file_path}")
df = data_loader.load_csv_complete(input_file_path)
logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
# Create and run pipeline
pipeline = create_pipeline_from_config(config_path)
logging.info("Starting pipeline execution")
result_df = pipeline.run(df)
# Save results using the splitting step
splitting_step = pipeline.steps[-1]
if isinstance(splitting_step, DataSplittingStep):
splitting_step.save_splits(result_df)
# Show completion statistics
progress = pipeline.get_progress()
logging.info("=== Pipeline Completion Summary ===")
for step_name, stats in progress.items():
logging.info(
f"{step_name}: {stats['completion_percentage']:.1f}% "
f"({stats['processed_batches']}/{stats['total_batches']} batches)"
)
if stats["failed_batches"] > 0:
logging.warning(f" {stats['failed_batches']} failed batches")
logging.info("Pipeline completed successfully")
return 0
except Exception as e:
logging.error(f"Pipeline failed: {e}", exc_info=True)
return 1
def main():
"""Main entry point with minimal command-line interface"""
parser = argparse.ArgumentParser(
description="DRC Names Processing Pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Configuration File Examples:
config/pipeline.yaml - Main configuration
config/pipeline.development.yaml - Development environment
config/pipeline.production.yaml - Production environment
Usage Examples:
python processing/main.py # Use default config
python processing/main.py --config config/pipeline.yaml # Use specific config
python processing/main.py --env development # Use environment config
python processing/main.py --resume # Resume from checkpoints
""",
)
parser.add_argument("--config", type=Path, help="Path to configuration file")
parser.add_argument(
"--env", type=str, help="Environment name (loads config/pipeline.{env}.yaml)"
)
parser.add_argument(
"--resume", action="store_true", help="Resume pipeline from existing checkpoints"
)
parser.add_argument(
"--validate-config", action="store_true", help="Validate configuration file and exit"
)
args = parser.parse_args()
# Determine config path
config_path = None
if args.config:
config_path = args.config
elif args.env:
config_path = Path("config") / f"pipeline.{args.env}.yaml"
if args.validate_config:
try:
config = ConfigManager(config_path).load_config()
print(f"Configuration is valid: {config.name} v{config.version}")
return 0
except Exception as e:
print(f"Configuration validation failed: {e}")
return 1
# Run pipeline
return run_pipeline(config_path, args.resume)
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
Executable
+157
View File
@@ -0,0 +1,157 @@
#!.venv/bin/python3
import argparse
import sys
from core.config.config_manager import ConfigManager
from processing.monitoring.pipeline_monitor import PipelineMonitor
from processing.monitoring.data_analyzer import DatasetAnalyzer
def main():
parser = argparse.ArgumentParser(
description="Monitor and manage the DRC names processing pipeline"
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Status command
status_parser = subparsers.add_parser("status", help="Show pipeline status")
status_parser.add_argument(
"--detailed",
action="store_true",
help="Show detailed information including failed batch IDs",
)
# Clean command
clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
clean_parser.add_argument(
"--step",
type=str,
choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
help="Clean specific step (default: all)",
)
clean_parser.add_argument(
"--keep-last", type=int, default=1, help="Number of recent checkpoints to keep (default: 1)"
)
clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
# Reset command
reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
reset_parser.add_argument(
"step",
type=str,
choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
help="Step to reset",
)
reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
# Analyze command
analyze_parser = subparsers.add_parser("analyze", help="Analyze dataset")
analyze_parser.add_argument(
"--file",
type=str,
default="names_featured.csv",
help="Dataset file to analyze (default: names_featured.csv)",
)
# Checkpoint info command
info_parser = subparsers.add_parser("info", help="Show checkpoint information")
args = parser.parse_args()
if not args.command:
parser.print_help()
return 1
monitor = PipelineMonitor()
if args.command == "status":
monitor.print_status(detailed=args.detailed)
elif args.command == "clean":
checkpoint_info = monitor.count_checkpoint_files()
print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
if not args.force:
response = input("Are you sure you want to clean checkpoints? (y/N): ")
if response.lower() != "y":
print("Cancelled")
return 0
if args.step:
monitor.clean_step_checkpoints(args.step, args.keep_last)
else:
for step in monitor.steps:
monitor.clean_step_checkpoints(step, args.keep_last)
print("Checkpoint cleaning completed")
elif args.command == "reset":
if not args.force:
response = input(
f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
)
if response.lower() != "y":
print("Cancelled")
return 0
monitor.reset_step(args.step)
print(f"Reset completed for {args.step}")
elif args.command == "analyze":
# Use configured data directory instead of hardcoded DATA_DIR
data_dir = ConfigManager().default_paths.data_dir
filepath = data_dir / args.file
if not filepath.exists():
print(f"File not found: {filepath}")
return 1
analyzer = DatasetAnalyzer(str(filepath))
if not analyzer.load_data():
return 1
completion_stats = analyzer.analyze_completion()
quality_stats = analyzer.analyze_quality()
print(f"\n=== Dataset Analysis: {args.file} ===")
print(f"Total rows: {completion_stats['total_rows']:,}")
print(
f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)"
)
print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
print(
f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
)
if "name_length" in quality_stats:
length_stats = quality_stats["name_length"]
print(f"\nName length statistics:")
print(f" Average: {length_stats['mean']:.1f} characters")
print(f" Range: {length_stats['min']}-{length_stats['max']} characters")
if "word_distribution" in quality_stats:
print(f"\nWord count distribution:")
for words, count in quality_stats["word_distribution"].items():
print(f" {words} words: {count:,} names")
elif args.command == "info":
checkpoint_info = monitor.count_checkpoint_files()
print(f"\n=== Checkpoint Information ===")
print(f"Total storage: {checkpoint_info['total_size_mb']:.1f} MB")
print()
for step in monitor.steps:
step_info = checkpoint_info[step]
print(f"{step.replace('_', ' ').title()}:")
print(f" Files: {step_info['files']}")
print(f" Size: {step_info['size_mb']:.1f} MB")
print()
return 0
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
-115
View File
@@ -1,115 +0,0 @@
import argparse
import os
import tensorflow as tf
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support, confusion_matrix
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
def evaluate_logreg(df, threshold):
"""
Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
a pre-trained model and label encoder, transforms the input data into the required format, and
performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
the encoder class labels.
"""
model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
X = df["name"].tolist()
y_true = encoder.transform(df["sex"])
proba = model.predict_proba(X)
y_pred = (proba[:, 1] >= threshold).astype(int)
return y_true, y_pred, proba[:, 1], encoder.classes_
def evaluate_lstm(df, threshold, max_len=6):
"""
Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
returns the true labels, predicted labels, prediction probabilities, and class names.
"""
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
sequences = tokenizer.texts_to_sequences(df["name"])
X = pad_sequences(sequences, maxlen=max_len, padding="post")
y_true = encoder.transform(df["sex"])
proba = model.predict(X)
y_pred = (proba[:, 1] >= threshold).astype(int)
return y_true, y_pred, proba[:, 1], encoder.classes_
def evaluate_transformer(df, threshold, max_len=6):
"""
Evaluates the transformer model for gender prediction. The function loads a pre-trained
transformer model, tokenizer, and label encoder. It processes the input dataframe by
tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
The function then predicts the probabilities for the given names using the transformer model
and generates predictions based on the specified threshold.
"""
model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
sequences = tokenizer.texts_to_sequences(df["name"])
X = pad_sequences(sequences, maxlen=max_len, padding="post")
y_true = encoder.transform(df["sex"])
proba = model.predict(X)
y_pred = (proba[:, 1] >= threshold).astype(int)
return y_true, y_pred, proba[:, 1], encoder.classes_
def compute_metrics(y_true, y_pred, y_proba, class_names):
"""
Computes classification metrics for given true and predicted labels, along with
class probabilities and class names. The function calculates accuracy, precision,
recall, F1 score, and confusion matrix for evaluating model performance.
"""
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
cm = confusion_matrix(y_true, y_pred).tolist()
return {
"accuracy": acc,
"precision": pr,
"recall": rc,
"f1": f1,
"confusion_matrix": {
"labels": class_names.tolist(),
"matrix": cm
}
}
def main():
parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
parser.add_argument("--dataset", default="names_evaluation.csv", help="Path to the dataset CSV file")
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
args = parser.parse_args()
df = load_csv_dataset(args.dataset, args.size, args.balanced)
model_funcs = {
"logreg": evaluate_logreg,
"lstm": evaluate_lstm,
"transformer": evaluate_transformer,
}
try:
y_true, y_pred, y_proba, classes = model_funcs[args.model](df, args.threshold)
except KeyError:
raise ValueError(f"Unknown model: {args.model}")
results = compute_metrics(y_true, y_pred, y_proba, classes)
save_json_dataset(results, os.path.join(GENDER_RESULT_DIR, f'{args.model}_eval'))
if __name__ == "__main__":
main()
-80
View File
@@ -1,80 +0,0 @@
import argparse
from dataclasses import dataclass
from typing import Optional
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support,
classification_report, confusion_matrix
)
from misc import logging
def evaluate_proba(y_true, y_proba, threshold, class_names):
y_pred = (y_proba[:, 1] >= threshold).astype(int)
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
cm = confusion_matrix(y_true, y_pred)
logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
@dataclass
class BaseConfig:
"""
Represents the base configuration for a dataset and its associated parameters.
This class serves as a foundational configuration handler to encapsulate
dataset-related parameters and options. It allows customization of dataset
behavior, including threshold values, size, cross-validation settings, and
whether to save derived configurations. It can also manage configurations
for balanced datasets if necessary.
"""
dataset_path: str = "names_featured.csv"
size: Optional[int] = None
threshold: float = 0.5
cv: Optional[int] = None
save: bool = False
balanced: bool = False
epochs: int = 10
test_size: float = 0.2
random_state: int = 42
def load_config(description: str) -> BaseConfig:
"""
Parses command-line arguments and loads the configuration for the logistic regression model.
This function sets up an argument parser for various command-line options including
the dataset path, dataset size, dataset balancing, classification threshold,
cross-validation folds, and saving the model and its associated artifacts. Once parsed,
it transfers the configurations to a ``BaseConfig`` instance and returns it.
"""
parser = argparse.ArgumentParser(description)
parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
args = parser.parse_args()
return BaseConfig(
dataset_path=args.dataset,
size=args.size,
threshold=args.threshold,
cv=args.cv,
save=args.save,
balanced=args.balanced,
epochs=args.epochs,
test_size=args.test_size,
random_state=args.random_state
)
-123
View File
@@ -1,123 +0,0 @@
import os
from dataclasses import dataclass
from typing import Tuple
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
precision_recall_fscore_support
)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import BaseConfig, load_config, logging
@dataclass
class Config(BaseConfig):
ngram_range: Tuple[int, int] = (2, 5)
max_iter: int = 1000
def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
"""
Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
for model training. The transformed labels and the fitted encoder are returned.
"""
logging.info("Encoding labels")
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
return y_encoded, encoder
def build_model(cfg: Config) -> Pipeline:
"""
Build a logistic regression model pipeline with a character-level CountVectorizer.
The pipeline consists of a CountVectorizer that transforms the input text into
character n-grams, followed by a Logistic Regression classifier. The n-gram range
and maximum iterations for the logistic regression can be configured through the
provided configuration object.
"""
return make_pipeline(
CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
LogisticRegression(max_iter=cfg.max_iter)
)
def evaluate_proba(y_true, y_proba, threshold: float, class_names):
"""
Evaluates the performance of a classification model using a specified threshold
for predicted probabilities. Computes metrics such as accuracy, precision,
recall, F1-score, and the confusion matrix. Also generates a classification
report with detailed metrics for each class.
Logs the evaluation metrics at the specified threshold and prints the confusion
matrix and classification report.
"""
logging.info(f"Evaluating at threshold = {threshold}")
y_pred = (y_proba[:, 1] >= threshold).astype(int)
acc = accuracy_score(y_true, y_pred)
pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
cm = confusion_matrix(y_true, y_pred)
logging.info(f"Accuracy: {acc:.4f}")
logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
def cross_validate(cfg: Config, X, y) -> None:
"""
Performs k-fold cross-validation on the provided dataset using the configuration and
logs the results including individual fold scores, mean accuracy, and the standard
deviation of the scores.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
pipeline = build_model(cfg)
scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
logging.info(f"Cross-validation scores: {scores}")
logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
def save_artifacts(model, encoder):
"""
Saves the trained model and label encoder artifacts to the specified directory.
"""
save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("logistic regression model")))
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
X_raw, y_raw = df["name"], df["sex"]
y_encoded, encoder = encode_labels(y_raw)
if cfg.cv:
cross_validate(cfg, X_raw, y_encoded)
return
X_train, X_test, y_train, y_test = train_test_split(
X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
)
model = build_model(cfg)
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, encoder)
if __name__ == "__main__":
main()
-144
View File
@@ -1,144 +0,0 @@
import os
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import pandas as pd
from sklearn.metrics import (
accuracy_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ProgbarLogger
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import load_config, BaseConfig, evaluate_proba, logging
@dataclass
class Config(BaseConfig):
max_len: int = 6
embedding_dim: int = 64
lstm_units: int = 32
batch_size: int = 64
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
"""
Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
model training. The resulting outputs are ready for input into a machine learning pipeline.
"""
logging.info("Loading and preprocessing data")
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(df["name"])
sequences = tokenizer.texts_to_sequences(df["name"])
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["sex"])
return padded, labels, tokenizer, label_encoder
def build_model(cfg: Config, vocab_size: int) -> Sequential:
"""
Builds and compiles a Sequential LSTM-based model. The model consists of an
embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
activation, and an output layer with a softmax activation function. The model
is compiled using sparse categorical crossentropy loss and the Adam optimizer.
"""
logging.info("Building LSTM model")
model = Sequential([
Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)),
Bidirectional(LSTM(cfg.lstm_units)),
Dense(64, activation="relu"),
Dense(2, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
return model
def cross_validate(cfg: Config, X, y, vocab_size: int):
"""
Performs cross-validation on the given dataset using the specified model configuration.
The function uses StratifiedKFold cross-validator to split the dataset into training and
validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
on the validation data, and logs the fold-wise and overall results.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
accuracies = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
logging.info(f"Fold {fold + 1}")
model = build_model(cfg, vocab_size)
model.fit(X[train_idx], y[train_idx],
epochs=cfg.epochs,
batch_size=cfg.batch_size,
verbose=0)
y_pred = model.predict(X[val_idx])
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
accuracies.append(acc)
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
def save_artifacts(model, tokenizer, encoder):
"""
Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
The function ensures that the specified directory for saving artifacts exists,
then serializes the model, tokenizer, and encoder using appropriate formats. It
also logs the success of the operation to notify the user of the action taken.
"""
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
X, y, tokenizer, encoder = load_and_prepare(cfg)
vocab_size = len(tokenizer.word_index) + 1
if cfg.cv:
cross_validate(cfg, X, y, vocab_size)
return
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
)
model = build_model(cfg, vocab_size)
model.summary()
logging.info("Training model")
model.fit(X_train, y_train,
validation_split=0.1,
epochs=cfg.epochs,
batch_size=cfg.batch_size,
callbacks=[ProgbarLogger()])
y_proba = model.predict(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, tokenizer, encoder)
if __name__ == "__main__":
main()
-173
View File
@@ -1,173 +0,0 @@
import os
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import (
accuracy_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ProgbarLogger
from tensorflow.keras.layers import (
Input, Embedding, Dense, GlobalAveragePooling1D,
MultiHeadAttention, Dropout, LayerNormalization
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
from pipeline.gender.models import BaseConfig, load_config, evaluate_proba, logging
@dataclass
class Config(BaseConfig):
max_len: int = 6
embedding_dim: int = 64
transformer_head_size: int = 64
transformer_num_heads: int = 2
transformer_ff_dim: int = 128
dropout: float = 0.1
batch_size: int = 64
def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
"""
Load and preprocess the dataset for training a Transformer model.
This function reads a CSV dataset, tokenizes the names, pads the sequences,
and encodes the labels. It returns the padded sequences, encoded labels,
tokenizer, and label encoder.
"""
logging.info("Loading and preprocessing data")
df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df["name"])
sequences = tokenizer.texts_to_sequences(df["name"])
padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
encoder = LabelEncoder()
labels = encoder.fit_transform(df["sex"])
return padded, labels, tokenizer, encoder
def transformer_encoder(x, cfg: Config):
"""
Transformer encoder block that applies multi-head attention and feed-forward
neural network layers with residual connections and layer normalization.
"""
attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
def build_model(cfg: Config, vocab_size: int) -> Model:
"""
Builds a Transformer-based model aimed at sequence processing tasks.
The model includes an embedding layer integrating positional encodings
and a Transformer encoder, followed by a global pooling layer,
a dense hidden layer, and a softmax output layer.
"""
logging.info("Building Transformer model")
inputs = Input(shape=(cfg.max_len,))
x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=cfg.max_len, delta=1)
pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
x = x + pos_embedding
x = transformer_encoder(x, cfg)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
outputs = Dense(2, activation="softmax")(x)
model = Model(inputs, outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
return model
def cross_validate(cfg: Config, X, y, vocab_size: int):
"""
Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
data. The overall mean and standard deviation of accuracies across all folds are logged.
"""
logging.info(f"Running {cfg.cv}-fold cross-validation")
skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
accuracies = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
logging.info(f"Fold {fold + 1}")
model = build_model(cfg, vocab_size)
model.fit(X[train_idx], y[train_idx],
epochs=cfg.epochs,
batch_size=cfg.batch_size,
verbose=0)
y_pred = model.predict(X[val_idx])
acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
accuracies.append(acc)
logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
def save_artifacts(model, tokenizer, encoder):
"""
Saves the model and associated artifacts to the designated directory. The model
is serialized and saved in a `.keras` file, while the tokenizer and label
encoder are serialized into `.pkl` files. If the directory does not exist, it
is created automatically. This function also logs the completion of the
operation.
"""
os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
def main():
cfg = Config(**vars(load_config("Transformer model")))
X, y, tokenizer, encoder = load_and_prepare(cfg)
vocab_size = len(tokenizer.word_index) + 1
if cfg.cv:
cross_validate(cfg, X, y, vocab_size)
return
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
)
model = build_model(cfg, vocab_size)
model.summary()
logging.info("Training Transformer model")
model.fit(
X_train, y_train,
validation_split=0.1,
epochs=cfg.epochs,
batch_size=cfg.batch_size,
callbacks=[ProgbarLogger()]
)
y_proba = model.predict(X_test)
evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
if cfg.save:
save_artifacts(model, tokenizer, encoder)
if __name__ == "__main__":
main()
-107
View File
@@ -1,107 +0,0 @@
import argparse
import os
from typing import List
import tensorflow as tf
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from misc import GENDER_MODELS_DIR, load_pickle
def predict_logreg(names: List[str], threshold: float):
"""
Predict gender labels for given names using a logistic regression model.
The function takes in a list of names and predicts the gender labels
based on a logistic regression model. A probabilistic threshold is used
to classify the names into one of the defined labels.
"""
model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
model: Pipeline = load_pickle(model_path)
label_encoder = load_pickle(encoder_path)
X = [name.lower().strip() for name in names]
proba = model.predict_proba(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def predict_lstm(names: List[str], threshold: float, max_len=6):
"""
Predicts gender labels and probabilities for a list of names using a pre-trained BiLSTM model.
The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
names, and then uses the loaded model to predict gender probabilities. Based on the threshold
value, it determines the predicted gender labels.
"""
model_path = os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl")
model = tf.keras.models.load_model(model_path)
tokenizer: Tokenizer = load_pickle(tokenizer_path)
label_encoder = load_pickle(encoder_path)
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
X = pad_sequences(X, maxlen=max_len, padding="post")
proba = model.predict(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def predict_transformer(names: List[str], threshold: float, max_len=6):
"""
Predicts gender labels for the provided names using a pre-trained transformer model.
This function loads a pre-trained transformer model along with its tokenizer and label
encoder, converts input names into tokenized sequences, and processes them to generate
gender predictions. The function returns the predicted labels and the associated
probabilities for each sample.
"""
model_path = os.path.join(GENDER_MODELS_DIR, "transformer.keras")
tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
model = tf.keras.models.load_model(model_path)
tokenizer: Tokenizer = load_pickle(tokenizer_path)
label_encoder = load_pickle(encoder_path)
X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
X = pad_sequences(X, maxlen=max_len, padding="post")
proba = model.predict(X)
pred = (proba[:, 1] >= threshold).astype(int)
labels = label_encoder.inverse_transform(pred)
return labels, proba
def main():
parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
parser.add_argument("--names", nargs="+", required=True, help="One or more names")
parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
args = parser.parse_args()
model_funcs = {
"logreg": predict_logreg,
"lstm": predict_lstm,
"transformer": predict_transformer,
}
try:
labels, proba = model_funcs[args.model](args.names, args.threshold)
except KeyError:
raise ValueError(f"Unsupported model type: {args.model}")
for i, name in enumerate(args.names):
p_female = proba[i][0]
p_male = proba[i][1]
print(f"{name}{labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
if __name__ == "__main__":
main()
-109
View File
@@ -1,109 +0,0 @@
import os
import argparse
import ollama
import pandas as pd
from pydantic import BaseModel, ValidationError
from tqdm import tqdm
from typing import Optional
from misc import load_prompt, load_csv_dataset, DATA_DIR, logging
class NameAnalysis(BaseModel):
identified_name: Optional[str]
identified_surname: Optional[str]
def analyze_name(client: ollama.Client, model: str, prompt: str, name: str) -> dict:
"""
Analyze a name using the specified model and prompt.
Returns a dictionary with identified name, surname, and category.
"""
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": name},
],
format=NameAnalysis.model_json_schema(),
)
analysis = NameAnalysis.model_validate_json(response.message.content)
return analysis.model_dump()
except ValidationError as ve:
logging.warning(f"Validation error: {ve}")
except Exception as e:
logging.error(f"Unexpected error: {e}")
return {"identified_name": None, "identified_surname": None}
def save_checkpoint(df: pd.DataFrame):
df.to_csv(os.path.join(DATA_DIR, "names_featured.csv"), index=False)
logging.critical(f"Checkpoint saved")
def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame:
BATCH_SIZE = 10
client = ollama.Client()
prompt = load_prompt()
updates = []
# Set logging level for HTTP client to reduce noise
# This is useful to avoid excessive logging from the HTTP client used by Ollama
logging.getLogger("httpx").setLevel(logging.WARNING)
for idx, (row_idx, row) in enumerate(entries.iterrows(), 1):
try:
entry = analyze_name(client, llm_model, prompt, row["name"])
entry["annotated"] = 1
updates.append((row_idx, entry))
logging.info(f"Analyzed: {row['name']} - {entry}")
except Exception as e:
logging.warning(f"Failed to analyze '{row['name']}': {e}")
continue
if idx % BATCH_SIZE == 0 or idx == len(entries):
update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
df.update(update_df)
save_checkpoint(df)
updates.clear() # avoid re-applying same updates
return df
def main(llm_model: str = "llama3.2:3b"):
df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
# Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
entries = df[df["annotated"] == 0]
if entries.empty:
logging.info("No names to analyze.")
return
logging.info(f"Found {len(entries)} names to analyze.")
df = build_updates(llm_model, df, entries)
save_checkpoint(df)
logging.info("Analysis complete.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze names using an LLM model.")
parser.add_argument(
"--llm_model",
type=str,
default="mistral:7b",
help="Ollama model name to use (default: mistral:7b)",
)
args = parser.parse_args()
try:
main(llm_model=args.llm_model)
except Exception as e:
logging.error(f"Fatal error: {e}", exc_info=True)
View File
+11
View File
@@ -0,0 +1,11 @@
from dataclasses import dataclass
@dataclass
class BatchConfig:
"""Configuration for batch processing"""
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5 # Save checkpoint every N batches
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
+102
View File
@@ -0,0 +1,102 @@
import logging
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from typing import Iterator
import pandas as pd
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class BatchProcessor:
"""Handles batch processing with concurrency and checkpointing"""
def __init__(self, config: BatchConfig):
self.config = config
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
"""Create batches from DataFrame"""
total_rows = len(df)
batch_size = self.config.batch_size
for i in range(0, total_rows, batch_size):
batch = df.iloc[i : i + batch_size].copy()
batch_id = i // batch_size
yield batch, batch_id
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process batches sequentially"""
results = []
for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
processed_batch = step.load_batch(batch_id)
else:
try:
processed_batch = step.process_batch(batch, batch_id)
step.save_batch(processed_batch, batch_id)
step.state.processed_batches += 1
except Exception as e:
logging.error(f"Failed to process batch {batch_id}: {e}")
step.state.failed_batches.append(batch_id)
continue
results.append(processed_batch)
# Save state periodically
if batch_id % self.config.checkpoint_interval == 0:
step.save_state()
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process batches concurrently"""
executor_class = (
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
)
results = {}
with executor_class(max_workers=self.config.max_workers) as executor:
# Submit all batches
future_to_batch = {}
for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
results[batch_id] = step.load_batch(batch_id)
else:
future = executor.submit(step.process_batch, batch, batch_id)
future_to_batch[future] = (batch_id, batch)
# Collect results as they complete
for future in as_completed(future_to_batch):
batch_id, batch = future_to_batch[future]
try:
processed_batch = future.result()
step.save_batch(processed_batch, batch_id)
results[batch_id] = processed_batch
step.state.processed_batches += 1
logging.info(f"Completed batch {batch_id}")
except Exception as e:
logging.error(f"Failed to process batch {batch_id}: {e}")
step.state.failed_batches.append(batch_id)
# Reassemble results in order
ordered_results = []
for batch_id in sorted(results.keys()):
ordered_results.append(results[batch_id])
step.save_state()
return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process data using the configured strategy"""
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
step.load_state()
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
if self.config.max_workers == 1:
return self.process_sequential(step, df)
else:
return self.process_concurrent(step, df)
View File
+80
View File
@@ -0,0 +1,80 @@
import logging
from typing import Dict
import pandas as pd
class DatasetAnalyzer:
"""Analyze dataset statistics and quality"""
def __init__(self, filepath: str):
self.filepath = filepath
self.df = None
def load_data(self) -> bool:
"""Load dataset for analysis"""
try:
self.df = pd.read_csv(self.filepath)
return True
except Exception as e:
logging.error(f"Failed to load {self.filepath}: {e}")
return False
def analyze_completion(self) -> Dict:
"""Analyze annotation completion status"""
if self.df is None:
return {}
total_rows = len(self.df)
# Check annotation status
if "annotated" in self.df.columns:
annotated_count = (self.df["annotated"] == 1).sum()
unannotated_count = (self.df["annotated"] == 0).sum()
else:
annotated_count = 0
unannotated_count = total_rows
# Analyze name completeness
complete_names = 0
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
complete_names = (
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
).sum()
return {
"total_rows": total_rows,
"annotated_rows": annotated_count,
"unannotated_rows": unannotated_count,
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
"complete_names": complete_names,
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
}
def analyze_quality(self) -> Dict:
"""Analyze data quality metrics"""
if self.df is None:
return {}
quality_metrics = {}
# Missing values
missing_data = self.df.isnull().sum()
quality_metrics["missing_values"] = missing_data.to_dict()
# Name length distribution
if "name" in self.df.columns:
name_lengths = self.df["name"].str.len()
quality_metrics["name_length"] = {
"mean": name_lengths.mean(),
"median": name_lengths.median(),
"min": name_lengths.min(),
"max": name_lengths.max(),
}
# Word count distribution
if "words" in self.df.columns:
word_counts = self.df["words"].value_counts().sort_index()
quality_metrics["word_distribution"] = word_counts.to_dict()
return quality_metrics
+179
View File
@@ -0,0 +1,179 @@
import json
import logging
import shutil
from datetime import datetime
from typing import Optional, Dict
from core.config.config_manager import ConfigManager
from core.config.project_paths import ProjectPaths
class PipelineMonitor:
"""Monitor and manage pipeline execution"""
def __init__(self, paths: Optional[ProjectPaths] = None):
if paths is None:
# Use default configuration if none provided
config_manager = ConfigManager()
paths = config_manager.default_paths
self.paths = paths
self.checkpoint_dir = paths.checkpoints_dir
self.steps = ["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"]
def get_step_status(self, step_name: str) -> Dict:
"""Get status of a specific pipeline step"""
step_dir = self.checkpoint_dir / step_name
state_file = step_dir / "pipeline_state.json"
if not state_file.exists():
return {
"step": step_name,
"status": "not_started",
"processed_batches": 0,
"total_batches": 0,
"failed_batches": 0,
"completion_percentage": 0.0,
}
try:
with open(state_file, "r") as f:
state = json.load(f)
processed = state.get("processed_batches", 0)
total = state.get("total_batches", 0)
failed = len(state.get("failed_batches", []))
if total == 0:
completion = 0.0
status = "not_started"
elif processed >= total:
completion = 100.0
status = "completed" if failed == 0 else "completed_with_errors"
else:
completion = (processed / total) * 100
status = "in_progress"
return {
"step": step_name,
"status": status,
"processed_batches": processed,
"total_batches": total,
"failed_batches": failed,
"completion_percentage": completion,
"last_checkpoint": state.get("last_checkpoint"),
"failed_batch_ids": state.get("failed_batches", []),
}
except Exception as e:
logging.error(f"Error reading state for {step_name}: {e}")
return {"step": step_name, "status": "error", "error": str(e)}
def get_pipeline_status(self) -> Dict:
"""Get overall pipeline status"""
step_statuses = {}
overall_status = "not_started"
total_completion = 0.0
for step in self.steps:
status = self.get_step_status(step)
step_statuses[step] = status
if status["status"] == "error":
overall_status = "error"
elif status["status"] in ["in_progress"]:
overall_status = "in_progress"
elif status["status"] == "completed_with_errors":
overall_status = "completed_with_errors"
total_completion += status.get("completion_percentage", 0)
avg_completion = total_completion / len(self.steps)
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
overall_status = "completed"
return {
"overall_status": overall_status,
"overall_completion": avg_completion,
"steps": step_statuses,
"timestamp": datetime.now().isoformat(),
}
def print_status(self, detailed: bool = False):
"""Print pipeline status in a human-readable format"""
status = self.get_pipeline_status()
print("\n=== Pipeline Status ===")
print(f"Overall Status: {status['overall_status'].upper()}")
print(f"Overall Completion: {status['overall_completion']:.1f}%")
print(f"Last Updated: {status['timestamp']}")
print()
for step_name, step_status in status["steps"].items():
print(f"{step_name.replace('_', ' ').title()}:")
print(f" Status: {step_status['status']}")
print(f" Progress: {step_status['completion_percentage']:.1f}%")
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
if step_status["failed_batches"] > 0:
print(f" Failed Batches: {step_status['failed_batches']}")
if detailed and "failed_batch_ids" in step_status:
print(f" Failed Batch IDs: {step_status['failed_batch_ids']}")
print()
def count_checkpoint_files(self) -> Dict:
"""Count checkpoint files for each step"""
counts = {}
total_size = 0
for step in self.steps:
step_dir = self.checkpoint_dir / step
if step_dir.exists():
csv_files = list(step_dir.glob("*.csv"))
step_size = sum(f.stat().st_size for f in csv_files)
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
total_size += step_size
else:
counts[step] = {"files": 0, "size_mb": 0}
counts["total_size_mb"] = total_size / (1024 * 1024)
return counts
def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
"""Clean checkpoint files for a specific step"""
step_dir = self.checkpoint_dir / step_name
if not step_dir.exists():
logging.info(f"No checkpoints found for {step_name}")
return
csv_files = sorted(step_dir.glob("batch_*.csv"))
if len(csv_files) <= keep_last:
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
return
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
for file_path in files_to_delete:
try:
file_path.unlink()
logging.info(f"Deleted {file_path}")
except Exception as e:
logging.error(f"Failed to delete {file_path}: {e}")
def reset_step(self, step_name: str):
"""Reset a pipeline step by removing its checkpoints and state"""
step_dir = self.checkpoint_dir / step_name
if step_dir.exists():
try:
shutil.rmtree(step_dir)
logging.info(f"Reset step: {step_name}")
except Exception as e:
logging.error(f"Failed to reset {step_name}: {e}")
else:
logging.info(f"Step {step_name} has no checkpoints to reset")
+57
View File
@@ -0,0 +1,57 @@
import logging
import pandas as pd
from typing import Dict, Any
import time
from processing.batch.batch_config import BatchConfig
from processing.batch.batch_processor import BatchProcessor
from processing.steps import PipelineStep
class Pipeline:
"""Main pipeline orchestrator"""
def __init__(self, config: BatchConfig):
self.config = config
self.processor = BatchProcessor(config)
self.steps = []
def add_step(self, step: PipelineStep):
"""Add a processing step to the pipeline"""
self.steps.append(step)
def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
"""Run the complete pipeline"""
current_data = input_data.copy()
for step in self.steps:
logging.info(f"Running pipeline step: {step.name}")
start_time = time.time()
current_data = self.processor.process(step, current_data)
elapsed_time = time.time() - start_time
logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
if step.state.failed_batches:
logging.warning(
f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
)
return current_data
def get_progress(self) -> Dict[str, Any]:
"""Get progress information for all steps"""
progress = {}
for step in self.steps:
progress[step.name] = {
"processed_batches": step.state.processed_batches,
"total_batches": step.state.total_batches,
"failed_batches": len(step.state.failed_batches),
"completion_percentage": (
step.state.processed_batches / max(1, step.state.total_batches)
)
* 100,
}
return progress
-119
View File
@@ -1,119 +0,0 @@
import os
import argparse
import pandas as pd
from misc import DATA_DIR, REGION_MAPPING, logging
def clean(filepath) -> pd.DataFrame:
"""
Clean the CSV file by removing null bytes, non-breaking spaces, and extra spaces.
Also, it attempts to read the file with different encodings to handle potential encoding issues.
"""
encodings = ['utf-8', 'utf-16', 'latin1']
for enc in encodings:
try:
logging.info(f"Trying to read {filepath} with encoding: {enc}")
# Use chunked reading to handle large files
chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
cleaned_chunks = []
for chunk in chunks:
# Drop rows with essential missing values early
chunk = chunk.dropna(subset=['name', 'sex', 'region'])
# Clean string columns in-place
for col in chunk.select_dtypes(include='object').columns:
chunk[col] = (
chunk[col]
.astype(str)
.str.replace('\x00', ' ', regex=False)
.str.replace('\u00a0', ' ', regex=False)
.str.replace(' +', ' ', regex=True)
.str.strip()
.str.lower()
)
cleaned_chunks.append(chunk)
df = pd.concat(cleaned_chunks, ignore_index=True)
df.to_csv(filepath, index=False, encoding='utf-8')
logging.info(f"Successfully read with encoding: {enc}")
return df
except Exception:
continue
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
def process(df: pd.DataFrame) -> pd.DataFrame:
"""
Process the DataFrame to extract features and clean data.
This includes counting words, calculating name length, and extracting probable native names and surnames.
Also maps regions to provinces based on REGION_MAPPING.
"""
logging.info("Preprocessing names")
df['words'] = df['name'].str.count(' ') + 1
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
df['year'] = df['year'].astype(int)
# Calculate probable_native and probable_surname
name_split = df['name'].str.split()
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple')
df['identified_name'] = None
df['identified_surname'] = None
df['annotated'] = 0
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
# This is a common pattern in Congolese names
three_word_mask = df['words'] == 3
df.loc[three_word_mask, 'identified_name'] = df.loc[three_word_mask, 'probable_native']
df.loc[three_word_mask, 'identified_surname'] = df.loc[three_word_mask, 'probable_surname']
df.loc[three_word_mask, 'annotated'] = 1
logging.info("Mapping regions to provinces")
df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1])
df['province'] = df['province'].str.lower()
return df
def save_artifacts(df: pd.DataFrame, split_eval: bool = True, split_by_sex: bool = True) -> None:
"""
Splits the input DataFrame into evaluation and featured datasets, saves them as CSV files,
and additionally saves separate CSV files for male and female entries if requested.
"""
if split_eval:
logging.info("Saving evaluation and featured datasets")
eval_idx = df.sample(frac=0.2, random_state=42).index
df_evaluation = df.loc[eval_idx]
df_featured = df.drop(index=eval_idx)
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
else:
df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
if split_by_sex:
logging.info("Saving by sex")
df[df['sex'] == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
df[df['sex'] == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
def main(split_eval: bool = True, split_by_sex: bool = True):
df = process(clean(os.path.join(DATA_DIR, 'names.csv')))
save_artifacts(df, split_eval=split_eval, split_by_sex=split_by_sex)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Prepare name datasets with optional splits.")
parser.add_argument('--split_eval', action='store_true', default=True, help="Split into evaluation and featured datasets (default: True)")
parser.add_argument('--no-split_eval', action='store_false', dest='split_eval', help="Do not split into evaluation and featured datasets")
parser.add_argument('--split_by_sex', action='store_true', default=True, help="Split by sex into male/female datasets (default: True)")
parser.add_argument('--no-split_by_sex', action='store_false', dest='split_by_sex', help="Do not split by sex into male/female datasets")
args = parser.parse_args()
main(split_eval=args.split_eval, split_by_sex=args.split_by_sex)
+111
View File
@@ -0,0 +1,111 @@
import json
import logging
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
from processing.batch.batch_config import BatchConfig
from core.config.pipeline_config import PipelineConfig
@dataclass
class PipelineState:
"""Tracks the state of pipeline execution"""
processed_batches: int = 0
total_batches: int = 0
failed_batches: List[int] = None
last_checkpoint: Optional[str] = None
def __post_init__(self):
if self.failed_batches is None:
self.failed_batches = []
class PipelineStep(ABC):
"""Abstract base class for pipeline steps"""
def __init__(
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
):
self.name = name
self.pipeline_config = pipeline_config
# Use provided batch_config or create default from pipeline config
if batch_config is None:
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=pipeline_config.processing.max_workers,
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
self.batch_config = batch_config
self.state = PipelineState()
@abstractmethod
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch of data"""
pass
def get_checkpoint_path(self, batch_id: int) -> str:
"""Get the checkpoint file path for a batch"""
checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
checkpoint_dir.mkdir(parents=True, exist_ok=True)
return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
def get_state_path(self) -> str:
"""Get the state file path"""
state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
state_dir.mkdir(parents=True, exist_ok=True)
return str(state_dir / "pipeline_state.json")
def save_state(self):
"""Save pipeline state to disk"""
state_file = self.get_state_path()
with open(state_file, "w") as f:
json.dump(
{
"processed_batches": self.state.processed_batches,
"total_batches": self.state.total_batches,
"failed_batches": self.state.failed_batches,
"last_checkpoint": self.state.last_checkpoint,
},
f,
)
def load_state(self) -> bool:
"""Load pipeline state from disk. Returns True if state was loaded."""
state_file = self.get_state_path()
if os.path.exists(state_file):
try:
with open(state_file, "r") as f:
state_data = json.load(f)
self.state.processed_batches = state_data.get("processed_batches", 0)
self.state.total_batches = state_data.get("total_batches", 0)
self.state.failed_batches = state_data.get("failed_batches", [])
self.state.last_checkpoint = state_data.get("last_checkpoint")
return True
except Exception as e:
logging.warning(f"Failed to load state: {e}")
return False
def batch_exists(self, batch_id: int) -> bool:
"""Check if a batch has already been processed (idempotency)"""
checkpoint_path = self.get_checkpoint_path(batch_id)
return os.path.exists(checkpoint_path)
def save_batch(self, batch: pd.DataFrame, batch_id: int):
"""Save processed batch to checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
batch.to_csv(checkpoint_path, index=False)
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
"""Load processed batch from checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
if os.path.exists(checkpoint_path):
return pd.read_csv(checkpoint_path)
return None
+28
View File
@@ -0,0 +1,28 @@
import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep
class DataCleaningStep(PipelineStep):
"""Configuration-driven data cleaning step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_cleaning", pipeline_config)
self.text_cleaner = TextCleaner()
self.required_columns = ["name", "sex", "region"]
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data cleaning"""
logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
# Drop rows with essential missing values
batch = batch.dropna(subset=self.required_columns)
# Apply text cleaning
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
return batch
+60
View File
@@ -0,0 +1,60 @@
import numpy as np
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps.feature_extraction_step import Gender
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class DataSplittingStep(PipelineStep):
"""Configuration-driven data splitting step"""
def __init__(self, pipeline_config: PipelineConfig):
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=1, # No need for parallelism in splitting
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=False,
)
super().__init__("data_splitting", pipeline_config, batch_config)
self.data_loader = DataLoader(pipeline_config)
self.eval_indices = None
def determine_eval_indices(self, total_size: int) -> set:
"""Determine evaluation indices consistently across batches"""
if self.eval_indices is None:
np.random.seed(self.pipeline_config.data.random_seed)
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
return self.eval_indices
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch for data splitting - no modification needed"""
return batch.copy()
def save_splits(self, df: pd.DataFrame) -> None:
"""Save the split datasets based on configuration"""
output_files = self.pipeline_config.data.output_files
data_dir = self.pipeline_config.paths.data_dir
if self.pipeline_config.data.split_evaluation:
eval_indices = self.determine_eval_indices(len(df))
eval_mask = df.index.isin(eval_indices)
df_evaluation = df[eval_mask]
df_featured = df[~eval_mask]
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
else:
self.data_loader.save_csv(df, data_dir / output_files["featured"])
if self.pipeline_config.data.split_by_gender:
df_males = df[df["sex"] == Gender.MALE.value]
df_females = df[df["sex"] == Gender.FEMALE.value]
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -0,0 +1,99 @@
import logging
from enum import Enum
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.steps import PipelineStep
class Gender(Enum):
MALE = "m"
FEMALE = "f"
class NameCategory(Enum):
SIMPLE = "simple"
COMPOSE = "compose"
class FeatureExtractionStep(PipelineStep):
"""Configuration-driven feature extraction step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("feature_extraction", pipeline_config)
self.region_mapper = RegionMapper()
@classmethod
def validate_gender(cls, gender: str) -> Gender:
"""Validate and normalize gender value"""
gender_lower = gender.lower().strip()
if gender_lower in ["m", "male", "homme", "masculin"]:
return Gender.MALE
elif gender_lower in ["f", "female", "femme", "féminin"]:
return Gender.FEMALE
else:
raise ValueError(f"Unknown gender: {gender}")
@classmethod
def get_name_category(cls, word_count: int) -> NameCategory:
"""Determine name category based on word count"""
if word_count <= 3:
return NameCategory.SIMPLE
else:
return NameCategory.COMPOSE
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Extract features from names in batch"""
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
batch = batch.copy()
# Basic features
batch["words"] = batch["name"].str.count(" ") + 1
batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
# Handle year column
if "year" in batch.columns:
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
# Initialize new columns
batch["probable_native"] = None
batch["probable_surname"] = None
batch["identified_name"] = None
batch["identified_surname"] = None
batch["annotated"] = 0
# Vectorized category assignment
batch["identified_category"] = batch["words"].apply(
lambda x: self.get_name_category(x).value
)
# Assign probable_native and probable_surname for all names
name_splits = batch["name"].str.split()
batch["probable_native"] = name_splits.apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
)
batch["probable_surname"] = name_splits.apply(
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
)
# Auto-assign for 3-word names
three_word_mask = batch["words"] == 3
batch.loc[three_word_mask, "identified_name"] = batch.loc[
three_word_mask, "probable_native"
]
batch.loc[three_word_mask, "identified_surname"] = batch.loc[
three_word_mask, "probable_surname"
]
batch.loc[three_word_mask, "annotated"] = 1
# Map regions to provinces
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
# Normalize gender
if "sex" in batch.columns:
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
return batch
+168
View File
@@ -0,0 +1,168 @@
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, Optional
import ollama
import pandas as pd
from pydantic import ValidationError, BaseModel
from core.config.pipeline_config import PipelineConfig
from core.utils.prompt_manager import PromptManager
from core.utils.rate_limiter import RateLimiter
from core.utils.rate_limiter import RateLimitConfig
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class NameAnnotation(BaseModel):
"""Model for name annotation results"""
identified_name: Optional[str]
identified_surname: Optional[str]
class LLMAnnotationStep(PipelineStep):
"""Configuration-driven LLM annotation step"""
def __init__(self, pipeline_config: PipelineConfig):
# Create custom batch config for LLM processing
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=min(
pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
),
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
super().__init__("llm_annotation", pipeline_config, batch_config)
self.prompt = PromptManager(pipeline_config).load_prompt()
self.rate_limiter = (
self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
)
# Statistics
self.successful_requests = 0
self.failed_requests = 0
self.total_retry_attempts = 0
# Setup logging
logging.getLogger("httpx").setLevel(logging.WARNING)
def _create_rate_limiter(self):
"""Create rate limiter based on configuration"""
rate_config = RateLimitConfig(
requests_per_minute=self.pipeline_config.llm.requests_per_minute,
requests_per_second=self.pipeline_config.llm.requests_per_second,
)
return RateLimiter(rate_config)
def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
"""Analyze a name with retry logic and rate limiting"""
for attempt in range(self.pipeline_config.llm.retry_attempts):
try:
# Apply rate limiting if enabled
if self.rate_limiter:
self.rate_limiter.wait_if_needed()
start_time = time.time()
response = client.chat(
model=self.pipeline_config.llm.model_name,
messages=[
{"role": "system", "content": self.prompt},
{"role": "user", "content": name},
],
format=NameAnnotation.model_json_schema(),
)
elapsed_time = time.time() - start_time
if elapsed_time > self.pipeline_config.llm.timeout_seconds:
raise TimeoutError(
f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
)
annotation = NameAnnotation.model_validate_json(response.message.content)
result = {
**annotation.model_dump(),
"annotated": 1,
"processing_time": elapsed_time,
"attempts": attempt + 1,
}
self.successful_requests += 1
if attempt > 0:
self.total_retry_attempts += attempt
return result
except (ValidationError, TimeoutError, Exception) as e:
logging.warning(
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
)
# Exponential backoff with jitter
if attempt < self.pipeline_config.llm.retry_attempts - 1:
wait_time = (2**attempt) + (time.time() % 1)
time.sleep(min(wait_time, 10))
self.failed_requests += 1
return {
"identified_name": None,
"identified_surname": None,
"annotated": 0,
"processing_time": 0,
"attempts": self.pipeline_config.llm.retry_attempts,
"failed": True,
}
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch with LLM annotation"""
unannotated_mask = batch.get("annotated", 0) == 0
unannotated_entries = batch[unannotated_mask]
if unannotated_entries.empty:
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
batch = batch.copy()
client = ollama.Client()
# Process with controlled concurrency
max_workers = self.pipeline_config.llm.max_concurrent_requests
if len(unannotated_entries) == 1 or max_workers == 1:
# Sequential processing
for idx, row in unannotated_entries.iterrows():
result = self.analyze_name_with_retry(client, row["name"], idx)
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
else:
# Concurrent processing
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {}
for idx, row in unannotated_entries.iterrows():
future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
future_to_idx[future] = idx
for future in as_completed(future_to_idx):
idx = future_to_idx[future]
try:
result = future.result()
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
except Exception as e:
logging.error(f"Failed to process row {idx}: {e}")
batch.loc[idx, "annotated"] = 0
# Ensure proper data types
batch["annotated"] = (
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
)
return batch
-26
View File
@@ -1,26 +0,0 @@
import ollama
from pydantic import BaseModel
from misc import load_prompt
class NameAnalysis(BaseModel):
identified_name: str | None
identified_surname: str | None
name = input("Enter name: ")
client = ollama.Client()
response = client.chat(
model="mistral:7b",
messages=[
{"role": "system", "content": load_prompt()},
{"role": "user", "content": name}
],
format=NameAnalysis.model_json_schema()
)
analysis = NameAnalysis.model_validate_json(response.message.content)
result = analysis.model_dump()
print(result)
+126 -1
View File
@@ -1,53 +1,178 @@
absl-py==2.3.0
altair==5.1.2
annotated-types==0.7.0
anyio==4.9.0
appnope==0.1.4
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==3.0.0
astunparse==1.6.3
async-lru==2.0.5
attrs==25.3.0
babel==2.17.0
beautifulsoup4==4.13.4
black==25.1.0
bleach==6.2.0
blinker==1.9.0
cachetools==6.1.0
certifi==2025.6.15
cffi==1.17.1
charset-normalizer==3.4.2
click==8.2.1
comm==0.2.2
contourpy==1.3.2
cycler==0.12.1
debugpy==1.8.14
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.0
fastjsonschema==2.21.1
flake8==7.3.0
flatbuffers==25.2.10
fonttools==4.58.4
fqdn==1.5.1
gast==0.6.0
gitdb==4.0.12
GitPython==3.1.45
google-pasta==0.2.0
grpcio==1.73.0
h11==0.16.0
h5py==3.14.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
imbalanced-learn==0.13.0
ipykernel==6.29.5
ipython==9.4.0
ipython_pygments_lexers==1.1.1
isoduration==20.11.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.1
json5==0.12.0
jsonpointer==3.0.0
jsonschema==4.24.0
jsonschema-specifications==2025.4.1
jupyter-events==0.12.0
jupyter-lsp==2.2.5
jupyter_client==8.6.3
jupyter_core==5.8.1
jupyter_server==2.16.0
jupyter_server_terminals==0.5.3
jupyterlab==4.4.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
keras==3.10.0
kiwisolver==1.4.8
libclang==18.1.1
lightgbm==4.6.0
Markdown==3.8.2
markdown-it-py==3.0.0
MarkupSafe==3.0.2
matplotlib==3.10.3
matplotlib-inline==0.1.7
mccabe==0.7.0
mdurl==0.1.2
mistune==3.1.3
ml-dtypes==0.3.2
mypy==1.17.0
mypy_extensions==1.1.0
namex==0.1.0
narwhals==2.0.1
nbclient==0.10.2
nbconvert==7.16.6
nbformat==5.10.4
nest-asyncio==1.6.0
nltk==3.9.1
notebook==7.4.4
notebook_shim==0.2.4
numpy==1.26.4
ollama==0.5.1
opt_einsum==3.4.0
optree==0.16.0
overrides==7.7.0
packaging==25.0
pandas==2.3.0
pandocfilters==1.5.1
parso==0.8.4
pathspec==0.12.1
pexpect==4.9.0
pillow==11.2.1
platformdirs==4.3.8
plotly==6.2.0
prometheus_client==0.22.1
prompt_toolkit==3.0.51
protobuf==4.25.8
psutil==7.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pyarrow==21.0.0
pycodestyle==2.14.0
pycparser==2.22
pydantic==2.11.7
pydantic_core==2.33.2
pydeck==0.9.1
pyflakes==3.4.0
Pygments==2.19.1
pyparsing==3.2.3
python-dateutil==2.9.0.post0
python-json-logger==3.3.0
pytz==2025.2
PyYAML==6.0.2
pyzmq==27.0.0
referencing==0.36.2
regex==2024.11.6
requests==2.32.4
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==14.0.0
scikit-learn==1.7.0
rpds-py==0.26.0
scikit-learn==1.6.1
scipy==1.15.3
seaborn==0.13.2
Send2Trash==1.8.3
six==1.17.0
sklearn-compat==0.1.3
smmap==5.0.2
sniffio==1.3.1
soupsieve==2.7
stack-data==0.6.3
streamlit==1.47.1
tenacity==9.1.2
tensorboard==2.16.2
tensorboard-data-server==0.7.2
tensorflow==2.16.2
tensorflow-io-gcs-filesystem==0.37.1
termcolor==3.1.0
terminado==0.18.1
threadpoolctl==3.6.0
tinycss2==1.4.0
toml==0.10.2
toolz==1.0.0
tornado==6.5.1
tqdm==4.67.1
traitlets==5.14.3
types-python-dateutil==2.9.0.20250516
types-PyYAML==6.0.12.20250516
typing-inspection==0.4.1
typing_extensions==4.14.0
tzdata==2025.2
uri-template==1.3.0
urllib3==2.5.0
wcwidth==0.2.13
webcolors==24.11.1
webencodings==0.5.1
websocket-client==1.8.0
Werkzeug==3.1.3
wrapt==1.17.2
xgboost==3.0.3
scikit-learn~=1.6.1
ollama~=0.5.1
pydantic~=2.11.7
streamlit~=1.47.1
plotly~=6.2.0
altair==5.1.2
PyYAML~=6.0.2
xgboost~=3.0.3
lightgbm~=4.6.0
View File
+250
View File
@@ -0,0 +1,250 @@
import logging
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from research.experiment import ExperimentConfig
class BaseModel(ABC):
"""Abstract base class for all models"""
def __init__(self, config: ExperimentConfig):
self.config = config
self.model = None
self.feature_extractor = None
self.label_encoder = None
self.tokenizer = None # For neural models
self.is_fitted = False
self.training_history = {} # Store training history for learning curves
self.learning_curve_data = {} # Store learning curve experiment data
@property
@abstractmethod
def architecture(self) -> str:
"""Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
pass
@abstractmethod
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
"""Prepare features for training/prediction"""
pass
@abstractmethod
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the model - implemented differently for each architecture"""
pass
@abstractmethod
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float] | dict[str, np.floating[Any]]:
"""Perform cross-validation and return average scores"""
pass
@abstractmethod
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
pass
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Make predictions"""
if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions")
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
predictions = self.model.predict(X_prepared)
# Handle different prediction formats
if hasattr(predictions, "shape") and len(predictions.shape) > 1:
# Neural network outputs (probabilities)
predictions = predictions.argmax(axis=1)
return self.label_encoder.inverse_transform(predictions)
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
"""Get prediction probabilities if supported"""
if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions")
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
if hasattr(self.model, "predict_proba"):
return self.model.predict_proba(X_prepared)
elif hasattr(self.model, "predict"):
# For neural networks that return probabilities directly
probabilities = self.model.predict(X_prepared)
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
return probabilities
raise NotImplementedError("Model does not support probability predictions")
def get_feature_importance(self) -> Optional[Dict[str, float]]:
"""Get feature importance if supported by the model"""
if hasattr(self.model, "feature_importances_"):
# For tree-based models
importances = self.model.feature_importances_
feature_names = self._get_feature_names()
return dict(zip(feature_names, importances))
elif hasattr(self.model, "coef_"):
# For linear models
coefficients = np.abs(self.model.coef_[0])
feature_names = self._get_feature_names()
return dict(zip(feature_names, coefficients))
elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
# For sklearn pipelines (like LogisticRegression with vectorizer)
classifier = self.model.named_steps["classifier"]
if hasattr(classifier, "coef_"):
coefficients = np.abs(classifier.coef_[0])
if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
# Take top features to avoid too many n-grams
top_indices = np.argsort(coefficients)[-20:]
return dict(zip(feature_names[top_indices], coefficients[top_indices]))
return None
def _get_feature_names(self) -> List[str]:
"""Get feature names (override in subclasses if needed)"""
if hasattr(self.model, "feature_names_in_"):
return list(self.model.feature_names_in_)
return [f"feature_{i}" for i in range(100)] # Default fallback
def save(self, path: str):
"""Save the complete model with training history"""
model_data = {
"model": self.model,
"feature_extractor": self.feature_extractor,
"label_encoder": self.label_encoder,
"tokenizer": self.tokenizer,
"config": self.config.to_dict(),
"is_fitted": self.is_fitted,
"training_history": self.training_history,
"learning_curve_data": self.learning_curve_data,
}
joblib.dump(model_data, path)
@classmethod
def load(cls, path: str) -> "BaseModel":
"""Load a saved model with training history"""
model_data = joblib.load(path)
# Recreate the model instance
from research.experiment import ExperimentConfig
config = ExperimentConfig.from_dict(model_data["config"])
instance = cls(config)
# Restore state
instance.model = model_data["model"]
instance.feature_extractor = model_data["feature_extractor"]
instance.label_encoder = model_data["label_encoder"]
instance.tokenizer = model_data.get("tokenizer")
instance.is_fitted = model_data["is_fitted"]
instance.training_history = model_data.get("training_history", {})
instance.learning_curve_data = model_data.get("learning_curve_data", {})
return instance
def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
"""Plot and save learning curve"""
if not self.learning_curve_data:
logging.warning("No learning curve data available")
return ""
plt.figure(figsize=(10, 6))
data = self.learning_curve_data
train_sizes = data["train_sizes"]
train_scores = data["train_scores"]
val_scores = data["val_scores"]
train_std = data.get("train_scores_std", [0] * len(train_sizes))
val_std = data.get("val_scores_std", [0] * len(train_sizes))
# Plot learning curves
plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
plt.fill_between(
train_sizes,
np.array(train_scores) - np.array(train_std),
np.array(train_scores) + np.array(train_std),
alpha=0.1,
color="blue",
)
plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
plt.fill_between(
train_sizes,
np.array(val_scores) - np.array(val_std),
np.array(val_scores) + np.array(val_std),
alpha=0.1,
color="red",
)
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.title(f"Learning Curve - {self.__class__.__name__}")
plt.legend(loc="best")
plt.grid(True, alpha=0.3)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
plt.close()
return save_path
else:
plt.show()
return ""
def plot_training_history(self, save_path: Optional[str] = None) -> str:
"""Plot training history for neural networks"""
if not self.training_history:
logging.warning("No training history available")
return ""
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
# Plot accuracy
if "accuracy" in self.training_history:
axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
if "val_accuracy" in self.training_history:
axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
axes[0].set_title("Model Accuracy")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Accuracy")
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Plot loss
if "loss" in self.training_history:
axes[1].plot(self.training_history["loss"], label="Training Loss")
if "val_loss" in self.training_history:
axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
axes[1].set_title("Model Loss")
axes[1].set_xlabel("Epoch")
axes[1].set_ylabel("Loss")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
plt.close()
return save_path
else:
plt.show()
return ""
+91
View File
@@ -0,0 +1,91 @@
from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import List, Dict, Any, Optional
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from .feature_extractor import FeatureType
@dataclass
class ExperimentConfig:
"""Configuration for a single experiment"""
# Experiment metadata
name: str
description: str = ""
tags: List[str] = field(default_factory=list)
# Model configuration
model_type: str = "logistic_regression" # logistic_regression, lstm, transformer, etc.
model_params: Dict[str, Any] = field(default_factory=dict)
# Feature configuration
features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
feature_params: Dict[str, Any] = field(default_factory=dict)
# Data configuration
train_data_filter: Optional[Dict[str, Any]] = None # Filter criteria for training data
test_data_filter: Optional[Dict[str, Any]] = None
target_column: str = "sex"
# Training configuration
test_size: float = 0.2
random_seed: int = 42
cross_validation_folds: int = 5
# Evaluation configuration
metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
# Convert enums to strings
result["features"] = [f.value for f in self.features]
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
"""Create from dictionary"""
if "features" in data:
data["features"] = [FeatureType(f) for f in data["features"]]
return cls(**data)
class ExperimentStatus(Enum):
"""Experiment execution status"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
def calculate_metrics(
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
) -> Dict[str, float]:
"""Calculate specified metrics"""
if metrics is None:
metrics = ["accuracy", "precision", "recall", "f1"]
results = {}
if "accuracy" in metrics:
results["accuracy"] = accuracy_score(y_true, y_pred)
if any(m in metrics for m in ["precision", "recall", "f1"]):
precision, recall, f1, _ = precision_recall_fscore_support(
y_true, y_pred, average="weighted"
)
if "precision" in metrics:
results["precision"] = precision
if "recall" in metrics:
results["recall"] = recall
if "f1" in metrics:
results["f1"] = f1
return results
+56
View File
@@ -0,0 +1,56 @@
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional, Dict, List, Any
from research.experiment import ExperimentConfig, ExperimentStatus
@dataclass
class ExperimentResult:
"""Results from an experiment execution"""
experiment_id: str
config: ExperimentConfig
# Execution metadata
start_time: datetime
end_time: Optional[datetime] = None
status: ExperimentStatus = ExperimentStatus.PENDING
error_message: Optional[str] = None
# Model artifacts
model_path: Optional[str] = None
feature_extractor_path: Optional[str] = None
# Metrics
train_metrics: Dict[str, float] = field(default_factory=dict)
test_metrics: Dict[str, float] = field(default_factory=dict)
cv_metrics: Dict[str, float] = field(default_factory=dict)
# Additional results
confusion_matrix: Optional[List[List[int]]] = None
feature_importance: Optional[Dict[str, float]] = None
prediction_examples: Optional[List[Dict]] = None
# Data statistics
train_size: int = 0
test_size: int = 0
class_distribution: Dict[str, int] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization"""
result = asdict(self)
result["config"] = self.config.to_dict()
result["start_time"] = self.start_time.isoformat()
result["end_time"] = self.end_time.isoformat() if self.end_time else None
result["status"] = self.status.value
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
"""Create from dictionary"""
data["config"] = ExperimentConfig.from_dict(data["config"])
data["start_time"] = datetime.fromisoformat(data["start_time"])
data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
data["status"] = ExperimentStatus(data["status"])
return cls(**data)
+123
View File
@@ -0,0 +1,123 @@
from typing import List
from research.experiment import ExperimentConfig
from research.experiment.feature_extractor import FeatureType
class ExperimentBuilder:
"""Helper class to build experiment configurations"""
@staticmethod
def create_baseline_experiments() -> List[ExperimentConfig]:
"""Create a set of baseline experiments for comparison"""
return [
# Full name experiments
ExperimentConfig(
name="baseline_logistic_regression_fullname",
description="Logistic regression with full name",
model_type="logistic_regression",
features=[FeatureType.FULL_NAME],
tags=["baseline", "fullname"],
),
# Native name only
ExperimentConfig(
name="baseline_logistic_regression_native",
description="Logistic regression with native name only",
model_type="logistic_regression",
features=[FeatureType.NATIVE_NAME],
tags=["baseline", "native"],
),
# Surname only
ExperimentConfig(
name="baseline_logistic_regression_surname",
description="Logistic regression with surname only",
model_type="logistic_regression",
features=[FeatureType.SURNAME],
tags=["baseline", "surname"],
),
# Random Forest with engineered features
ExperimentConfig(
name="baseline_rf_engineered",
description="Random Forest with engineered features",
model_type="random_forest",
features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
tags=["baseline", "engineered"],
),
]
@staticmethod
def create_feature_ablation_study() -> List[ExperimentConfig]:
"""Create experiments for feature ablation study"""
base_features = [
FeatureType.FULL_NAME,
FeatureType.NAME_LENGTH,
FeatureType.WORD_COUNT,
FeatureType.PROVINCE,
]
experiments = []
# Test removing each feature one by one
for i, feature_to_remove in enumerate(base_features):
remaining_features = [f for f in base_features if f != feature_to_remove]
experiments.append(
ExperimentConfig(
name=f"ablation_remove_{feature_to_remove.value}",
description=f"Ablation study: removed {feature_to_remove.value}",
model_type="logistic_regression",
features=remaining_features,
tags=["ablation", feature_to_remove.value],
)
)
return experiments
@staticmethod
def create_name_component_study() -> List[ExperimentConfig]:
"""Create experiments to study different name components"""
experiments = []
name_components = [
(FeatureType.FIRST_WORD, "first_word"),
(FeatureType.LAST_WORD, "last_word"),
(FeatureType.NATIVE_NAME, "native_name"),
(FeatureType.SURNAME, "surname"),
(FeatureType.NAME_BEGINNINGS, "name_beginnings"),
(FeatureType.NAME_ENDINGS, "name_endings"),
]
for feature, name in name_components:
experiments.append(
ExperimentConfig(
name=f"component_study_{name}",
description=f"Study of {name} for gender prediction",
model_type="logistic_regression",
features=[feature],
tags=["component_study", name],
)
)
return experiments
@staticmethod
def create_province_specific_study() -> List[ExperimentConfig]:
"""Create experiments for province-specific analysis"""
provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"] # Add more as needed
experiments = []
for province in provinces:
experiments.append(
ExperimentConfig(
name=f"province_study_{province}",
description=f"Gender prediction for {province} province only",
model_type="logistic_regression",
features=[FeatureType.FULL_NAME],
train_data_filter={"province": province},
tags=["province_study", province],
)
)
return experiments
+238
View File
@@ -0,0 +1,238 @@
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from research.base_model import BaseModel
from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
from research.experiment.experiment_tracker import ExperimentTracker
from research.model_registry import create_model
class ExperimentRunner:
"""Runs and manages experiments"""
def __init__(self, config: PipelineConfig):
self.config = config
self.tracker = ExperimentTracker(self.config)
self.data_loader = DataLoader(self.config)
def run_experiment(self, experiment_config: ExperimentConfig) -> str:
"""Run a single experiment and return experiment ID"""
# Create experiment
experiment_id = self.tracker.create_experiment(experiment_config)
try:
logging.info(f"Starting experiment: {experiment_id}")
self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
# Load data
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
df = self.data_loader.load_csv_complete(data_path)
# Apply data filters if specified
df = self._apply_data_filters(df, experiment_config)
# Prepare target variable
y = df[experiment_config.target_column]
X = df
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=experiment_config.test_size,
random_state=experiment_config.random_seed,
stratify=y,
)
# Create and train model
model = create_model(experiment_config)
model.fit(X_train, y_train)
# Make predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
# Calculate metrics
train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
# Cross-validation if requested
cv_metrics = {}
if experiment_config.cross_validation_folds > 1:
cv_metrics = model.cross_validate(
X_train, y_train, experiment_config.cross_validation_folds
)
# Additional analysis
conf_matrix = confusion_matrix(y_test, test_pred).tolist()
feature_importance = model.get_feature_importance()
# Create prediction examples
prediction_examples = self._create_prediction_examples(
X_test, y_test, test_pred, model, n_examples=10
)
# Calculate class distribution
class_distribution = y.value_counts().to_dict()
# Save model
model_path = self._save_model(model, experiment_id)
# Update experiment with results
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.COMPLETED,
end_time=datetime.now(),
model_path=str(model_path),
train_metrics=train_metrics,
test_metrics=test_metrics,
cv_metrics=cv_metrics,
confusion_matrix=conf_matrix,
feature_importance=feature_importance,
prediction_examples=prediction_examples,
train_size=len(X_train),
test_size=len(X_test),
class_distribution=class_distribution,
)
logging.info(f"Experiment {experiment_id} completed successfully")
logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
return experiment_id
except Exception as e:
logging.error(f"Experiment {experiment_id} failed: {str(e)}")
self.tracker.update_experiment(
experiment_id,
status=ExperimentStatus.FAILED,
end_time=datetime.now(),
error_message=str(e),
)
raise
def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
"""Run multiple experiments"""
experiment_ids = []
for i, config in enumerate(experiments):
logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
try:
exp_id = self.run_experiment(config)
experiment_ids.append(exp_id)
except Exception as e:
logging.error(f"Failed to run experiment {config.name}: {e}")
continue
return experiment_ids
@classmethod
def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
"""Apply data filters specified in experiment config"""
filtered_df = df.copy()
# Apply training data filters
if config.train_data_filter:
for column, criteria in config.train_data_filter.items():
if column in filtered_df.columns:
if isinstance(criteria, list):
filtered_df = filtered_df[filtered_df[column].isin(criteria)]
elif isinstance(criteria, dict):
if "min" in criteria:
filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
if "max" in criteria:
filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
else:
filtered_df = filtered_df[filtered_df[column] == criteria]
return filtered_df
@classmethod
def _create_prediction_examples(
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
) -> List[Dict]:
"""Create prediction examples for analysis"""
examples = []
# Get both correct and incorrect predictions
correct_mask = y_test == predictions
incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
correct_indices = X_test[correct_mask].index[: n_examples // 2]
sample_indices = list(incorrect_indices) + list(correct_indices)
for idx in sample_indices[:n_examples]:
example = {
"name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
"true_label": y_test.loc[idx],
"predicted_label": predictions[X_test.index.get_loc(idx)],
"correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
}
# Add probability if available
if model.architecture == "traditional":
proba = model.predict_proba(X_test.loc[[idx]])
example["prediction_confidence"] = float(proba.max())
examples.append(example)
return examples
def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
"""Save trained model"""
model_dir = self.config.paths.models_dir / "experiments" / experiment_id
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / "model.joblib"
model.save(str(model_path))
return model_path
def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
"""Load a model from a completed experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.model_path:
return BaseModel.load(experiment.model_path)
return None
def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
"""Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids)
if f"test_{metric}" in comparison_df.columns:
comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
return comparison_df
def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
"""Get feature importance analysis for an experiment"""
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.feature_importance:
importance_df = pd.DataFrame(
[
{"feature": feature, "importance": importance}
for feature, importance in experiment.feature_importance.items()
]
)
return importance_df.sort_values("importance", ascending=False)
return None
+194
View File
@@ -0,0 +1,194 @@
import hashlib
import json
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, List
import pandas as pd
from core.config import PipelineConfig, get_config
from research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiement_result import ExperimentResult
class ExperimentTracker:
"""Tracks and manages experiments"""
def __init__(self, config: Optional[PipelineConfig] = None):
self.config = config or get_config()
self.experiments_dir = self.config.paths.outputs_dir / "experiments"
self.experiments_dir.mkdir(parents=True, exist_ok=True)
self.results_db_path = self.experiments_dir / "experiments.json"
self._results: Dict[str, ExperimentResult] = {}
self._load_results()
def _load_results(self):
"""Load existing experiment results"""
if self.results_db_path.exists():
try:
with open(self.results_db_path, "r") as f:
data = json.load(f)
for exp_id, exp_data in data.items():
self._results[exp_id] = ExperimentResult.from_dict(exp_data)
except Exception as e:
print(f"Warning: Failed to load experiment results: {e}")
def _save_results(self):
"""Save experiment results to disk"""
data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
with open(self.results_db_path, "w") as f:
json.dump(data, f, indent=2, default=str)
def create_experiment(self, config: ExperimentConfig) -> str:
"""Create a new experiment and return its ID"""
# Generate experiment ID
config_hash = hashlib.md5(
json.dumps(config.to_dict(), sort_keys=True).encode()
).hexdigest()[:8]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_id = f"{config.name}_{timestamp}_{config_hash}"
# Create result object
result = ExperimentResult(
experiment_id=experiment_id, config=config, start_time=datetime.now()
)
self._results[experiment_id] = result
self._save_results()
return experiment_id
def update_experiment(self, experiment_id: str, **updates):
"""Update an experiment's results"""
if experiment_id in self._results:
result = self._results[experiment_id]
for key, value in updates.items():
if hasattr(result, key):
setattr(result, key, value)
self._save_results()
def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
"""Get experiment by ID"""
return self._results.get(experiment_id)
def list_experiments(
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
) -> List[ExperimentResult]:
"""List experiments with optional filtering"""
results = list(self._results.values())
if status:
results = [r for r in results if r.status == status]
if tags:
results = [r for r in results if any(tag in r.config.tags for tag in tags)]
if model_type:
results = [r for r in results if r.config.model_type == model_type]
return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric"""
experiments = self.list_experiments()
if filters:
# Apply additional filters
if "model_type" in filters:
experiments = [
e for e in experiments if e.config.model_type == filters["model_type"]
]
if "features" in filters:
experiments = [
e
for e in experiments
if any(f in e.config.features for f in filters["features"])
]
valid_experiments = []
for exp in experiments:
if exp.status == ExperimentStatus.COMPLETED:
metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
if metric in metrics_dict:
valid_experiments.append((exp, metrics_dict[metric]))
if not valid_experiments:
return None
return max(valid_experiments, key=lambda x: x[1])[0]
def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
"""Compare multiple experiments in a DataFrame"""
rows = []
for exp_id in experiment_ids:
exp = self.get_experiment(exp_id)
if exp:
row = {
"experiment_id": exp_id,
"name": exp.config.name,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
return pd.DataFrame(rows)
def export_results(self, output_path: Optional[Path] = None) -> Path:
"""Export all results to CSV"""
if output_path is None:
output_path = (
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
rows = []
for exp in self._results.values():
row = {
"experiment_id": exp.experiment_id,
"name": exp.config.name,
"description": exp.config.description,
"model_type": exp.config.model_type,
"features": ",".join([f.value for f in exp.config.features]),
"status": exp.status.value,
"start_time": exp.start_time.isoformat(),
"end_time": exp.end_time.isoformat() if exp.end_time else None,
"train_size": exp.train_size,
"test_size": exp.test_size,
}
# Add all metrics
for metric, value in exp.test_metrics.items():
row[f"test_{metric}"] = value
for metric, value in exp.cv_metrics.items():
row[f"cv_{metric}"] = value
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv(output_path, index=False)
return output_path
+90
View File
@@ -0,0 +1,90 @@
from enum import Enum
from typing import List, Dict, Any, Union
import pandas as pd
class FeatureType(Enum):
"""Types of features that can be extracted from names"""
FULL_NAME = "full_name"
NATIVE_NAME = "native_name"
SURNAME = "surname"
FIRST_WORD = "first_word"
LAST_WORD = "last_word"
NAME_LENGTH = "name_length"
WORD_COUNT = "word_count"
PROVINCE = "province"
CHAR_NGRAMS = "char_ngrams"
WORD_NGRAMS = "word_ngrams"
NAME_ENDINGS = "name_endings"
NAME_BEGINNINGS = "name_beginnings"
class FeatureExtractor:
"""Extract different types of features from name data"""
def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
self.feature_types = feature_types
self.feature_params = feature_params or {}
def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Extract all configured features"""
features_df = pd.DataFrame(index=df.index)
for feature_type in self.feature_types:
feature_data = self._extract_single_feature(df, feature_type)
if isinstance(feature_data, pd.DataFrame):
features_df = pd.concat([features_df, feature_data], axis=1)
else:
features_df[feature_type.value] = feature_data
return features_df
def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME:
return df["name"].fillna("")
elif feature_type == FeatureType.NATIVE_NAME:
return df["identified_name"].fillna(df["probable_native"]).fillna("")
elif feature_type == FeatureType.SURNAME:
return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
elif feature_type == FeatureType.FIRST_WORD:
return df["name"].str.split().str[0].fillna("")
elif feature_type == FeatureType.LAST_WORD:
return df["name"].str.split().str[-1].fillna("")
elif feature_type == FeatureType.NAME_LENGTH:
return df["name"].str.len().fillna(0)
elif feature_type == FeatureType.WORD_COUNT:
return df["words"].fillna(1)
elif feature_type == FeatureType.PROVINCE:
return df["province"].fillna("unknown")
elif feature_type == FeatureType.NAME_ENDINGS:
n = self.feature_params.get("ending_length", 3)
return df["name"].str[-n:].fillna("")
elif feature_type == FeatureType.NAME_BEGINNINGS:
n = self.feature_params.get("beginning_length", 3)
return df["name"].str[:n].fillna("")
elif feature_type == FeatureType.CHAR_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
elif feature_type == FeatureType.WORD_NGRAMS:
# This will be handled by the model's vectorizer
return df["name"].fillna("")
else:
raise ValueError(f"Unknown feature type: {feature_type}")
+44
View File
@@ -0,0 +1,44 @@
from typing import List
from research.base_model import BaseModel
from research.experiment import ExperimentConfig
from research.models.bigru_model import BiGRUModel
from research.models.cnn_model import CNNModel
from research.models.ensemble_model import EnsembleModel
from research.models.lightgbm_model import LightGBMModel
from research.models.logistic_regression_model import LogisticRegressionModel
from research.models.lstm_model import LSTMModel
from research.models.naive_bayes_model import NaiveBayesModel
from research.models.random_forest_model import RandomForestModel
from research.models.svm_model import SVMModel
from research.models.transformer_model import TransformerModel
from research.models.xgboost_model import XGBoostModel
MODEL_REGISTRY = {
"bigru": BiGRUModel,
"cnn": CNNModel,
"ensemble": EnsembleModel,
"lightgbm": LightGBMModel,
"logistic_regression": LogisticRegressionModel,
"lstm": LSTMModel,
"naive_bayes": NaiveBayesModel,
"random_forest": RandomForestModel,
"svm": SVMModel,
"transformer": TransformerModel,
"xgboost": XGBoostModel,
}
def create_model(config: ExperimentConfig) -> BaseModel:
"""Factory function to create models"""
model_class = MODEL_REGISTRY.get(config.model_type)
if model_class is None:
raise ValueError(f"Unknown model type: {config.model_type}")
return model_class(config)
def list_available_models() -> List[str]:
"""List all available model types"""
return list(MODEL_REGISTRY.keys())
+281
View File
@@ -0,0 +1,281 @@
import json
import logging
from datetime import datetime
from typing import List, Dict, Any
import pandas as pd
from core.config import get_config
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
from research.experiment import FeatureType, ExperimentConfig
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
class ModelTrainer:
"""Comprehensive model training and artifact management"""
def __init__(self, config=None):
self.config = config or get_config()
self.data_loader = DataLoader(self.config)
self.experiment_runner = ExperimentRunner(self.config)
self.experiment_tracker = ExperimentTracker(self.config)
self.logger = logging.getLogger(__name__)
# Setup model artifacts directory
self.models_dir = self.config.paths.models_dir
self.models_dir.mkdir(parents=True, exist_ok=True)
def train_single_model(
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
save_artifacts: bool = True,
) -> str:
"""
Train a single model and save its artifacts.
Returns the experiment ID.
"""
self.logger.info(f"Training {model_type} model: {model_name}")
if features is None:
features = ["full_name"]
feature_types = [FeatureType(f) for f in features]
# Create experiment configuration
config = ExperimentConfig(
name=model_name,
description=f"Training {model_type} model with features: {', '.join(features)}",
model_type=model_type,
features=feature_types,
model_params=model_params or {},
tags=["training", model_type],
)
# Run experiment
experiment_id = self.experiment_runner.run_experiment(config)
experiment = self.experiment_tracker.get_experiment(experiment_id)
if experiment and experiment.test_metrics:
self.logger.info("Training completed successfully!")
self.logger.info(f" Experiment ID: {experiment_id}")
self.logger.info(f" Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
self.logger.info(f" Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
if save_artifacts:
self.save_model_artifacts(experiment_id)
return experiment_id
def train_multiple_models(
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
) -> List[str]:
"""
Train multiple models with different configurations.
"""
self.logger.info(f"Training {len(model_configs)} models...")
experiment_ids = []
for i, config in enumerate(model_configs):
model_name = f"{base_name}_{config['model_type']}_{i + 1}"
try:
exp_id = self.train_single_model(
model_name=model_name,
model_type=config["model_type"],
features=config.get("features", ["full_name"]),
model_params=config.get("model_params", {}),
save_artifacts=save_all,
)
experiment_ids.append(exp_id)
except Exception as e:
self.logger.error(f"Failed to train {model_name}: {e}")
continue
self.logger.info(f"Completed training {len(experiment_ids)} models successfully")
return experiment_ids
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
"""
Save model artifacts in a structured way for easy loading.
Returns paths to saved artifacts.
"""
experiment = self.experiment_tracker.get_experiment(experiment_id)
if not experiment:
raise ValueError(f"Experiment {experiment_id} not found")
# Create model-specific directory
model_dir = self.models_dir / experiment_id
model_dir.mkdir(parents=True, exist_ok=True)
# Load the trained model
trained_model = self.experiment_runner.load_experiment_model(experiment_id)
if not trained_model:
raise ValueError(f"Could not load model for experiment {experiment_id}")
# Save complete model with joblib
model_path = model_dir / "complete_model.joblib"
trained_model.save(str(model_path))
# Save model configuration
config_path = model_dir / "model_config.json"
with open(config_path, "w") as f:
import json
json.dump(experiment.config.to_dict(), f, indent=2)
# Save experiment results
results_path = model_dir / "experiment_results.json"
with open(results_path, "w") as f:
json.dump(experiment.to_dict(), f, indent=2, default=str)
# Generate and save learning curves
learning_curve_path = None
training_history_path = None
try:
# Load data for learning curve generation
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
if data_path.exists():
df = self.data_loader.load_csv_complete(data_path)
# Generate learning curve
self.logger.info("Generating learning curve...")
trained_model.generate_learning_curve(df, df[experiment.config.target_column])
# Plot and save learning curve
learning_curve_path = model_dir / "learning_curve.png"
trained_model.plot_learning_curve(str(learning_curve_path))
# Plot and save training history (for neural networks)
if trained_model.training_history:
training_history_path = model_dir / "training_history.png"
trained_model.plot_training_history(str(training_history_path))
# Save learning curve data as JSON
learning_data_path = model_dir / "learning_curve_data.json"
with open(learning_data_path, "w") as f:
json.dump(trained_model.learning_curve_data, f, indent=2)
# Save training history data as JSON
if trained_model.training_history:
history_data_path = model_dir / "training_history_data.json"
with open(history_data_path, "w") as f:
json.dump(trained_model.training_history, f, indent=2)
except Exception as e:
self.logger.warning(f"Could not generate learning curves: {e}")
# Save artifacts metadata
metadata = {
"experiment_id": experiment_id,
"model_name": experiment.config.name,
"model_type": experiment.config.model_type,
"features": [f.value for f in experiment.config.features],
"training_date": datetime.now().isoformat(),
"test_accuracy": experiment.test_metrics.get("accuracy", 0),
"test_f1": experiment.test_metrics.get("f1", 0),
"model_path": str(model_path),
"config_path": str(config_path),
"results_path": str(results_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
"training_history_plot": str(training_history_path) if training_history_path else None,
"has_learning_curve": bool(trained_model.learning_curve_data),
"has_training_history": bool(trained_model.training_history),
}
metadata_path = model_dir / "metadata.json"
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=2)
self.logger.info(f"Model artifacts saved to: {model_dir}")
self.logger.info(f" - Complete model: {model_path.name}")
self.logger.info(f" - Configuration: {config_path.name}")
self.logger.info(f" - Results: {results_path.name}")
self.logger.info(f" - Metadata: {metadata_path.name}")
if learning_curve_path and learning_curve_path.exists():
self.logger.info(f" - Learning curve: {learning_curve_path.name}")
if training_history_path and training_history_path.exists():
self.logger.info(f" - Training history: {training_history_path.name}")
return {
"model_dir": str(model_dir),
"model_path": str(model_path),
"config_path": str(config_path),
"results_path": str(results_path),
"metadata_path": str(metadata_path),
"learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
"training_history_plot": str(training_history_path) if training_history_path else None,
}
def load_trained_model(self, experiment_id: str):
"""
Load a previously trained model from artifacts.
"""
model_dir = self.models_dir / experiment_id
model_path = model_dir / "complete_model.joblib"
if not model_path.exists():
raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
# Load the model class dynamically
metadata_path = model_dir / "metadata.json"
with open(metadata_path, "r") as f:
metadata = json.load(f)
model_type = metadata["model_type"]
from research.model_registry import MODEL_REGISTRY
model_class = MODEL_REGISTRY[model_type]
# Load the complete model
loaded_model = model_class.load(str(model_path))
self.logger.info(f"Loaded model: {metadata['model_name']}")
self.logger.info(f" Type: {model_type}")
self.logger.info(f" Accuracy: {metadata['test_accuracy']:.4f}")
return loaded_model
def list_saved_models(self) -> pd.DataFrame:
"""
List all saved model artifacts.
"""
models_data = []
for model_dir in self.models_dir.iterdir():
if model_dir.is_dir():
metadata_path = model_dir / "metadata.json"
if metadata_path.exists():
try:
with open(metadata_path, "r") as f:
metadata = json.load(f)
models_data.append(metadata)
except Exception as e:
self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}")
if not models_data:
self.logger.info("No saved models found.")
return pd.DataFrame()
df = pd.DataFrame(models_data)
# Format the display
display_columns = [
"model_name",
"model_type",
"features",
"test_accuracy",
"test_f1",
"training_date",
]
available_columns = [col for col in display_columns if col in df.columns]
return df[available_columns].sort_values("training_date", ascending=False)
View File
+56
View File
@@ -0,0 +1,56 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
class BiGRUModel(NeuralNetworkModel):
"""Bidirectional GRU model for name classification"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Bidirectional(
GRU(
params.get("gru_units", 32),
return_sequences=True,
dropout=params.get("dropout", 0.2),
)
),
Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
Dense(2, activation="softmax"),
]
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+75
View File
@@ -0,0 +1,75 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import (
Embedding,
Conv1D,
MaxPooling1D,
GlobalMaxPooling1D,
Dense,
Dropout,
)
from tensorflow.keras.models import Sequential
from research.neural_network_model import NeuralNetworkModel
class CNNModel(NeuralNetworkModel):
"""1D Convolutional Neural Network for character patterns"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any:
"""Build CNN model with known vocabulary size"""
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
),
MaxPooling1D(pool_size=2),
Conv1D(
filters=params.get("filters", 64),
kernel_size=params.get("kernel_size", 3),
activation="relu",
),
GlobalMaxPooling1D(),
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
Dense(2, activation="softmax"),
]
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
"""Prepare sequences for CNN using extracted features"""
# X here contains the features already extracted by FeatureExtractor
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Get text data from extracted features - use character level for CNN
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
# Fallback - should not happen if FeatureExtractor is properly configured
text_data = [""] * len(X)
# Initialize character-level tokenizer
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 20) # Longer for character level
return pad_sequences(sequences, maxlen=max_len, padding="post")
+97
View File
@@ -0,0 +1,97 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from research.experiment import ExperimentConfig
from research.traditional_model import TraditionalModel
class EnsembleModel(TraditionalModel):
"""Ensemble model combining multiple base models"""
@property
def architecture(self) -> str:
"""Return the architecture type"""
return "ensemble"
def __init__(self, config: ExperimentConfig):
super().__init__(config)
self.base_models = []
self.model_weights = None
def build_model(self) -> BaseEstimator:
params = self.config.model_params
base_model_types = params.get(
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
)
# Create base models with simplified configs
estimators = []
for model_type in base_model_types:
if model_type == "logistic_regression":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
),
(
"classifier",
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
),
]
)
estimators.append((f"logistic_regression", model))
elif model_type == "random_forest":
model = Pipeline(
[
(
"vectorizer",
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
),
(
"classifier",
RandomForestClassifier(
n_estimators=50, random_state=self.config.random_seed
),
),
]
)
estimators.append((f"rf", model))
elif model_type == "naive_bayes":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
),
("classifier", MultinomialNB()),
]
)
estimators.append((f"nb", model))
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(estimators=estimators, voting=voting_type)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+51
View File
@@ -0,0 +1,51 @@
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
class LightGBMModel(TraditionalModel):
"""LightGBM with engineered features"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
return lgb.LGBMClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", -1),
learning_rate=params.get("learning_rate", 0.1),
num_leaves=params.get("num_leaves", 31),
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
verbose=-1,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]:
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character n-grams for text features
vectorizer = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=50
)
char_features = vectorizer.fit_transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features)
else:
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,44 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel
class LogisticRegressionModel(TraditionalModel):
"""Logistic Regression with character n-grams"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 5)),
max_features=params.get("max_features", 10000),
)
classifier = LogisticRegression(
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
# Collect text-based features from the extracted features DataFrame
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
# Combine text features
if len(text_features) == 1:
return text_features[0].values
else:
# Concatenate multiple text features with separator
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+52
View File
@@ -0,0 +1,52 @@
from typing import Any
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
class LSTMModel(NeuralNetworkModel):
"""LSTM model for sequence learning"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
params = kwargs
model = Sequential(
[
Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
Bidirectional(LSTM(params.get("lstm_units", 32))),
Dense(64, activation="relu"),
Dense(2, activation="softmax"),
]
)
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
return model
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
# Initialize tokenizer if needed
if self.tokenizer is None:
self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+39
View File
@@ -0,0 +1,39 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from research.traditional_model import TraditionalModel
class NaiveBayesModel(TraditionalModel):
"""Multinomial Naive Bayes with character n-grams"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (1, 4)),
max_features=params.get("max_features", 8000),
)
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+40
View File
@@ -0,0 +1,40 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
class RandomForestModel(TraditionalModel):
"""Random Forest with engineered features"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
return RandomForestClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", None),
random_state=self.config.random_seed,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
# Handle different feature types
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
else:
# Categorical features (encode them)
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+45
View File
@@ -0,0 +1,45 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from research.traditional_model import TraditionalModel
class SVMModel(TraditionalModel):
"""Support Vector Machine with character n-grams and RBF kernel"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
vectorizer = TfidfVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 4)),
max_features=params.get("max_features", 5000),
)
classifier = SVC(
kernel=params.get("kernel", "rbf"),
C=params.get("C", 1.0),
gamma=params.get("gamma", "scale"),
probability=True, # Enable probability prediction
random_state=self.config.random_seed,
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
+82
View File
@@ -0,0 +1,82 @@
from typing import Any
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import (
Input,
Embedding,
Dense,
GlobalAveragePooling1D,
MultiHeadAttention,
Dropout,
LayerNormalization,
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from research.neural_network_model import NeuralNetworkModel
class TransformerModel(NeuralNetworkModel):
"""Transformer-based model"""
def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
params = kwargs
# Build Transformer model
inputs = Input(shape=(max_len,))
x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=max_len, delta=1)
pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))(
positions
)
x = x + pos_embedding
x = self._transformer_encoder(x, params)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
outputs = Dense(2, activation="softmax")(x)
model = Model(inputs, outputs)
model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return model
@classmethod
def _transformer_encoder(cls, x, cfg_params):
"""Transformer encoder block"""
attn = MultiHeadAttention(
num_heads=cfg_params.get("transformer_num_heads", 2),
key_dim=cfg_params.get("transformer_head_size", 64),
)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
ff = Dense(x.shape[-1])(ff)
return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_data = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_data.extend(X[feature_type.value].astype(str).tolist())
if not text_data:
raise ValueError("No text data found in the provided DataFrame.")
# Initialize tokenizer if needed
if self.tokenizer is None:
self.tokenizer = Tokenizer(oov_token="<OOV>")
self.tokenizer.fit_on_texts(text_data)
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post")
+52
View File
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from research.traditional_model import TraditionalModel
class XGBoostModel(TraditionalModel):
"""XGBoost with engineered features and character embeddings"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
return xgb.XGBClassifier(
n_estimators=params.get("n_estimators", 100),
max_depth=params.get("max_depth", 6),
learning_rate=params.get("learning_rate", 0.1),
subsample=params.get("subsample", 0.8),
colsample_bytree=params.get("colsample_bytree", 0.8),
random_state=self.config.random_seed,
eval_metric="logloss",
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character-level features for names
vectorizer = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=100
)
char_features = vectorizer.fit_transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features)
else:
# Categorical features
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+201
View File
@@ -0,0 +1,201 @@
import logging
from abc import abstractmethod
from typing import Any, Dict, List
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor
class NeuralNetworkModel(BaseModel):
"""Base class for neural network models (TensorFlow/Keras)"""
@property
def architecture(self) -> str:
return "neural_network"
@abstractmethod
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
"""Build neural network model with known vocabulary size"""
pass
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the neural network model with deferred building"""
logging.info(f"Training {self.__class__.__name__}")
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
# Extract and prepare features (this will also initialize tokenizer)
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
# Now we can build the model with known vocab size
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
# Get additional model parameters
max_len = self.config.model_params.get("max_len", 6)
self.model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train the neural network
history = self.model.fit(
X_prepared,
y_encoded,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 64),
validation_split=0.1,
verbose=1,
)
# Store training history
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
"val_accuracy": history.history.get("val_accuracy", []),
"val_loss": history.history.get("val_loss", []),
}
self.is_fitted = True
return self
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
accuracies = []
precisions = []
recalls = []
f1_scores = []
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
# Create fresh model for each fold
fold_model = self.build_model()
# Train on fold
if hasattr(fold_model, "fit"):
fold_model.fit(
X_prepared[train_idx],
y_encoded[train_idx],
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
verbose=0,
)
# Predict on validation
y_pred = fold_model.predict(X_prepared[val_idx])
if len(y_pred.shape) > 1:
y_pred = y_pred.argmax(axis=1)
# Calculate metrics
acc = accuracy_score(y_encoded[val_idx], y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
y_encoded[val_idx], y_pred, average="weighted"
)
accuracies.append(acc)
precisions.append(prec)
recalls.append(rec)
f1_scores.append(f1)
return {
"accuracy": np.mean(accuracies),
"accuracy_std": np.std(accuracies),
"precision": np.mean(precisions),
"precision_std": np.std(precisions),
"recall": np.mean(recalls),
"recall_std": np.std(recalls),
"f1": np.mean(f1_scores),
"f1_std": np.std(f1_scores),
}
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
learning_curve_data = {
"train_sizes": [],
"train_scores": [],
"val_scores": [],
"train_scores_std": [],
"val_scores_std": [],
}
# Split data once for validation
X_train_full, X_val, y_train_full, y_val = train_test_split(
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
)
for size in train_sizes:
train_size = int(len(X_train_full) * size)
if train_size < 10: # Minimum training size
continue
# Sample training data
indices = np.random.choice(len(X_train_full), train_size, replace=False)
X_train_subset = X_train_full[indices]
y_train_subset = y_train_full[indices]
# Train multiple models for variance estimation
train_scores = []
val_scores = []
for seed in range(3): # 3 runs for variance
# Build fresh model
model = self.build_model()
# Train model
if hasattr(model, "fit"):
history = model.fit(
X_train_subset,
y_train_subset,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
validation_data=(X_val, y_val),
verbose=0,
)
# Evaluate
train_pred = model.predict(X_train_subset)
val_pred = model.predict(X_val)
train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
train_scores.append(train_acc)
val_scores.append(val_acc)
learning_curve_data["train_sizes"].append(train_size)
learning_curve_data["train_scores"].append(np.mean(train_scores))
learning_curve_data["val_scores"].append(np.mean(val_scores))
learning_curve_data["train_scores_std"].append(np.std(train_scores))
learning_curve_data["val_scores_std"].append(np.std(val_scores))
self.learning_curve_data = learning_curve_data
return learning_curve_data
+134
View File
@@ -0,0 +1,134 @@
import logging
from abc import abstractmethod
from typing import Dict, Any, List
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor
class TraditionalModel(BaseModel):
"""Base class for traditional ML models (scikit-learn compatible)"""
@property
def architecture(self) -> str:
return "traditional"
@abstractmethod
def build_model(self) -> BaseEstimator:
"""Build and return the sklearn model instance"""
pass
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the traditional ML model"""
logging.info(f"Training {self.__class__.__name__}")
# Build model if not already built
if self.model is None:
self.model = self.build_model()
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
# Extract and prepare features
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
# Train model
self.model.fit(X_prepared, y_encoded)
self.is_fitted = True
return self
def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
# Calculate different metrics
results = {}
# Accuracy
accuracy_scores = cross_val_score(
self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
)
results["accuracy"] = accuracy_scores.mean()
results["accuracy_std"] = accuracy_scores.std()
# Precision, Recall, F1
for metric in ["precision", "recall", "f1"]:
if metric in self.config.metrics:
scores = cross_val_score(
self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
)
results[metric] = scores.mean()
results[f"{metric}_std"] = scores.std()
return results
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
if train_sizes is None:
train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
# Prepare features
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
try:
train_sizes_abs, train_scores, val_scores = learning_curve(
self.build_model(),
X_prepared,
y_encoded,
train_sizes=train_sizes,
cv=3, # Use 3-fold CV for speed
scoring="accuracy",
random_state=self.config.random_seed,
)
learning_curve_data = {
"train_sizes": train_sizes_abs.tolist(),
"train_scores": train_scores.mean(axis=1).tolist(),
"val_scores": val_scores.mean(axis=1).tolist(),
"train_scores_std": train_scores.std(axis=1).tolist(),
"val_scores_std": val_scores.std(axis=1).tolist(),
}
except Exception as e:
logging.warning(f"Could not generate learning curve: {e}")
return {}
self.learning_curve_data = learning_curve_data
return learning_curve_data
+152
View File
@@ -0,0 +1,152 @@
#!.venv/bin/python3
import logging
import argparse
from research.model_trainer import ModelTrainer
def train_baseline_models():
"""
Quick function to train all baseline models and save artifacts.
"""
logger = logging.getLogger(__name__)
logger.info("Training Baseline Models with Artifact Saving")
trainer = ModelTrainer()
# Define baseline model configurations
baseline_configs = [
{
"model_type": "logistic_regression",
"features": ["full_name"],
"model_params": {"ngram_range": [2, 5], "max_features": 10000},
},
{
"model_type": "logistic_regression",
"features": ["native_name"],
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
},
{
"model_type": "logistic_regression",
"features": ["surname"],
"model_params": {"ngram_range": [2, 4], "max_features": 5000},
},
{
"model_type": "random_forest",
"features": ["name_length", "word_count", "province"],
"model_params": {"n_estimators": 100, "max_depth": 10},
},
{
"model_type": "svm",
"features": ["full_name"],
"model_params": {"kernel": "rbf", "C": 1.0},
},
{"model_type": "naive_bayes", "features": ["full_name"], "model_params": {"alpha": 1.0}},
]
# Train all baseline models
experiment_ids = trainer.train_multiple_models("baseline", baseline_configs)
# Show summary
logger.info(f"\n Training Summary:")
for exp_id in experiment_ids:
experiment = trainer.experiment_tracker.get_experiment(exp_id)
if experiment:
acc = experiment.test_metrics.get("accuracy", 0)
logger.info(f" {experiment.config.name}: {acc:.4f} accuracy")
return experiment_ids
def train_neural_networks():
"""
Train neural network models with proper parameters.
"""
logging.info("Training Neural Network Models")
trainer = ModelTrainer()
neural_configs = [
{
"model_type": "lstm",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"lstm_units": 32,
"epochs": 10,
"batch_size": 64,
"max_len": 6,
},
},
{
"model_type": "cnn",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"filters": 64,
"kernel_size": 3,
"epochs": 10,
"batch_size": 64,
"max_len": 20, # Character level
},
},
{
"model_type": "transformer",
"features": ["full_name"],
"model_params": {
"embedding_dim": 64,
"transformer_num_heads": 2,
"epochs": 10,
"batch_size": 64,
"max_len": 6,
},
},
]
experiment_ids = trainer.train_multiple_models("neural_networks", neural_configs)
return experiment_ids
def main():
"""
Main training script with different options.
"""
parser = argparse.ArgumentParser(description="Train DRC Names Models")
parser.add_argument(
"--mode",
choices=["baseline", "neural", "list"],
default="list",
help="Training mode",
)
parser.add_argument("--model-type", type=str, help="Specific model type to train")
parser.add_argument("--name", type=str, help="Model name")
args = parser.parse_args()
trainer = ModelTrainer()
if args.mode == "baseline":
train_baseline_models()
elif args.mode == "neural":
train_neural_networks()
elif args.mode == "list":
logging.info("📋 Saved Models:")
saved_models = trainer.list_saved_models()
if not saved_models.empty:
logging.info(saved_models.to_string(index=False))
else:
logging.info("No saved models found.")
elif args.model_type and args.name:
# Train specific model
trainer.train_single_model(
model_name=args.name, model_type=args.model_type, features=["full_name"]
)
if __name__ == "__main__":
main()
View File
+76
View File
@@ -0,0 +1,76 @@
import pandas as pd
import streamlit as st
from core.utils import get_data_file_path
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class Dashboard:
def __init__(self, config, experiment_tracker, experiment_runner):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
st.header("Dashboard")
col1, col2, col3, col4 = st.columns(4)
# Load basic statistics
try:
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
if data_path.exists():
df = load_dataset(str(data_path))
with col1:
st.metric("Total Names", f"{len(df):,}")
with col2:
annotated = (df.get("annotated", 0) == 1).sum()
st.metric("Annotated Names", f"{annotated:,}")
with col3:
provinces = df["province"].nunique() if "province" in df.columns else 0
st.metric("Provinces", provinces)
with col4:
if "sex" in df.columns:
gender_dist = df["sex"].value_counts()
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
st.metric("F/M Ratio", f"{ratio:.2f}")
else:
st.warning("No processed data found. Please run data processing first.")
except Exception as e:
st.error(f"Error loading dashboard data: {e}")
# Recent experiments
st.subheader("Recent Experiments")
experiments = self.experiment_tracker.list_experiments()[:5]
if experiments:
exp_data = []
for exp in experiments:
exp_data.append(
{
"Name": exp.config.name,
"Model": exp.config.model_type,
"Status": exp.status.value,
"Accuracy": (
f"{exp.test_metrics.get('accuracy', 0):.3f}"
if exp.test_metrics
else "N/A"
),
"Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
}
)
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
else:
st.info("No experiments found. Create your first experiment in the Experiments tab!")
+154
View File
@@ -0,0 +1,154 @@
from datetime import datetime
import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataOverview:
def __init__(self, config):
self.config = config
def index(self):
st.header("Data Overview")
data_files = {
"Names": self.config.data.input_file,
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
"Male Names": self.config.data.output_files["males"],
"Female Names": self.config.data.output_files["females"],
}
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
file_path = get_data_file_path(data_files[selected_file], self.config)
if not file_path.exists():
st.warning(f"Dataset not found: {file_path}")
st.warning("Please run data processing first to generate datasets.")
return
# Load and display data
df = load_dataset(str(file_path))
if df.empty:
st.error("Failed to load dataset")
return
# Basic statistics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Records", f"{len(df):,}")
with col2:
if "annotated" in df.columns:
annotated_pct = (df["annotated"] == 1).mean() * 100
st.metric("Annotated", f"{annotated_pct:.1f}%")
with col3:
if "words" in df.columns:
avg_words = df["words"].mean()
st.metric("Avg Words", f"{avg_words:.1f}")
with col4:
if "length" in df.columns:
avg_length = df["length"].mean()
st.metric("Avg Length", f"{avg_length:.0f}")
# Data quality analysis
st.subheader("Data Quality Analysis")
col1, col2 = st.columns(2)
with col1:
# Missing values
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
fig = px.bar(
x=missing_data.index, y=missing_data.values, title="Missing Values by Column"
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
else:
st.success("No missing values found")
with col2:
# Gender distribution
if "sex" in df.columns:
gender_counts = df["sex"].value_counts()
fig = px.pie(
values=gender_counts.values,
names=gender_counts.index,
title="Gender Distribution",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
# Word count distribution
if "words" in df.columns:
st.subheader("Name Structure Analysis")
col1, col2 = st.columns(2)
with col1:
word_dist = df["words"].value_counts().sort_index()
fig = px.bar(
x=word_dist.index,
y=word_dist.values,
title="Distribution of Word Count in Names",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
with col2:
# Province distribution
if "province" in df.columns:
province_counts = df["province"].value_counts().head(10)
fig = px.bar(
x=province_counts.values,
y=province_counts.index,
orientation="h",
title="Top 10 Provinces by Name Count",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
# Sample data
st.subheader("Sample Data")
# Display columns selector
if not df.empty:
columns_to_show = st.multiselect(
"Select columns to display",
df.columns.tolist(),
default=(
["name", "sex", "province", "words"]
if all(col in df.columns for col in ["name", "sex", "province", "words"])
else df.columns[:5].tolist()
),
)
if columns_to_show:
sample_size = st.slider("Number of rows to display", 10, min(1000, len(df)), 50)
st.dataframe(df[columns_to_show].head(sample_size), use_container_width=True)
# Data export
st.subheader("Export Data")
if st.button("Download as CSV"):
csv = df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv,
file_name=f"{selected_file.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.csv",
mime="text/csv",
)
+127
View File
@@ -0,0 +1,127 @@
import pandas as pd
import plotly.express as px
import streamlit as st
from web.log_reader import LogReader
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataProcessing:
def __init__(self, config, pipeline_monitor):
self.config = config
self.pipeline_monitor = pipeline_monitor
def index(self):
st.header("Data Processing Pipeline")
status = self.pipeline_monitor.get_pipeline_status()
# Overall progress
overall_progress = status["overall_completion"] / 100
st.progress(overall_progress)
st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
# Step details
for step_name, step_status in status["steps"].items():
with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Processed Batches", step_status["processed_batches"])
with col2:
st.metric("Total Batches", step_status["total_batches"])
with col3:
st.metric("Failed Batches", step_status["failed_batches"])
if step_status["completion_percentage"] > 0:
st.progress(step_status["completion_percentage"] / 100)
# Read actual log entries from the log file
st.subheader("Recent Processing Logs")
try:
log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
log_reader = LogReader(log_file_path)
# Options for filtering logs
col1, col2 = st.columns(2)
with col1:
log_level_filter = st.selectbox(
"Filter by Level",
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
key="log_level_filter"
)
with col2:
num_entries = st.number_input(
"Number of entries",
min_value=5,
max_value=50,
value=10,
key="num_log_entries"
)
# Get log entries based on filter
if log_level_filter == "All":
log_entries = log_reader.read_last_entries(num_entries)
else:
log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
if log_entries:
for entry in log_entries:
if entry.level == "ERROR":
st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
elif entry.level == "WARNING":
st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
elif entry.level == "INFO":
st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
else:
st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
# Show log statistics
st.subheader("Log Statistics")
log_stats = log_reader.get_log_stats()
if log_stats:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Lines", log_stats.get('total_lines', 0))
with col2:
st.metric("INFO", log_stats.get('INFO', 0))
with col3:
st.metric("WARNING", log_stats.get('WARNING', 0))
with col4:
st.metric("ERROR", log_stats.get('ERROR', 0))
# Log level distribution chart
levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
counts = [log_stats.get(level, 0) for level in levels]
if sum(counts) > 0:
fig = px.bar(
x=levels,
y=counts,
title="Log Entries by Level",
color=levels,
color_discrete_map={
'INFO': 'blue',
'WARNING': 'orange',
'ERROR': 'red',
'DEBUG': 'gray',
'CRITICAL': 'darkred'
}
)
st.plotly_chart(fig, use_container_width=True)
else:
st.info("No log entries found or log file is empty.")
except Exception as e:
st.error(f"Error reading log file: {e}")
+185
View File
@@ -0,0 +1,185 @@
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
@dataclass
class LogEntry:
"""Represents a single log entry."""
timestamp: datetime
logger: str
level: str
message: str
raw_line: str
class LogReader:
"""Utility class for reading and parsing log files."""
def __init__(self, log_file_path: Path):
"""Initialize the log reader with a log file path."""
self.log_file_path = Path(log_file_path)
# Pattern to match Python logging format: timestamp - logger - level - message
self.log_pattern = re.compile(
r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
)
def read_last_entries(self, count: int = 10) -> List[LogEntry]:
"""Read the last N entries from the log file."""
if not self.log_file_path.exists():
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Parse log entries from the end
entries = []
for line in reversed(lines[-count*2:]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip())
if entry:
entries.append(entry)
if len(entries) >= count:
break
# Return entries in chronological order (oldest first of the last N)
return list(reversed(entries))
except Exception as e:
print(f"Error reading log file: {e}")
return []
def read_entries_by_level(self, level: str, count: int = 50) -> List[LogEntry]:
"""Read entries filtered by log level."""
if not self.log_file_path.exists():
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
entries = []
for line in reversed(lines):
entry = self._parse_log_line(line.strip())
if entry and entry.level.upper() == level.upper():
entries.append(entry)
if len(entries) >= count:
break
return list(reversed(entries))
except Exception as e:
print(f"Error reading log file: {e}")
return []
def read_entries_since(self, since: datetime, count: int = 100) -> List[LogEntry]:
"""Read entries since a specific datetime."""
if not self.log_file_path.exists():
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
entries = []
for line in reversed(lines):
entry = self._parse_log_line(line.strip())
if entry:
if entry.timestamp >= since:
entries.append(entry)
else:
# Stop reading if we've gone past the since time
break
if len(entries) >= count:
break
return list(reversed(entries))
except Exception as e:
print(f"Error reading log file: {e}")
return []
def get_log_stats(self) -> Dict[str, int]:
"""Get statistics about the log file."""
if not self.log_file_path.exists():
return {}
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
stats = {
'total_lines': len(lines),
'INFO': 0,
'WARNING': 0,
'ERROR': 0,
'DEBUG': 0,
'CRITICAL': 0
}
for line in lines:
entry = self._parse_log_line(line.strip())
if entry:
level = entry.level.upper()
if level in stats:
stats[level] += 1
return stats
except Exception as e:
print(f"Error reading log file: {e}")
return {}
def _parse_log_line(self, line: str) -> Optional[LogEntry]:
"""Parse a single log line into a LogEntry object."""
if not line:
return None
match = self.log_pattern.match(line)
if not match:
return None
try:
timestamp_str, logger, level, message = match.groups()
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
return LogEntry(
timestamp=timestamp,
logger=logger,
level=level,
message=message,
raw_line=line
)
except ValueError:
return None
class MultiLogReader:
"""Reader for multiple log files."""
def __init__(self, log_directory: Path):
"""Initialize with a directory containing log files."""
self.log_directory = Path(log_directory)
def get_available_log_files(self) -> List[Path]:
"""Get list of available log files."""
if not self.log_directory.exists():
return []
return list(self.log_directory.glob('*.log'))
def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
"""Read entries from all log files and merge them chronologically."""
all_entries = []
for log_file in self.get_available_log_files():
reader = LogReader(log_file)
entries = reader.read_last_entries(count)
all_entries.extend(entries)
# Sort by timestamp
all_entries.sort(key=lambda x: x.timestamp, reverse=True)
return all_entries[:count]