refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -4,8 +4,9 @@
 __pycache__/
 .ipynb_checkpoints/
 *.pyc
 /models/
 .env.local
 var/
-/dataset/
+/data/dataset/
 .DS_Store
 /data/
 /backups
@@ -2,24 +2,127 @@
 default: help
 .PHONY: help
-help:
+help: ## Show this help message
 	@echo Tasks:
 	@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
-.PHONY: download
+# =============================================================================
-download:
+# ENVIRONMENT SETUP
-	@if [ ! -f dataset/names.csv ]; then \
+# =============================================================================
 		set -a; [ -f .env.local ] && . .env.local; set +a; \
 		[ -z "$$DATASET_URL" ] && . .env; \
 		mkdir -p dataset; \
 		curl -L "$${DATASET_URL}" -o dataset/names.csv; \
 	else \
 		echo "dataset/names.csv already exists. Skipping download."; \
 	fi
-.PHONY: clean
+.PHONY: setup
-clean:
+setup: ## Setup virtual environment and install dependencies
-    rm -rf ./models
+	python -m venv .venv
-	rm -rf ./results
+	.venv/bin/pip install --upgrade pip
-	rm -rf ./dataset/spacy/train.spacy
+	.venv/bin/pip install -r requirements.txt
-	rm -rf ./dataset/spacy/dev.spacy
+
 .PHONY: install
 install: ## Install/update dependencies
 	pip install --upgrade pip
 	pip install -r requirements.txt
 .PHONY: install-dev
 install-dev: ## Install development dependencies
 	pip install -r requirements.txt
 	pip install jupyter notebook ipykernel pytest black flake8 mypy
 .PHONY: activate
 activate: ## Show activation command
 	@echo "Run: source .venv/bin/activate"
 # =============================================================================
 # MODEL TRAINING & ARTIFACTS
 # =============================================================================
 .PHONY: train-baseline
 train-baseline: ## Train all baseline models and save artifacts
 	python research/train.py --mode baseline
 .PHONY: train-neural
 train-neural: ## Train neural network models (LSTM, CNN, Transformer)
 	python research/train.py --mode neural
 .PHONY: train-model
 train-model: ## Train specific model (use: make train-model MODEL=logistic_regression NAME=my_model)
 	python research/train.py --model-type $(MODEL) --name $(NAME)
 .PHONY: list-models
 list-models: ## List all saved model artifacts
 	python research/train.py --mode list
 # =============================================================================
 # RESEARCH & EXPERIMENTS
 # =============================================================================
 .PHONY: experiment
 experiment: ## Create sample experiment configuration
 	python research/cli.py run --name "sample_experiment" --features full_name --model-type logistic_regression
 .PHONY: baseline
 baseline: ## Run baseline experiments
 	python research/cli.py baseline
 .PHONY: ablation
 ablation: ## Run feature ablation study
 	python research/cli.py ablation
 .PHONY: components
 components: ## Run name component analysis
 	python research/cli.py components
 .PHONY: list-experiments
 list-experiments: ## List all experiments
 	python research/cli.py list
 .PHONY: list-completed
 list-completed: ## List completed experiments only
 	python research/cli.py list --status completed
 .PHONY: export-results
 export-results: ## Export all experiment results to CSV
 	python research/cli.py export --output results_$(shell date +%Y%m%d_%H%M%S).csv
 .PHONY: best-model
 best-model: ## Show best performing model
 	python research/cli.py list --status completed | head -5
 # =============================================================================
 # WEB INTERFACE
 # =============================================================================
 .PHONY: web
 web: ## Launch Streamlit web interface
 	streamlit run web/app.py --server.runOnSave true --server.port 8501
 # =============================================================================
 # DEVELOPMENT & CODE QUALITY
 # =============================================================================
 .PHONY: format
 format: ## Format code with black
 	black . --line-length 100
 .PHONY: lint
 lint: ## Lint code with flake8
 	flake8 . --max-line-length=100 --ignore=E203,W503 --exclude=.venv
 .PHONY: type-check
 type-check: ## Type check with mypy
 	mypy . --ignore-missing-imports
 .PHONY: notebook
 notebook: ## Start Jupyter notebook
 	jupyter notebook notebooks/
 .PHONY: lab
 lab: ## Start Jupyter lab
 	jupyter lab notebooks/
 # =============================================================================
 # DEPLOYMENT & PRODUCTION
 # =============================================================================
 .PHONY: backup
 backup: ## Backup datasets and results
 	@mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
 	@cp -r data/ backups/$(shell date +%Y%m%d_%H%M%S)/data/
 	@echo "Backup created in backups/$(shell date +%Y%m%d_%H%M%S)/"
@@ -1,110 +1,316 @@
-# NERS-NLP: A Culturally-Aware Natural Language Processing System with Named Entity Recognition and Gender Inference Models
+# DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System for Congolese Name Analysis
-Despite the growing success of Named Entity Recognition (NER) systems and gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data. In this paper, we propose NERS-NLP, a culturally-aware NLP system with Named Entity Recognition and Gender Inference Models. This study introduces a large-scale dataset of over 7 million names of the population of the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata, including geographical distribution. We explore the linguistic and sociocultural features embedded in these names and examine their impact on two key NLP tasks, namely, entity recognition and gender classification.
+A comprehensive, research-friendly pipeline for analyzing Congolese names and predicting gender using culturally-aware machine learning models. 
-Our approach involves :
+This system provides advanced data processing, experiment management, and an intuitive web interface for non-technical users.
- (1) a statistical and feature analysis of Congolese name structures, 
+## Overview
 - (2) the development of supervised gender prediction models leveraging name components and demographic patterns, 
 - (3) the integration of the curated name lexicon into NER pipelines to improve recognition accuracy for Congolese entities. 
 Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data. 
 This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 7 million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
-Experiments conducted on custom evaluation sets, including multilingual and code-switched Congolese texts, show that our culturally-aware methods significantly outperform state-of-the-art multilingual baselines.
+Our approach involves:
 This work demonstrates the importance of culturally grounded resources in reducing bias and improving performance in NLP systems applied to underrepresented regions. Our findings open new directions for inclusive language technologies in African contexts and contribute a valuable resource for future research in regional linguistics, onomastics, and identity-aware artificial intelligence.
 - **(1) Advanced data processing pipeline** with batching, checkpointing, and parallel processing
 - **(2) Modular experiment framework** for systematic model comparison and research iteration  
 - **(3) Multiple feature extraction strategies** leveraging name components, linguistic patterns, and demographic data
 - **(4) Culturally-aware gender prediction models** trained specifically on Congolese naming patterns
 - **(5) User-friendly web interface** enabling non-technical users to run experiments and make predictions
 - **(6) Comprehensive research tools** for reproducible experimentation and result analysis
 ## Key Features
 ### **Advanced Data Processing**
 - **Batched processing** with configurable batch sizes and parallel execution
 - **Automatic checkpointing** and resume capability for large datasets
 - **LLM-powered annotation** with rate limiting and retry logic
 - **Memory-efficient** chunked data loading for datasets of any size
 ### **Research-Friendly Experiment Framework**
 - **Modular model architecture** - easily add new models and features
 - **Systematic experiment tracking** with automatic result storage
 - **Feature ablation studies** and component analysis tools
 - **Cross-validation** and statistical significance testing
 - **Automated baseline comparisons** and performance analysis
 ### **Intuitive Web Interface**
 - **No-code experiment creation** with visual parameter selection
 - **Real-time monitoring** of data processing and training progress
 - **Interactive result visualization** with charts and comparisons
 - **Batch prediction capabilities** for CSV file upload and processing
 - **Model comparison tools** with automatic performance rankings
 ### **Comprehensive Analytics**
 - **Feature importance analysis** showing which name components matter most
 - **Province-specific studies** examining regional naming patterns
 - **Learning curve analysis** for understanding data requirements
 - **Prediction confidence scoring** and error analysis tools
 ## Quick Start
 ### Using Make Commands (Recommended)
 ```bash
 # Complete setup and basic processing
 make quick-start
 # Launch web interface
 make web
 # Run research workflow  
 make research-flow
 # Show all available commands
 make help
 ```
 ### Manual Installation
 ## Installation
 ```bash
 git clone https://github.com/bernard-ng/drc-ners-nlp.git
 cd drc-ners-nlp
-python3 -m venv .venv
+# Setup environment
-source .venv/bin/activate
+make setup
 make process
-pip install -r requirements.txt
+# Launch web interface
 make web
 ```
 ## Usage
-## Dataset
+### Web Interface (Recommended for Non-Technical Users)
-### Preparation
+
-| Name             | Description                                                        | Default |
+Launch the Streamlit web application:
-|------------------|--------------------------------------------------------------------|---------|
+```bash
-| --split_eval     | Split into evaluation and featured datasets                        | True    |
+make web
-| --no-split_eval  | Do not split into evaluation and featured datasets                 |         |
+```
-| --split_by_sex   | Split by sex into male/female datasets                            | True    |
+
-| --no-split_by_sex| Do not split by sex into male/female datasets                     |         |
+The interface provides:
 - **Dashboard**: Overview of datasets and recent experiments
 - **Data Overview**: Interactive data exploration and statistics  
 - **Data Processing**: Monitor and control the processing pipeline
 - **Experiments**: Create and manage machine learning experiments
 - **Results & Analysis**: Compare models and analyze performance
 - **Predictions**: Make predictions on new names or upload CSV files
 - **Settings**: Configure the system and manage data
 ### Research & Experiments
 #### Quick Research Studies
 ```bash
 # Compare different approaches (full name vs native vs surname)
 make baseline
 # Analyze which name components are most effective
 make components  
 # Test feature importance through ablation study
 make ablation
 # View all experiment results
 make list-experiments
 # Export results for publication
 make export-results
 ```
 #### Custom Experiments
 ```bash
 # Run specific experiment via command line
 python research/cli.py run \
  --name "native_name_study" \
  --features native_name \
  --model-type logistic_regression \
  --description "Test native name effectiveness"
 # Compare multiple experiments
 python research/cli.py compare <exp_id_1> <exp_id_2>
 # View detailed results
 python research/cli.py show <experiment_id>
 ```
 ### Data Processing Pipeline
 #### Basic Processing (No LLM)
 ```bash
 make process-basic    # Fast processing without LLM annotation
 ```
 #### Complete Processing (With LLM)
 ```bash
 make process         # Full pipeline including LLM annotation
 make process-dev     # Development mode with smaller batches
 ```
 #### Monitor Progress
 ```bash
 make monitoring         # Show current pipeline status
 make status          # Show overall system status
 ```
 #### Resume Interrupted Processing
 ```bash
 make process-resume  # Resume from last checkpoint
 ```
 ### Available Models and Features
 #### Models
 - **Logistic Regression**: Character n-gram based classification
 - **Random Forest**: Engineered feature-based classification
 - **LSTM**: Sequential neural network (planned)
 - **Transformer**: Attention-based model (planned)
 #### Features
 - **Full Name**: Complete name as given
 - **Native Name**: Identified native/given name component  
 - **Surname**: Family name component
 - **Name Length**: Character count features
 - **Word Count**: Number of words in name
 - **Province**: Geographic/demographic features
 - **Name Beginnings/Endings**: Prefix/suffix patterns
 - **Character N-grams**: Linguistic pattern features
 ## Configuration
 ### Environment Configurations
 ```bash
-python -m processing.prepare --split_eval --split_by_sex
+# Switch to development configuration (smaller batches, more logging)
 make config-dev
 # Switch to production configuration (optimized for performance) 
 make config-prod
 # View current configuration
 make show-config
 ```
-### Annotation
+### Custom Configuration
 | Name        | Description                                         | Default        |
 |-------------|-----------------------------------------------------|----------------|
 | --llm_model | Ollama model name to use                            | mistral:7b     |
-Example:
+Edit configuration files in `config/`:
 - `pipeline.yaml` - Main configuration
 - `pipeline.development.yaml` - Development overrides  
 - `pipeline.production.yaml` - Production settings
 Example configuration:
 ```yaml
 processing:
  batch_size: 1000
  max_workers: 4
 llm:
  model_name: "mistral:7b"
  requests_per_minute: 60
 data:
  split_evaluation: true
  split_by_gender: true
 ```
 ## Research Capabilities
 ### Systematic Experimentation
 The framework supports systematic research through:
 1. **Baseline Studies**: Compare fundamental approaches
 2. **Feature Studies**: Test individual name components  
 3. **Ablation Studies**: Identify most important features
 4. **Cross-Province Analysis**: Test generalization across regions
 5. **Hyperparameter Optimization**: Systematic parameter tuning
 ### Reproducible Research
 - **Experiment Tracking**: All experiments automatically logged with full configuration
 - **Result Export**: CSV export for publication and further analysis
 - **Statistical Testing**: Cross-validation and confidence intervals
 - **Version Control**: Configuration-based approach enables easy replication
 ### Publication-Ready Output
 ```bash
-python -m processing.annotate --llm_model=mistral7b
+# Generate comprehensive results for publication
 make research-flow
 make export-results
 # Get best models for each approach  
 make list-completed
 python research/cli.py list --status completed | head -10
 ```
-## Experiments
+## Development
 ### Training
 | Name           | Description                                      | Default            |
 |----------------|--------------------------------------------------|--------------------|
 | --dataset      | Path to the dataset file                         | names_featured.csv |
 | --size         | Number of samples to use (None for full dataset) | None               |
 | --threshold    | Probability threshold for gender classification  | 0.5                |
 | --cv           | Number of cross-validation folds                 | None               |
 | --save         | Whether to save the trained model                | False              |
 | --balanced     | Whether to balance the dataset                   | False              |
 | --epochs       | Number of training epochs                        | 10                 |
 | --test_size    | Proportion of data to use as test set            | 0.2                |
 | --random_state | Random seed for reproducibility                  | 42                 |
 Examples: 
 ### Code Quality and Testing
 ```bash
-python -m pipelilne.gender.models.lstm --size 1000000 --save
+make format          # Format code with black
-python -m pipelilne.gender.models.logreg --size 1000000 --save
+make lint           # Lint with flake8  
-python -m pipelilne.gender.models.transformer --size 1000000 --save
+make check-deps     # Verify dependencies
 ```
 ### Development Workflow
 ```bash
-python -m pipelilne.gender.models.lstm --size 1000000 --balanced --save
+make daily-work     # Daily development setup
-python -m pipelilne.gender.models.logreg --size 1000000 --balanced --save
+make notebook       # Launch Jupyter for analysis
-python -m pipelilne.gender.models.transformer --size 1000000 --balanced --save
+make web-dev        # Launch web interface with auto-reload
 ```
-### Evaluation
+### Data Management
 | Name       | Description                                   | Default              |
 |------------|-----------------------------------------------|----------------------|
 | --model    | Model type: logreg, lstm, or transformer      | (required)           |
 | --dataset  | Path to the dataset CSV file                  | names_featured.csv   |
 | --size     | Number of rows to load from the dataset       | None                 |
 | --balanced | Load balanced dataset                         | False                |
 | --threshold| Probability threshold for classification      | 0.5                  |
 Examples:
 ```bash
-python -m pipelilne.gender.eval --dataset names_evaluations.csv --model logreg
+make check-data     # Verify all data files
-python -m pipelilne.gender.eval --dataset names_evaluations.csv --model lstm 
+make data-stats     # Show dataset statistics
-python -m pipelilne.gender.eval --dataset names_evaluations.csv --model transformer
+make backup-data    # Create timestamped backup
 make clean-checkpoints  # Clean processing checkpoints
 ```
-### Inference
+## Project Structure
 | Name        | Description                              | Default   |
 |-------------|------------------------------------------|-----------|
 | --model     | Model type: logreg, lstm, or transformer | (required)|
 | --names     | One or more names                        | (required)|
 | --threshold | Threshold for classification             | 0.5       |
 Examples: 
 ```bash
 python -m pipelilne.gender.predict --model logreg --names "Tshisekedi"
 python -m pipelilne.gender.predict --model lstm --names "Ilunga Ngandu"
 python -m pipelilne.gender.predict --model transformer --names "musenga wa musenga"
 ```
 ├── Makefile                    # All command shortcuts
 ├── streamlit_app.py           # Web interface application
 ├── config/                    # Configuration files
 │   ├── pipeline.yaml         # Main configuration
 │   ├── pipeline.development.yaml  # Dev settings
 │   └── pipeline.production.yaml   # Prod settings
 ├── core/                      # Core framework
 │   ├── config.py             # Configuration management
 │   ├── domain.py             # Domain-specific data
 │   └── utils.py              # Reusable utilities
 ├── processing/                # Data processing pipeline
 │   ├── main.py               # Main pipeline script
 │   ├── pipeline.py           # Pipeline framework
 │   ├── steps_config.py       # Configurable processing steps
 │   └── monitor.py            # Monitoring utilities
 ├── research/                  # Research and experiments
 │   ├── cli.py                # Command-line interface
 │   ├── experiment.py         # Experiment management
 │   ├── models.py             # Model implementations
 │   └── runner.py             # Experiment execution
 └── dataset/                   # Data files
    └── names.csv             # Raw dataset
 ```
 ## Citation
 If you use this pipeline in your research, please cite:
 ```bibtex
@software{drc_names_pipeline,
  title={DRC Names Gender Prediction Pipeline: A Culturally-Aware NLP System},
  author={Your Name},
  year={2025},
  url={https://github.com/bernard-ng/drc-ners-nlp}
 }
 ```
 ## License
 This project is licensed under the MIT License - see the LICENSE file for details.
 ## Acknowledgments
 - Democratic Republic of Congo population data contributors
 - Open source NLP and machine learning communities
 - Cultural linguistics research communities
@@ -0,0 +1,383 @@
 #!.venv/bin/python3
 import argparse
 import sys
 from pathlib import Path
 import json
 import pandas as pd
 import logging
 from core.config import get_config, setup_logging
 from research.experiment import ExperimentConfig
 from research.experiment.experiment_tracker import ExperimentTracker
 from research.experiment.feature_extractor import FeatureType
 from research.experiment.experiment_builder import ExperimentBuilder
 from research.experiment.experiment_runner import ExperimentRunner
 from research.model_registry import list_available_models
 def create_experiment_from_args(args) -> ExperimentConfig:
    """Create experiment configuration from command line arguments"""
    features = []
    if args.features:
        for feature_name in args.features:
            try:
                features.append(FeatureType(feature_name))
            except ValueError:
                logging.warning(f"Unknown feature type '{feature_name}', skipping")
    if not features:
        features = [FeatureType.FULL_NAME]  # Default
    # Parse model parameters
    model_params = {}
    if args.model_params:
        try:
            model_params = json.loads(args.model_params)
        except json.JSONDecodeError:
            logging.warning("Invalid JSON for model parameters, using defaults")
    # Parse feature parameters
    feature_params = {}
    if args.feature_params:
        try:
            feature_params = json.loads(args.feature_params)
        except json.JSONDecodeError:
            logging.warning("Invalid JSON for feature parameters, using defaults")
    # Parse data filters
    train_filter = None
    if args.train_filter:
        try:
            train_filter = json.loads(args.train_filter)
        except json.JSONDecodeError:
            logging.warning("Invalid JSON for train filter, ignoring")
    return ExperimentConfig(
        name=args.name,
        description=args.description or "",
        tags=args.tags or [],
        model_type=args.model_type,
        model_params=model_params,
        features=features,
        feature_params=feature_params,
        train_data_filter=train_filter,
        target_column=args.target,
        test_size=args.test_size,
        random_seed=args.seed,
        cross_validation_folds=args.cv_folds,
        metrics=args.metrics or ["accuracy", "precision", "recall", "f1"],
    )
 def run_single_experiment(args):
    """Run a single experiment"""
    config = create_experiment_from_args(args)
    runner = ExperimentRunner()
    experiment_id = runner.run_experiment(config)
    logging.info(f"Experiment completed: {experiment_id}")
    # Show results
    experiment = runner.tracker.get_experiment(experiment_id)
    if experiment:
        logging.info("Results:")
        for metric, value in experiment.test_metrics.items():
            logging.info(f"  Test {metric}: {value:.4f}")
        if experiment.cv_metrics:
            logging.info("Cross-validation:")
            for metric, value in experiment.cv_metrics.items():
                if not metric.endswith("_std"):
                    std_key = f"{metric}_std"
                    std_val = experiment.cv_metrics.get(std_key, 0)
                    logging.info(f"  CV {metric}: {value:.4f} ± {std_val:.4f}")
 def run_baseline_experiments(args):
    """Run baseline experiments"""
    logger = logging.getLogger(__name__)
    builder = ExperimentBuilder()
    experiments = builder.create_baseline_experiments()
    runner = ExperimentRunner()
    experiment_ids = runner.run_experiment_batch(experiments)
    logging.info(f"Completed {len(experiment_ids)} baseline experiments")
    # Show comparison
    if experiment_ids:
        comparison = runner.compare_experiments(experiment_ids)
        logging.info("Baseline Results Comparison:")
        logging.info(
            comparison[["name", "model_type", "features", "test_accuracy"]].to_string(index=False)
        )
 def run_ablation_study(args):
    """Run feature ablation study"""
    builder = ExperimentBuilder()
    experiments = builder.create_feature_ablation_study()
    runner = ExperimentRunner()
    experiment_ids = runner.run_experiment_batch(experiments)
    logging.info(f"Completed {len(experiment_ids)} ablation experiments")
    # Show results
    if experiment_ids:
        comparison = runner.compare_experiments(experiment_ids)
        logging.info("Ablation Study Results:")
        logging.info(comparison[["name", "test_accuracy", "test_f1"]].to_string(index=False))
 def run_component_study(args):
    """Run name component study"""
    builder = ExperimentBuilder()
    experiments = builder.create_name_component_study()
    runner = ExperimentRunner()
    experiment_ids = runner.run_experiment_batch(experiments)
    logging.info(f"Completed {len(experiment_ids)} component study experiments")
    # Show results
    if experiment_ids:
        comparison = runner.compare_experiments(experiment_ids)
        logging.info("Name Component Study Results:")
        logging.info(
            comparison[["name", "test_accuracy", "test_precision", "test_recall"]].to_string(
                index=False
            )
        )
 def list_experiments(args):
    """List experiments with optional filtering"""
    tracker = ExperimentTracker()
    # Apply filters
    filters = {}
    if args.status:
        from research.experiment import ExperimentStatus
        filters["status"] = ExperimentStatus(args.status)
    if args.model_type:
        filters["model_type"] = args.model_type
    if args.tags:
        filters["tags"] = args.tags
    experiments = tracker.list_experiments(**filters)
    if not experiments:
        logging.info("No experiments found matching criteria")
        return
    # Create summary table
    rows = []
    for exp in experiments:
        row = {
            "ID": exp.experiment_id[:12] + "...",
            "Name": exp.config.name,
            "Model": exp.config.model_type,
            "Status": exp.status.value,
            "Test Acc": f"{exp.test_metrics.get('accuracy', 0):.4f}" if exp.test_metrics else "N/A",
            "Start Time": exp.start_time.strftime("%Y-%m-%d %H:%M"),
        }
        rows.append(row)
    df = pd.DataFrame(rows)
    logging.info(df.to_string(index=False))
 def show_experiment_details(args):
    """Show detailed results for an experiment"""
    tracker = ExperimentTracker()
    experiment = tracker.get_experiment(args.experiment_id)
    if not experiment:
        logging.error(f"Experiment not found: {args.experiment_id}")
        return
    logging.info("=== Experiment Details ===")
    logging.info(f"ID: {experiment.experiment_id}")
    logging.info(f"Name: {experiment.config.name}")
    logging.info(f"Description: {experiment.config.description}")
    logging.info(f"Model Type: {experiment.config.model_type}")
    logging.info(f"Features: {', '.join([f.value for f in experiment.config.features])}")
    logging.info(f"Status: {experiment.status.value}")
    logging.info(f"Start Time: {experiment.start_time}")
    logging.info(f"End Time: {experiment.end_time}")
    if experiment.test_metrics:
        logging.info("=== Test Metrics ===")
        for metric, value in experiment.test_metrics.items():
            logging.info(f"{metric}: {value:.4f}")
    if experiment.cv_metrics:
        logging.info("=== Cross-Validation Metrics ===")
        for metric, value in experiment.cv_metrics.items():
            if not metric.endswith("_std"):
                std_key = f"{metric}_std"
                std_val = experiment.cv_metrics.get(std_key, 0)
                logging.info(f"{metric}: {value:.4f} ± {std_val:.4f}")
    if experiment.feature_importance:
        logging.info("=== Top 10 Feature Importances ===")
        sorted_features = sorted(
            experiment.feature_importance.items(), key=lambda x: x[1], reverse=True
        )
        for feature, importance in sorted_features[:10]:
            logging.info(f"{feature}: {importance:.4f}")
    if experiment.prediction_examples:
        logging.info("=== Prediction Examples ===")
        for i, example in enumerate(experiment.prediction_examples[:5]):
            correct = "✓" if example["correct"] else "✗"
            logging.info(
                f"{i + 1}. {example['name']} -> True: {example['true_label']}, "
                f"Pred: {example['predicted_label']} {correct}"
            )
 def compare_experiments_cmd(args):
    """Compare multiple experiments"""
    runner = ExperimentRunner()
    comparison = runner.compare_experiments(args.experiment_ids)
    if comparison.empty:
        logging.info("No experiments found for comparison")
        return
    logging.info("=== Experiment Comparison ===")
    # Show key columns
    key_columns = ["name", "model_type", "features", "test_accuracy", "test_f1"]
    available_columns = [col for col in key_columns if col in comparison.columns]
    logging.info(comparison[available_columns].to_string(index=False))
 def export_results(args):
    """Export experiment results"""
    tracker = ExperimentTracker()
    output_path = tracker.export_results(Path(args.output) if args.output else None)
    logging.info(f"Results exported to: {output_path}")
 def main():
    """Main CLI entry point"""
    parser = argparse.ArgumentParser(
        description="DRC Names Research Experiment Manager",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    # Setup logging
    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
    subparsers = parser.add_subparsers(dest="command", help="Available commands")
    # Single experiment command
    exp_parser = subparsers.add_parser("run", help="Run a single experiment")
    exp_parser.add_argument("--name", required=True, help="Experiment name")
    exp_parser.add_argument("--description", help="Experiment description")
    exp_parser.add_argument(
        "--model-type",
        default="logistic_regression",
        choices=list_available_models(),
        help="Model type",
    )
    exp_parser.add_argument(
        "--features", nargs="+", choices=[f.value for f in FeatureType], help="Features to use"
    )
    exp_parser.add_argument("--model-params", help="Model parameters as JSON")
    exp_parser.add_argument("--feature-params", help="Feature parameters as JSON")
    exp_parser.add_argument("--train-filter", help="Training data filter as JSON")
    exp_parser.add_argument("--target", default="sex", help="Target column")
    exp_parser.add_argument("--test-size", type=float, default=0.2, help="Test set size")
    exp_parser.add_argument("--seed", type=int, default=42, help="Random seed")
    exp_parser.add_argument("--cv-folds", type=int, default=5, help="CV folds")
    exp_parser.add_argument(
        "--metrics",
        nargs="+",
        choices=["accuracy", "precision", "recall", "f1"],
        help="Metrics to calculate",
    )
    exp_parser.add_argument("--tags", nargs="+", help="Experiment tags")
    # Batch experiment commands
    subparsers.add_parser("baseline", help="Run baseline experiments")
    subparsers.add_parser("ablation", help="Run feature ablation study")
    subparsers.add_parser("components", help="Run name component study")
    # List experiments
    list_parser = subparsers.add_parser("list", help="List experiments")
    list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"])
    list_parser.add_argument("--model-type", choices=list_available_models())
    list_parser.add_argument("--tags", nargs="+", help="Filter by tags")
    # Show experiment details
    detail_parser = subparsers.add_parser("show", help="Show experiment details")
    detail_parser.add_argument("experiment_id", help="Experiment ID")
    # Compare experiments
    compare_parser = subparsers.add_parser("compare", help="Compare experiments")
    compare_parser.add_argument("experiment_ids", nargs="+", help="Experiment IDs to compare")
    # Export results
    export_parser = subparsers.add_parser("export", help="Export results to CSV")
    export_parser.add_argument("--output", help="Output file path")
    args = parser.parse_args()
    if not args.command:
        parser.print_help()
        return 1
    # Setup logging
    config = get_config()
    if args.verbose:
        config.logging.level = "DEBUG"
    setup_logging(config)
    # Execute command
    try:
        if args.command == "run":
            run_single_experiment(args)
        elif args.command == "baseline":
            run_baseline_experiments(args)
        elif args.command == "ablation":
            run_ablation_study(args)
        elif args.command == "components":
            run_component_study(args)
        elif args.command == "list":
            list_experiments(args)
        elif args.command == "show":
            show_experiment_details(args)
        elif args.command == "compare":
            compare_experiments_cmd(args)
        elif args.command == "export":
            export_results(args)
        return 0
    except Exception as e:
        logging.error(f"Command failed: {e}")
        if args.verbose:
            import traceback
            traceback.print_exc()
        return 1
 if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)
@@ -0,0 +1,46 @@
 # Production Environment Configuration
 # Optimized settings for production deployment
 name: "drc_names_pipeline"
 version: "1.0.0"
 environment: "development"
 debug: true
 # Processing settings
 processing:
  batch_size: 100_000
  max_workers: 8
  checkpoint_interval: 10
  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
 # Pipeline stages
 stages:
  - "data_cleaning"
  - "feature_extraction"
  #- "llm_annotation"
  - "data_splitting"
 # Production LLM settings
 llm:
  model_name: "mistral:7b"
  requests_per_minute: 120
  requests_per_second: 3
  retry_attempts: 3
  timeout_seconds: 45
  max_concurrent_requests: 4
  enable_rate_limiting: true
 # Production data settings
 data:
  split_evaluation: true
  split_by_gender: true
  evaluation_fraction: 0.2
  random_seed: 42
 # Enhanced logging for development
 logging:
  level: "INFO"
  console_logging: true
  file_logging: true
  log_file: "pipeline.development.log"
@@ -0,0 +1,48 @@
 # Production Environment Configuration
 # Optimized settings for production deployment
 name: "drc_names_pipeline"
 version: "1.0.0"
 environment: "production"
 debug: false
 # Production processing settings (optimized for performance)
 processing:
  batch_size: 10_000
  max_workers: 8
  checkpoint_interval: 10
  use_multiprocessing: true  # Enable multiprocessing for CPU-bound tasks
 # Pipeline stages
 stages:
  - "data_cleaning"
  - "feature_extraction"
  - "llm_annotation"
  - "data_splitting"
 # Production LLM settings
 llm:
  model_name: "mistral:7b"
  requests_per_minute: 360
  requests_per_second: 3
  retry_attempts: 3
  timeout_seconds: 45
  max_concurrent_requests: 4
  enable_rate_limiting: true
 # Production data settings
 data:
  split_evaluation: true
  split_by_gender: true
  evaluation_fraction: 0.2
  random_seed: 42
 # Production logging (less verbose)
 logging:
  level: "INFO"
  console_logging: false  # Disable console in production
  file_logging: true
  log_file: "pipeline.production.log"
  max_log_size: 52428800  # 50MB
  backup_count: 10
@@ -0,0 +1,70 @@
 # DRC Names Processing Pipeline Configuration
 # Main configuration file with default settings
 name: "drc_names_pipeline"
 version: "1.0.0"
 description: "DRC Names NLP Processing Pipeline"
 environment: "development"
 debug: false
 # Project directory structure
 paths:
  root_dir: "."
  configs_dir: "./config"
  data_dir: "./data/dataset"
  models_dir: "./data/models"
  outputs_dir: "./data/outputs"
  logs_dir: "./data/logs"
  checkpoints_dir: "./data/checkpoints"
 # Pipeline stages
 stages:
  - "data_cleaning"
  - "feature_extraction"
  - "llm_annotation"
  - "data_splitting"
 # Data processing configuration
 processing:
  batch_size: 1_000
  max_workers: 4
  checkpoint_interval: 5
  use_multiprocessing: false
  encoding_options:
    - "utf-8"
    - "utf-16"
    - "latin1"
  chunk_size: 100_000
 # LLM annotation settings
 llm:
  model_name: "mistral:7b"
  requests_per_minute: 60
  requests_per_second: 2
  retry_attempts: 3
  timeout_seconds: 600
  max_concurrent_requests: 2
  enable_rate_limiting: true
 # Data handling configuration
 data:
  input_file: "names.csv"
  output_files:
    featured: "names_featured.csv"
    evaluation: "names_evaluation.csv"
    males: "names_males.csv"
    females: "names_females.csv"
  split_evaluation: true
  split_by_gender: true
  evaluation_fraction: 0.2
  random_seed: 42
 # Logging configuration
 logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  file_logging: true
  console_logging: true
  log_file: "pipeline.log"
  max_log_size: 10485760  # 10MB
  backup_count: 5
@@ -0,0 +1,128 @@
 # Research Experiment Configuration Templates
 # These configurations can be used as starting points for different types of experiments
 # Baseline Experiments Configuration
 baseline_experiments:
  - name: "baseline_logistic_regression_fullname"
    description: "Baseline logistic regression with full name"
    model_type: "logistic_regression"
    features: ["full_name"]
    model_params:
      ngram_range: [2, 5]
      max_features: 10000
      max_iter: 1000
    tags: ["baseline", "fullname"]
  - name: "baseline_logistic_regression_native"
    description: "Logistic regression with native name only"
    model_type: "logistic_regression"
    features: ["native_name"]
    model_params:
      ngram_range: [2, 4]
      max_features: 5000
    tags: ["baseline", "native"]
  - name: "baseline_rf_engineered"
    description: "Random Forest with engineered features"
    model_type: "random_forest"
    features: ["name_length", "word_count", "province"]
    model_params:
      n_estimators: 100
      max_depth: 10
    tags: ["baseline", "engineered"]
 # Feature Study Configurations
 feature_studies:
  - name: "native_vs_surname"
    description: "Compare native name vs surname effectiveness"
    experiments:
      - model_type: "logistic_regression"
        features: ["native_name"]
        tags: ["feature_study", "native"]
      - model_type: "logistic_regression"
        features: ["surname"]
        tags: ["feature_study", "surname"]
  - name: "name_parts_analysis"
    description: "Analyze effectiveness of different name parts"
    experiments:
      - features: ["first_word"]
        tags: ["name_parts", "first"]
      - features: ["last_word"]
        tags: ["name_parts", "last"]
      - features: ["name_beginnings"]
        feature_params:
          beginning_length: 3
        tags: ["name_parts", "beginnings"]
      - features: ["name_endings"]
        feature_params:
          ending_length: 3
        tags: ["name_parts", "endings"]
 # Province-Specific Studies
 province_studies:
  - name: "kinshasa_study"
    description: "Gender prediction for Kinshasa province"
    model_type: "logistic_regression"
    features: ["full_name"]
    train_data_filter:
      province: "kinshasa"
    tags: ["province_study", "kinshasa"]
  - name: "cross_province_generalization"
    description: "Train on one province, test on another"
    experiments:
      - train_filter: {"province": "kinshasa"}
        test_filter: {"province": "bas-congo"}
        tags: ["generalization", "kinshasa_to_bas-congo"]
 # Model Comparison Studies
 model_comparisons:
  - name: "model_comparison_fullname"
    description: "Compare different models with full name"
    base_config:
      features: ["full_name"]
      tags: ["model_comparison"]
    models:
      - model_type: "logistic_regression"
        model_params:
          ngram_range: [2, 5]
      - model_type: "random_forest"
        # Note: RF will need different feature preparation
        features: ["name_length", "word_count", "province"]
 # Advanced Feature Combinations
 advanced_features:
  - name: "multi_feature_combination"
    description: "Test various feature combinations"
    experiments:
      - features: ["full_name", "name_length"]
        tags: ["combination", "name_plus_length"]
      - features: ["native_name", "surname", "province"]
        tags: ["combination", "semantic_features"]
      - features: ["name_beginnings", "name_endings", "word_count"]
        tags: ["combination", "structural_features"]
 # Hyperparameter Studies
 hyperparameter_studies:
  - name: "ngram_range_study"
    description: "Study effect of different n-gram ranges"
    base_config:
      model_type: "logistic_regression"
      features: ["full_name"]
      tags: ["hyperparameter", "ngram"]
    variants:
      - model_params: {"ngram_range": [1, 3]}
      - model_params: {"ngram_range": [2, 4]}
      - model_params: {"ngram_range": [2, 5]}
      - model_params: {"ngram_range": [3, 6]}
 # Data Size Studies
 data_studies:
  - name: "learning_curve_study"
    description: "Study performance vs training data size"
    base_config:
      model_type: "logistic_regression"
      features: ["full_name"]
      tags: ["learning_curve"]
    data_sizes: [0.1, 0.25, 0.5, 0.75, 1.0]  # Fractions of training data to use
@@ -0,0 +1,61 @@
 import logging
 from pathlib import Path
 from typing import Optional, Union
 from core.config.config_manager import ConfigManager
 from core.config.logging_config import LoggingConfig
 from core.config.pipeline_config import PipelineConfig
 config_manager = ConfigManager()
 def get_config() -> PipelineConfig:
    """Get the global configuration instance"""
    return config_manager.get_config()
 def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
    """Load configuration from specified path"""
    if config_path:
        return config_manager.load_config(Path(config_path))
    return config_manager.get_config()
 def setup_logging(config: PipelineConfig):
    """Setup logging based on configuration"""
    # Create logs directory
    log_dir = config.paths.logs_dir
    log_dir.mkdir(parents=True, exist_ok=True)
    # Setup logging configuration
    log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
    # Create formatter
    formatter = logging.Formatter(config.logging.format)
    # Setup root logger
    root_logger = logging.getLogger()
    root_logger.setLevel(log_level)
    # Clear existing handlers
    root_logger.handlers.clear()
    # Console handler
    if config.logging.console_logging:
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(formatter)
        root_logger.addHandler(console_handler)
    # File handler
    if config.logging.file_logging:
        from logging.handlers import RotatingFileHandler
        log_file_path = log_dir / config.logging.log_file
        file_handler = RotatingFileHandler(
            log_file_path,
            maxBytes=config.logging.max_log_size,
            backupCount=config.logging.backup_count,
        )
        file_handler.setFormatter(formatter)
        root_logger.addHandler(file_handler)
@@ -0,0 +1,145 @@
 import json
 import logging
 from pathlib import Path
 from typing import Optional, Union, Dict, Any
 import yaml
 from core.config.pipeline_config import PipelineConfig
 from core.config.project_paths import ProjectPaths
 class ConfigManager:
    """Centralized configuration management"""
    def __init__(self, config_path: Optional[Union[str, Path]] = None):
        self.config_path = config_path or self._find_config_file()
        self._config: Optional[PipelineConfig] = None
        self._setup_default_paths()
    @classmethod
    def _find_config_file(cls) -> Path:
        """Find configuration file in standard locations"""
        possible_paths = [
            Path.cwd() / "config" / "pipeline.yaml",
            Path.cwd() / "config" / "pipeline.yml",
            Path.cwd() / "pipeline.yaml",
            Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
        ]
        for path in possible_paths:
            if path.exists():
                return path
        # Return default path if none found
        return Path.cwd() / "config" / "pipeline.yaml"
    def _setup_default_paths(self):
        """Setup default project paths"""
        root_dir = Path(__file__).parent.parent.parent
        self.default_paths = ProjectPaths(
            root_dir=root_dir,
            configs_dir=root_dir / "config",
            data_dir=root_dir / "data" / "dataset",
            models_dir=root_dir / "data" / "models",
            outputs_dir=root_dir / "data" / "outputs",
            logs_dir=root_dir / "data" / "logs",
            checkpoints_dir=root_dir / "data" / "checkpoints",
        )
    def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
        """Load configuration from file"""
        if config_path:
            self.config_path = config_path
        if not self.config_path.exists():
            logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
            return self._create_default_config()
        try:
            with open(self.config_path, "r") as f:
                if self.config_path.suffix.lower() in [".yaml", ".yml"]:
                    config_data = yaml.safe_load(f)
                else:
                    config_data = json.load(f)
            # Ensure paths are properly set
            if "paths" not in config_data:
                config_data["paths"] = self.default_paths.dict()
            self._config = PipelineConfig(**config_data)
            return self._config
        except Exception as e:
            logging.error(f"Failed to load config from {self.config_path}: {e}")
            return self._create_default_config()
    def _create_default_config(self) -> PipelineConfig:
        """Create default configuration"""
        return PipelineConfig(paths=self.default_paths)
    def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
        """Save configuration to file"""
        save_path = path or self.config_path
        save_path.parent.mkdir(parents=True, exist_ok=True)
        config_dict = config.model_dump()
        # Convert Path objects to strings for serialization
        if "paths" in config_dict:
            for key, value in config_dict["paths"].items():
                if isinstance(value, Path):
                    config_dict["paths"][key] = str(value)
        try:
            with open(save_path, "w") as f:
                if save_path.suffix.lower() in [".yaml", ".yml"]:
                    yaml.dump(config_dict, f, default_flow_style=False, indent=2)
                else:
                    json.dump(config_dict, f, indent=2)
            logging.info(f"Configuration saved to {save_path}")
        except Exception as e:
            logging.error(f"Failed to save config to {save_path}: {e}")
    def get_config(self) -> PipelineConfig:
        """Get current configuration, loading if necessary"""
        if self._config is None:
            self._config = self.load_config()
        return self._config
    def update_config(self, updates: Dict[str, Any]):
        """Update configuration with new values"""
        config = self.get_config()
        # Deep update configuration
        config_dict = config.model_dump()
        self._deep_update(config_dict, updates)
        self._config = PipelineConfig(**config_dict)
    def _deep_update(self, base_dict: Dict, update_dict: Dict):
        """Recursively update nested dictionaries"""
        for key, value in update_dict.items():
            if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
                self._deep_update(base_dict[key], value)
            else:
                base_dict[key] = value
    def get_environment_config(self, env: str) -> PipelineConfig:
        """Load environment-specific configuration"""
        env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
        if env_config_path.exists():
            base_config = self.load_config()
            env_config = self.load_config(env_config_path)
            # Merge configurations
            base_dict = base_config.dict()
            env_dict = env_config.dict()
            self._deep_update(base_dict, env_dict)
            return PipelineConfig(**base_dict)
        return self.get_config()
@@ -0,0 +1,22 @@
 from dataclasses import field
 from typing import Dict
 from pydantic import BaseModel
 class DataConfig(BaseModel):
    """Data handling configuration"""
    input_file: str = "names.csv"
    output_files: Dict[str, str] = field(
        default_factory=lambda: {
            "featured": "names_featured.csv",
            "evaluation": "names_evaluation.csv",
            "males": "names_males.csv",
            "females": "names_females.csv",
        }
    )
    split_evaluation: bool = True
    split_by_gender: bool = True
    evaluation_fraction: float = 0.2
    random_seed: int = 42
@@ -0,0 +1,13 @@
 from pydantic import BaseModel
 class LLMConfig(BaseModel):
    """LLM annotation configuration"""
    model_name: str = "mistral:7b"
    requests_per_minute: int = 60
    requests_per_second: int = 2
    retry_attempts: int = 3
    timeout_seconds: int = 30
    max_concurrent_requests: int = 2
    enable_rate_limiting: bool = False
@@ -0,0 +1,13 @@
 from pydantic import BaseModel
 class LoggingConfig(BaseModel):
    """Logging configuration"""
    level: str = "INFO"
    format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    file_logging: bool = True
    console_logging: bool = True
    log_file: str = "pipeline.log"
    max_log_size: int = 10 * 1024 * 1024  # 10MB
    backup_count: int = 5
@@ -0,0 +1,29 @@
 from pydantic import BaseModel
 from core.config.logging_config import LoggingConfig
 from core.config.data_config import DataConfig
 from core.config.llm_config import LLMConfig
 from core.config.processing_config import ProcessingConfig
 from core.config.project_paths import ProjectPaths
 class PipelineConfig(BaseModel):
    """Main pipeline configuration"""
    name: str = "drc_names_pipeline"
    version: str = "1.0.0"
    description: str = "DRC Names NLP Processing Pipeline"
    paths: ProjectPaths
    stages: list[str] = []
    processing: ProcessingConfig = ProcessingConfig()
    llm: LLMConfig = LLMConfig()
    data: DataConfig = DataConfig()
    logging: LoggingConfig = LoggingConfig()
    # Environment-specific settings
    environment: str = "development"
    debug: bool = True
    class Config:
        arbitrary_types_allowed = True
@@ -0,0 +1,14 @@
 from dataclasses import field
 from pydantic import BaseModel
 class ProcessingConfig(BaseModel):
    """Data processing pipeline configuration"""
    batch_size: int = 1000
    max_workers: int = 4
    checkpoint_interval: int = 5
    use_multiprocessing: bool = False
    encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
    chunk_size: int = 100_000
@@ -0,0 +1,23 @@
 from pathlib import Path
 from pydantic import BaseModel, field_validator
 class ProjectPaths(BaseModel):
    """Project directory structure configuration"""
    root_dir: Path
    data_dir: Path
    models_dir: Path
    outputs_dir: Path
    logs_dir: Path
    configs_dir: Path
    checkpoints_dir: Path
    class Config:
        arbitrary_types_allowed = True
    @classmethod
    @field_validator("*", mode="before")
    def convert_to_path(cls, v):
        return Path(v) if not isinstance(v, Path) else v
@@ -0,0 +1,57 @@
 import logging
 from contextlib import contextmanager
 from pathlib import Path
 from core.config import get_config, PipelineConfig
@contextmanager
 def temporary_config_override(**overrides):
    """Context manager for temporarily overriding configuration"""
    config = get_config()
    original_values = {}
    # Store original values and apply overrides
    for key, value in overrides.items():
        if hasattr(config, key):
            original_values[key] = getattr(config, key)
            setattr(config, key, value)
    try:
        yield config
    finally:
        # Restore original values
        for key, value in original_values.items():
            setattr(config, key, value)
 def ensure_directories(config: PipelineConfig) -> None:
    """Ensure all required directories exist"""
    directories = [
        config.paths.data_dir,
        config.paths.models_dir,
        config.paths.outputs_dir,
        config.paths.logs_dir,
        config.paths.configs_dir,
        config.paths.checkpoints_dir,
    ]
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
    logging.info("Ensured all required directories exist")
 def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
    """Get full path for a data file"""
    return config.paths.data_dir / filename
 def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
    """Get full path for a model file"""
    return config.paths.models_dir / filename
 def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
    """Get full path for an output file"""
    return config.paths.outputs_dir / filename
@@ -0,0 +1,62 @@
 import logging
 from pathlib import Path
 from typing import Optional, Union, Iterator
 import pandas as pd
 from core.config.pipeline_config import PipelineConfig
 class DataLoader:
    """Reusable data loading utilities"""
    def __init__(self, config: PipelineConfig):
        self.config = config
    def load_csv_chunked(
        self, filepath: Union[str, Path], chunk_size: Optional[int] = None
    ) -> Iterator[pd.DataFrame]:
        """Load CSV file in chunks for memory efficiency"""
        chunk_size = chunk_size or self.config.processing.chunk_size
        encodings = self.config.processing.encoding_options
        filepath = Path(filepath)
        for encoding in encodings:
            try:
                logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
                chunk_iter = pd.read_csv(
                    filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
                )
                for i, chunk in enumerate(chunk_iter):
                    logging.debug(f"Processing chunk {i+1}")
                    yield chunk
                logging.info(f"Successfully read {filepath} with encoding: {encoding}")
                return
            except Exception as e:
                logging.warning(f"Failed with encoding {encoding}: {e}")
                continue
        raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
    def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
        """Load complete CSV file into memory"""
        chunks = list(self.load_csv_chunked(filepath))
        return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
    @classmethod
    def save_csv(
        cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
    ) -> None:
        """Save DataFrame to CSV with proper handling"""
        filepath = Path(filepath)
        if create_dirs:
            filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False, encoding="utf-8")
        logging.info(f"Saved {len(df)} rows to {filepath}")
@@ -0,0 +1,3 @@
@@ -0,0 +1,24 @@
 from core.config.pipeline_config import PipelineConfig
 class PromptManager:
    """Manage prompts for LLM operations"""
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.prompts_dir = self.config.paths.configs_dir / "prompts"
    def load_prompt(self, prompt_name: str = "default") -> str:
        """Load a prompt template"""
        prompt_file = self.prompts_dir / f"{prompt_name}.txt"
        if not prompt_file.exists():
            # Fallback to root directory
            fallback_file = self.config.paths.root_dir / "prompt.txt"
            if fallback_file.exists():
                prompt_file = fallback_file
            else:
                raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
        with open(prompt_file, "r", encoding="utf-8") as f:
            return f.read().strip()
@@ -0,0 +1,56 @@
 import threading
 import time
 from dataclasses import dataclass
 from queue import Queue
@dataclass
 class RateLimitConfig:
    """Configuration for rate limiting LLM requests"""
    requests_per_minute: int = 60
    requests_per_second: int = 2
    burst_limit: int = 5
 class RateLimiter:
    """Thread-safe rate limiter for LLM requests"""
    def __init__(self, config: RateLimitConfig):
        self.config = config
        self.request_times = Queue()
        self.lock = threading.Lock()
        self.last_request_time = 0
    def wait_if_needed(self):
        """Wait if necessary to respect rate limits"""
        with self.lock:
            current_time = time.time()
            # Check requests per second limit
            time_since_last = current_time - self.last_request_time
            min_interval = 1.0 / self.config.requests_per_second
            if time_since_last < min_interval:
                sleep_time = min_interval - time_since_last
                time.sleep(sleep_time)
                current_time = time.time()
            # Clean old request times (older than 1 minute)
            while not self.request_times.empty():
                if current_time - self.request_times.queue[0] > 60:
                    self.request_times.get()
                else:
                    break
            # Check requests per minute limit
            if self.request_times.qsize() >= self.config.requests_per_minute:
                oldest_request = self.request_times.queue[0]
                wait_time = 60 - (current_time - oldest_request)
                if wait_time > 0:
                    time.sleep(wait_time)
                    current_time = time.time()
            # Record this request
            self.request_times.put(current_time)
            self.last_request_time = current_time
@@ -1,23 +1,44 @@
-import csv
+from typing import Optional, Dict, Tuple
 import io
 import json
 import logging
 import os
 import pickle
 from typing import List, Dict
-# Paths
+import pandas as pd
 ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 DATA_DIR = os.path.join(ROOT_DIR, 'dataset')
 MODELS_DIR = os.path.join(ROOT_DIR, 'models')
 GENDER_MODELS_DIR = os.path.join(MODELS_DIR, 'gender')
 GENDER_RESULT_DIR = os.path.join(ROOT_DIR, 'gender', 'results')
-NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner')
+class RegionMapper:
-NER_RESULT_DIR = os.path.join(ROOT_DIR, 'ner', 'results')
+    """Reusable region mapping utilities"""
-REGION_MAPPING = {
+    def __init__(self, mapping: Optional[Dict] = None):
        self.mapping = mapping or REGION_MAPPING
    def map_region_to_province(self, region: str) -> str:
        """Map a region to its province"""
        region_lower = str(region).lower().strip()
        return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
    def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
        """Vectorized region to province mapping"""
        return regions.str.lower().map(
            lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
        )
    @staticmethod
    def get_provinces():
        return [
            "kinshasa",
            "bas-congo",
            "bandundu",
            "katanga",
            "equateur",
            "province-orientale",
            "maniema",
            "nord-kivu",
            "sud-kivu",
            "kasai-occidental",
            "kasai-oriental",
        ]
 # DRC Region to Province Mapping
 REGION_MAPPING: Dict[str, Tuple[str, str]] = {
    # Kinshasa
    "kinshasa": ("KINSHASA", "KINSHASA"),
    "kinshasa-centre": ("KINSHASA", "KINSHASA"),
@@ -28,7 +49,6 @@ REGION_MAPPING = {
    "kinshasa-ouest": ("KINSHASA", "KINSHASA"),
    "kinshasa-plateau": ("KINSHASA", "KINSHASA"),
    "kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
    # Bas-Congo → Kongo-Central → BAS-CONGO
    "bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
    "bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
@@ -37,7 +57,6 @@ REGION_MAPPING = {
    "kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
    "kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
    "kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
    # Kwilu, Kwango, Mai-Ndombe → BANDUNDU
    "bandundu": ("BANDUNDU", "BANDUNDU"),
    "bandundu-1": ("BANDUNDU", "BANDUNDU"),
@@ -54,7 +73,6 @@ REGION_MAPPING = {
    "mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
    "mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
    "mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
    # Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
    "haut-katanga": ("HAUT-KATANGA", "KATANGA"),
    "haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
@@ -69,7 +87,6 @@ REGION_MAPPING = {
    "tanganyika": ("TANGANYIKA", "KATANGA"),
    "tanganyika-1": ("TANGANYIKA", "KATANGA"),
    "tanganyika-2": ("TANGANYIKA", "KATANGA"),
    # Equateur → MONGALA, NORD-UBANGI, SUD-UBANGI, TSHUAPA
    "equateur": ("EQUATEUR", "EQUATEUR"),
    "equateur-1": ("EQUATEUR", "EQUATEUR"),
@@ -89,7 +106,6 @@ REGION_MAPPING = {
    "tshuapa": ("TSHUAPA", "EQUATEUR"),
    "tshuapa-1": ("TSHUAPA", "EQUATEUR"),
    "tshuapa-2": ("TSHUAPA", "EQUATEUR"),
    # Province-Orientale
    "province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
    "province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
@@ -100,128 +116,47 @@ REGION_MAPPING = {
    "haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
    "haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
    "bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
    "bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"),
    "bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"),
    "ituri": ("ITURI", "PROVINCE-ORIENTALE"),
    "ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
    "ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
    "ituri-3": ("ITURI", "PROVINCE-ORIENTALE"),
    "tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
    "tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
    "tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
-
+    # Maniema
-    # Kasaï
+    "maniema": ("MANIEMA", "MANIEMA"),
-    "kasai-1": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
+    "maniema-1": ("MANIEMA", "MANIEMA"),
-    "kasai-2": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
+    "maniema-2": ("MANIEMA", "MANIEMA"),
    "kasai-ce": ("KASAÏ", "KASAÏ-OCCIDENTAL"),
    "kasai-central": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
    "kasai-central-1": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
    "kasai-central-2": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
    "kasai-occidental": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
    "kasai-occidental-1": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
    "kasai-occidental-2": ("KASAÏ-CENTRAL", "KASAÏ-OCCIDENTAL"),
    "kasai-oriental": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
    "kasai-oriental-1": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
    "kasai-oriental-2": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
    "kasai-oriental-3": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
    "kasai-orientale": ("KASAÏ-ORIENTAL", "KASAÏ-ORIENTAL"),
    "lomami": ("LOMAMI", "KASAÏ-ORIENTAL"),
    "lomami-1": ("LOMAMI", "KASAÏ-ORIENTAL"),
    "lomami-2": ("LOMAMI", "KASAÏ-ORIENTAL"),
    "sankuru": ("SANKURU", "KASAÏ-ORIENTAL"),
    "sankuru-1": ("SANKURU", "KASAÏ-ORIENTAL"),
    "sankuru-2": ("SANKURU", "KASAÏ-ORIENTAL"),
    # Nord-Kivu
    "nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
    "nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
    "nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
    "nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
    # Sud-Kivu
    "sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
    "sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
    "sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
    "sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
-
+    # Kasai-Occidental → KASAI, KASAI-CENTRAL
-    # Maniema
+    "kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
-    "maniema": ("MANIEMA", "MANIEMA"),
+    "kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
-    "maniema-1": ("MANIEMA", "MANIEMA"),
+    "kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
-    "maniema-2": ("MANIEMA", "MANIEMA"),
+    "kasai": ("KASAI", "KASAI-OCCIDENTAL"),
-
+    "kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
-    # Divers
+    "kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
-    "hors-frontieres": ("AUTRES", "AUTRES"),
+    "kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
-    "lukaya": ("AUTRES", "AUTRES"),
+    "kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
-    "recours": ("AUTRES", "AUTRES"),
+    "kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
-    "junacyc": ("AUTRES", "AUTRES"),
+    # Kasai-Oriental → LOMAMI, SANKURU
-    "junacyp": ("AUTRES", "AUTRES"),
+    "kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
-    "junacyc-lualaba-corrige": ("LUALABA", "KATANGA"),
+    "kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
-    "options-techniques-toutes-les-provinces-et-hors-frontieres": ("AUTRES", "AUTRES"),
+    "kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
-    "region": ("AUTRES", "AUTRES"),
+    "kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
    "lomami": ("LOMAMI", "KASAI-ORIENTAL"),
    "lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
    "lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
    "sankuru": ("SANKURU", "KASAI-ORIENTAL"),
    "sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
    "sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
 }
 logging.basicConfig(level=logging.INFO, format=">> %(message)s")
 def load_json_dataset(path: str) -> list:
    logging.info(f"Loading JSON dataset from {path}")
    with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f:
        return json.load(f)
 def save_csv_dataset(data: list, path: str) -> None:
    logging.info(f"Saving CSV dataset to {path}")
    with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)
 def load_csv_dataset(path: str, limit: int = None, balanced: bool = False) -> List[Dict[str, str]]:
    logging.info(f"Loading CSV dataset from {path}")
    file_path = os.path.join(DATA_DIR, path)
    with open(file_path, "r", encoding="utf-8", errors="replace", newline="") as f:
        raw_text = f.read().replace('\x00', '')
    reader = csv.DictReader(io.StringIO(raw_text))
    logging.info(f"Detected fieldnames: {reader.fieldnames}")
    if balanced:
        by_sex = {'m': [], 'f': []}
        for row in reader:
            sex = row.get("sex", "").lower()
            if sex in by_sex:
                by_sex[sex].append(row)
        min_len = min(len(by_sex['m']), len(by_sex['f']))
        if limit:
            min_len = min(min_len, limit // 2)
        data = by_sex['m'][:min_len] + by_sex['f'][:min_len]
    else:
        data = []
        for i, row in enumerate(reader):
            data.append(row)
            if limit and i + 1 >= limit:
                break
    logging.info("Successfully loaded with UTF-8 encoding")
    return data
 def save_json_dataset(data: list, path: str) -> None:
    logging.info(f"Saving JSON dataset to {path}")
    with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, separators=(',', ':'))
 def save_pickle(obj, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "wb") as f:
        pickle.dump(obj, f)
 def load_pickle(path: str):
    with open(path, "rb") as f:
        return pickle.load(f)
 def load_prompt() -> str:
    with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f:
        return f.read()
@@ -0,0 +1,41 @@
 import json
 import logging
 from typing import Dict, Any
 from core.config.pipeline_config import PipelineConfig
 class StateManager:
    """Manage pipeline state and checkpoints"""
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.checkpoints_dir = self.config.paths.checkpoints_dir
    def save_state(self, state: Dict[str, Any], state_name: str) -> None:
        """Save pipeline state"""
        self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
        state_file = self.checkpoints_dir / f"{state_name}.json"
        with open(state_file, "w") as f:
            json.dump(state, f, indent=2, default=str)
        logging.debug(f"Saved state to {state_file}")
    def load_state(self, state_name: str) -> Dict[str, Any]:
        """Load pipeline state"""
        state_file = self.checkpoints_dir / f"{state_name}.json"
        if not state_file.exists():
            return {}
        with open(state_file, "r") as f:
            return json.load(f)
    def clear_state(self, state_name: str) -> None:
        """Clear pipeline state"""
        state_file = self.checkpoints_dir / f"{state_name}.json"
        if state_file.exists():
            state_file.unlink()
            logging.info(f"Cleared state: {state_name}")
@@ -0,0 +1,38 @@
 from typing import Optional, Dict
 import pandas as pd
 class TextCleaner:
    """Reusable text cleaning utilities"""
    def __init__(self, patterns: Optional[Dict[str, str]] = None):
        self.patterns = patterns or {
            "null_bytes": "\x00",
            "non_breaking_spaces": "\u00a0",
            "multiple_spaces": r" +",
            "extra_whitespace": r"\s+",
        }
    def clean_text_series(self, series: pd.Series) -> pd.Series:
        """Clean a pandas Series of text data"""
        cleaned = series.astype(str)
        # Apply cleaning patterns
        for pattern_name, pattern in self.patterns.items():
            if pattern_name == "multiple_spaces":
                cleaned = cleaned.str.replace(pattern, " ", regex=True)
            else:
                cleaned = cleaned.str.replace(pattern, " ", regex=False)
        return cleaned.str.strip().str.lower()
    def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean all text columns in a DataFrame"""
        df = df.copy()
        text_columns = df.select_dtypes(include="object").columns
        for col in text_columns:
            df[col] = self.clean_text_series(df[col])
        return df
@@ -0,0 +1,154 @@
 #!.venv/bin/python3
 import sys
 import argparse
 import logging
 from pathlib import Path
 from typing import Optional
 from core.utils.data_loader import DataLoader
 from core.config import ConfigManager, setup_logging
 from core.utils import ensure_directories, get_data_file_path
 from processing.pipeline import Pipeline
 from processing.batch.batch_config import BatchConfig
 from processing.steps.data_splitting_step import DataSplittingStep
 from processing.steps.llm_annotation_step import LLMAnnotationStep
 from processing.steps.feature_extraction_step import FeatureExtractionStep
 from processing.steps.data_cleaning_step import DataCleaningStep
 def create_pipeline_from_config(config_path: Optional[Path] = None) -> Pipeline:
    """Create pipeline from configuration file"""
    config = ConfigManager(config_path).load_config()
    # Setup logging
    setup_logging(config)
    ensure_directories(config)
    batch_config = BatchConfig(
        batch_size=config.processing.batch_size,
        max_workers=config.processing.max_workers,
        checkpoint_interval=config.processing.checkpoint_interval,
        use_multiprocessing=config.processing.use_multiprocessing,
    )
    # Add steps based on configuration
    pipeline = Pipeline(batch_config)
    steps = [
        DataCleaningStep(config),
        FeatureExtractionStep(config),
        LLMAnnotationStep(config),
        DataSplittingStep(config),
    ]
    for stage in config.stages:
        for step in steps:
            if step.name == stage:
                pipeline.add_step(step)
    return pipeline
 def run_pipeline(config_path: Optional[Path] = None, resume: bool = False) -> int:
    """Run the complete pipeline"""
    try:
        config = ConfigManager(config_path).load_config()
        logging.info(f"Starting pipeline: {config.name} v{config.version}")
        logging.info(f"Environment: {config.environment}")
        # Load input data
        input_file_path = get_data_file_path(config.data.input_file, config)
        if not input_file_path.exists():
            logging.error(f"Input file not found: {input_file_path}")
            return 1
        data_loader = DataLoader(config)
        logging.info(f"Loading data from {input_file_path}")
        df = data_loader.load_csv_complete(input_file_path)
        logging.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
        # Create and run pipeline
        pipeline = create_pipeline_from_config(config_path)
        logging.info("Starting pipeline execution")
        result_df = pipeline.run(df)
        # Save results using the splitting step
        splitting_step = pipeline.steps[-1]
        if isinstance(splitting_step, DataSplittingStep):
            splitting_step.save_splits(result_df)
        # Show completion statistics
        progress = pipeline.get_progress()
        logging.info("=== Pipeline Completion Summary ===")
        for step_name, stats in progress.items():
            logging.info(
                f"{step_name}: {stats['completion_percentage']:.1f}% "
                f"({stats['processed_batches']}/{stats['total_batches']} batches)"
            )
            if stats["failed_batches"] > 0:
                logging.warning(f"  {stats['failed_batches']} failed batches")
        logging.info("Pipeline completed successfully")
        return 0
    except Exception as e:
        logging.error(f"Pipeline failed: {e}", exc_info=True)
        return 1
 def main():
    """Main entry point with minimal command-line interface"""
    parser = argparse.ArgumentParser(
        description="DRC Names Processing Pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Configuration File Examples:
  config/pipeline.yaml              - Main configuration
  config/pipeline.development.yaml  - Development environment
  config/pipeline.production.yaml   - Production environment
 Usage Examples:
  python processing/main.py                                   # Use default config
  python processing/main.py --config config/pipeline.yaml     # Use specific config
  python processing/main.py --env development                 # Use environment config
  python processing/main.py --resume                          # Resume from checkpoints
        """,
    )
    parser.add_argument("--config", type=Path, help="Path to configuration file")
    parser.add_argument(
        "--env", type=str, help="Environment name (loads config/pipeline.{env}.yaml)"
    )
    parser.add_argument(
        "--resume", action="store_true", help="Resume pipeline from existing checkpoints"
    )
    parser.add_argument(
        "--validate-config", action="store_true", help="Validate configuration file and exit"
    )
    args = parser.parse_args()
    # Determine config path
    config_path = None
    if args.config:
        config_path = args.config
    elif args.env:
        config_path = Path("config") / f"pipeline.{args.env}.yaml"
    if args.validate_config:
        try:
            config = ConfigManager(config_path).load_config()
            print(f"Configuration is valid: {config.name} v{config.version}")
            return 0
        except Exception as e:
            print(f"Configuration validation failed: {e}")
            return 1
    # Run pipeline
    return run_pipeline(config_path, args.resume)
 if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)
@@ -0,0 +1,157 @@
 #!.venv/bin/python3
 import argparse
 import sys
 from core.config.config_manager import ConfigManager
 from processing.monitoring.pipeline_monitor import PipelineMonitor
 from processing.monitoring.data_analyzer import DatasetAnalyzer
 def main():
    parser = argparse.ArgumentParser(
        description="Monitor and manage the DRC names processing pipeline"
    )
    subparsers = parser.add_subparsers(dest="command", help="Available commands")
    # Status command
    status_parser = subparsers.add_parser("status", help="Show pipeline status")
    status_parser.add_argument(
        "--detailed",
        action="store_true",
        help="Show detailed information including failed batch IDs",
    )
    # Clean command
    clean_parser = subparsers.add_parser("clean", help="Clean checkpoint files")
    clean_parser.add_argument(
        "--step",
        type=str,
        choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
        help="Clean specific step (default: all)",
    )
    clean_parser.add_argument(
        "--keep-last", type=int, default=1, help="Number of recent checkpoints to keep (default: 1)"
    )
    clean_parser.add_argument("--force", action="store_true", help="Clean without confirmation")
    # Reset command
    reset_parser = subparsers.add_parser("reset", help="Reset pipeline step")
    reset_parser.add_argument(
        "step",
        type=str,
        choices=["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"],
        help="Step to reset",
    )
    reset_parser.add_argument("--force", action="store_true", help="Reset without confirmation")
    # Analyze command
    analyze_parser = subparsers.add_parser("analyze", help="Analyze dataset")
    analyze_parser.add_argument(
        "--file",
        type=str,
        default="names_featured.csv",
        help="Dataset file to analyze (default: names_featured.csv)",
    )
    # Checkpoint info command
    info_parser = subparsers.add_parser("info", help="Show checkpoint information")
    args = parser.parse_args()
    if not args.command:
        parser.print_help()
        return 1
    monitor = PipelineMonitor()
    if args.command == "status":
        monitor.print_status(detailed=args.detailed)
    elif args.command == "clean":
        checkpoint_info = monitor.count_checkpoint_files()
        print(f"Current checkpoint storage: {checkpoint_info['total_size_mb']:.1f} MB")
        if not args.force:
            response = input("Are you sure you want to clean checkpoints? (y/N): ")
            if response.lower() != "y":
                print("Cancelled")
                return 0
        if args.step:
            monitor.clean_step_checkpoints(args.step, args.keep_last)
        else:
            for step in monitor.steps:
                monitor.clean_step_checkpoints(step, args.keep_last)
        print("Checkpoint cleaning completed")
    elif args.command == "reset":
        if not args.force:
            response = input(
                f"Are you sure you want to reset {args.step}? This will delete all checkpoints. (y/N): "
            )
            if response.lower() != "y":
                print("Cancelled")
                return 0
        monitor.reset_step(args.step)
        print(f"Reset completed for {args.step}")
    elif args.command == "analyze":
        # Use configured data directory instead of hardcoded DATA_DIR
        data_dir = ConfigManager().default_paths.data_dir
        filepath = data_dir / args.file
        if not filepath.exists():
            print(f"File not found: {filepath}")
            return 1
        analyzer = DatasetAnalyzer(str(filepath))
        if not analyzer.load_data():
            return 1
        completion_stats = analyzer.analyze_completion()
        quality_stats = analyzer.analyze_quality()
        print(f"\n=== Dataset Analysis: {args.file} ===")
        print(f"Total rows: {completion_stats['total_rows']:,}")
        print(
            f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)"
        )
        print(f"Unannotated: {completion_stats['unannotated_rows']:,}")
        print(
            f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)"
        )
        if "name_length" in quality_stats:
            length_stats = quality_stats["name_length"]
            print(f"\nName length statistics:")
            print(f"  Average: {length_stats['mean']:.1f} characters")
            print(f"  Range: {length_stats['min']}-{length_stats['max']} characters")
        if "word_distribution" in quality_stats:
            print(f"\nWord count distribution:")
            for words, count in quality_stats["word_distribution"].items():
                print(f"  {words} words: {count:,} names")
    elif args.command == "info":
        checkpoint_info = monitor.count_checkpoint_files()
        print(f"\n=== Checkpoint Information ===")
        print(f"Total storage: {checkpoint_info['total_size_mb']:.1f} MB")
        print()
        for step in monitor.steps:
            step_info = checkpoint_info[step]
            print(f"{step.replace('_', ' ').title()}:")
            print(f"  Files: {step_info['files']}")
            print(f"  Size: {step_info['size_mb']:.1f} MB")
            print()
    return 0
 if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)
@@ -1,115 +0,0 @@
 import argparse
 import os
 import tensorflow as tf
 from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, confusion_matrix
 )
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_json_dataset, load_pickle, GENDER_RESULT_DIR
 def evaluate_logreg(df, threshold):
    """
    Evaluates a logistic regression model with the given DataFrame and threshold. The function loads
    a pre-trained model and label encoder, transforms the input data into the required format, and
    performs predictions. It returns the true labels, predicted labels, predicted probabilities, and
    the encoder class labels.
    """
    model = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
    encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
    X = df["name"].tolist()
    y_true = encoder.transform(df["sex"])
    proba = model.predict_proba(X)
    y_pred = (proba[:, 1] >= threshold).astype(int)
    return y_true, y_pred, proba[:, 1], encoder.classes_
 def evaluate_lstm(df, threshold, max_len=6):
    """
    Evaluates the predictions of a pre-trained BiLSTM model on the given dataset and
    returns the true labels, predicted labels, prediction probabilities, and class names.
    """
    model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
    tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
    encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
    sequences = tokenizer.texts_to_sequences(df["name"])
    X = pad_sequences(sequences, maxlen=max_len, padding="post")
    y_true = encoder.transform(df["sex"])
    proba = model.predict(X)
    y_pred = (proba[:, 1] >= threshold).astype(int)
    return y_true, y_pred, proba[:, 1], encoder.classes_
 def evaluate_transformer(df, threshold, max_len=6):
    """
    Evaluates the transformer model for gender prediction. The function loads a pre-trained
    transformer model, tokenizer, and label encoder. It processes the input dataframe by
    tokenizing and padding the "name" column and encodes the "sex" column to numerical format.
    The function then predicts the probabilities for the given names using the transformer model
    and generates predictions based on the specified threshold.
    """
    model = tf.keras.models.load_model(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
    tokenizer = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
    encoder = load_pickle(os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
    sequences = tokenizer.texts_to_sequences(df["name"])
    X = pad_sequences(sequences, maxlen=max_len, padding="post")
    y_true = encoder.transform(df["sex"])
    proba = model.predict(X)
    y_pred = (proba[:, 1] >= threshold).astype(int)
    return y_true, y_pred, proba[:, 1], encoder.classes_
 def compute_metrics(y_true, y_pred, y_proba, class_names):
    """
    Computes classification metrics for given true and predicted labels, along with
    class probabilities and class names. The function calculates accuracy, precision,
    recall, F1 score, and confusion matrix for evaluating model performance.
    """
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    cm = confusion_matrix(y_true, y_pred).tolist()
    return {
        "accuracy": acc,
        "precision": pr,
        "recall": rc,
        "f1": f1,
        "confusion_matrix": {
            "labels": class_names.tolist(),
            "matrix": cm
        }
    }
 def main():
    parser = argparse.ArgumentParser(description="Evaluate gender prediction model")
    parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
    parser.add_argument("--dataset", default="names_evaluation.csv", help="Path to the dataset CSV file")
    parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
    parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
    args = parser.parse_args()
    df = load_csv_dataset(args.dataset, args.size, args.balanced)
    model_funcs = {
        "logreg": evaluate_logreg,
        "lstm": evaluate_lstm,
        "transformer": evaluate_transformer,
    }
    try:
        y_true, y_pred, y_proba, classes = model_funcs[args.model](df, args.threshold)
    except KeyError:
        raise ValueError(f"Unknown model: {args.model}")
    results = compute_metrics(y_true, y_pred, y_proba, classes)
    save_json_dataset(results, os.path.join(GENDER_RESULT_DIR, f'{args.model}_eval'))
 if __name__ == "__main__":
    main()
@@ -1,80 +0,0 @@
 import argparse
 from dataclasses import dataclass
 from typing import Optional
 from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    classification_report, confusion_matrix
 )
 from misc import logging
 def evaluate_proba(y_true, y_proba, threshold, class_names):
    y_pred = (y_proba[:, 1] >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    cm = confusion_matrix(y_true, y_pred)
    logging.info(f"Accuracy: {acc:.4f} | Precision: {pr:.4f} | Recall: {rc:.4f} | F1: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
@dataclass
 class BaseConfig:
    """
    Represents the base configuration for a dataset and its associated parameters.
    This class serves as a foundational configuration handler to encapsulate
    dataset-related parameters and options. It allows customization of dataset
    behavior, including threshold values, size, cross-validation settings, and
    whether to save derived configurations. It can also manage configurations
    for balanced datasets if necessary.
    """
    dataset_path: str = "names_featured.csv"
    size: Optional[int] = None
    threshold: float = 0.5
    cv: Optional[int] = None
    save: bool = False
    balanced: bool = False
    epochs: int = 10
    test_size: float = 0.2
    random_state: int = 42
 def load_config(description: str) -> BaseConfig:
    """
    Parses command-line arguments and loads the configuration for the logistic regression model.
    This function sets up an argument parser for various command-line options including
    the dataset path, dataset size, dataset balancing, classification threshold,
    cross-validation folds, and saving the model and its associated artifacts. Once parsed,
    it transfers the configurations to a ``BaseConfig`` instance and returns it.
    """
    parser = argparse.ArgumentParser(description)
    parser.add_argument("--dataset", type=str, default="names_featured.csv", help="Path to the dataset file")
    parser.add_argument("--size", type=int, help="Number of rows to load from the dataset")
    parser.add_argument("--balanced", action="store_true", help="Load balanced dataset")
    parser.add_argument("--threshold", type=float, default=0.5, help="Probability threshold for classification")
    parser.add_argument("--cv", type=int, help="Number of folds for cross-validation")
    parser.add_argument("--save", action="store_true", help="Save the model and artifacts after training")
    parser.add_argument("--epochs", type=int, default=10, help="Number of epochs for training")
    parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the test split")
    parser.add_argument("--random_state", type=int, default=42, help="Random seed for reproducibility")
    args = parser.parse_args()
    return BaseConfig(
        dataset_path=args.dataset,
        size=args.size,
        threshold=args.threshold,
        cv=args.cv,
        save=args.save,
        balanced=args.balanced,
        epochs=args.epochs,
        test_size=args.test_size,
        random_state=args.random_state
    )
@@ -1,123 +0,0 @@
 import os
 from dataclasses import dataclass
 from typing import Tuple
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.preprocessing import LabelEncoder
 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
 from pipeline.gender.models import BaseConfig, load_config, logging
@dataclass
 class Config(BaseConfig):
    ngram_range: Tuple[int, int] = (2, 5)
    max_iter: int = 1000
 def encode_labels(y: pd.Series) -> Tuple[pd.Series, LabelEncoder]:
    """
    Encode the labels using a LabelEncoder. This function takes a pandas Series of labels,
    fits a LabelEncoder to the labels, and transforms them into a numerical format suitable
    for model training. The transformed labels and the fitted encoder are returned.
    """
    logging.info("Encoding labels")
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)
    return y_encoded, encoder
 def build_model(cfg: Config) -> Pipeline:
    """
    Build a logistic regression model pipeline with a character-level CountVectorizer.
    The pipeline consists of a CountVectorizer that transforms the input text into
    character n-grams, followed by a Logistic Regression classifier. The n-gram range
    and maximum iterations for the logistic regression can be configured through the
    provided configuration object.
    """
    return make_pipeline(
        CountVectorizer(analyzer="char", ngram_range=cfg.ngram_range),
        LogisticRegression(max_iter=cfg.max_iter)
    )
 def evaluate_proba(y_true, y_proba, threshold: float, class_names):
    """
    Evaluates the performance of a classification model using a specified threshold
    for predicted probabilities. Computes metrics such as accuracy, precision,
    recall, F1-score, and the confusion matrix. Also generates a classification
    report with detailed metrics for each class.
    Logs the evaluation metrics at the specified threshold and prints the confusion
    matrix and classification report.
    """
    logging.info(f"Evaluating at threshold = {threshold}")
    y_pred = (y_proba[:, 1] >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    cm = confusion_matrix(y_true, y_pred)
    logging.info(f"Accuracy: {acc:.4f}")
    logging.info(f"Precision: {pr:.4f}, Recall: {rc:.4f}, F1-score: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=class_names))
 def cross_validate(cfg: Config, X, y) -> None:
    """
    Performs k-fold cross-validation on the provided dataset using the configuration and
    logs the results including individual fold scores, mean accuracy, and the standard
    deviation of the scores.
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    pipeline = build_model(cfg)
    scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(n_splits=cfg.cv), scoring="accuracy")
    logging.info(f"Cross-validation scores: {scores}")
    logging.info(f"Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
 def save_artifacts(model, encoder):
    """
    Saves the trained model and label encoder artifacts to the specified directory.
    """
    save_pickle(model, os.path.join(GENDER_MODELS_DIR, "regression_model.pkl"))
    save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl"))
    logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
 def main():
    cfg = Config(**vars(load_config("logistic regression model")))
    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
    X_raw, y_raw = df["name"], df["sex"]
    y_encoded, encoder = encode_labels(y_raw)
    if cfg.cv:
        cross_validate(cfg, X_raw, y_encoded)
        return
    X_train, X_test, y_train, y_test = train_test_split(
        X_raw, y_encoded, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y_encoded
    )
    model = build_model(cfg)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)
    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
    if cfg.save:
        save_artifacts(model, encoder)
 if __name__ == "__main__":
    main()
@@ -1,144 +0,0 @@
 import os
 from dataclasses import dataclass
 from typing import Tuple
 import numpy as np
 import pandas as pd
 from sklearn.metrics import (
    accuracy_score
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
 from tensorflow.keras.callbacks import ProgbarLogger
 from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
 from pipeline.gender.models import load_config, BaseConfig, evaluate_proba, logging
@dataclass
 class Config(BaseConfig):
    max_len: int = 6
    embedding_dim: int = 64
    lstm_units: int = 32
    batch_size: int = 64
 def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
    """
    Loads and preprocesses data for text classification by tokenizing text data, encoding labels, and padding sequences.
    This function expects a dataset file path, prepares the tokenizer to process text input, and encodes labels for
    model training. The resulting outputs are ready for input into a machine learning pipeline.
    """
    logging.info("Loading and preprocessing data")
    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
    tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
    tokenizer.fit_on_texts(df["name"])
    sequences = tokenizer.texts_to_sequences(df["name"])
    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(df["sex"])
    return padded, labels, tokenizer, label_encoder
 def build_model(cfg: Config, vocab_size: int) -> Sequential:
    """
    Builds and compiles a Sequential LSTM-based model. The model consists of an
    embedding layer, two bidirectional LSTM layers, a dense hidden layer with ReLU
    activation, and an output layer with a softmax activation function. The model
    is compiled using sparse categorical crossentropy loss and the Adam optimizer.
    """
    logging.info("Building LSTM model")
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim),
        Bidirectional(LSTM(cfg.lstm_units, return_sequences=True)),
        Bidirectional(LSTM(cfg.lstm_units)),
        Dense(64, activation="relu"),
        Dense(2, activation="softmax")
    ])
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model
 def cross_validate(cfg: Config, X, y, vocab_size: int):
    """
    Performs cross-validation on the given dataset using the specified model configuration.
    The function uses StratifiedKFold cross-validator to split the dataset into training and
    validation sets for each fold. For each fold, it trains the model, evaluates its accuracy
    on the validation data, and logs the fold-wise and overall results.
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
    accuracies = []
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        logging.info(f"Fold {fold + 1}")
        model = build_model(cfg, vocab_size)
        model.fit(X[train_idx], y[train_idx],
                  epochs=cfg.epochs,
                  batch_size=cfg.batch_size,
                  verbose=0)
        y_pred = model.predict(X[val_idx])
        acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
        accuracies.append(acc)
        logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
    logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
 def save_artifacts(model, tokenizer, encoder):
    """
    Saves the given model, tokenizer, and encoder artifacts to a predefined directory.
    The function ensures that the specified directory for saving artifacts exists,
    then serializes the model, tokenizer, and encoder using appropriate formats. It
    also logs the success of the operation to notify the user of the action taken.
    """
    os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
    model.save(os.path.join(GENDER_MODELS_DIR, "lstm_model.keras"))
    save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl"))
    save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl"))
    logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
 def main():
    cfg = Config(**vars(load_config("Long Short-Term Memory (LSTM) model")))
    X, y, tokenizer, encoder = load_and_prepare(cfg)
    vocab_size = len(tokenizer.word_index) + 1
    if cfg.cv:
        cross_validate(cfg, X, y, vocab_size)
        return
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
    )
    model = build_model(cfg, vocab_size)
    model.summary()
    logging.info("Training model")
    model.fit(X_train, y_train,
              validation_split=0.1,
              epochs=cfg.epochs,
              batch_size=cfg.batch_size,
              callbacks=[ProgbarLogger()])
    y_proba = model.predict(X_test)
    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
    if cfg.save:
        save_artifacts(model, tokenizer, encoder)
 if __name__ == "__main__":
    main()
@@ -1,173 +0,0 @@
 import os
 from dataclasses import dataclass
 from typing import Tuple
 import numpy as np
 import pandas as pd
 import tensorflow as tf
 from sklearn.metrics import (
    accuracy_score
 )
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from sklearn.preprocessing import LabelEncoder
 from tensorflow.keras.callbacks import ProgbarLogger
 from tensorflow.keras.layers import (
    Input, Embedding, Dense, GlobalAveragePooling1D,
    MultiHeadAttention, Dropout, LayerNormalization
 )
 from tensorflow.keras.models import Model
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from misc import GENDER_MODELS_DIR, load_csv_dataset, save_pickle
 from pipeline.gender.models import BaseConfig, load_config, evaluate_proba, logging
@dataclass
 class Config(BaseConfig):
    max_len: int = 6
    embedding_dim: int = 64
    transformer_head_size: int = 64
    transformer_num_heads: int = 2
    transformer_ff_dim: int = 128
    dropout: float = 0.1
    batch_size: int = 64
 def load_and_prepare(cfg: Config) -> Tuple[np.ndarray, np.ndarray, Tokenizer, LabelEncoder]:
    """
    Load and preprocess the dataset for training a Transformer model.
    This function reads a CSV dataset, tokenizes the names, pads the sequences,
    and encodes the labels. It returns the padded sequences, encoded labels,
    tokenizer, and label encoder.
    """
    logging.info("Loading and preprocessing data")
    df = pd.DataFrame(load_csv_dataset(cfg.dataset_path, cfg.size, cfg.balanced))
    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(df["name"])
    sequences = tokenizer.texts_to_sequences(df["name"])
    padded = pad_sequences(sequences, maxlen=cfg.max_len, padding="post")
    encoder = LabelEncoder()
    labels = encoder.fit_transform(df["sex"])
    return padded, labels, tokenizer, encoder
 def transformer_encoder(x, cfg: Config):
    """
    Transformer encoder block that applies multi-head attention and feed-forward
    neural network layers with residual connections and layer normalization.
    """
    attn = MultiHeadAttention(num_heads=cfg.transformer_num_heads, key_dim=cfg.transformer_head_size)(x, x)
    x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(attn))
    ff = Dense(cfg.transformer_ff_dim, activation="relu")(x)
    ff = Dense(x.shape[-1])(ff)
    return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg.dropout)(ff))
 def build_model(cfg: Config, vocab_size: int) -> Model:
    """
    Builds a Transformer-based model aimed at sequence processing tasks.
    The model includes an embedding layer integrating positional encodings
    and a Transformer encoder, followed by a global pooling layer,
    a dense hidden layer, and a softmax output layer.
    """
    logging.info("Building Transformer model")
    inputs = Input(shape=(cfg.max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=cfg.embedding_dim)(inputs)
    # Add positional encoding
    positions = tf.range(start=0, limit=cfg.max_len, delta=1)
    pos_embedding = Embedding(input_dim=cfg.max_len, output_dim=cfg.embedding_dim)(positions)
    x = x + pos_embedding
    x = transformer_encoder(x, cfg)
    x = GlobalAveragePooling1D()(x)
    x = Dense(32, activation="relu")(x)
    outputs = Dense(2, activation="softmax")(x)
    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model
 def cross_validate(cfg: Config, X, y, vocab_size: int):
    """
    Performs cross-validation using the given configuration, dataset, and specified vocabulary size. This function
    splits the dataset into stratified folds, trains a model on each fold, and evaluates its performance on validation
    data. The overall mean and standard deviation of accuracies across all folds are logged.
    """
    logging.info(f"Running {cfg.cv}-fold cross-validation")
    skf = StratifiedKFold(n_splits=cfg.cv, shuffle=True, random_state=cfg.random_state)
    accuracies = []
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        logging.info(f"Fold {fold + 1}")
        model = build_model(cfg, vocab_size)
        model.fit(X[train_idx], y[train_idx],
                  epochs=cfg.epochs,
                  batch_size=cfg.batch_size,
                  verbose=0)
        y_pred = model.predict(X[val_idx])
        acc = accuracy_score(y[val_idx], y_pred.argmax(axis=1))
        accuracies.append(acc)
        logging.info(f"Fold {fold + 1} Accuracy: {acc:.4f}")
    logging.info(f"Mean accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
 def save_artifacts(model, tokenizer, encoder):
    """
    Saves the model and associated artifacts to the designated directory. The model
    is serialized and saved in a `.keras` file, while the tokenizer and label
    encoder are serialized into `.pkl` files. If the directory does not exist, it
    is created automatically. This function also logs the completion of the
    operation.
    """
    os.makedirs(GENDER_MODELS_DIR, exist_ok=True)
    model.save(os.path.join(GENDER_MODELS_DIR, "transformer.keras"))
    save_pickle(tokenizer, os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl"))
    save_pickle(encoder, os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl"))
    logging.info(f"Model and artifacts saved to {GENDER_MODELS_DIR}")
 def main():
    cfg = Config(**vars(load_config("Transformer model")))
    X, y, tokenizer, encoder = load_and_prepare(cfg)
    vocab_size = len(tokenizer.word_index) + 1
    if cfg.cv:
        cross_validate(cfg, X, y, vocab_size)
        return
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
    )
    model = build_model(cfg, vocab_size)
    model.summary()
    logging.info("Training Transformer model")
    model.fit(
        X_train, y_train,
        validation_split=0.1,
        epochs=cfg.epochs,
        batch_size=cfg.batch_size,
        callbacks=[ProgbarLogger()]
    )
    y_proba = model.predict(X_test)
    evaluate_proba(y_test, y_proba, cfg.threshold, class_names=encoder.classes_)
    if cfg.save:
        save_artifacts(model, tokenizer, encoder)
 if __name__ == "__main__":
    main()
@@ -1,107 +0,0 @@
 import argparse
 import os
 from typing import List
 import tensorflow as tf
 from sklearn.pipeline import Pipeline
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from misc import GENDER_MODELS_DIR, load_pickle
 def predict_logreg(names: List[str], threshold: float):
    """
    Predict gender labels for given names using a logistic regression model.
    The function takes in a list of names and predicts the gender labels
    based on a logistic regression model. A probabilistic threshold is used
    to classify the names into one of the defined labels.
    """
    model_path = os.path.join(GENDER_MODELS_DIR, "regression_model.pkl")
    encoder_path = os.path.join(GENDER_MODELS_DIR, "regression_label_encoder.pkl")
    model: Pipeline = load_pickle(model_path)
    label_encoder = load_pickle(encoder_path)
    X = [name.lower().strip() for name in names]
    proba = model.predict_proba(X)
    pred = (proba[:, 1] >= threshold).astype(int)
    labels = label_encoder.inverse_transform(pred)
    return labels, proba
 def predict_lstm(names: List[str], threshold: float, max_len=6):
    """
    Predicts gender labels and probabilities for a list of names using a pre-trained BiLSTM model.
    The function loads the model, tokenizer, and label encoder, performs preprocessing on the input
    names, and then uses the loaded model to predict gender probabilities. Based on the threshold
    value, it determines the predicted gender labels.
    """
    model_path = os.path.join(GENDER_MODELS_DIR, "lstm_model.keras")
    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "lstm_tokenizer.pkl")
    encoder_path = os.path.join(GENDER_MODELS_DIR, "lstm_label_encoder.pkl")
    model = tf.keras.models.load_model(model_path)
    tokenizer: Tokenizer = load_pickle(tokenizer_path)
    label_encoder = load_pickle(encoder_path)
    X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
    X = pad_sequences(X, maxlen=max_len, padding="post")
    proba = model.predict(X)
    pred = (proba[:, 1] >= threshold).astype(int)
    labels = label_encoder.inverse_transform(pred)
    return labels, proba
 def predict_transformer(names: List[str], threshold: float, max_len=6):
    """
    Predicts gender labels for the provided names using a pre-trained transformer model.
    This function loads a pre-trained transformer model along with its tokenizer and label
    encoder, converts input names into tokenized sequences, and processes them to generate
    gender predictions. The function returns the predicted labels and the associated
    probabilities for each sample.
    """
    model_path = os.path.join(GENDER_MODELS_DIR, "transformer.keras")
    tokenizer_path = os.path.join(GENDER_MODELS_DIR, "transformer_tokenizer.pkl")
    encoder_path = os.path.join(GENDER_MODELS_DIR, "transformer_label_encoder.pkl")
    model = tf.keras.models.load_model(model_path)
    tokenizer: Tokenizer = load_pickle(tokenizer_path)
    label_encoder = load_pickle(encoder_path)
    X = tokenizer.texts_to_sequences([n.lower().strip() for n in names])
    X = pad_sequences(X, maxlen=max_len, padding="post")
    proba = model.predict(X)
    pred = (proba[:, 1] >= threshold).astype(int)
    labels = label_encoder.inverse_transform(pred)
    return labels, proba
 def main():
    parser = argparse.ArgumentParser(description="Predict gender from names using trained model")
    parser.add_argument("--model", choices=["logreg", "lstm", "transformer"], required=True)
    parser.add_argument("--names", nargs="+", required=True, help="One or more names")
    parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
    args = parser.parse_args()
    model_funcs = {
        "logreg": predict_logreg,
        "lstm": predict_lstm,
        "transformer": predict_transformer,
    }
    try:
        labels, proba = model_funcs[args.model](args.names, args.threshold)
    except KeyError:
        raise ValueError(f"Unsupported model type: {args.model}")
    for i, name in enumerate(args.names):
        p_female = proba[i][0]
        p_male = proba[i][1]
        print(f"{name} → {labels[i]} | P(f): {p_female:.2f} | P(m): {p_male:.2f}")
 if __name__ == "__main__":
    main()
@@ -1,109 +0,0 @@
 import os
 import argparse
 import ollama
 import pandas as pd
 from pydantic import BaseModel, ValidationError
 from tqdm import tqdm
 from typing import Optional
 from misc import load_prompt, load_csv_dataset, DATA_DIR, logging
 class NameAnalysis(BaseModel):
    identified_name: Optional[str]
    identified_surname: Optional[str]
 def analyze_name(client: ollama.Client, model: str, prompt: str, name: str) -> dict:
    """
    Analyze a name using the specified model and prompt.
    Returns a dictionary with identified name, surname, and category.
    """
    try:
        response = client.chat(
            model=model,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": name},
            ],
            format=NameAnalysis.model_json_schema(),
        )
        analysis = NameAnalysis.model_validate_json(response.message.content)
        return analysis.model_dump()
    except ValidationError as ve:
        logging.warning(f"Validation error: {ve}")
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
    return {"identified_name": None, "identified_surname": None}
 def save_checkpoint(df: pd.DataFrame):
    df.to_csv(os.path.join(DATA_DIR, "names_featured.csv"), index=False)
    logging.critical(f"Checkpoint saved")
 def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame:
    BATCH_SIZE = 10
    client = ollama.Client()
    prompt = load_prompt()
    updates = []
    # Set logging level for HTTP client to reduce noise
    # This is useful to avoid excessive logging from the HTTP client used by Ollama
    logging.getLogger("httpx").setLevel(logging.WARNING)
    for idx, (row_idx, row) in enumerate(entries.iterrows(), 1):
        try:
            entry = analyze_name(client, llm_model, prompt, row["name"])
            entry["annotated"] = 1
            updates.append((row_idx, entry))
            logging.info(f"Analyzed: {row['name']} - {entry}")
        except Exception as e:
            logging.warning(f"Failed to analyze '{row['name']}': {e}")
            continue
        if idx % BATCH_SIZE == 0 or idx == len(entries):
            update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
            update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
            df.update(update_df)
            save_checkpoint(df)
            updates.clear()  # avoid re-applying same updates
    return df
 def main(llm_model: str = "llama3.2:3b"):
    df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
    # Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
    df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
    entries = df[df["annotated"] == 0]
    if entries.empty:
        logging.info("No names to analyze.")
        return
    logging.info(f"Found {len(entries)} names to analyze.")
    df = build_updates(llm_model, df, entries)
    save_checkpoint(df)
    logging.info("Analysis complete.")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Analyze names using an LLM model.")
    parser.add_argument(
        "--llm_model",
        type=str,
        default="mistral:7b",
        help="Ollama model name to use (default: mistral:7b)",
    )
    args = parser.parse_args()
    try:
        main(llm_model=args.llm_model)
    except Exception as e:
        logging.error(f"Fatal error: {e}", exc_info=True)
@@ -0,0 +1,11 @@
 from dataclasses import dataclass
@dataclass
 class BatchConfig:
    """Configuration for batch processing"""
    batch_size: int = 1000
    max_workers: int = 4
    checkpoint_interval: int = 5  # Save checkpoint every N batches
    use_multiprocessing: bool = False  # Use ProcessPoolExecutor instead of ThreadPoolExecutor
@@ -0,0 +1,102 @@
 import logging
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from typing import Iterator
 import pandas as pd
 from processing.batch.batch_config import BatchConfig
 from processing.steps import PipelineStep
 class BatchProcessor:
    """Handles batch processing with concurrency and checkpointing"""
    def __init__(self, config: BatchConfig):
        self.config = config
    def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
        """Create batches from DataFrame"""
        total_rows = len(df)
        batch_size = self.config.batch_size
        for i in range(0, total_rows, batch_size):
            batch = df.iloc[i : i + batch_size].copy()
            batch_id = i // batch_size
            yield batch, batch_id
    def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
        """Process batches sequentially"""
        results = []
        for batch, batch_id in self.create_batches(df):
            if step.batch_exists(batch_id):
                logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
                processed_batch = step.load_batch(batch_id)
            else:
                try:
                    processed_batch = step.process_batch(batch, batch_id)
                    step.save_batch(processed_batch, batch_id)
                    step.state.processed_batches += 1
                except Exception as e:
                    logging.error(f"Failed to process batch {batch_id}: {e}")
                    step.state.failed_batches.append(batch_id)
                    continue
            results.append(processed_batch)
            # Save state periodically
            if batch_id % self.config.checkpoint_interval == 0:
                step.save_state()
        return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
    def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
        """Process batches concurrently"""
        executor_class = (
            ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
        )
        results = {}
        with executor_class(max_workers=self.config.max_workers) as executor:
            # Submit all batches
            future_to_batch = {}
            for batch, batch_id in self.create_batches(df):
                if step.batch_exists(batch_id):
                    logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
                    results[batch_id] = step.load_batch(batch_id)
                else:
                    future = executor.submit(step.process_batch, batch, batch_id)
                    future_to_batch[future] = (batch_id, batch)
            # Collect results as they complete
            for future in as_completed(future_to_batch):
                batch_id, batch = future_to_batch[future]
                try:
                    processed_batch = future.result()
                    step.save_batch(processed_batch, batch_id)
                    results[batch_id] = processed_batch
                    step.state.processed_batches += 1
                    logging.info(f"Completed batch {batch_id}")
                except Exception as e:
                    logging.error(f"Failed to process batch {batch_id}: {e}")
                    step.state.failed_batches.append(batch_id)
        # Reassemble results in order
        ordered_results = []
        for batch_id in sorted(results.keys()):
            ordered_results.append(results[batch_id])
        step.save_state()
        return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
    def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
        """Process data using the configured strategy"""
        step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
        step.load_state()
        logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
        if self.config.max_workers == 1:
            return self.process_sequential(step, df)
        else:
            return self.process_concurrent(step, df)
@@ -0,0 +1,80 @@
 import logging
 from typing import Dict
 import pandas as pd
 class DatasetAnalyzer:
    """Analyze dataset statistics and quality"""
    def __init__(self, filepath: str):
        self.filepath = filepath
        self.df = None
    def load_data(self) -> bool:
        """Load dataset for analysis"""
        try:
            self.df = pd.read_csv(self.filepath)
            return True
        except Exception as e:
            logging.error(f"Failed to load {self.filepath}: {e}")
            return False
    def analyze_completion(self) -> Dict:
        """Analyze annotation completion status"""
        if self.df is None:
            return {}
        total_rows = len(self.df)
        # Check annotation status
        if "annotated" in self.df.columns:
            annotated_count = (self.df["annotated"] == 1).sum()
            unannotated_count = (self.df["annotated"] == 0).sum()
        else:
            annotated_count = 0
            unannotated_count = total_rows
        # Analyze name completeness
        complete_names = 0
        if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
            complete_names = (
                (self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
            ).sum()
        return {
            "total_rows": total_rows,
            "annotated_rows": annotated_count,
            "unannotated_rows": unannotated_count,
            "annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
            "complete_names": complete_names,
            "completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
        }
    def analyze_quality(self) -> Dict:
        """Analyze data quality metrics"""
        if self.df is None:
            return {}
        quality_metrics = {}
        # Missing values
        missing_data = self.df.isnull().sum()
        quality_metrics["missing_values"] = missing_data.to_dict()
        # Name length distribution
        if "name" in self.df.columns:
            name_lengths = self.df["name"].str.len()
            quality_metrics["name_length"] = {
                "mean": name_lengths.mean(),
                "median": name_lengths.median(),
                "min": name_lengths.min(),
                "max": name_lengths.max(),
            }
        # Word count distribution
        if "words" in self.df.columns:
            word_counts = self.df["words"].value_counts().sort_index()
            quality_metrics["word_distribution"] = word_counts.to_dict()
        return quality_metrics
@@ -0,0 +1,179 @@
 import json
 import logging
 import shutil
 from datetime import datetime
 from typing import Optional, Dict
 from core.config.config_manager import ConfigManager
 from core.config.project_paths import ProjectPaths
 class PipelineMonitor:
    """Monitor and manage pipeline execution"""
    def __init__(self, paths: Optional[ProjectPaths] = None):
        if paths is None:
            # Use default configuration if none provided
            config_manager = ConfigManager()
            paths = config_manager.default_paths
        self.paths = paths
        self.checkpoint_dir = paths.checkpoints_dir
        self.steps = ["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"]
    def get_step_status(self, step_name: str) -> Dict:
        """Get status of a specific pipeline step"""
        step_dir = self.checkpoint_dir / step_name
        state_file = step_dir / "pipeline_state.json"
        if not state_file.exists():
            return {
                "step": step_name,
                "status": "not_started",
                "processed_batches": 0,
                "total_batches": 0,
                "failed_batches": 0,
                "completion_percentage": 0.0,
            }
        try:
            with open(state_file, "r") as f:
                state = json.load(f)
            processed = state.get("processed_batches", 0)
            total = state.get("total_batches", 0)
            failed = len(state.get("failed_batches", []))
            if total == 0:
                completion = 0.0
                status = "not_started"
            elif processed >= total:
                completion = 100.0
                status = "completed" if failed == 0 else "completed_with_errors"
            else:
                completion = (processed / total) * 100
                status = "in_progress"
            return {
                "step": step_name,
                "status": status,
                "processed_batches": processed,
                "total_batches": total,
                "failed_batches": failed,
                "completion_percentage": completion,
                "last_checkpoint": state.get("last_checkpoint"),
                "failed_batch_ids": state.get("failed_batches", []),
            }
        except Exception as e:
            logging.error(f"Error reading state for {step_name}: {e}")
            return {"step": step_name, "status": "error", "error": str(e)}
    def get_pipeline_status(self) -> Dict:
        """Get overall pipeline status"""
        step_statuses = {}
        overall_status = "not_started"
        total_completion = 0.0
        for step in self.steps:
            status = self.get_step_status(step)
            step_statuses[step] = status
            if status["status"] == "error":
                overall_status = "error"
            elif status["status"] in ["in_progress"]:
                overall_status = "in_progress"
            elif status["status"] == "completed_with_errors":
                overall_status = "completed_with_errors"
            total_completion += status.get("completion_percentage", 0)
        avg_completion = total_completion / len(self.steps)
        if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
            overall_status = "completed"
        return {
            "overall_status": overall_status,
            "overall_completion": avg_completion,
            "steps": step_statuses,
            "timestamp": datetime.now().isoformat(),
        }
    def print_status(self, detailed: bool = False):
        """Print pipeline status in a human-readable format"""
        status = self.get_pipeline_status()
        print("\n=== Pipeline Status ===")
        print(f"Overall Status: {status['overall_status'].upper()}")
        print(f"Overall Completion: {status['overall_completion']:.1f}%")
        print(f"Last Updated: {status['timestamp']}")
        print()
        for step_name, step_status in status["steps"].items():
            print(f"{step_name.replace('_', ' ').title()}:")
            print(f"  Status: {step_status['status']}")
            print(f"  Progress: {step_status['completion_percentage']:.1f}%")
            print(f"  Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
            if step_status["failed_batches"] > 0:
                print(f"  Failed Batches: {step_status['failed_batches']}")
                if detailed and "failed_batch_ids" in step_status:
                    print(f"  Failed Batch IDs: {step_status['failed_batch_ids']}")
            print()
    def count_checkpoint_files(self) -> Dict:
        """Count checkpoint files for each step"""
        counts = {}
        total_size = 0
        for step in self.steps:
            step_dir = self.checkpoint_dir / step
            if step_dir.exists():
                csv_files = list(step_dir.glob("*.csv"))
                step_size = sum(f.stat().st_size for f in csv_files)
                counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
                total_size += step_size
            else:
                counts[step] = {"files": 0, "size_mb": 0}
        counts["total_size_mb"] = total_size / (1024 * 1024)
        return counts
    def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
        """Clean checkpoint files for a specific step"""
        step_dir = self.checkpoint_dir / step_name
        if not step_dir.exists():
            logging.info(f"No checkpoints found for {step_name}")
            return
        csv_files = sorted(step_dir.glob("batch_*.csv"))
        if len(csv_files) <= keep_last:
            logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
            return
        files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
        for file_path in files_to_delete:
            try:
                file_path.unlink()
                logging.info(f"Deleted {file_path}")
            except Exception as e:
                logging.error(f"Failed to delete {file_path}: {e}")
    def reset_step(self, step_name: str):
        """Reset a pipeline step by removing its checkpoints and state"""
        step_dir = self.checkpoint_dir / step_name
        if step_dir.exists():
            try:
                shutil.rmtree(step_dir)
                logging.info(f"Reset step: {step_name}")
            except Exception as e:
                logging.error(f"Failed to reset {step_name}: {e}")
        else:
            logging.info(f"Step {step_name} has no checkpoints to reset")
@@ -0,0 +1,57 @@
 import logging
 import pandas as pd
 from typing import Dict, Any
 import time
 from processing.batch.batch_config import BatchConfig
 from processing.batch.batch_processor import BatchProcessor
 from processing.steps import PipelineStep
 class Pipeline:
    """Main pipeline orchestrator"""
    def __init__(self, config: BatchConfig):
        self.config = config
        self.processor = BatchProcessor(config)
        self.steps = []
    def add_step(self, step: PipelineStep):
        """Add a processing step to the pipeline"""
        self.steps.append(step)
    def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
        """Run the complete pipeline"""
        current_data = input_data.copy()
        for step in self.steps:
            logging.info(f"Running pipeline step: {step.name}")
            start_time = time.time()
            current_data = self.processor.process(step, current_data)
            elapsed_time = time.time() - start_time
            logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
            if step.state.failed_batches:
                logging.warning(
                    f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
                )
        return current_data
    def get_progress(self) -> Dict[str, Any]:
        """Get progress information for all steps"""
        progress = {}
        for step in self.steps:
            progress[step.name] = {
                "processed_batches": step.state.processed_batches,
                "total_batches": step.state.total_batches,
                "failed_batches": len(step.state.failed_batches),
                "completion_percentage": (
                    step.state.processed_batches / max(1, step.state.total_batches)
                )
                * 100,
            }
        return progress
@@ -1,119 +0,0 @@
 import os
 import argparse
 import pandas as pd
 from misc import DATA_DIR, REGION_MAPPING, logging
 def clean(filepath) -> pd.DataFrame:
    """
    Clean the CSV file by removing null bytes, non-breaking spaces, and extra spaces.
    Also, it attempts to read the file with different encodings to handle potential encoding issues.
    """
    encodings = ['utf-8', 'utf-16', 'latin1']
    for enc in encodings:
        try:
            logging.info(f"Trying to read {filepath} with encoding: {enc}")
            # Use chunked reading to handle large files
            chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
            cleaned_chunks = []
            for chunk in chunks:
                # Drop rows with essential missing values early
                chunk = chunk.dropna(subset=['name', 'sex', 'region'])
                # Clean string columns in-place
                for col in chunk.select_dtypes(include='object').columns:
                    chunk[col] = (
                        chunk[col]
                        .astype(str)
                        .str.replace('\x00', ' ', regex=False)
                        .str.replace('\u00a0', ' ', regex=False)
                        .str.replace(' +', ' ', regex=True)
                        .str.strip()
                        .str.lower()
                    )
                cleaned_chunks.append(chunk)
            df = pd.concat(cleaned_chunks, ignore_index=True)
            df.to_csv(filepath, index=False, encoding='utf-8')
            logging.info(f"Successfully read with encoding: {enc}")
            return df
        except Exception:
            continue
    raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
 def process(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the DataFrame to extract features and clean data.
    This includes counting words, calculating name length, and extracting probable native names and surnames.
    Also maps regions to provinces based on REGION_MAPPING.
    """
    logging.info("Preprocessing names")
    df['words'] = df['name'].str.count(' ') + 1
    df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
    df['year'] = df['year'].astype(int)
    # Calculate probable_native and probable_surname
    name_split = df['name'].str.split()
    df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
    df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
    df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple')
    df['identified_name'] = None
    df['identified_surname'] = None
    df['annotated'] = 0
    # We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
    # This is a common pattern in Congolese names
    three_word_mask = df['words'] == 3
    df.loc[three_word_mask, 'identified_name'] = df.loc[three_word_mask, 'probable_native']
    df.loc[three_word_mask, 'identified_surname'] = df.loc[three_word_mask, 'probable_surname']
    df.loc[three_word_mask, 'annotated'] = 1
    logging.info("Mapping regions to provinces")
    df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1])
    df['province'] = df['province'].str.lower()
    return df
 def save_artifacts(df: pd.DataFrame, split_eval: bool = True, split_by_sex: bool = True) -> None:
    """
    Splits the input DataFrame into evaluation and featured datasets, saves them as CSV files, 
    and additionally saves separate CSV files for male and female entries if requested.
    """
    if split_eval:
        logging.info("Saving evaluation and featured datasets")
        eval_idx = df.sample(frac=0.2, random_state=42).index
        df_evaluation = df.loc[eval_idx]
        df_featured = df.drop(index=eval_idx)
        df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
        df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
    else:
        df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
    if split_by_sex:
        logging.info("Saving by sex")
        df[df['sex'] == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
        df[df['sex'] == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
 def main(split_eval: bool = True, split_by_sex: bool = True):
    df = process(clean(os.path.join(DATA_DIR, 'names.csv')))
    save_artifacts(df, split_eval=split_eval, split_by_sex=split_by_sex)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Prepare name datasets with optional splits.")
    parser.add_argument('--split_eval', action='store_true', default=True, help="Split into evaluation and featured datasets (default: True)")
    parser.add_argument('--no-split_eval', action='store_false', dest='split_eval', help="Do not split into evaluation and featured datasets")
    parser.add_argument('--split_by_sex', action='store_true', default=True, help="Split by sex into male/female datasets (default: True)")
    parser.add_argument('--no-split_by_sex', action='store_false', dest='split_by_sex', help="Do not split by sex into male/female datasets")
    args = parser.parse_args()
    main(split_eval=args.split_eval, split_by_sex=args.split_by_sex)
@@ -0,0 +1,111 @@
 import json
 import logging
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import List, Optional
 import pandas as pd
 from processing.batch.batch_config import BatchConfig
 from core.config.pipeline_config import PipelineConfig
@dataclass
 class PipelineState:
    """Tracks the state of pipeline execution"""
    processed_batches: int = 0
    total_batches: int = 0
    failed_batches: List[int] = None
    last_checkpoint: Optional[str] = None
    def __post_init__(self):
        if self.failed_batches is None:
            self.failed_batches = []
 class PipelineStep(ABC):
    """Abstract base class for pipeline steps"""
    def __init__(
        self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
    ):
        self.name = name
        self.pipeline_config = pipeline_config
        # Use provided batch_config or create default from pipeline config
        if batch_config is None:
            batch_config = BatchConfig(
                batch_size=pipeline_config.processing.batch_size,
                max_workers=pipeline_config.processing.max_workers,
                checkpoint_interval=pipeline_config.processing.checkpoint_interval,
                use_multiprocessing=pipeline_config.processing.use_multiprocessing,
            )
        self.batch_config = batch_config
        self.state = PipelineState()
    @abstractmethod
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process a single batch of data"""
        pass
    def get_checkpoint_path(self, batch_id: int) -> str:
        """Get the checkpoint file path for a batch"""
        checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
        checkpoint_dir.mkdir(parents=True, exist_ok=True)
        return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
    def get_state_path(self) -> str:
        """Get the state file path"""
        state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
        state_dir.mkdir(parents=True, exist_ok=True)
        return str(state_dir / "pipeline_state.json")
    def save_state(self):
        """Save pipeline state to disk"""
        state_file = self.get_state_path()
        with open(state_file, "w") as f:
            json.dump(
                {
                    "processed_batches": self.state.processed_batches,
                    "total_batches": self.state.total_batches,
                    "failed_batches": self.state.failed_batches,
                    "last_checkpoint": self.state.last_checkpoint,
                },
                f,
            )
    def load_state(self) -> bool:
        """Load pipeline state from disk. Returns True if state was loaded."""
        state_file = self.get_state_path()
        if os.path.exists(state_file):
            try:
                with open(state_file, "r") as f:
                    state_data = json.load(f)
                self.state.processed_batches = state_data.get("processed_batches", 0)
                self.state.total_batches = state_data.get("total_batches", 0)
                self.state.failed_batches = state_data.get("failed_batches", [])
                self.state.last_checkpoint = state_data.get("last_checkpoint")
                return True
            except Exception as e:
                logging.warning(f"Failed to load state: {e}")
        return False
    def batch_exists(self, batch_id: int) -> bool:
        """Check if a batch has already been processed (idempotency)"""
        checkpoint_path = self.get_checkpoint_path(batch_id)
        return os.path.exists(checkpoint_path)
    def save_batch(self, batch: pd.DataFrame, batch_id: int):
        """Save processed batch to checkpoint"""
        checkpoint_path = self.get_checkpoint_path(batch_id)
        batch.to_csv(checkpoint_path, index=False)
        logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
    def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
        """Load processed batch from checkpoint"""
        checkpoint_path = self.get_checkpoint_path(batch_id)
        if os.path.exists(checkpoint_path):
            return pd.read_csv(checkpoint_path)
        return None
@@ -0,0 +1,28 @@
 import logging
 import pandas as pd
 from core.config.pipeline_config import PipelineConfig
 from core.utils.text_cleaner import TextCleaner
 from processing.steps import PipelineStep
 class DataCleaningStep(PipelineStep):
    """Configuration-driven data cleaning step"""
    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("data_cleaning", pipeline_config)
        self.text_cleaner = TextCleaner()
        self.required_columns = ["name", "sex", "region"]
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process a single batch for data cleaning"""
        logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
        # Drop rows with essential missing values
        batch = batch.dropna(subset=self.required_columns)
        # Apply text cleaning
        batch = self.text_cleaner.clean_dataframe_text_columns(batch)
        return batch
@@ -0,0 +1,60 @@
 import numpy as np
 import pandas as pd
 from core.config.pipeline_config import PipelineConfig
 from processing.steps.feature_extraction_step import Gender
 from core.utils.data_loader import DataLoader
 from processing.batch.batch_config import BatchConfig
 from processing.steps import PipelineStep
 class DataSplittingStep(PipelineStep):
    """Configuration-driven data splitting step"""
    def __init__(self, pipeline_config: PipelineConfig):
        batch_config = BatchConfig(
            batch_size=pipeline_config.processing.batch_size,
            max_workers=1,  # No need for parallelism in splitting
            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
            use_multiprocessing=False,
        )
        super().__init__("data_splitting", pipeline_config, batch_config)
        self.data_loader = DataLoader(pipeline_config)
        self.eval_indices = None
    def determine_eval_indices(self, total_size: int) -> set:
        """Determine evaluation indices consistently across batches"""
        if self.eval_indices is None:
            np.random.seed(self.pipeline_config.data.random_seed)
            eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
            self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
        return self.eval_indices
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process batch for data splitting - no modification needed"""
        return batch.copy()
    def save_splits(self, df: pd.DataFrame) -> None:
        """Save the split datasets based on configuration"""
        output_files = self.pipeline_config.data.output_files
        data_dir = self.pipeline_config.paths.data_dir
        if self.pipeline_config.data.split_evaluation:
            eval_indices = self.determine_eval_indices(len(df))
            eval_mask = df.index.isin(eval_indices)
            df_evaluation = df[eval_mask]
            df_featured = df[~eval_mask]
            self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
            self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
        else:
            self.data_loader.save_csv(df, data_dir / output_files["featured"])
        if self.pipeline_config.data.split_by_gender:
            df_males = df[df["sex"] == Gender.MALE.value]
            df_females = df[df["sex"] == Gender.FEMALE.value]
            self.data_loader.save_csv(df_males, data_dir / output_files["males"])
            self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -0,0 +1,99 @@
 import logging
 from enum import Enum
 import pandas as pd
 from core.config.pipeline_config import PipelineConfig
 from core.utils.region_mapper import RegionMapper
 from processing.steps import PipelineStep
 class Gender(Enum):
    MALE = "m"
    FEMALE = "f"
 class NameCategory(Enum):
    SIMPLE = "simple"
    COMPOSE = "compose"
 class FeatureExtractionStep(PipelineStep):
    """Configuration-driven feature extraction step"""
    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("feature_extraction", pipeline_config)
        self.region_mapper = RegionMapper()
    @classmethod
    def validate_gender(cls, gender: str) -> Gender:
        """Validate and normalize gender value"""
        gender_lower = gender.lower().strip()
        if gender_lower in ["m", "male", "homme", "masculin"]:
            return Gender.MALE
        elif gender_lower in ["f", "female", "femme", "féminin"]:
            return Gender.FEMALE
        else:
            raise ValueError(f"Unknown gender: {gender}")
    @classmethod
    def get_name_category(cls, word_count: int) -> NameCategory:
        """Determine name category based on word count"""
        if word_count <= 3:
            return NameCategory.SIMPLE
        else:
            return NameCategory.COMPOSE
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Extract features from names in batch"""
        logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
        batch = batch.copy()
        # Basic features
        batch["words"] = batch["name"].str.count(" ") + 1
        batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
        # Handle year column
        if "year" in batch.columns:
            batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
        # Initialize new columns
        batch["probable_native"] = None
        batch["probable_surname"] = None
        batch["identified_name"] = None
        batch["identified_surname"] = None
        batch["annotated"] = 0
        # Vectorized category assignment
        batch["identified_category"] = batch["words"].apply(
            lambda x: self.get_name_category(x).value
        )
        # Assign probable_native and probable_surname for all names
        name_splits = batch["name"].str.split()
        batch["probable_native"] = name_splits.apply(
            lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
        )
        batch["probable_surname"] = name_splits.apply(
            lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
        )
        # Auto-assign for 3-word names
        three_word_mask = batch["words"] == 3
        batch.loc[three_word_mask, "identified_name"] = batch.loc[
            three_word_mask, "probable_native"
        ]
        batch.loc[three_word_mask, "identified_surname"] = batch.loc[
            three_word_mask, "probable_surname"
        ]
        batch.loc[three_word_mask, "annotated"] = 1
        # Map regions to provinces
        batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
        # Normalize gender
        if "sex" in batch.columns:
            batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
        return batch
@@ -0,0 +1,168 @@
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Dict, Optional
 import ollama
 import pandas as pd
 from pydantic import ValidationError, BaseModel
 from core.config.pipeline_config import PipelineConfig
 from core.utils.prompt_manager import PromptManager
 from core.utils.rate_limiter import RateLimiter
 from core.utils.rate_limiter import RateLimitConfig
 from processing.batch.batch_config import BatchConfig
 from processing.steps import PipelineStep
 class NameAnnotation(BaseModel):
    """Model for name annotation results"""
    identified_name: Optional[str]
    identified_surname: Optional[str]
 class LLMAnnotationStep(PipelineStep):
    """Configuration-driven LLM annotation step"""
    def __init__(self, pipeline_config: PipelineConfig):
        # Create custom batch config for LLM processing
        batch_config = BatchConfig(
            batch_size=pipeline_config.processing.batch_size,
            max_workers=min(
                pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
            ),
            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
            use_multiprocessing=pipeline_config.processing.use_multiprocessing,
        )
        super().__init__("llm_annotation", pipeline_config, batch_config)
        self.prompt = PromptManager(pipeline_config).load_prompt()
        self.rate_limiter = (
            self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
        )
        # Statistics
        self.successful_requests = 0
        self.failed_requests = 0
        self.total_retry_attempts = 0
        # Setup logging
        logging.getLogger("httpx").setLevel(logging.WARNING)
    def _create_rate_limiter(self):
        """Create rate limiter based on configuration"""
        rate_config = RateLimitConfig(
            requests_per_minute=self.pipeline_config.llm.requests_per_minute,
            requests_per_second=self.pipeline_config.llm.requests_per_second,
        )
        return RateLimiter(rate_config)
    def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
        """Analyze a name with retry logic and rate limiting"""
        for attempt in range(self.pipeline_config.llm.retry_attempts):
            try:
                # Apply rate limiting if enabled
                if self.rate_limiter:
                    self.rate_limiter.wait_if_needed()
                start_time = time.time()
                response = client.chat(
                    model=self.pipeline_config.llm.model_name,
                    messages=[
                        {"role": "system", "content": self.prompt},
                        {"role": "user", "content": name},
                    ],
                    format=NameAnnotation.model_json_schema(),
                )
                elapsed_time = time.time() - start_time
                if elapsed_time > self.pipeline_config.llm.timeout_seconds:
                    raise TimeoutError(
                        f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
                    )
                annotation = NameAnnotation.model_validate_json(response.message.content)
                result = {
                    **annotation.model_dump(),
                    "annotated": 1,
                    "processing_time": elapsed_time,
                    "attempts": attempt + 1,
                }
                self.successful_requests += 1
                if attempt > 0:
                    self.total_retry_attempts += attempt
                return result
            except (ValidationError, TimeoutError, Exception) as e:
                logging.warning(
                    f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
                )
                # Exponential backoff with jitter
                if attempt < self.pipeline_config.llm.retry_attempts - 1:
                    wait_time = (2**attempt) + (time.time() % 1)
                    time.sleep(min(wait_time, 10))
        self.failed_requests += 1
        return {
            "identified_name": None,
            "identified_surname": None,
            "annotated": 0,
            "processing_time": 0,
            "attempts": self.pipeline_config.llm.retry_attempts,
            "failed": True,
        }
    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
        """Process batch with LLM annotation"""
        unannotated_mask = batch.get("annotated", 0) == 0
        unannotated_entries = batch[unannotated_mask]
        if unannotated_entries.empty:
            logging.info(f"Batch {batch_id}: No entries to annotate")
            return batch
        logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
        batch = batch.copy()
        client = ollama.Client()
        # Process with controlled concurrency
        max_workers = self.pipeline_config.llm.max_concurrent_requests
        if len(unannotated_entries) == 1 or max_workers == 1:
            # Sequential processing
            for idx, row in unannotated_entries.iterrows():
                result = self.analyze_name_with_retry(client, row["name"], idx)
                for field, value in result.items():
                    if field not in ["failed"]:
                        batch.loc[idx, field] = value
        else:
            # Concurrent processing
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                future_to_idx = {}
                for idx, row in unannotated_entries.iterrows():
                    future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
                    future_to_idx[future] = idx
                for future in as_completed(future_to_idx):
                    idx = future_to_idx[future]
                    try:
                        result = future.result()
                        for field, value in result.items():
                            if field not in ["failed"]:
                                batch.loc[idx, field] = value
                    except Exception as e:
                        logging.error(f"Failed to process row {idx}: {e}")
                        batch.loc[idx, "annotated"] = 0
        # Ensure proper data types
        batch["annotated"] = (
            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
        )
        return batch
@@ -1,26 +0,0 @@
 import ollama
 from pydantic import BaseModel
 from misc import load_prompt
 class NameAnalysis(BaseModel):
    identified_name: str | None
    identified_surname: str | None
 name = input("Enter name: ")
 client = ollama.Client()
 response = client.chat(
    model="mistral:7b",
    messages=[
        {"role": "system", "content": load_prompt()},
        {"role": "user", "content": name}
    ],
    format=NameAnalysis.model_json_schema()
 )
 analysis = NameAnalysis.model_validate_json(response.message.content)
 result = analysis.model_dump()
 print(result)
@@ -1,53 +1,178 @@
 absl-py==2.3.0
 altair==5.1.2
 annotated-types==0.7.0
 anyio==4.9.0
 appnope==0.1.4
 argon2-cffi==25.1.0
 argon2-cffi-bindings==21.2.0
 arrow==1.3.0
 asttokens==3.0.0
 astunparse==1.6.3
 async-lru==2.0.5
 attrs==25.3.0
 babel==2.17.0
 beautifulsoup4==4.13.4
 black==25.1.0
 bleach==6.2.0
 blinker==1.9.0
 cachetools==6.1.0
 certifi==2025.6.15
 cffi==1.17.1
 charset-normalizer==3.4.2
 click==8.2.1
 comm==0.2.2
 contourpy==1.3.2
 cycler==0.12.1
 debugpy==1.8.14
 decorator==5.2.1
 defusedxml==0.7.1
 executing==2.2.0
 fastjsonschema==2.21.1
 flake8==7.3.0
 flatbuffers==25.2.10
 fonttools==4.58.4
 fqdn==1.5.1
 gast==0.6.0
 gitdb==4.0.12
 GitPython==3.1.45
 google-pasta==0.2.0
 grpcio==1.73.0
 h11==0.16.0
 h5py==3.14.0
 httpcore==1.0.9
 httpx==0.28.1
 idna==3.10
 imbalanced-learn==0.13.0
 ipykernel==6.29.5
 ipython==9.4.0
 ipython_pygments_lexers==1.1.1
 isoduration==20.11.0
 jedi==0.19.2
 Jinja2==3.1.6
 joblib==1.5.1
 json5==0.12.0
 jsonpointer==3.0.0
 jsonschema==4.24.0
 jsonschema-specifications==2025.4.1
 jupyter-events==0.12.0
 jupyter-lsp==2.2.5
 jupyter_client==8.6.3
 jupyter_core==5.8.1
 jupyter_server==2.16.0
 jupyter_server_terminals==0.5.3
 jupyterlab==4.4.4
 jupyterlab_pygments==0.3.0
 jupyterlab_server==2.27.3
 keras==3.10.0
 kiwisolver==1.4.8
 libclang==18.1.1
 lightgbm==4.6.0
 Markdown==3.8.2
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
 matplotlib==3.10.3
 matplotlib-inline==0.1.7
 mccabe==0.7.0
 mdurl==0.1.2
 mistune==3.1.3
 ml-dtypes==0.3.2
 mypy==1.17.0
 mypy_extensions==1.1.0
 namex==0.1.0
 narwhals==2.0.1
 nbclient==0.10.2
 nbconvert==7.16.6
 nbformat==5.10.4
 nest-asyncio==1.6.0
 nltk==3.9.1
 notebook==7.4.4
 notebook_shim==0.2.4
 numpy==1.26.4
 ollama==0.5.1
 opt_einsum==3.4.0
 optree==0.16.0
 overrides==7.7.0
 packaging==25.0
 pandas==2.3.0
 pandocfilters==1.5.1
 parso==0.8.4
 pathspec==0.12.1
 pexpect==4.9.0
 pillow==11.2.1
 platformdirs==4.3.8
 plotly==6.2.0
 prometheus_client==0.22.1
 prompt_toolkit==3.0.51
 protobuf==4.25.8
 psutil==7.0.0
 ptyprocess==0.7.0
 pure_eval==0.2.3
 pyarrow==21.0.0
 pycodestyle==2.14.0
 pycparser==2.22
 pydantic==2.11.7
 pydantic_core==2.33.2
 pydeck==0.9.1
 pyflakes==3.4.0
 Pygments==2.19.1
 pyparsing==3.2.3
 python-dateutil==2.9.0.post0
 python-json-logger==3.3.0
 pytz==2025.2
 PyYAML==6.0.2
 pyzmq==27.0.0
 referencing==0.36.2
 regex==2024.11.6
 requests==2.32.4
 rfc3339-validator==0.1.4
 rfc3986-validator==0.1.1
 rich==14.0.0
-scikit-learn==1.7.0
+rpds-py==0.26.0
 scikit-learn==1.6.1
 scipy==1.15.3
 seaborn==0.13.2
 Send2Trash==1.8.3
 six==1.17.0
 sklearn-compat==0.1.3
 smmap==5.0.2
 sniffio==1.3.1
 soupsieve==2.7
 stack-data==0.6.3
 streamlit==1.47.1
 tenacity==9.1.2
 tensorboard==2.16.2
 tensorboard-data-server==0.7.2
 tensorflow==2.16.2
 tensorflow-io-gcs-filesystem==0.37.1
 termcolor==3.1.0
 terminado==0.18.1
 threadpoolctl==3.6.0
 tinycss2==1.4.0
 toml==0.10.2
 toolz==1.0.0
 tornado==6.5.1
 tqdm==4.67.1
 traitlets==5.14.3
 types-python-dateutil==2.9.0.20250516
 types-PyYAML==6.0.12.20250516
 typing-inspection==0.4.1
 typing_extensions==4.14.0
 tzdata==2025.2
 uri-template==1.3.0
 urllib3==2.5.0
 wcwidth==0.2.13
 webcolors==24.11.1
 webencodings==0.5.1
 websocket-client==1.8.0
 Werkzeug==3.1.3
 wrapt==1.17.2
 xgboost==3.0.3
 scikit-learn~=1.6.1
 ollama~=0.5.1
 pydantic~=2.11.7
 streamlit~=1.47.1
 plotly~=6.2.0
 altair==5.1.2
 PyYAML~=6.0.2
 xgboost~=3.0.3
 lightgbm~=4.6.0
@@ -0,0 +1,250 @@
 import logging
 from abc import ABC, abstractmethod
 from typing import Dict, Any, Optional, List
 import joblib
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from research.experiment import ExperimentConfig
 class BaseModel(ABC):
    """Abstract base class for all models"""
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.model = None
        self.feature_extractor = None
        self.label_encoder = None
        self.tokenizer = None  # For neural models
        self.is_fitted = False
        self.training_history = {}  # Store training history for learning curves
        self.learning_curve_data = {}  # Store learning curve experiment data
    @property
    @abstractmethod
    def architecture(self) -> str:
        """Return the architecture type: 'neural_network', 'traditional', or 'ensemble'"""
        pass
    @abstractmethod
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        """Prepare features for training/prediction"""
        pass
    @abstractmethod
    def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
        """Fit the model - implemented differently for each architecture"""
        pass
    @abstractmethod
    def cross_validate(
        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
    ) -> Dict[str, float] | dict[str, np.floating[Any]]:
        """Perform cross-validation and return average scores"""
        pass
    @abstractmethod
    def generate_learning_curve(
        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        pass
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
        predictions = self.model.predict(X_prepared)
        # Handle different prediction formats
        if hasattr(predictions, "shape") and len(predictions.shape) > 1:
            # Neural network outputs (probabilities)
            predictions = predictions.argmax(axis=1)
        return self.label_encoder.inverse_transform(predictions)
    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        """Get prediction probabilities if supported"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
        if hasattr(self.model, "predict_proba"):
            return self.model.predict_proba(X_prepared)
        elif hasattr(self.model, "predict"):
            # For neural networks that return probabilities directly
            probabilities = self.model.predict(X_prepared)
            if len(probabilities.shape) == 2 and probabilities.shape[1] > 1:
                return probabilities
        raise NotImplementedError("Model does not support probability predictions")
    def get_feature_importance(self) -> Optional[Dict[str, float]]:
        """Get feature importance if supported by the model"""
        if hasattr(self.model, "feature_importances_"):
            # For tree-based models
            importances = self.model.feature_importances_
            feature_names = self._get_feature_names()
            return dict(zip(feature_names, importances))
        elif hasattr(self.model, "coef_"):
            # For linear models
            coefficients = np.abs(self.model.coef_[0])
            feature_names = self._get_feature_names()
            return dict(zip(feature_names, coefficients))
        elif hasattr(self.model, "named_steps") and "classifier" in self.model.named_steps:
            # For sklearn pipelines (like LogisticRegression with vectorizer)
            classifier = self.model.named_steps["classifier"]
            if hasattr(classifier, "coef_"):
                coefficients = np.abs(classifier.coef_[0])
                if hasattr(self.model.named_steps["vectorizer"], "get_feature_names_out"):
                    feature_names = self.model.named_steps["vectorizer"].get_feature_names_out()
                    # Take top features to avoid too many n-grams
                    top_indices = np.argsort(coefficients)[-20:]
                    return dict(zip(feature_names[top_indices], coefficients[top_indices]))
        return None
    def _get_feature_names(self) -> List[str]:
        """Get feature names (override in subclasses if needed)"""
        if hasattr(self.model, "feature_names_in_"):
            return list(self.model.feature_names_in_)
        return [f"feature_{i}" for i in range(100)]  # Default fallback
    def save(self, path: str):
        """Save the complete model with training history"""
        model_data = {
            "model": self.model,
            "feature_extractor": self.feature_extractor,
            "label_encoder": self.label_encoder,
            "tokenizer": self.tokenizer,
            "config": self.config.to_dict(),
            "is_fitted": self.is_fitted,
            "training_history": self.training_history,
            "learning_curve_data": self.learning_curve_data,
        }
        joblib.dump(model_data, path)
    @classmethod
    def load(cls, path: str) -> "BaseModel":
        """Load a saved model with training history"""
        model_data = joblib.load(path)
        # Recreate the model instance
        from research.experiment import ExperimentConfig
        config = ExperimentConfig.from_dict(model_data["config"])
        instance = cls(config)
        # Restore state
        instance.model = model_data["model"]
        instance.feature_extractor = model_data["feature_extractor"]
        instance.label_encoder = model_data["label_encoder"]
        instance.tokenizer = model_data.get("tokenizer")
        instance.is_fitted = model_data["is_fitted"]
        instance.training_history = model_data.get("training_history", {})
        instance.learning_curve_data = model_data.get("learning_curve_data", {})
        return instance
    def plot_learning_curve(self, save_path: Optional[str] = None) -> str:
        """Plot and save learning curve"""
        if not self.learning_curve_data:
            logging.warning("No learning curve data available")
            return ""
        plt.figure(figsize=(10, 6))
        data = self.learning_curve_data
        train_sizes = data["train_sizes"]
        train_scores = data["train_scores"]
        val_scores = data["val_scores"]
        train_std = data.get("train_scores_std", [0] * len(train_sizes))
        val_std = data.get("val_scores_std", [0] * len(train_sizes))
        # Plot learning curves
        plt.plot(train_sizes, train_scores, "o-", color="blue", label="Training Score")
        plt.fill_between(
            train_sizes,
            np.array(train_scores) - np.array(train_std),
            np.array(train_scores) + np.array(train_std),
            alpha=0.1,
            color="blue",
        )
        plt.plot(train_sizes, val_scores, "o-", color="red", label="Validation Score")
        plt.fill_between(
            train_sizes,
            np.array(val_scores) - np.array(val_std),
            np.array(val_scores) + np.array(val_std),
            alpha=0.1,
            color="red",
        )
        plt.xlabel("Training Set Size")
        plt.ylabel("Accuracy Score")
        plt.title(f"Learning Curve - {self.__class__.__name__}")
        plt.legend(loc="best")
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches="tight")
            plt.close()
            return save_path
        else:
            plt.show()
            return ""
    def plot_training_history(self, save_path: Optional[str] = None) -> str:
        """Plot training history for neural networks"""
        if not self.training_history:
            logging.warning("No training history available")
            return ""
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        # Plot accuracy
        if "accuracy" in self.training_history:
            axes[0].plot(self.training_history["accuracy"], label="Training Accuracy")
            if "val_accuracy" in self.training_history:
                axes[0].plot(self.training_history["val_accuracy"], label="Validation Accuracy")
            axes[0].set_title("Model Accuracy")
            axes[0].set_xlabel("Epoch")
            axes[0].set_ylabel("Accuracy")
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)
        # Plot loss
        if "loss" in self.training_history:
            axes[1].plot(self.training_history["loss"], label="Training Loss")
            if "val_loss" in self.training_history:
                axes[1].plot(self.training_history["val_loss"], label="Validation Loss")
            axes[1].set_title("Model Loss")
            axes[1].set_xlabel("Epoch")
            axes[1].set_ylabel("Loss")
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)
        plt.tight_layout()
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches="tight")
            plt.close()
            return save_path
        else:
            plt.show()
            return ""
@@ -0,0 +1,91 @@
 from dataclasses import dataclass, field, asdict
 from enum import Enum
 from typing import List, Dict, Any, Optional
 import numpy as np
 from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 from .feature_extractor import FeatureType
@dataclass
 class ExperimentConfig:
    """Configuration for a single experiment"""
    # Experiment metadata
    name: str
    description: str = ""
    tags: List[str] = field(default_factory=list)
    # Model configuration
    model_type: str = "logistic_regression"  # logistic_regression, lstm, transformer, etc.
    model_params: Dict[str, Any] = field(default_factory=dict)
    # Feature configuration
    features: List[FeatureType] = field(default_factory=lambda: [FeatureType.FULL_NAME])
    feature_params: Dict[str, Any] = field(default_factory=dict)
    # Data configuration
    train_data_filter: Optional[Dict[str, Any]] = None  # Filter criteria for training data
    test_data_filter: Optional[Dict[str, Any]] = None
    target_column: str = "sex"
    # Training configuration
    test_size: float = 0.2
    random_seed: int = 42
    cross_validation_folds: int = 5
    # Evaluation configuration
    metrics: List[str] = field(default_factory=lambda: ["accuracy", "precision", "recall", "f1"])
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization"""
        result = asdict(self)
        # Convert enums to strings
        result["features"] = [f.value for f in self.features]
        return result
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig":
        """Create from dictionary"""
        if "features" in data:
            data["features"] = [FeatureType(f) for f in data["features"]]
        return cls(**data)
 class ExperimentStatus(Enum):
    """Experiment execution status"""
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"
 def calculate_metrics(
    y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None
 ) -> Dict[str, float]:
    """Calculate specified metrics"""
    if metrics is None:
        metrics = ["accuracy", "precision", "recall", "f1"]
    results = {}
    if "accuracy" in metrics:
        results["accuracy"] = accuracy_score(y_true, y_pred)
    if any(m in metrics for m in ["precision", "recall", "f1"]):
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average="weighted"
        )
        if "precision" in metrics:
            results["precision"] = precision
        if "recall" in metrics:
            results["recall"] = recall
        if "f1" in metrics:
            results["f1"] = f1
    return results
@@ -0,0 +1,56 @@
 from dataclasses import dataclass, field, asdict
 from datetime import datetime
 from typing import Optional, Dict, List, Any
 from research.experiment import ExperimentConfig, ExperimentStatus
@dataclass
 class ExperimentResult:
    """Results from an experiment execution"""
    experiment_id: str
    config: ExperimentConfig
    # Execution metadata
    start_time: datetime
    end_time: Optional[datetime] = None
    status: ExperimentStatus = ExperimentStatus.PENDING
    error_message: Optional[str] = None
    # Model artifacts
    model_path: Optional[str] = None
    feature_extractor_path: Optional[str] = None
    # Metrics
    train_metrics: Dict[str, float] = field(default_factory=dict)
    test_metrics: Dict[str, float] = field(default_factory=dict)
    cv_metrics: Dict[str, float] = field(default_factory=dict)
    # Additional results
    confusion_matrix: Optional[List[List[int]]] = None
    feature_importance: Optional[Dict[str, float]] = None
    prediction_examples: Optional[List[Dict]] = None
    # Data statistics
    train_size: int = 0
    test_size: int = 0
    class_distribution: Dict[str, int] = field(default_factory=dict)
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization"""
        result = asdict(self)
        result["config"] = self.config.to_dict()
        result["start_time"] = self.start_time.isoformat()
        result["end_time"] = self.end_time.isoformat() if self.end_time else None
        result["status"] = self.status.value
        return result
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "ExperimentResult":
        """Create from dictionary"""
        data["config"] = ExperimentConfig.from_dict(data["config"])
        data["start_time"] = datetime.fromisoformat(data["start_time"])
        data["end_time"] = datetime.fromisoformat(data["end_time"]) if data["end_time"] else None
        data["status"] = ExperimentStatus(data["status"])
        return cls(**data)
@@ -0,0 +1,123 @@
 from typing import List
 from research.experiment import ExperimentConfig
 from research.experiment.feature_extractor import FeatureType
 class ExperimentBuilder:
    """Helper class to build experiment configurations"""
    @staticmethod
    def create_baseline_experiments() -> List[ExperimentConfig]:
        """Create a set of baseline experiments for comparison"""
        return [
            # Full name experiments
            ExperimentConfig(
                name="baseline_logistic_regression_fullname",
                description="Logistic regression with full name",
                model_type="logistic_regression",
                features=[FeatureType.FULL_NAME],
                tags=["baseline", "fullname"],
            ),
            # Native name only
            ExperimentConfig(
                name="baseline_logistic_regression_native",
                description="Logistic regression with native name only",
                model_type="logistic_regression",
                features=[FeatureType.NATIVE_NAME],
                tags=["baseline", "native"],
            ),
            # Surname only
            ExperimentConfig(
                name="baseline_logistic_regression_surname",
                description="Logistic regression with surname only",
                model_type="logistic_regression",
                features=[FeatureType.SURNAME],
                tags=["baseline", "surname"],
            ),
            # Random Forest with engineered features
            ExperimentConfig(
                name="baseline_rf_engineered",
                description="Random Forest with engineered features",
                model_type="random_forest",
                features=[FeatureType.NAME_LENGTH, FeatureType.WORD_COUNT, FeatureType.PROVINCE],
                tags=["baseline", "engineered"],
            ),
        ]
    @staticmethod
    def create_feature_ablation_study() -> List[ExperimentConfig]:
        """Create experiments for feature ablation study"""
        base_features = [
            FeatureType.FULL_NAME,
            FeatureType.NAME_LENGTH,
            FeatureType.WORD_COUNT,
            FeatureType.PROVINCE,
        ]
        experiments = []
        # Test removing each feature one by one
        for i, feature_to_remove in enumerate(base_features):
            remaining_features = [f for f in base_features if f != feature_to_remove]
            experiments.append(
                ExperimentConfig(
                    name=f"ablation_remove_{feature_to_remove.value}",
                    description=f"Ablation study: removed {feature_to_remove.value}",
                    model_type="logistic_regression",
                    features=remaining_features,
                    tags=["ablation", feature_to_remove.value],
                )
            )
        return experiments
    @staticmethod
    def create_name_component_study() -> List[ExperimentConfig]:
        """Create experiments to study different name components"""
        experiments = []
        name_components = [
            (FeatureType.FIRST_WORD, "first_word"),
            (FeatureType.LAST_WORD, "last_word"),
            (FeatureType.NATIVE_NAME, "native_name"),
            (FeatureType.SURNAME, "surname"),
            (FeatureType.NAME_BEGINNINGS, "name_beginnings"),
            (FeatureType.NAME_ENDINGS, "name_endings"),
        ]
        for feature, name in name_components:
            experiments.append(
                ExperimentConfig(
                    name=f"component_study_{name}",
                    description=f"Study of {name} for gender prediction",
                    model_type="logistic_regression",
                    features=[feature],
                    tags=["component_study", name],
                )
            )
        return experiments
    @staticmethod
    def create_province_specific_study() -> List[ExperimentConfig]:
        """Create experiments for province-specific analysis"""
        provinces = ["kinshasa", "bas-congo", "bandundu", "katanga"]  # Add more as needed
        experiments = []
        for province in provinces:
            experiments.append(
                ExperimentConfig(
                    name=f"province_study_{province}",
                    description=f"Gender prediction for {province} province only",
                    model_type="logistic_regression",
                    features=[FeatureType.FULL_NAME],
                    train_data_filter={"province": province},
                    tags=["province_study", province],
                )
            )
        return experiments
@@ -0,0 +1,238 @@
 import logging
 from datetime import datetime
 from pathlib import Path
 from typing import List, Dict, Optional
 import numpy as np
 import pandas as pd
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import train_test_split
 from core.config import PipelineConfig
 from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from research.base_model import BaseModel
 from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
 from research.experiment.experiment_tracker import ExperimentTracker
 from research.model_registry import create_model
 class ExperimentRunner:
    """Runs and manages experiments"""
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.tracker = ExperimentTracker(self.config)
        self.data_loader = DataLoader(self.config)
    def run_experiment(self, experiment_config: ExperimentConfig) -> str:
        """Run a single experiment and return experiment ID"""
        # Create experiment
        experiment_id = self.tracker.create_experiment(experiment_config)
        try:
            logging.info(f"Starting experiment: {experiment_id}")
            self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
            # Load data
            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
            df = self.data_loader.load_csv_complete(data_path)
            # Apply data filters if specified
            df = self._apply_data_filters(df, experiment_config)
            # Prepare target variable
            y = df[experiment_config.target_column]
            X = df
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=experiment_config.test_size,
                random_state=experiment_config.random_seed,
                stratify=y,
            )
            # Create and train model
            model = create_model(experiment_config)
            model.fit(X_train, y_train)
            # Make predictions
            train_pred = model.predict(X_train)
            test_pred = model.predict(X_test)
            # Calculate metrics
            train_metrics = calculate_metrics(y_train, train_pred, experiment_config.metrics)
            test_metrics = calculate_metrics(y_test, test_pred, experiment_config.metrics)
            # Cross-validation if requested
            cv_metrics = {}
            if experiment_config.cross_validation_folds > 1:
                cv_metrics = model.cross_validate(
                    X_train, y_train, experiment_config.cross_validation_folds
                )
            # Additional analysis
            conf_matrix = confusion_matrix(y_test, test_pred).tolist()
            feature_importance = model.get_feature_importance()
            # Create prediction examples
            prediction_examples = self._create_prediction_examples(
                X_test, y_test, test_pred, model, n_examples=10
            )
            # Calculate class distribution
            class_distribution = y.value_counts().to_dict()
            # Save model
            model_path = self._save_model(model, experiment_id)
            # Update experiment with results
            self.tracker.update_experiment(
                experiment_id,
                status=ExperimentStatus.COMPLETED,
                end_time=datetime.now(),
                model_path=str(model_path),
                train_metrics=train_metrics,
                test_metrics=test_metrics,
                cv_metrics=cv_metrics,
                confusion_matrix=conf_matrix,
                feature_importance=feature_importance,
                prediction_examples=prediction_examples,
                train_size=len(X_train),
                test_size=len(X_test),
                class_distribution=class_distribution,
            )
            logging.info(f"Experiment {experiment_id} completed successfully")
            logging.info(f"Test accuracy: {test_metrics.get('accuracy', 'N/A'):.4f}")
            return experiment_id
        except Exception as e:
            logging.error(f"Experiment {experiment_id} failed: {str(e)}")
            self.tracker.update_experiment(
                experiment_id,
                status=ExperimentStatus.FAILED,
                end_time=datetime.now(),
                error_message=str(e),
            )
            raise
    def run_experiment_batch(self, experiments: List[ExperimentConfig]) -> List[str]:
        """Run multiple experiments"""
        experiment_ids = []
        for i, config in enumerate(experiments):
            logging.info(f"Running experiment {i + 1}/{len(experiments)}: {config.name}")
            try:
                exp_id = self.run_experiment(config)
                experiment_ids.append(exp_id)
            except Exception as e:
                logging.error(f"Failed to run experiment {config.name}: {e}")
                continue
        return experiment_ids
    @classmethod
    def _apply_data_filters(cls, df: pd.DataFrame, config: ExperimentConfig) -> pd.DataFrame:
        """Apply data filters specified in experiment config"""
        filtered_df = df.copy()
        # Apply training data filters
        if config.train_data_filter:
            for column, criteria in config.train_data_filter.items():
                if column in filtered_df.columns:
                    if isinstance(criteria, list):
                        filtered_df = filtered_df[filtered_df[column].isin(criteria)]
                    elif isinstance(criteria, dict):
                        if "min" in criteria:
                            filtered_df = filtered_df[filtered_df[column] >= criteria["min"]]
                        if "max" in criteria:
                            filtered_df = filtered_df[filtered_df[column] <= criteria["max"]]
                    else:
                        filtered_df = filtered_df[filtered_df[column] == criteria]
        return filtered_df
    @classmethod
    def _create_prediction_examples(
        cls,
        X_test: pd.DataFrame,
        y_test: pd.Series,
        predictions: np.ndarray,
        model: BaseModel,
        n_examples: int = 10,
    ) -> List[Dict]:
        """Create prediction examples for analysis"""
        examples = []
        # Get both correct and incorrect predictions
        correct_mask = y_test == predictions
        incorrect_indices = X_test[~correct_mask].index[: n_examples // 2]
        correct_indices = X_test[correct_mask].index[: n_examples // 2]
        sample_indices = list(incorrect_indices) + list(correct_indices)
        for idx in sample_indices[:n_examples]:
            example = {
                "name": X_test.loc[idx, "name"] if "name" in X_test.columns else "N/A",
                "true_label": y_test.loc[idx],
                "predicted_label": predictions[X_test.index.get_loc(idx)],
                "correct": y_test.loc[idx] == predictions[X_test.index.get_loc(idx)],
            }
            # Add probability if available
            if model.architecture == "traditional":
                proba = model.predict_proba(X_test.loc[[idx]])
                example["prediction_confidence"] = float(proba.max())
            examples.append(example)
        return examples
    def _save_model(self, model: BaseModel, experiment_id: str) -> Path:
        """Save trained model"""
        model_dir = self.config.paths.models_dir / "experiments" / experiment_id
        model_dir.mkdir(parents=True, exist_ok=True)
        model_path = model_dir / "model.joblib"
        model.save(str(model_path))
        return model_path
    def load_experiment_model(self, experiment_id: str) -> Optional[BaseModel]:
        """Load a model from a completed experiment"""
        experiment = self.tracker.get_experiment(experiment_id)
        if experiment and experiment.model_path:
            return BaseModel.load(experiment.model_path)
        return None
    def compare_experiments(
        self, experiment_ids: List[str], metric: str = "accuracy"
    ) -> pd.DataFrame:
        """Compare experiments and return analysis"""
        comparison_df = self.tracker.compare_experiments(experiment_ids)
        if f"test_{metric}" in comparison_df.columns:
            comparison_df = comparison_df.sort_values(f"test_{metric}", ascending=False)
        return comparison_df
    def get_feature_analysis(self, experiment_id: str) -> Optional[pd.DataFrame]:
        """Get feature importance analysis for an experiment"""
        experiment = self.tracker.get_experiment(experiment_id)
        if experiment and experiment.feature_importance:
            importance_df = pd.DataFrame(
                [
                    {"feature": feature, "importance": importance}
                    for feature, importance in experiment.feature_importance.items()
                ]
            )
            return importance_df.sort_values("importance", ascending=False)
        return None
@@ -0,0 +1,194 @@
 import hashlib
 import json
 from datetime import datetime
 from pathlib import Path
 from typing import Optional, Dict, List
 import pandas as pd
 from core.config import PipelineConfig, get_config
 from research.experiment import ExperimentConfig, ExperimentStatus
 from research.experiment.experiement_result import ExperimentResult
 class ExperimentTracker:
    """Tracks and manages experiments"""
    def __init__(self, config: Optional[PipelineConfig] = None):
        self.config = config or get_config()
        self.experiments_dir = self.config.paths.outputs_dir / "experiments"
        self.experiments_dir.mkdir(parents=True, exist_ok=True)
        self.results_db_path = self.experiments_dir / "experiments.json"
        self._results: Dict[str, ExperimentResult] = {}
        self._load_results()
    def _load_results(self):
        """Load existing experiment results"""
        if self.results_db_path.exists():
            try:
                with open(self.results_db_path, "r") as f:
                    data = json.load(f)
                for exp_id, exp_data in data.items():
                    self._results[exp_id] = ExperimentResult.from_dict(exp_data)
            except Exception as e:
                print(f"Warning: Failed to load experiment results: {e}")
    def _save_results(self):
        """Save experiment results to disk"""
        data = {exp_id: result.to_dict() for exp_id, result in self._results.items()}
        with open(self.results_db_path, "w") as f:
            json.dump(data, f, indent=2, default=str)
    def create_experiment(self, config: ExperimentConfig) -> str:
        """Create a new experiment and return its ID"""
        # Generate experiment ID
        config_hash = hashlib.md5(
            json.dumps(config.to_dict(), sort_keys=True).encode()
        ).hexdigest()[:8]
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        experiment_id = f"{config.name}_{timestamp}_{config_hash}"
        # Create result object
        result = ExperimentResult(
            experiment_id=experiment_id, config=config, start_time=datetime.now()
        )
        self._results[experiment_id] = result
        self._save_results()
        return experiment_id
    def update_experiment(self, experiment_id: str, **updates):
        """Update an experiment's results"""
        if experiment_id in self._results:
            result = self._results[experiment_id]
            for key, value in updates.items():
                if hasattr(result, key):
                    setattr(result, key, value)
            self._save_results()
    def get_experiment(self, experiment_id: str) -> Optional[ExperimentResult]:
        """Get experiment by ID"""
        return self._results.get(experiment_id)
    def list_experiments(
        self,
        status: Optional[ExperimentStatus] = None,
        tags: Optional[List[str]] = None,
        model_type: Optional[str] = None,
    ) -> List[ExperimentResult]:
        """List experiments with optional filtering"""
        results = list(self._results.values())
        if status:
            results = [r for r in results if r.status == status]
        if tags:
            results = [r for r in results if any(tag in r.config.tags for tag in tags)]
        if model_type:
            results = [r for r in results if r.config.model_type == model_type]
        return sorted(results, key=lambda x: x.start_time, reverse=True)
    def get_best_experiment(
        self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
    ) -> Optional[ExperimentResult]:
        """Get the best experiment based on a metric"""
        experiments = self.list_experiments()
        if filters:
            # Apply additional filters
            if "model_type" in filters:
                experiments = [
                    e for e in experiments if e.config.model_type == filters["model_type"]
                ]
            if "features" in filters:
                experiments = [
                    e
                    for e in experiments
                    if any(f in e.config.features for f in filters["features"])
                ]
        valid_experiments = []
        for exp in experiments:
            if exp.status == ExperimentStatus.COMPLETED:
                metrics_dict = exp.test_metrics if dataset == "test" else exp.train_metrics
                if metric in metrics_dict:
                    valid_experiments.append((exp, metrics_dict[metric]))
        if not valid_experiments:
            return None
        return max(valid_experiments, key=lambda x: x[1])[0]
    def compare_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
        """Compare multiple experiments in a DataFrame"""
        rows = []
        for exp_id in experiment_ids:
            exp = self.get_experiment(exp_id)
            if exp:
                row = {
                    "experiment_id": exp_id,
                    "name": exp.config.name,
                    "model_type": exp.config.model_type,
                    "features": ",".join([f.value for f in exp.config.features]),
                    "status": exp.status.value,
                    "train_size": exp.train_size,
                    "test_size": exp.test_size,
                }
                # Add metrics
                for metric, value in exp.test_metrics.items():
                    row[f"test_{metric}"] = value
                for metric, value in exp.cv_metrics.items():
                    row[f"cv_{metric}"] = value
                rows.append(row)
        return pd.DataFrame(rows)
    def export_results(self, output_path: Optional[Path] = None) -> Path:
        """Export all results to CSV"""
        if output_path is None:
            output_path = (
                self.experiments_dir
                / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            )
        rows = []
        for exp in self._results.values():
            row = {
                "experiment_id": exp.experiment_id,
                "name": exp.config.name,
                "description": exp.config.description,
                "model_type": exp.config.model_type,
                "features": ",".join([f.value for f in exp.config.features]),
                "status": exp.status.value,
                "start_time": exp.start_time.isoformat(),
                "end_time": exp.end_time.isoformat() if exp.end_time else None,
                "train_size": exp.train_size,
                "test_size": exp.test_size,
            }
            # Add all metrics
            for metric, value in exp.test_metrics.items():
                row[f"test_{metric}"] = value
            for metric, value in exp.cv_metrics.items():
                row[f"cv_{metric}"] = value
            rows.append(row)
        df = pd.DataFrame(rows)
        df.to_csv(output_path, index=False)
        return output_path
@@ -0,0 +1,90 @@
 from enum import Enum
 from typing import List, Dict, Any, Union
 import pandas as pd
 class FeatureType(Enum):
    """Types of features that can be extracted from names"""
    FULL_NAME = "full_name"
    NATIVE_NAME = "native_name"
    SURNAME = "surname"
    FIRST_WORD = "first_word"
    LAST_WORD = "last_word"
    NAME_LENGTH = "name_length"
    WORD_COUNT = "word_count"
    PROVINCE = "province"
    CHAR_NGRAMS = "char_ngrams"
    WORD_NGRAMS = "word_ngrams"
    NAME_ENDINGS = "name_endings"
    NAME_BEGINNINGS = "name_beginnings"
 class FeatureExtractor:
    """Extract different types of features from name data"""
    def __init__(self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None):
        self.feature_types = feature_types
        self.feature_params = feature_params or {}
    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Extract all configured features"""
        features_df = pd.DataFrame(index=df.index)
        for feature_type in self.feature_types:
            feature_data = self._extract_single_feature(df, feature_type)
            if isinstance(feature_data, pd.DataFrame):
                features_df = pd.concat([features_df, feature_data], axis=1)
            else:
                features_df[feature_type.value] = feature_data
        return features_df
    def _extract_single_feature(
        self, df: pd.DataFrame, feature_type: FeatureType
    ) -> Union[pd.Series, pd.DataFrame]:
        """Extract a single type of feature"""
        if feature_type == FeatureType.FULL_NAME:
            return df["name"].fillna("")
        elif feature_type == FeatureType.NATIVE_NAME:
            return df["identified_name"].fillna(df["probable_native"]).fillna("")
        elif feature_type == FeatureType.SURNAME:
            return df["identified_surname"].fillna(df["probable_surname"]).fillna("")
        elif feature_type == FeatureType.FIRST_WORD:
            return df["name"].str.split().str[0].fillna("")
        elif feature_type == FeatureType.LAST_WORD:
            return df["name"].str.split().str[-1].fillna("")
        elif feature_type == FeatureType.NAME_LENGTH:
            return df["name"].str.len().fillna(0)
        elif feature_type == FeatureType.WORD_COUNT:
            return df["words"].fillna(1)
        elif feature_type == FeatureType.PROVINCE:
            return df["province"].fillna("unknown")
        elif feature_type == FeatureType.NAME_ENDINGS:
            n = self.feature_params.get("ending_length", 3)
            return df["name"].str[-n:].fillna("")
        elif feature_type == FeatureType.NAME_BEGINNINGS:
            n = self.feature_params.get("beginning_length", 3)
            return df["name"].str[:n].fillna("")
        elif feature_type == FeatureType.CHAR_NGRAMS:
            # This will be handled by the model's vectorizer
            return df["name"].fillna("")
        elif feature_type == FeatureType.WORD_NGRAMS:
            # This will be handled by the model's vectorizer
            return df["name"].fillna("")
        else:
            raise ValueError(f"Unknown feature type: {feature_type}")
@@ -0,0 +1,44 @@
 from typing import List
 from research.base_model import BaseModel
 from research.experiment import ExperimentConfig
 from research.models.bigru_model import BiGRUModel
 from research.models.cnn_model import CNNModel
 from research.models.ensemble_model import EnsembleModel
 from research.models.lightgbm_model import LightGBMModel
 from research.models.logistic_regression_model import LogisticRegressionModel
 from research.models.lstm_model import LSTMModel
 from research.models.naive_bayes_model import NaiveBayesModel
 from research.models.random_forest_model import RandomForestModel
 from research.models.svm_model import SVMModel
 from research.models.transformer_model import TransformerModel
 from research.models.xgboost_model import XGBoostModel
 MODEL_REGISTRY = {
    "bigru": BiGRUModel,
    "cnn": CNNModel,
    "ensemble": EnsembleModel,
    "lightgbm": LightGBMModel,
    "logistic_regression": LogisticRegressionModel,
    "lstm": LSTMModel,
    "naive_bayes": NaiveBayesModel,
    "random_forest": RandomForestModel,
    "svm": SVMModel,
    "transformer": TransformerModel,
    "xgboost": XGBoostModel,
 }
 def create_model(config: ExperimentConfig) -> BaseModel:
    """Factory function to create models"""
    model_class = MODEL_REGISTRY.get(config.model_type)
    if model_class is None:
        raise ValueError(f"Unknown model type: {config.model_type}")
    return model_class(config)
 def list_available_models() -> List[str]:
    """List all available model types"""
    return list(MODEL_REGISTRY.keys())
@@ -0,0 +1,281 @@
 import json
 import logging
 from datetime import datetime
 from typing import List, Dict, Any
 import pandas as pd
 from core.config import get_config
 from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from research.experiment import FeatureType, ExperimentConfig
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
 class ModelTrainer:
    """Comprehensive model training and artifact management"""
    def __init__(self, config=None):
        self.config = config or get_config()
        self.data_loader = DataLoader(self.config)
        self.experiment_runner = ExperimentRunner(self.config)
        self.experiment_tracker = ExperimentTracker(self.config)
        self.logger = logging.getLogger(__name__)
        # Setup model artifacts directory
        self.models_dir = self.config.paths.models_dir
        self.models_dir.mkdir(parents=True, exist_ok=True)
    def train_single_model(
        self,
        model_name: str,
        model_type: str = "logistic_regression",
        features: List[str] = None,
        model_params: Dict[str, Any] = None,
        save_artifacts: bool = True,
    ) -> str:
        """
        Train a single model and save its artifacts.
        Returns the experiment ID.
        """
        self.logger.info(f"Training {model_type} model: {model_name}")
        if features is None:
            features = ["full_name"]
        feature_types = [FeatureType(f) for f in features]
        # Create experiment configuration
        config = ExperimentConfig(
            name=model_name,
            description=f"Training {model_type} model with features: {', '.join(features)}",
            model_type=model_type,
            features=feature_types,
            model_params=model_params or {},
            tags=["training", model_type],
        )
        # Run experiment
        experiment_id = self.experiment_runner.run_experiment(config)
        experiment = self.experiment_tracker.get_experiment(experiment_id)
        if experiment and experiment.test_metrics:
            self.logger.info("Training completed successfully!")
            self.logger.info(f"   Experiment ID: {experiment_id}")
            self.logger.info(f"   Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
            self.logger.info(f"   Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
            if save_artifacts:
                self.save_model_artifacts(experiment_id)
        return experiment_id
    def train_multiple_models(
        self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
    ) -> List[str]:
        """
        Train multiple models with different configurations.
        """
        self.logger.info(f"Training {len(model_configs)} models...")
        experiment_ids = []
        for i, config in enumerate(model_configs):
            model_name = f"{base_name}_{config['model_type']}_{i + 1}"
            try:
                exp_id = self.train_single_model(
                    model_name=model_name,
                    model_type=config["model_type"],
                    features=config.get("features", ["full_name"]),
                    model_params=config.get("model_params", {}),
                    save_artifacts=save_all,
                )
                experiment_ids.append(exp_id)
            except Exception as e:
                self.logger.error(f"Failed to train {model_name}: {e}")
                continue
        self.logger.info(f"Completed training {len(experiment_ids)} models successfully")
        return experiment_ids
    def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
        """
        Save model artifacts in a structured way for easy loading.
        Returns paths to saved artifacts.
        """
        experiment = self.experiment_tracker.get_experiment(experiment_id)
        if not experiment:
            raise ValueError(f"Experiment {experiment_id} not found")
        # Create model-specific directory
        model_dir = self.models_dir / experiment_id
        model_dir.mkdir(parents=True, exist_ok=True)
        # Load the trained model
        trained_model = self.experiment_runner.load_experiment_model(experiment_id)
        if not trained_model:
            raise ValueError(f"Could not load model for experiment {experiment_id}")
        # Save complete model with joblib
        model_path = model_dir / "complete_model.joblib"
        trained_model.save(str(model_path))
        # Save model configuration
        config_path = model_dir / "model_config.json"
        with open(config_path, "w") as f:
            import json
            json.dump(experiment.config.to_dict(), f, indent=2)
        # Save experiment results
        results_path = model_dir / "experiment_results.json"
        with open(results_path, "w") as f:
            json.dump(experiment.to_dict(), f, indent=2, default=str)
        # Generate and save learning curves
        learning_curve_path = None
        training_history_path = None
        try:
            # Load data for learning curve generation
            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
            if data_path.exists():
                df = self.data_loader.load_csv_complete(data_path)
                # Generate learning curve
                self.logger.info("Generating learning curve...")
                trained_model.generate_learning_curve(df, df[experiment.config.target_column])
                # Plot and save learning curve
                learning_curve_path = model_dir / "learning_curve.png"
                trained_model.plot_learning_curve(str(learning_curve_path))
                # Plot and save training history (for neural networks)
                if trained_model.training_history:
                    training_history_path = model_dir / "training_history.png"
                    trained_model.plot_training_history(str(training_history_path))
                # Save learning curve data as JSON
                learning_data_path = model_dir / "learning_curve_data.json"
                with open(learning_data_path, "w") as f:
                    json.dump(trained_model.learning_curve_data, f, indent=2)
                # Save training history data as JSON
                if trained_model.training_history:
                    history_data_path = model_dir / "training_history_data.json"
                    with open(history_data_path, "w") as f:
                        json.dump(trained_model.training_history, f, indent=2)
        except Exception as e:
            self.logger.warning(f"Could not generate learning curves: {e}")
        # Save artifacts metadata
        metadata = {
            "experiment_id": experiment_id,
            "model_name": experiment.config.name,
            "model_type": experiment.config.model_type,
            "features": [f.value for f in experiment.config.features],
            "training_date": datetime.now().isoformat(),
            "test_accuracy": experiment.test_metrics.get("accuracy", 0),
            "test_f1": experiment.test_metrics.get("f1", 0),
            "model_path": str(model_path),
            "config_path": str(config_path),
            "results_path": str(results_path),
            "learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
            "training_history_plot": str(training_history_path) if training_history_path else None,
            "has_learning_curve": bool(trained_model.learning_curve_data),
            "has_training_history": bool(trained_model.training_history),
        }
        metadata_path = model_dir / "metadata.json"
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)
        self.logger.info(f"Model artifacts saved to: {model_dir}")
        self.logger.info(f"   - Complete model: {model_path.name}")
        self.logger.info(f"   - Configuration: {config_path.name}")
        self.logger.info(f"   - Results: {results_path.name}")
        self.logger.info(f"   - Metadata: {metadata_path.name}")
        if learning_curve_path and learning_curve_path.exists():
            self.logger.info(f"   - Learning curve: {learning_curve_path.name}")
        if training_history_path and training_history_path.exists():
            self.logger.info(f"   - Training history: {training_history_path.name}")
        return {
            "model_dir": str(model_dir),
            "model_path": str(model_path),
            "config_path": str(config_path),
            "results_path": str(results_path),
            "metadata_path": str(metadata_path),
            "learning_curve_plot": str(learning_curve_path) if learning_curve_path else None,
            "training_history_plot": str(training_history_path) if training_history_path else None,
        }
    def load_trained_model(self, experiment_id: str):
        """
        Load a previously trained model from artifacts.
        """
        model_dir = self.models_dir / experiment_id
        model_path = model_dir / "complete_model.joblib"
        if not model_path.exists():
            raise FileNotFoundError(f"Model artifacts not found for experiment {experiment_id}")
        # Load the model class dynamically
        metadata_path = model_dir / "metadata.json"
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
        model_type = metadata["model_type"]
        from research.model_registry import MODEL_REGISTRY
        model_class = MODEL_REGISTRY[model_type]
        # Load the complete model
        loaded_model = model_class.load(str(model_path))
        self.logger.info(f"Loaded model: {metadata['model_name']}")
        self.logger.info(f"   Type: {model_type}")
        self.logger.info(f"   Accuracy: {metadata['test_accuracy']:.4f}")
        return loaded_model
    def list_saved_models(self) -> pd.DataFrame:
        """
        List all saved model artifacts.
        """
        models_data = []
        for model_dir in self.models_dir.iterdir():
            if model_dir.is_dir():
                metadata_path = model_dir / "metadata.json"
                if metadata_path.exists():
                    try:
                        with open(metadata_path, "r") as f:
                            metadata = json.load(f)
                        models_data.append(metadata)
                    except Exception as e:
                        self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}")
        if not models_data:
            self.logger.info("No saved models found.")
            return pd.DataFrame()
        df = pd.DataFrame(models_data)
        # Format the display
        display_columns = [
            "model_name",
            "model_type",
            "features",
            "test_accuracy",
            "test_f1",
            "training_date",
        ]
        available_columns = [col for col in display_columns if col in df.columns]
        return df[available_columns].sort_values("training_date", ascending=False)
@@ -0,0 +1,56 @@
 from typing import Any
 import numpy as np
 import pandas as pd
 from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from research.neural_network_model import NeuralNetworkModel
 class BiGRUModel(NeuralNetworkModel):
    """Bidirectional GRU model for name classification"""
    def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
        params = kwargs
        model = Sequential(
            [
                Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
                Bidirectional(
                    GRU(
                        params.get("gru_units", 32),
                        return_sequences=True,
                        dropout=params.get("dropout", 0.2),
                    )
                ),
                Bidirectional(GRU(params.get("gru_units", 32), dropout=params.get("dropout", 0.2))),
                Dense(64, activation="relu"),
                Dropout(params.get("dropout", 0.5)),
                Dense(2, activation="softmax"),
            ]
        )
        model.compile(
            loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
        )
        return model
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_data = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_data.extend(X[feature_type.value].astype(str).tolist())
        if not text_data:
            raise ValueError("No text data found in the provided DataFrame.")
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
            self.tokenizer.fit_on_texts(text_data)
        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
        max_len = self.config.model_params.get("max_len", 6)
        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,75 @@
 from typing import Any
 import numpy as np
 import pandas as pd
 from tensorflow.keras.layers import (
    Embedding,
    Conv1D,
    MaxPooling1D,
    GlobalMaxPooling1D,
    Dense,
    Dropout,
 )
 from tensorflow.keras.models import Sequential
 from research.neural_network_model import NeuralNetworkModel
 class CNNModel(NeuralNetworkModel):
    """1D Convolutional Neural Network for character patterns"""
    def build_model_with_vocab(self, vocab_size: int, max_len: int = 20, **kwargs) -> Any:
        """Build CNN model with known vocabulary size"""
        params = kwargs
        model = Sequential(
            [
                Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
                Conv1D(
                    filters=params.get("filters", 64),
                    kernel_size=params.get("kernel_size", 3),
                    activation="relu",
                ),
                MaxPooling1D(pool_size=2),
                Conv1D(
                    filters=params.get("filters", 64),
                    kernel_size=params.get("kernel_size", 3),
                    activation="relu",
                ),
                GlobalMaxPooling1D(),
                Dense(64, activation="relu"),
                Dropout(params.get("dropout", 0.5)),
                Dense(2, activation="softmax"),
            ]
        )
        model.compile(
            loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
        )
        return model
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        """Prepare sequences for CNN using extracted features"""
        # X here contains the features already extracted by FeatureExtractor
        from tensorflow.keras.preprocessing.text import Tokenizer
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        # Get text data from extracted features - use character level for CNN
        text_data = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_data.extend(X[feature_type.value].astype(str).tolist())
        if not text_data:
            # Fallback - should not happen if FeatureExtractor is properly configured
            text_data = [""] * len(X)
        # Initialize character-level tokenizer
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<OOV>")
            self.tokenizer.fit_on_texts(text_data)
        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
        max_len = self.config.model_params.get("max_len", 20)  # Longer for character level
        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,97 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.ensemble import VotingClassifier, RandomForestClassifier
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
 from research.experiment import ExperimentConfig
 from research.traditional_model import TraditionalModel
 class EnsembleModel(TraditionalModel):
    """Ensemble model combining multiple base models"""
    @property
    def architecture(self) -> str:
        """Return the architecture type"""
        return "ensemble"
    def __init__(self, config: ExperimentConfig):
        super().__init__(config)
        self.base_models = []
        self.model_weights = None
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
        base_model_types = params.get(
            "base_models", ["logistic_regression", "random_forest", "naive_bayes"]
        )
        # Create base models with simplified configs
        estimators = []
        for model_type in base_model_types:
            if model_type == "logistic_regression":
                model = Pipeline(
                    [
                        (
                            "vectorizer",
                            CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
                        ),
                        (
                            "classifier",
                            LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
                        ),
                    ]
                )
                estimators.append((f"logistic_regression", model))
            elif model_type == "random_forest":
                model = Pipeline(
                    [
                        (
                            "vectorizer",
                            TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
                        ),
                        (
                            "classifier",
                            RandomForestClassifier(
                                n_estimators=50, random_state=self.config.random_seed
                            ),
                        ),
                    ]
                )
                estimators.append((f"rf", model))
            elif model_type == "naive_bayes":
                model = Pipeline(
                    [
                        (
                            "vectorizer",
                            CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
                        ),
                        ("classifier", MultinomialNB()),
                    ]
                )
                estimators.append((f"nb", model))
        voting_type = params.get("voting", "soft")  # 'hard' or 'soft'
        return VotingClassifier(estimators=estimators, voting=voting_type)
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_features = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_features.append(X[feature_type.value].astype(str))
        if len(text_features) == 1:
            return text_features[0].values
        else:
            combined = text_features[0].astype(str)
            for feature in text_features[1:]:
                combined = combined + " " + feature.astype(str)
            return combined.values
@@ -0,0 +1,51 @@
 import lightgbm as lgb
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.preprocessing import LabelEncoder
 from research.traditional_model import TraditionalModel
 class LightGBMModel(TraditionalModel):
    """LightGBM with engineered features"""
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
        return lgb.LGBMClassifier(
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth", -1),
            learning_rate=params.get("learning_rate", 0.1),
            num_leaves=params.get("num_leaves", 31),
            subsample=params.get("subsample", 0.8),
            colsample_bytree=params.get("colsample_bytree", 0.8),
            random_state=self.config.random_seed,
            verbose=-1,
        )
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        features = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                column = X[feature_type.value]
                if feature_type.value in ["name_length", "word_count"]:
                    features.append(column.fillna(0).values.reshape(-1, 1))
                elif feature_type.value in ["full_name", "native_name", "surname"]:
                    # Character n-grams for text features
                    vectorizer = CountVectorizer(
                        analyzer="char", ngram_range=(2, 3), max_features=50
                    )
                    char_features = vectorizer.fit_transform(
                        column.fillna("").astype(str)
                    ).toarray()
                    features.append(char_features)
                else:
                    le = LabelEncoder()
                    encoded = le.fit_transform(column.fillna("unknown").astype(str))
                    features.append(encoded.reshape(-1, 1))
        return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,44 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
 from research.traditional_model import TraditionalModel
 class LogisticRegressionModel(TraditionalModel):
    """Logistic Regression with character n-grams"""
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
        vectorizer = CountVectorizer(
            analyzer="char",
            ngram_range=params.get("ngram_range", (2, 5)),
            max_features=params.get("max_features", 10000),
        )
        classifier = LogisticRegression(
            max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
        )
        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_features = []
        # Collect text-based features from the extracted features DataFrame
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_features.append(X[feature_type.value].astype(str))
        # Combine text features
        if len(text_features) == 1:
            return text_features[0].values
        else:
            # Concatenate multiple text features with separator
            combined = text_features[0].astype(str)
            for feature in text_features[1:]:
                combined = combined + " " + feature.astype(str)
            return combined.values
@@ -0,0 +1,52 @@
 from typing import Any
 import numpy as np
 import pandas as pd
 from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from research.neural_network_model import NeuralNetworkModel
 class LSTMModel(NeuralNetworkModel):
    """LSTM model for sequence learning"""
    def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
        params = kwargs
        model = Sequential(
            [
                Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64)),
                Bidirectional(LSTM(params.get("lstm_units", 32), return_sequences=True)),
                Bidirectional(LSTM(params.get("lstm_units", 32))),
                Dense(64, activation="relu"),
                Dense(2, activation="softmax"),
            ]
        )
        model.compile(
            loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
        )
        return model
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_data = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_data.extend(X[feature_type.value].astype(str).tolist())
        if not text_data:
            raise ValueError("No text data found in the provided DataFrame.")
        # Initialize tokenizer if needed
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(char_level=False, lower=True, oov_token="<OOV>")
            self.tokenizer.fit_on_texts(text_data)
        # Convert to sequences
        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
        max_len = self.config.model_params.get("max_len", 6)
        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,39 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
 from research.traditional_model import TraditionalModel
 class NaiveBayesModel(TraditionalModel):
    """Multinomial Naive Bayes with character n-grams"""
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
        vectorizer = CountVectorizer(
            analyzer="char",
            ngram_range=params.get("ngram_range", (1, 4)),
            max_features=params.get("max_features", 8000),
        )
        classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_features = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_features.append(X[feature_type.value].astype(str))
        if len(text_features) == 1:
            return text_features[0].values
        else:
            combined = text_features[0].astype(str)
            for feature in text_features[1:]:
                combined = combined + " " + feature.astype(str)
            return combined.values
@@ -0,0 +1,40 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import LabelEncoder
 from research.traditional_model import TraditionalModel
 class RandomForestModel(TraditionalModel):
    """Random Forest with engineered features"""
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
        return RandomForestClassifier(
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth", None),
            random_state=self.config.random_seed,
        )
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        features = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                column = X[feature_type.value]
                # Handle different feature types
                if feature_type.value in ["name_length", "word_count"]:
                    # Numerical features
                    features.append(column.fillna(0).values.reshape(-1, 1))
                else:
                    # Categorical features (encode them)
                    le = LabelEncoder()
                    encoded = le.fit_transform(column.fillna("unknown").astype(str))
                    features.append(encoded.reshape(-1, 1))
        return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,45 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from research.traditional_model import TraditionalModel
 class SVMModel(TraditionalModel):
    """Support Vector Machine with character n-grams and RBF kernel"""
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
        vectorizer = TfidfVectorizer(
            analyzer="char",
            ngram_range=params.get("ngram_range", (2, 4)),
            max_features=params.get("max_features", 5000),
        )
        classifier = SVC(
            kernel=params.get("kernel", "rbf"),
            C=params.get("C", 1.0),
            gamma=params.get("gamma", "scale"),
            probability=True,  # Enable probability prediction
            random_state=self.config.random_seed,
        )
        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_features = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_features.append(X[feature_type.value].astype(str))
        if len(text_features) == 1:
            return text_features[0].values
        else:
            combined = text_features[0].astype(str)
            for feature in text_features[1:]:
                combined = combined + " " + feature.astype(str)
            return combined.values
@@ -0,0 +1,82 @@
 from typing import Any
 import numpy as np
 import pandas as pd
 import tensorflow as tf
 from tensorflow.keras.layers import (
    Input,
    Embedding,
    Dense,
    GlobalAveragePooling1D,
    MultiHeadAttention,
    Dropout,
    LayerNormalization,
 )
 from tensorflow.keras.models import Model
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
 from research.neural_network_model import NeuralNetworkModel
 class TransformerModel(NeuralNetworkModel):
    """Transformer-based model"""
    def build_model_with_vocab(self, vocab_size: int, max_len: int = 6, **kwargs) -> Any:
        params = kwargs
        # Build Transformer model
        inputs = Input(shape=(max_len,))
        x = Embedding(input_dim=vocab_size, output_dim=params.get("embedding_dim", 64))(inputs)
        # Add positional encoding
        positions = tf.range(start=0, limit=max_len, delta=1)
        pos_embedding = Embedding(input_dim=max_len, output_dim=params.get("embedding_dim", 64))(
            positions
        )
        x = x + pos_embedding
        x = self._transformer_encoder(x, params)
        x = GlobalAveragePooling1D()(x)
        x = Dense(32, activation="relu")(x)
        outputs = Dense(2, activation="softmax")(x)
        model = Model(inputs, outputs)
        model.compile(
            optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
        )
        return model
    @classmethod
    def _transformer_encoder(cls, x, cfg_params):
        """Transformer encoder block"""
        attn = MultiHeadAttention(
            num_heads=cfg_params.get("transformer_num_heads", 2),
            key_dim=cfg_params.get("transformer_head_size", 64),
        )(x, x)
        x = LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(attn))
        ff = Dense(cfg_params.get("transformer_ff_dim", 128), activation="relu")(x)
        ff = Dense(x.shape[-1])(ff)
        return LayerNormalization(epsilon=1e-6)(x + Dropout(cfg_params.get("dropout", 0.1))(ff))
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        text_data = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                text_data.extend(X[feature_type.value].astype(str).tolist())
        if not text_data:
            raise ValueError("No text data found in the provided DataFrame.")
        # Initialize tokenizer if needed
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(oov_token="<OOV>")
            self.tokenizer.fit_on_texts(text_data)
        # Convert to sequences
        sequences = self.tokenizer.texts_to_sequences(text_data[: len(X)])
        max_len = self.config.model_params.get("max_len", 6)
        return pad_sequences(sequences, maxlen=max_len, padding="post")
@@ -0,0 +1,52 @@
 import numpy as np
 import pandas as pd
 import xgboost as xgb
 from sklearn.base import BaseEstimator
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.preprocessing import LabelEncoder
 from research.traditional_model import TraditionalModel
 class XGBoostModel(TraditionalModel):
    """XGBoost with engineered features and character embeddings"""
    def build_model(self) -> BaseEstimator:
        params = self.config.model_params
        return xgb.XGBClassifier(
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth", 6),
            learning_rate=params.get("learning_rate", 0.1),
            subsample=params.get("subsample", 0.8),
            colsample_bytree=params.get("colsample_bytree", 0.8),
            random_state=self.config.random_seed,
            eval_metric="logloss",
        )
    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
        features = []
        for feature_type in self.config.features:
            if feature_type.value in X.columns:
                column = X[feature_type.value]
                if feature_type.value in ["name_length", "word_count"]:
                    # Numerical features
                    features.append(column.fillna(0).values.reshape(-1, 1))
                elif feature_type.value in ["full_name", "native_name", "surname"]:
                    # Character-level features for names
                    vectorizer = CountVectorizer(
                        analyzer="char", ngram_range=(2, 3), max_features=100
                    )
                    char_features = vectorizer.fit_transform(
                        column.fillna("").astype(str)
                    ).toarray()
                    features.append(char_features)
                else:
                    # Categorical features
                    le = LabelEncoder()
                    encoded = le.fit_transform(column.fillna("unknown").astype(str))
                    features.append(encoded.reshape(-1, 1))
        return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
@@ -0,0 +1,201 @@
 import logging
 from abc import abstractmethod
 from typing import Any, Dict, List
 import numpy as np
 import pandas as pd
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import precision_recall_fscore_support
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 from research.base_model import BaseModel
 from research.experiment.feature_extractor import FeatureExtractor
 class NeuralNetworkModel(BaseModel):
    """Base class for neural network models (TensorFlow/Keras)"""
    @property
    def architecture(self) -> str:
        return "neural_network"
    @abstractmethod
    def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
        """Build neural network model with known vocabulary size"""
        pass
    def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
        """Fit the neural network model with deferred building"""
        logging.info(f"Training {self.__class__.__name__}")
        # Setup feature extraction
        if self.feature_extractor is None:
            self.feature_extractor = FeatureExtractor(
                self.config.features, self.config.feature_params
            )
        # Extract and prepare features (this will also initialize tokenizer)
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
        # Encode labels
        if self.label_encoder is None:
            self.label_encoder = LabelEncoder()
            y_encoded = self.label_encoder.fit_transform(y)
        else:
            y_encoded = self.label_encoder.transform(y)
        # Now we can build the model with known vocab size
        vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
        # Get additional model parameters
        max_len = self.config.model_params.get("max_len", 6)
        self.model = self.build_model_with_vocab(
            vocab_size=vocab_size, max_len=max_len, **self.config.model_params
        )
        # Train the neural network
        history = self.model.fit(
            X_prepared,
            y_encoded,
            epochs=self.config.model_params.get("epochs", 10),
            batch_size=self.config.model_params.get("batch_size", 64),
            validation_split=0.1,
            verbose=1,
        )
        # Store training history
        self.training_history = {
            "accuracy": history.history["accuracy"],
            "loss": history.history["loss"],
            "val_accuracy": history.history.get("val_accuracy", []),
            "val_loss": history.history.get("val_loss", []),
        }
        self.is_fitted = True
        return self
    def cross_validate(
        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
    ) -> dict[str, np.floating[Any]]:
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
        y_encoded = self.label_encoder.transform(y)
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []
        for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
            # Create fresh model for each fold
            fold_model = self.build_model()
            # Train on fold
            if hasattr(fold_model, "fit"):
                fold_model.fit(
                    X_prepared[train_idx],
                    y_encoded[train_idx],
                    epochs=self.config.model_params.get("epochs", 10),
                    batch_size=self.config.model_params.get("batch_size", 32),
                    verbose=0,
                )
            # Predict on validation
            y_pred = fold_model.predict(X_prepared[val_idx])
            if len(y_pred.shape) > 1:
                y_pred = y_pred.argmax(axis=1)
            # Calculate metrics
            acc = accuracy_score(y_encoded[val_idx], y_pred)
            prec, rec, f1, _ = precision_recall_fscore_support(
                y_encoded[val_idx], y_pred, average="weighted"
            )
            accuracies.append(acc)
            precisions.append(prec)
            recalls.append(rec)
            f1_scores.append(f1)
        return {
            "accuracy": np.mean(accuracies),
            "accuracy_std": np.std(accuracies),
            "precision": np.mean(precisions),
            "precision_std": np.std(precisions),
            "recall": np.mean(recalls),
            "recall_std": np.std(recalls),
            "f1": np.mean(f1_scores),
            "f1_std": np.std(f1_scores),
        }
    def generate_learning_curve(
        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        logging.info(f"Generating learning curve for {self.__class__.__name__}")
        learning_curve_data = {
            "train_sizes": [],
            "train_scores": [],
            "val_scores": [],
            "train_scores_std": [],
            "val_scores_std": [],
        }
        # Split data once for validation
        X_train_full, X_val, y_train_full, y_val = train_test_split(
            X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
        )
        for size in train_sizes:
            train_size = int(len(X_train_full) * size)
            if train_size < 10:  # Minimum training size
                continue
            # Sample training data
            indices = np.random.choice(len(X_train_full), train_size, replace=False)
            X_train_subset = X_train_full[indices]
            y_train_subset = y_train_full[indices]
            # Train multiple models for variance estimation
            train_scores = []
            val_scores = []
            for seed in range(3):  # 3 runs for variance
                # Build fresh model
                model = self.build_model()
                # Train model
                if hasattr(model, "fit"):
                    history = model.fit(
                        X_train_subset,
                        y_train_subset,
                        epochs=self.config.model_params.get("epochs", 10),
                        batch_size=self.config.model_params.get("batch_size", 32),
                        validation_data=(X_val, y_val),
                        verbose=0,
                    )
                # Evaluate
                train_pred = model.predict(X_train_subset)
                val_pred = model.predict(X_val)
                train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
                val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
                train_scores.append(train_acc)
                val_scores.append(val_acc)
            learning_curve_data["train_sizes"].append(train_size)
            learning_curve_data["train_scores"].append(np.mean(train_scores))
            learning_curve_data["val_scores"].append(np.mean(val_scores))
            learning_curve_data["train_scores_std"].append(np.std(train_scores))
            learning_curve_data["val_scores_std"].append(np.std(val_scores))
        self.learning_curve_data = learning_curve_data
        return learning_curve_data
@@ -0,0 +1,134 @@
 import logging
 from abc import abstractmethod
 from typing import Dict, Any, List
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.model_selection import StratifiedKFold, cross_val_score
 from sklearn.model_selection import learning_curve
 from sklearn.preprocessing import LabelEncoder
 from research.base_model import BaseModel
 from research.experiment.feature_extractor import FeatureExtractor
 class TraditionalModel(BaseModel):
    """Base class for traditional ML models (scikit-learn compatible)"""
    @property
    def architecture(self) -> str:
        return "traditional"
    @abstractmethod
    def build_model(self) -> BaseEstimator:
        """Build and return the sklearn model instance"""
        pass
    def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
        """Fit the traditional ML model"""
        logging.info(f"Training {self.__class__.__name__}")
        # Build model if not already built
        if self.model is None:
            self.model = self.build_model()
        # Setup feature extraction
        if self.feature_extractor is None:
            self.feature_extractor = FeatureExtractor(
                self.config.features, self.config.feature_params
            )
        # Extract and prepare features
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
        # Encode labels
        if self.label_encoder is None:
            self.label_encoder = LabelEncoder()
            y_encoded = self.label_encoder.fit_transform(y)
        else:
            y_encoded = self.label_encoder.transform(y)
        # Train model
        self.model.fit(X_prepared, y_encoded)
        self.is_fitted = True
        return self
    def cross_validate(self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> Dict[str, float]:
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
        y_encoded = self.label_encoder.transform(y)
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
        # Calculate different metrics
        results = {}
        # Accuracy
        accuracy_scores = cross_val_score(
            self.model, X_prepared, y_encoded, cv=cv, scoring="accuracy"
        )
        results["accuracy"] = accuracy_scores.mean()
        results["accuracy_std"] = accuracy_scores.std()
        # Precision, Recall, F1
        for metric in ["precision", "recall", "f1"]:
            if metric in self.config.metrics:
                scores = cross_val_score(
                    self.model, X_prepared, y_encoded, cv=cv, scoring=f"{metric}_weighted"
                )
                results[metric] = scores.mean()
                results[f"{metric}_std"] = scores.std()
        return results
    def generate_learning_curve(
        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        logging.info(f"Generating learning curve for {self.__class__.__name__}")
        if train_sizes is None:
            train_sizes = [0.1, 0.25, 0.5, 0.75, 1.0]
        # Prepare features
        if self.feature_extractor is None:
            self.feature_extractor = FeatureExtractor(
                self.config.features, self.config.feature_params
            )
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
        # Encode labels
        if self.label_encoder is None:
            self.label_encoder = LabelEncoder()
            y_encoded = self.label_encoder.fit_transform(y)
        else:
            y_encoded = self.label_encoder.transform(y)
        try:
            train_sizes_abs, train_scores, val_scores = learning_curve(
                self.build_model(),
                X_prepared,
                y_encoded,
                train_sizes=train_sizes,
                cv=3,  # Use 3-fold CV for speed
                scoring="accuracy",
                random_state=self.config.random_seed,
            )
            learning_curve_data = {
                "train_sizes": train_sizes_abs.tolist(),
                "train_scores": train_scores.mean(axis=1).tolist(),
                "val_scores": val_scores.mean(axis=1).tolist(),
                "train_scores_std": train_scores.std(axis=1).tolist(),
                "val_scores_std": val_scores.std(axis=1).tolist(),
            }
        except Exception as e:
            logging.warning(f"Could not generate learning curve: {e}")
            return {}
        self.learning_curve_data = learning_curve_data
        return learning_curve_data
@@ -0,0 +1,152 @@
 #!.venv/bin/python3
 import logging
 import argparse
 from research.model_trainer import ModelTrainer
 def train_baseline_models():
    """
    Quick function to train all baseline models and save artifacts.
    """
    logger = logging.getLogger(__name__)
    logger.info("Training Baseline Models with Artifact Saving")
    trainer = ModelTrainer()
    # Define baseline model configurations
    baseline_configs = [
        {
            "model_type": "logistic_regression",
            "features": ["full_name"],
            "model_params": {"ngram_range": [2, 5], "max_features": 10000},
        },
        {
            "model_type": "logistic_regression",
            "features": ["native_name"],
            "model_params": {"ngram_range": [2, 4], "max_features": 5000},
        },
        {
            "model_type": "logistic_regression",
            "features": ["surname"],
            "model_params": {"ngram_range": [2, 4], "max_features": 5000},
        },
        {
            "model_type": "random_forest",
            "features": ["name_length", "word_count", "province"],
            "model_params": {"n_estimators": 100, "max_depth": 10},
        },
        {
            "model_type": "svm",
            "features": ["full_name"],
            "model_params": {"kernel": "rbf", "C": 1.0},
        },
        {"model_type": "naive_bayes", "features": ["full_name"], "model_params": {"alpha": 1.0}},
    ]
    # Train all baseline models
    experiment_ids = trainer.train_multiple_models("baseline", baseline_configs)
    # Show summary
    logger.info(f"\n Training Summary:")
    for exp_id in experiment_ids:
        experiment = trainer.experiment_tracker.get_experiment(exp_id)
        if experiment:
            acc = experiment.test_metrics.get("accuracy", 0)
            logger.info(f"   {experiment.config.name}: {acc:.4f} accuracy")
    return experiment_ids
 def train_neural_networks():
    """
    Train neural network models with proper parameters.
    """
    logging.info("Training Neural Network Models")
    trainer = ModelTrainer()
    neural_configs = [
        {
            "model_type": "lstm",
            "features": ["full_name"],
            "model_params": {
                "embedding_dim": 64,
                "lstm_units": 32,
                "epochs": 10,
                "batch_size": 64,
                "max_len": 6,
            },
        },
        {
            "model_type": "cnn",
            "features": ["full_name"],
            "model_params": {
                "embedding_dim": 64,
                "filters": 64,
                "kernel_size": 3,
                "epochs": 10,
                "batch_size": 64,
                "max_len": 20,  # Character level
            },
        },
        {
            "model_type": "transformer",
            "features": ["full_name"],
            "model_params": {
                "embedding_dim": 64,
                "transformer_num_heads": 2,
                "epochs": 10,
                "batch_size": 64,
                "max_len": 6,
            },
        },
    ]
    experiment_ids = trainer.train_multiple_models("neural_networks", neural_configs)
    return experiment_ids
 def main():
    """
    Main training script with different options.
    """
    parser = argparse.ArgumentParser(description="Train DRC Names Models")
    parser.add_argument(
        "--mode",
        choices=["baseline", "neural", "list"],
        default="list",
        help="Training mode",
    )
    parser.add_argument("--model-type", type=str, help="Specific model type to train")
    parser.add_argument("--name", type=str, help="Model name")
    args = parser.parse_args()
    trainer = ModelTrainer()
    if args.mode == "baseline":
        train_baseline_models()
    elif args.mode == "neural":
        train_neural_networks()
    elif args.mode == "list":
        logging.info("📋 Saved Models:")
        saved_models = trainer.list_saved_models()
        if not saved_models.empty:
            logging.info(saved_models.to_string(index=False))
        else:
            logging.info("No saved models found.")
    elif args.model_type and args.name:
        # Train specific model
        trainer.train_single_model(
            model_name=args.name, model_type=args.model_type, features=["full_name"]
        )
 if __name__ == "__main__":
    main()
@@ -0,0 +1,76 @@
 import pandas as pd
 import streamlit as st
 from core.utils import get_data_file_path
 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame()
 class Dashboard:
    def __init__(self, config, experiment_tracker, experiment_runner):
        self.config = config
        self.experiment_tracker = experiment_tracker
        self.experiment_runner = experiment_runner
    def index(self):
        st.header("Dashboard")
        col1, col2, col3, col4 = st.columns(4)
        # Load basic statistics
        try:
            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
            if data_path.exists():
                df = load_dataset(str(data_path))
                with col1:
                    st.metric("Total Names", f"{len(df):,}")
                with col2:
                    annotated = (df.get("annotated", 0) == 1).sum()
                    st.metric("Annotated Names", f"{annotated:,}")
                with col3:
                    provinces = df["province"].nunique() if "province" in df.columns else 0
                    st.metric("Provinces", provinces)
                with col4:
                    if "sex" in df.columns:
                        gender_dist = df["sex"].value_counts()
                        ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
                        st.metric("F/M Ratio", f"{ratio:.2f}")
            else:
                st.warning("No processed data found. Please run data processing first.")
        except Exception as e:
            st.error(f"Error loading dashboard data: {e}")
        # Recent experiments
        st.subheader("Recent Experiments")
        experiments = self.experiment_tracker.list_experiments()[:5]
        if experiments:
            exp_data = []
            for exp in experiments:
                exp_data.append(
                    {
                        "Name": exp.config.name,
                        "Model": exp.config.model_type,
                        "Status": exp.status.value,
                        "Accuracy": (
                            f"{exp.test_metrics.get('accuracy', 0):.3f}"
                            if exp.test_metrics
                            else "N/A"
                        ),
                        "Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
                    }
                )
            st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
        else:
            st.info("No experiments found. Create your first experiment in the Experiments tab!")
@@ -0,0 +1,154 @@
 from datetime import datetime
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 from core.utils import get_data_file_path
 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame()
 class DataOverview:
    def __init__(self, config):
        self.config = config
    def index(self):
        st.header("Data Overview")
        data_files = {
            "Names": self.config.data.input_file,
            "Featured Dataset": self.config.data.output_files["featured"],
            "Evaluation Dataset": self.config.data.output_files["evaluation"],
            "Male Names": self.config.data.output_files["males"],
            "Female Names": self.config.data.output_files["females"],
        }
        selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
        file_path = get_data_file_path(data_files[selected_file], self.config)
        if not file_path.exists():
            st.warning(f"Dataset not found: {file_path}")
            st.warning("Please run data processing first to generate datasets.")
            return
        # Load and display data
        df = load_dataset(str(file_path))
        if df.empty:
            st.error("Failed to load dataset")
            return
        # Basic statistics
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Total Records", f"{len(df):,}")
        with col2:
            if "annotated" in df.columns:
                annotated_pct = (df["annotated"] == 1).mean() * 100
                st.metric("Annotated", f"{annotated_pct:.1f}%")
        with col3:
            if "words" in df.columns:
                avg_words = df["words"].mean()
                st.metric("Avg Words", f"{avg_words:.1f}")
        with col4:
            if "length" in df.columns:
                avg_length = df["length"].mean()
                st.metric("Avg Length", f"{avg_length:.0f}")
        # Data quality analysis
        st.subheader("Data Quality Analysis")
        col1, col2 = st.columns(2)
        with col1:
            # Missing values
            missing_data = df.isnull().sum()
            if missing_data.sum() > 0:
                fig = px.bar(
                    x=missing_data.index, y=missing_data.values, title="Missing Values by Column"
                )
                fig.update_layout(height=400)
                st.plotly_chart(fig, use_container_width=True)
            else:
                st.success("No missing values found")
        with col2:
            # Gender distribution
            if "sex" in df.columns:
                gender_counts = df["sex"].value_counts()
                fig = px.pie(
                    values=gender_counts.values,
                    names=gender_counts.index,
                    title="Gender Distribution",
                )
                fig.update_layout(height=400)
                st.plotly_chart(fig, use_container_width=True)
        # Word count distribution
        if "words" in df.columns:
            st.subheader("Name Structure Analysis")
            col1, col2 = st.columns(2)
            with col1:
                word_dist = df["words"].value_counts().sort_index()
                fig = px.bar(
                    x=word_dist.index,
                    y=word_dist.values,
                    title="Distribution of Word Count in Names",
                )
                fig.update_layout(height=400)
                st.plotly_chart(fig, use_container_width=True)
            with col2:
                # Province distribution
                if "province" in df.columns:
                    province_counts = df["province"].value_counts().head(10)
                    fig = px.bar(
                        x=province_counts.values,
                        y=province_counts.index,
                        orientation="h",
                        title="Top 10 Provinces by Name Count",
                    )
                    fig.update_layout(height=400)
                    st.plotly_chart(fig, use_container_width=True)
        # Sample data
        st.subheader("Sample Data")
        # Display columns selector
        if not df.empty:
            columns_to_show = st.multiselect(
                "Select columns to display",
                df.columns.tolist(),
                default=(
                    ["name", "sex", "province", "words"]
                    if all(col in df.columns for col in ["name", "sex", "province", "words"])
                    else df.columns[:5].tolist()
                ),
            )
            if columns_to_show:
                sample_size = st.slider("Number of rows to display", 10, min(1000, len(df)), 50)
                st.dataframe(df[columns_to_show].head(sample_size), use_container_width=True)
        # Data export
        st.subheader("Export Data")
        if st.button("Download as CSV"):
            csv = df.to_csv(index=False)
            st.download_button(
                label="Download CSV",
                data=csv,
                file_name=f"{selected_file.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.csv",
                mime="text/csv",
            )
@@ -0,0 +1,127 @@
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 from web.log_reader import LogReader
 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame()
 class DataProcessing:
    def __init__(self, config, pipeline_monitor):
        self.config = config
        self.pipeline_monitor = pipeline_monitor
    def index(self):
        st.header("Data Processing Pipeline")
        status = self.pipeline_monitor.get_pipeline_status()
        # Overall progress
        overall_progress = status["overall_completion"] / 100
        st.progress(overall_progress)
        st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
        # Step details
        for step_name, step_status in status["steps"].items():
            with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
                col1, col2, col3 = st.columns(3)
                with col1:
                    st.metric("Processed Batches", step_status["processed_batches"])
                with col2:
                    st.metric("Total Batches", step_status["total_batches"])
                with col3:
                    st.metric("Failed Batches", step_status["failed_batches"])
                if step_status["completion_percentage"] > 0:
                    st.progress(step_status["completion_percentage"] / 100)
        # Read actual log entries from the log file
        st.subheader("Recent Processing Logs")
        try:
            log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
            log_reader = LogReader(log_file_path)
            # Options for filtering logs
            col1, col2 = st.columns(2)
            with col1:
                log_level_filter = st.selectbox(
                    "Filter by Level",
                    ["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
                    key="log_level_filter"
                )
            with col2:
                num_entries = st.number_input(
                    "Number of entries",
                    min_value=5,
                    max_value=50,
                    value=10,
                    key="num_log_entries"
                )
            # Get log entries based on filter
            if log_level_filter == "All":
                log_entries = log_reader.read_last_entries(num_entries)
            else:
                log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
            if log_entries:
                for entry in log_entries:
                    if entry.level == "ERROR":
                        st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
                    elif entry.level == "WARNING":
                        st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
                    elif entry.level == "INFO":
                        st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
                    else:
                        st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
                # Show log statistics
                st.subheader("Log Statistics")
                log_stats = log_reader.get_log_stats()
                if log_stats:
                    col1, col2, col3, col4 = st.columns(4)
                    with col1:
                        st.metric("Total Lines", log_stats.get('total_lines', 0))
                    with col2:
                        st.metric("INFO", log_stats.get('INFO', 0))
                    with col3:
                        st.metric("WARNING", log_stats.get('WARNING', 0))
                    with col4:
                        st.metric("ERROR", log_stats.get('ERROR', 0))
                    # Log level distribution chart
                    levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
                    counts = [log_stats.get(level, 0) for level in levels]
                    if sum(counts) > 0:
                        fig = px.bar(
                            x=levels,
                            y=counts,
                            title="Log Entries by Level",
                            color=levels,
                            color_discrete_map={
                                'INFO': 'blue',
                                'WARNING': 'orange',
                                'ERROR': 'red',
                                'DEBUG': 'gray',
                                'CRITICAL': 'darkred'
                            }
                        )
                        st.plotly_chart(fig, use_container_width=True)
            else:
                st.info("No log entries found or log file is empty.")
        except Exception as e:
            st.error(f"Error reading log file: {e}")
@@ -0,0 +1,185 @@
 import re
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import List, Dict, Optional
@dataclass
 class LogEntry:
    """Represents a single log entry."""
    timestamp: datetime
    logger: str
    level: str
    message: str
    raw_line: str
 class LogReader:
    """Utility class for reading and parsing log files."""
    def __init__(self, log_file_path: Path):
        """Initialize the log reader with a log file path."""
        self.log_file_path = Path(log_file_path)
        # Pattern to match Python logging format: timestamp - logger - level - message
        self.log_pattern = re.compile(
            r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
        )
    def read_last_entries(self, count: int = 10) -> List[LogEntry]:
        """Read the last N entries from the log file."""
        if not self.log_file_path.exists():
            return []
        try:
            with open(self.log_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            # Parse log entries from the end
            entries = []
            for line in reversed(lines[-count*2:]):  # Read more lines in case some don't match
                entry = self._parse_log_line(line.strip())
                if entry:
                    entries.append(entry)
                if len(entries) >= count:
                    break
            # Return entries in chronological order (oldest first of the last N)
            return list(reversed(entries))
        except Exception as e:
            print(f"Error reading log file: {e}")
            return []
    def read_entries_by_level(self, level: str, count: int = 50) -> List[LogEntry]:
        """Read entries filtered by log level."""
        if not self.log_file_path.exists():
            return []
        try:
            with open(self.log_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            entries = []
            for line in reversed(lines):
                entry = self._parse_log_line(line.strip())
                if entry and entry.level.upper() == level.upper():
                    entries.append(entry)
                if len(entries) >= count:
                    break
            return list(reversed(entries))
        except Exception as e:
            print(f"Error reading log file: {e}")
            return []
    def read_entries_since(self, since: datetime, count: int = 100) -> List[LogEntry]:
        """Read entries since a specific datetime."""
        if not self.log_file_path.exists():
            return []
        try:
            with open(self.log_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            entries = []
            for line in reversed(lines):
                entry = self._parse_log_line(line.strip())
                if entry:
                    if entry.timestamp >= since:
                        entries.append(entry)
                    else:
                        # Stop reading if we've gone past the since time
                        break
                if len(entries) >= count:
                    break
            return list(reversed(entries))
        except Exception as e:
            print(f"Error reading log file: {e}")
            return []
    def get_log_stats(self) -> Dict[str, int]:
        """Get statistics about the log file."""
        if not self.log_file_path.exists():
            return {}
        try:
            with open(self.log_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            stats = {
                'total_lines': len(lines),
                'INFO': 0,
                'WARNING': 0,
                'ERROR': 0,
                'DEBUG': 0,
                'CRITICAL': 0
            }
            for line in lines:
                entry = self._parse_log_line(line.strip())
                if entry:
                    level = entry.level.upper()
                    if level in stats:
                        stats[level] += 1
            return stats
        except Exception as e:
            print(f"Error reading log file: {e}")
            return {}
    def _parse_log_line(self, line: str) -> Optional[LogEntry]:
        """Parse a single log line into a LogEntry object."""
        if not line:
            return None
        match = self.log_pattern.match(line)
        if not match:
            return None
        try:
            timestamp_str, logger, level, message = match.groups()
            timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
            return LogEntry(
                timestamp=timestamp,
                logger=logger,
                level=level,
                message=message,
                raw_line=line
            )
        except ValueError:
            return None
 class MultiLogReader:
    """Reader for multiple log files."""
    def __init__(self, log_directory: Path):
        """Initialize with a directory containing log files."""
        self.log_directory = Path(log_directory)
    def get_available_log_files(self) -> List[Path]:
        """Get list of available log files."""
        if not self.log_directory.exists():
            return []
        return list(self.log_directory.glob('*.log'))
    def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
        """Read entries from all log files and merge them chronologically."""
        all_entries = []
        for log_file in self.get_available_log_files():
            reader = LogReader(log_file)
            entries = reader.read_last_entries(count)
            all_entries.extend(entries)
        # Sort by timestamp
        all_entries.sort(key=lambda x: x.timestamp, reverse=True)
        return all_entries[:count]