diff --git a/app.py b/app.py index 50f27a4..67ed4af 100644 --- a/app.py +++ b/app.py @@ -1,7 +1,7 @@ #!.venv/bin/python3 import streamlit as st -from core.config import setup_config +from core.config import get_config from core.utils.data_loader import DataLoader from interface.configuration import Configuration from interface.dashboard import Dashboard @@ -26,7 +26,7 @@ st.set_page_config( @st.cache_data def load_config(): """Load application configuration with unified setup""" - return setup_config(env="development") + return get_config() class StreamlitApp: diff --git a/config/pipeline.development.yaml b/config/pipeline.development.yaml index 110ca54..e2d1437 100644 --- a/config/pipeline.development.yaml +++ b/config/pipeline.development.yaml @@ -37,8 +37,8 @@ data: split_by_gender: true evaluation_fraction: 0.2 random_seed: 42 - max_dataset_size: 10_000 # Limit to 10k records for development/testing - balance_by_sex: true # Balance male/female samples when limiting + max_dataset_size: ~ # Limit to 10k records for development/testing + balance_by_sex: false # Balance male/female samples when limiting # Enhanced logging for development logging: diff --git a/core/config/__init__.py b/core/config/__init__.py index 12cbab4..322aded 100644 --- a/core/config/__init__.py +++ b/core/config/__init__.py @@ -2,9 +2,10 @@ import logging from pathlib import Path from typing import Optional, Union -from core.config.config_manager import ConfigManager -from core.config.logging_config import LoggingConfig -from core.config.pipeline_config import PipelineConfig +from core.utils import ensure_directories +from .config_manager import ConfigManager +from .logging_config import LoggingConfig +from .pipeline_config import PipelineConfig config_manager = ConfigManager() @@ -43,7 +44,6 @@ def setup_config(config_path: Optional[Path] = None, env: str = "development") - setup_logging(config) # Ensure required directories exist - from core.utils import ensure_directories ensure_directories(config) logging.info(f"Loaded configuration: {config.name} v{config.version}") diff --git a/core/utils/__init__.py b/core/utils/__init__.py index 14efd3d..845a116 100644 --- a/core/utils/__init__.py +++ b/core/utils/__init__.py @@ -1,13 +1,17 @@ import logging from contextlib import contextmanager from pathlib import Path +from typing import TYPE_CHECKING -from core.config import get_config, PipelineConfig +if TYPE_CHECKING: + from core.config import PipelineConfig @contextmanager def temporary_config_override(**overrides): """Context manager for temporarily overriding configuration""" + from core.config import get_config + config = get_config() original_values = {} @@ -25,7 +29,7 @@ def temporary_config_override(**overrides): setattr(config, key, value) -def ensure_directories(config: PipelineConfig) -> None: +def ensure_directories(config: "PipelineConfig") -> None: """Ensure all required directories exist""" directories = [ config.paths.data_dir, @@ -42,16 +46,16 @@ def ensure_directories(config: PipelineConfig) -> None: logging.info("Ensured all required directories exist") -def get_data_file_path(filename: str, config: PipelineConfig) -> Path: +def get_data_file_path(filename: str, config: "PipelineConfig") -> Path: """Get full path for a data file""" return config.paths.data_dir / filename -def get_model_file_path(filename: str, config: PipelineConfig) -> Path: +def get_model_file_path(filename: str, config: "PipelineConfig") -> Path: """Get full path for a model file""" return config.paths.models_dir / filename -def get_output_file_path(filename: str, config: PipelineConfig) -> Path: +def get_output_file_path(filename: str, config: "PipelineConfig") -> Path: """Get full path for an output file""" return config.paths.outputs_dir / filename diff --git a/research/experiment/experiment_runner.py b/research/experiment/experiment_runner.py index 74d17e3..d4592e0 100644 --- a/research/experiment/experiment_runner.py +++ b/research/experiment/experiment_runner.py @@ -3,6 +3,7 @@ from datetime import datetime from pathlib import Path from typing import List, Dict, Optional +import joblib import numpy as np import pandas as pd from sklearn.metrics import confusion_matrix @@ -207,13 +208,36 @@ class ExperimentRunner: experiment = self.tracker.get_experiment(experiment_id) if experiment and experiment.model_path: - return BaseModel.load(experiment.model_path) + try: + # Load the saved model data Recreate the model instance using the saved config + model_data = joblib.load(experiment.model_path) + config = ExperimentConfig.from_dict(model_data["config"]) + model = create_model(config) + + # Restore the saved state + model.model = model_data["model"] + model.feature_extractor = model_data["feature_extractor"] + model.label_encoder = model_data["label_encoder"] + model.tokenizer = model_data.get("tokenizer") + model.is_fitted = model_data["is_fitted"] + model.training_history = model_data.get("training_history", {}) + model.learning_curve_data = model_data.get("learning_curve_data", {}) + + # Restore vectorizers and encoders for models that use them (like XGBoost) + if "vectorizers" in model_data and hasattr(model, 'vectorizers'): + model.vectorizers = model_data["vectorizers"] + if "label_encoders" in model_data and hasattr(model, 'label_encoders'): + model.label_encoders = model_data["label_encoders"] + + return model + + except Exception as e: + logging.error(f"Failed to load model for experiment {experiment_id}: {e}") + return None return None - def compare_experiments( - self, experiment_ids: List[str], metric: str = "accuracy" - ) -> pd.DataFrame: + def compare_experiments(self, experiment_ids: List[str], metric: str = "accuracy") -> pd.DataFrame: """Compare experiments and return analysis""" comparison_df = self.tracker.compare_experiments(experiment_ids) diff --git a/research/models/lightgbm_model.py b/research/models/lightgbm_model.py index 6efc4c6..2a4256a 100644 --- a/research/models/lightgbm_model.py +++ b/research/models/lightgbm_model.py @@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel class LightGBMModel(TraditionalModel): """LightGBM with engineered features""" + def __init__(self, config): + super().__init__(config) + # Store vectorizers and encoders to ensure consistent feature space + self.vectorizers = {} + self.label_encoders = {} + def build_model(self) -> BaseEstimator: params = self.config.model_params @@ -33,19 +39,58 @@ class LightGBMModel(TraditionalModel): column = X[feature_type.value] if feature_type.value in ["name_length", "word_count"]: + # Numerical features features.append(column.fillna(0).values.reshape(-1, 1)) elif feature_type.value in ["full_name", "native_name", "surname"]: - # Character n-grams for text features - vectorizer = CountVectorizer( - analyzer="char", ngram_range=(2, 3), max_features=50 - ) - char_features = vectorizer.fit_transform( - column.fillna("").astype(str) - ).toarray() + # Character-level features for names + feature_key = f"vectorizer_{feature_type.value}" + + if feature_key not in self.vectorizers: + # First time - create and fit vectorizer + self.vectorizers[feature_key] = CountVectorizer( + analyzer="char", ngram_range=(2, 3), max_features=50 + ) + char_features = self.vectorizers[feature_key].fit_transform( + column.fillna("").astype(str) + ).toarray() + else: + # Subsequent times - use existing vectorizer + char_features = self.vectorizers[feature_key].transform( + column.fillna("").astype(str) + ).toarray() + features.append(char_features) else: - le = LabelEncoder() - encoded = le.fit_transform(column.fillna("unknown").astype(str)) + # Categorical features + feature_key = f"encoder_{feature_type.value}" + + if feature_key not in self.label_encoders: + # First time - create and fit encoder + self.label_encoders[feature_key] = LabelEncoder() + encoded = self.label_encoders[feature_key].fit_transform( + column.fillna("unknown").astype(str) + ) + else: + # Subsequent times - use existing encoder + # Handle unseen labels by mapping them to a default value + column_clean = column.fillna("unknown").astype(str) + + # Get the classes the encoder was trained on + known_classes = set(self.label_encoders[feature_key].classes_) + + # Map unseen values to "unknown" if it exists, otherwise to the first class + if "unknown" in known_classes: + default_class = "unknown" + else: + default_class = self.label_encoders[feature_key].classes_[0] + + # Replace unseen values with default + column_mapped = column_clean.apply( + lambda x: x if x in known_classes else default_class + ) + + encoded = self.label_encoders[feature_key].transform(column_mapped) + features.append(encoded.reshape(-1, 1)) return np.hstack(features) if features else np.array([]).reshape(len(X), 0) diff --git a/research/models/xgboost_model.py b/research/models/xgboost_model.py index cefc703..454d807 100644 --- a/research/models/xgboost_model.py +++ b/research/models/xgboost_model.py @@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel class XGBoostModel(TraditionalModel): """XGBoost with engineered features and character embeddings""" + def __init__(self, config): + super().__init__(config) + # Store vectorizers and encoders to ensure consistent feature space + self.vectorizers = {} + self.label_encoders = {} + def build_model(self) -> BaseEstimator: params = self.config.model_params @@ -37,17 +43,54 @@ class XGBoostModel(TraditionalModel): features.append(column.fillna(0).values.reshape(-1, 1)) elif feature_type.value in ["full_name", "native_name", "surname"]: # Character-level features for names - vectorizer = CountVectorizer( - analyzer="char", ngram_range=(2, 3), max_features=100 - ) - char_features = vectorizer.fit_transform( - column.fillna("").astype(str) - ).toarray() + feature_key = f"vectorizer_{feature_type.value}" + + if feature_key not in self.vectorizers: + # First time - create and fit vectorizer + self.vectorizers[feature_key] = CountVectorizer( + analyzer="char", ngram_range=(2, 3), max_features=100 + ) + char_features = self.vectorizers[feature_key].fit_transform( + column.fillna("").astype(str) + ).toarray() + else: + # Subsequent times - use existing vectorizer + char_features = self.vectorizers[feature_key].transform( + column.fillna("").astype(str) + ).toarray() + features.append(char_features) else: # Categorical features - le = LabelEncoder() - encoded = le.fit_transform(column.fillna("unknown").astype(str)) + feature_key = f"encoder_{feature_type.value}" + + if feature_key not in self.label_encoders: + # First time - create and fit encoder + self.label_encoders[feature_key] = LabelEncoder() + encoded = self.label_encoders[feature_key].fit_transform( + column.fillna("unknown").astype(str) + ) + else: + # Subsequent times - use existing encoder + # Handle unseen labels by mapping them to a default value + column_clean = column.fillna("unknown").astype(str) + + # Get the classes the encoder was trained on + known_classes = set(self.label_encoders[feature_key].classes_) + + # Map unseen values to "unknown" if it exists, otherwise to the first class + if "unknown" in known_classes: + default_class = "unknown" + else: + default_class = self.label_encoders[feature_key].classes_[0] + + # Replace unseen values with default + column_mapped = column_clean.apply( + lambda x: x if x in known_classes else default_class + ) + + encoded = self.label_encoders[feature_key].transform(column_mapped) + features.append(encoded.reshape(-1, 1)) return np.hstack(features) if features else np.array([]).reshape(len(X), 0) diff --git a/research/neural_network_model.py b/research/neural_network_model.py index 624648d..f5372cc 100644 --- a/research/neural_network_model.py +++ b/research/neural_network_model.py @@ -95,9 +95,15 @@ class NeuralNetworkModel(BaseModel): recalls = [] f1_scores = [] + # Get vocabulary size and model parameters + vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000 + max_len = self.config.model_params.get("max_len", 6) + for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)): - # Create fresh model for each fold - fold_model = self.build_model() + # Create fresh model for each fold using build_model_with_vocab + fold_model = self.build_model_with_vocab( + vocab_size=vocab_size, max_len=max_len, **self.config.model_params + ) # Train on fold if hasattr(fold_model, "fit"): @@ -127,13 +133,9 @@ class NeuralNetworkModel(BaseModel): return { "accuracy": np.mean(accuracies), - "accuracy_std": np.std(accuracies), "precision": np.mean(precisions), - "precision_std": np.std(precisions), "recall": np.mean(recalls), - "recall_std": np.std(recalls), "f1": np.mean(f1_scores), - "f1_std": np.std(f1_scores), } def generate_learning_curve( @@ -150,9 +152,17 @@ class NeuralNetworkModel(BaseModel): "val_scores_std": [], } + # Prepare features and get vocabulary size + features_df = self.feature_extractor.extract_features(X) + X_prepared = self.prepare_features(features_df) + y_encoded = self.label_encoder.transform(y) + + vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000 + max_len = self.config.model_params.get("max_len", 6) + # Split data once for validation X_train_full, X_val, y_train_full, y_val = train_test_split( - X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y + X_prepared, y_encoded, test_size=0.2, random_state=self.config.random_seed, stratify=y_encoded ) for size in train_sizes: @@ -170,8 +180,10 @@ class NeuralNetworkModel(BaseModel): val_scores = [] for seed in range(3): # 3 runs for variance - # Build fresh model - model = self.build_model() + # Build fresh model using build_model_with_vocab + model = self.build_model_with_vocab( + vocab_size=vocab_size, max_len=max_len, **self.config.model_params + ) # Train model if hasattr(model, "fit"): diff --git a/research/traditional_model.py b/research/traditional_model.py index 89dc56d..46511a2 100644 --- a/research/traditional_model.py +++ b/research/traditional_model.py @@ -50,8 +50,14 @@ class TraditionalModel(BaseModel): y_encoded = self.label_encoder.transform(y) # Train model - logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features") - self.model.fit(X_prepared, y_encoded, verbose=2) + if len(X_prepared.shape) == 1: + # For text-based features (like LogisticRegression with vectorization) + logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)") + else: + # For numerical features + logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features") + + self.model.fit(X_prepared, y_encoded) self.is_fitted = True return self