refactor: update configuration loading and ensure directory existence across modules
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
#!.venv/bin/python3
|
||||
import streamlit as st
|
||||
|
||||
from core.config import setup_config
|
||||
from core.config import get_config
|
||||
from core.utils.data_loader import DataLoader
|
||||
from interface.configuration import Configuration
|
||||
from interface.dashboard import Dashboard
|
||||
@@ -26,7 +26,7 @@ st.set_page_config(
|
||||
@st.cache_data
|
||||
def load_config():
|
||||
"""Load application configuration with unified setup"""
|
||||
return setup_config(env="development")
|
||||
return get_config()
|
||||
|
||||
|
||||
class StreamlitApp:
|
||||
|
||||
@@ -37,8 +37,8 @@ data:
|
||||
split_by_gender: true
|
||||
evaluation_fraction: 0.2
|
||||
random_seed: 42
|
||||
max_dataset_size: 10_000 # Limit to 10k records for development/testing
|
||||
balance_by_sex: true # Balance male/female samples when limiting
|
||||
max_dataset_size: ~ # Limit to 10k records for development/testing
|
||||
balance_by_sex: false # Balance male/female samples when limiting
|
||||
|
||||
# Enhanced logging for development
|
||||
logging:
|
||||
|
||||
@@ -2,9 +2,10 @@ import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from core.config.config_manager import ConfigManager
|
||||
from core.config.logging_config import LoggingConfig
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils import ensure_directories
|
||||
from .config_manager import ConfigManager
|
||||
from .logging_config import LoggingConfig
|
||||
from .pipeline_config import PipelineConfig
|
||||
|
||||
config_manager = ConfigManager()
|
||||
|
||||
@@ -43,7 +44,6 @@ def setup_config(config_path: Optional[Path] = None, env: str = "development") -
|
||||
setup_logging(config)
|
||||
|
||||
# Ensure required directories exist
|
||||
from core.utils import ensure_directories
|
||||
ensure_directories(config)
|
||||
|
||||
logging.info(f"Loaded configuration: {config.name} v{config.version}")
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from core.config import get_config, PipelineConfig
|
||||
if TYPE_CHECKING:
|
||||
from core.config import PipelineConfig
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temporary_config_override(**overrides):
|
||||
"""Context manager for temporarily overriding configuration"""
|
||||
from core.config import get_config
|
||||
|
||||
config = get_config()
|
||||
original_values = {}
|
||||
|
||||
@@ -25,7 +29,7 @@ def temporary_config_override(**overrides):
|
||||
setattr(config, key, value)
|
||||
|
||||
|
||||
def ensure_directories(config: PipelineConfig) -> None:
|
||||
def ensure_directories(config: "PipelineConfig") -> None:
|
||||
"""Ensure all required directories exist"""
|
||||
directories = [
|
||||
config.paths.data_dir,
|
||||
@@ -42,16 +46,16 @@ def ensure_directories(config: PipelineConfig) -> None:
|
||||
logging.info("Ensured all required directories exist")
|
||||
|
||||
|
||||
def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||
def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||
"""Get full path for a data file"""
|
||||
return config.paths.data_dir / filename
|
||||
|
||||
|
||||
def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||
def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||
"""Get full path for a model file"""
|
||||
return config.paths.models_dir / filename
|
||||
|
||||
|
||||
def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||
def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||
"""Get full path for an output file"""
|
||||
return config.paths.outputs_dir / filename
|
||||
|
||||
@@ -3,6 +3,7 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import confusion_matrix
|
||||
@@ -207,13 +208,36 @@ class ExperimentRunner:
|
||||
experiment = self.tracker.get_experiment(experiment_id)
|
||||
|
||||
if experiment and experiment.model_path:
|
||||
return BaseModel.load(experiment.model_path)
|
||||
try:
|
||||
# Load the saved model data Recreate the model instance using the saved config
|
||||
model_data = joblib.load(experiment.model_path)
|
||||
config = ExperimentConfig.from_dict(model_data["config"])
|
||||
model = create_model(config)
|
||||
|
||||
# Restore the saved state
|
||||
model.model = model_data["model"]
|
||||
model.feature_extractor = model_data["feature_extractor"]
|
||||
model.label_encoder = model_data["label_encoder"]
|
||||
model.tokenizer = model_data.get("tokenizer")
|
||||
model.is_fitted = model_data["is_fitted"]
|
||||
model.training_history = model_data.get("training_history", {})
|
||||
model.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||
|
||||
# Restore vectorizers and encoders for models that use them (like XGBoost)
|
||||
if "vectorizers" in model_data and hasattr(model, 'vectorizers'):
|
||||
model.vectorizers = model_data["vectorizers"]
|
||||
if "label_encoders" in model_data and hasattr(model, 'label_encoders'):
|
||||
model.label_encoders = model_data["label_encoders"]
|
||||
|
||||
return model
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def compare_experiments(
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
) -> pd.DataFrame:
|
||||
def compare_experiments(self, experiment_ids: List[str], metric: str = "accuracy") -> pd.DataFrame:
|
||||
"""Compare experiments and return analysis"""
|
||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||
|
||||
|
||||
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
|
||||
class LightGBMModel(TraditionalModel):
|
||||
"""LightGBM with engineered features"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Store vectorizers and encoders to ensure consistent feature space
|
||||
self.vectorizers = {}
|
||||
self.label_encoders = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
@@ -33,19 +39,58 @@ class LightGBMModel(TraditionalModel):
|
||||
column = X[feature_type.value]
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character n-grams for text features
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = vectorizer.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
# Character-level features for names
|
||||
feature_key = f"vectorizer_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.vectorizers:
|
||||
# First time - create and fit vectorizer
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = self.vectorizers[feature_key].fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = self.vectorizers[feature_key].transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
|
||||
features.append(char_features)
|
||||
else:
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
# Categorical features
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.label_encoders:
|
||||
# First time - create and fit encoder
|
||||
self.label_encoders[feature_key] = LabelEncoder()
|
||||
encoded = self.label_encoders[feature_key].fit_transform(
|
||||
column.fillna("unknown").astype(str)
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing encoder
|
||||
# Handle unseen labels by mapping them to a default value
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
|
||||
# Get the classes the encoder was trained on
|
||||
known_classes = set(self.label_encoders[feature_key].classes_)
|
||||
|
||||
# Map unseen values to "unknown" if it exists, otherwise to the first class
|
||||
if "unknown" in known_classes:
|
||||
default_class = "unknown"
|
||||
else:
|
||||
default_class = self.label_encoders[feature_key].classes_[0]
|
||||
|
||||
# Replace unseen values with default
|
||||
column_mapped = column_clean.apply(
|
||||
lambda x: x if x in known_classes else default_class
|
||||
)
|
||||
|
||||
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
|
||||
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
|
||||
class XGBoostModel(TraditionalModel):
|
||||
"""XGBoost with engineered features and character embeddings"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Store vectorizers and encoders to ensure consistent feature space
|
||||
self.vectorizers = {}
|
||||
self.label_encoders = {}
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
@@ -37,17 +43,54 @@ class XGBoostModel(TraditionalModel):
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character-level features for names
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||
)
|
||||
char_features = vectorizer.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
feature_key = f"vectorizer_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.vectorizers:
|
||||
# First time - create and fit vectorizer
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||
)
|
||||
char_features = self.vectorizers[feature_key].fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = self.vectorizers[feature_key].transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
|
||||
features.append(char_features)
|
||||
else:
|
||||
# Categorical features
|
||||
le = LabelEncoder()
|
||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
|
||||
if feature_key not in self.label_encoders:
|
||||
# First time - create and fit encoder
|
||||
self.label_encoders[feature_key] = LabelEncoder()
|
||||
encoded = self.label_encoders[feature_key].fit_transform(
|
||||
column.fillna("unknown").astype(str)
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing encoder
|
||||
# Handle unseen labels by mapping them to a default value
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
|
||||
# Get the classes the encoder was trained on
|
||||
known_classes = set(self.label_encoders[feature_key].classes_)
|
||||
|
||||
# Map unseen values to "unknown" if it exists, otherwise to the first class
|
||||
if "unknown" in known_classes:
|
||||
default_class = "unknown"
|
||||
else:
|
||||
default_class = self.label_encoders[feature_key].classes_[0]
|
||||
|
||||
# Replace unseen values with default
|
||||
column_mapped = column_clean.apply(
|
||||
lambda x: x if x in known_classes else default_class
|
||||
)
|
||||
|
||||
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
|
||||
@@ -95,9 +95,15 @@ class NeuralNetworkModel(BaseModel):
|
||||
recalls = []
|
||||
f1_scores = []
|
||||
|
||||
# Get vocabulary size and model parameters
|
||||
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
|
||||
# Create fresh model for each fold
|
||||
fold_model = self.build_model()
|
||||
# Create fresh model for each fold using build_model_with_vocab
|
||||
fold_model = self.build_model_with_vocab(
|
||||
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||
)
|
||||
|
||||
# Train on fold
|
||||
if hasattr(fold_model, "fit"):
|
||||
@@ -127,13 +133,9 @@ class NeuralNetworkModel(BaseModel):
|
||||
|
||||
return {
|
||||
"accuracy": np.mean(accuracies),
|
||||
"accuracy_std": np.std(accuracies),
|
||||
"precision": np.mean(precisions),
|
||||
"precision_std": np.std(precisions),
|
||||
"recall": np.mean(recalls),
|
||||
"recall_std": np.std(recalls),
|
||||
"f1": np.mean(f1_scores),
|
||||
"f1_std": np.std(f1_scores),
|
||||
}
|
||||
|
||||
def generate_learning_curve(
|
||||
@@ -150,9 +152,17 @@ class NeuralNetworkModel(BaseModel):
|
||||
"val_scores_std": [],
|
||||
}
|
||||
|
||||
# Prepare features and get vocabulary size
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
|
||||
# Split data once for validation
|
||||
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
||||
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
|
||||
X_prepared, y_encoded, test_size=0.2, random_state=self.config.random_seed, stratify=y_encoded
|
||||
)
|
||||
|
||||
for size in train_sizes:
|
||||
@@ -170,8 +180,10 @@ class NeuralNetworkModel(BaseModel):
|
||||
val_scores = []
|
||||
|
||||
for seed in range(3): # 3 runs for variance
|
||||
# Build fresh model
|
||||
model = self.build_model()
|
||||
# Build fresh model using build_model_with_vocab
|
||||
model = self.build_model_with_vocab(
|
||||
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||
)
|
||||
|
||||
# Train model
|
||||
if hasattr(model, "fit"):
|
||||
|
||||
@@ -50,8 +50,14 @@ class TraditionalModel(BaseModel):
|
||||
y_encoded = self.label_encoder.transform(y)
|
||||
|
||||
# Train model
|
||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
||||
self.model.fit(X_prepared, y_encoded, verbose=2)
|
||||
if len(X_prepared.shape) == 1:
|
||||
# For text-based features (like LogisticRegression with vectorization)
|
||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
|
||||
else:
|
||||
# For numerical features
|
||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
||||
|
||||
self.model.fit(X_prepared, y_encoded)
|
||||
self.is_fitted = True
|
||||
|
||||
return self
|
||||
|
||||
Reference in New Issue
Block a user