refactor: update configuration loading and ensure directory existence across modules

This commit is contained in:
2025-08-07 00:36:32 +02:00
parent 104d7e1146
commit 96291b4ad0
9 changed files with 179 additions and 45 deletions
+2 -2
View File
@@ -1,7 +1,7 @@
#!.venv/bin/python3
import streamlit as st
from core.config import setup_config
from core.config import get_config
from core.utils.data_loader import DataLoader
from interface.configuration import Configuration
from interface.dashboard import Dashboard
@@ -26,7 +26,7 @@ st.set_page_config(
@st.cache_data
def load_config():
"""Load application configuration with unified setup"""
return setup_config(env="development")
return get_config()
class StreamlitApp:
+2 -2
View File
@@ -37,8 +37,8 @@ data:
split_by_gender: true
evaluation_fraction: 0.2
random_seed: 42
max_dataset_size: 10_000 # Limit to 10k records for development/testing
balance_by_sex: true # Balance male/female samples when limiting
max_dataset_size: ~ # Limit to 10k records for development/testing
balance_by_sex: false # Balance male/female samples when limiting
# Enhanced logging for development
logging:
+4 -4
View File
@@ -2,9 +2,10 @@ import logging
from pathlib import Path
from typing import Optional, Union
from core.config.config_manager import ConfigManager
from core.config.logging_config import LoggingConfig
from core.config.pipeline_config import PipelineConfig
from core.utils import ensure_directories
from .config_manager import ConfigManager
from .logging_config import LoggingConfig
from .pipeline_config import PipelineConfig
config_manager = ConfigManager()
@@ -43,7 +44,6 @@ def setup_config(config_path: Optional[Path] = None, env: str = "development") -
setup_logging(config)
# Ensure required directories exist
from core.utils import ensure_directories
ensure_directories(config)
logging.info(f"Loaded configuration: {config.name} v{config.version}")
+9 -5
View File
@@ -1,13 +1,17 @@
import logging
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING
from core.config import get_config, PipelineConfig
if TYPE_CHECKING:
from core.config import PipelineConfig
@contextmanager
def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration"""
from core.config import get_config
config = get_config()
original_values = {}
@@ -25,7 +29,7 @@ def temporary_config_override(**overrides):
setattr(config, key, value)
def ensure_directories(config: PipelineConfig) -> None:
def ensure_directories(config: "PipelineConfig") -> None:
"""Ensure all required directories exist"""
directories = [
config.paths.data_dir,
@@ -42,16 +46,16 @@ def ensure_directories(config: PipelineConfig) -> None:
logging.info("Ensured all required directories exist")
def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for a data file"""
return config.paths.data_dir / filename
def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for a model file"""
return config.paths.models_dir / filename
def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for an output file"""
return config.paths.outputs_dir / filename
+28 -4
View File
@@ -3,6 +3,7 @@ from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
@@ -207,13 +208,36 @@ class ExperimentRunner:
experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.model_path:
return BaseModel.load(experiment.model_path)
try:
# Load the saved model data Recreate the model instance using the saved config
model_data = joblib.load(experiment.model_path)
config = ExperimentConfig.from_dict(model_data["config"])
model = create_model(config)
# Restore the saved state
model.model = model_data["model"]
model.feature_extractor = model_data["feature_extractor"]
model.label_encoder = model_data["label_encoder"]
model.tokenizer = model_data.get("tokenizer")
model.is_fitted = model_data["is_fitted"]
model.training_history = model_data.get("training_history", {})
model.learning_curve_data = model_data.get("learning_curve_data", {})
# Restore vectorizers and encoders for models that use them (like XGBoost)
if "vectorizers" in model_data and hasattr(model, 'vectorizers'):
model.vectorizers = model_data["vectorizers"]
if "label_encoders" in model_data and hasattr(model, 'label_encoders'):
model.label_encoders = model_data["label_encoders"]
return model
except Exception as e:
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
return None
return None
def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
def compare_experiments(self, experiment_ids: List[str], metric: str = "accuracy") -> pd.DataFrame:
"""Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids)
+54 -9
View File
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
class LightGBMModel(TraditionalModel):
"""LightGBM with engineered features"""
def __init__(self, config):
super().__init__(config)
# Store vectorizers and encoders to ensure consistent feature space
self.vectorizers = {}
self.label_encoders = {}
def build_model(self) -> BaseEstimator:
params = self.config.model_params
@@ -33,19 +39,58 @@ class LightGBMModel(TraditionalModel):
column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character n-grams for text features
vectorizer = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=50
)
char_features = vectorizer.fit_transform(
column.fillna("").astype(str)
).toarray()
# Character-level features for names
feature_key = f"vectorizer_{feature_type.value}"
if feature_key not in self.vectorizers:
# First time - create and fit vectorizer
self.vectorizers[feature_key] = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=50
)
char_features = self.vectorizers[feature_key].fit_transform(
column.fillna("").astype(str)
).toarray()
else:
# Subsequent times - use existing vectorizer
char_features = self.vectorizers[feature_key].transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features)
else:
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
# Categorical features
feature_key = f"encoder_{feature_type.value}"
if feature_key not in self.label_encoders:
# First time - create and fit encoder
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
# Subsequent times - use existing encoder
# Handle unseen labels by mapping them to a default value
column_clean = column.fillna("unknown").astype(str)
# Get the classes the encoder was trained on
known_classes = set(self.label_encoders[feature_key].classes_)
# Map unseen values to "unknown" if it exists, otherwise to the first class
if "unknown" in known_classes:
default_class = "unknown"
else:
default_class = self.label_encoders[feature_key].classes_[0]
# Replace unseen values with default
column_mapped = column_clean.apply(
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(column_mapped)
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+51 -8
View File
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
class XGBoostModel(TraditionalModel):
"""XGBoost with engineered features and character embeddings"""
def __init__(self, config):
super().__init__(config)
# Store vectorizers and encoders to ensure consistent feature space
self.vectorizers = {}
self.label_encoders = {}
def build_model(self) -> BaseEstimator:
params = self.config.model_params
@@ -37,17 +43,54 @@ class XGBoostModel(TraditionalModel):
features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character-level features for names
vectorizer = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=100
)
char_features = vectorizer.fit_transform(
column.fillna("").astype(str)
).toarray()
feature_key = f"vectorizer_{feature_type.value}"
if feature_key not in self.vectorizers:
# First time - create and fit vectorizer
self.vectorizers[feature_key] = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=100
)
char_features = self.vectorizers[feature_key].fit_transform(
column.fillna("").astype(str)
).toarray()
else:
# Subsequent times - use existing vectorizer
char_features = self.vectorizers[feature_key].transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features)
else:
# Categorical features
le = LabelEncoder()
encoded = le.fit_transform(column.fillna("unknown").astype(str))
feature_key = f"encoder_{feature_type.value}"
if feature_key not in self.label_encoders:
# First time - create and fit encoder
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
# Subsequent times - use existing encoder
# Handle unseen labels by mapping them to a default value
column_clean = column.fillna("unknown").astype(str)
# Get the classes the encoder was trained on
known_classes = set(self.label_encoders[feature_key].classes_)
# Map unseen values to "unknown" if it exists, otherwise to the first class
if "unknown" in known_classes:
default_class = "unknown"
else:
default_class = self.label_encoders[feature_key].classes_[0]
# Replace unseen values with default
column_mapped = column_clean.apply(
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(column_mapped)
features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+21 -9
View File
@@ -95,9 +95,15 @@ class NeuralNetworkModel(BaseModel):
recalls = []
f1_scores = []
# Get vocabulary size and model parameters
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
# Create fresh model for each fold
fold_model = self.build_model()
# Create fresh model for each fold using build_model_with_vocab
fold_model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train on fold
if hasattr(fold_model, "fit"):
@@ -127,13 +133,9 @@ class NeuralNetworkModel(BaseModel):
return {
"accuracy": np.mean(accuracies),
"accuracy_std": np.std(accuracies),
"precision": np.mean(precisions),
"precision_std": np.std(precisions),
"recall": np.mean(recalls),
"recall_std": np.std(recalls),
"f1": np.mean(f1_scores),
"f1_std": np.std(f1_scores),
}
def generate_learning_curve(
@@ -150,9 +152,17 @@ class NeuralNetworkModel(BaseModel):
"val_scores_std": [],
}
# Prepare features and get vocabulary size
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
# Split data once for validation
X_train_full, X_val, y_train_full, y_val = train_test_split(
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
X_prepared, y_encoded, test_size=0.2, random_state=self.config.random_seed, stratify=y_encoded
)
for size in train_sizes:
@@ -170,8 +180,10 @@ class NeuralNetworkModel(BaseModel):
val_scores = []
for seed in range(3): # 3 runs for variance
# Build fresh model
model = self.build_model()
# Build fresh model using build_model_with_vocab
model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train model
if hasattr(model, "fit"):
+8 -2
View File
@@ -50,8 +50,14 @@ class TraditionalModel(BaseModel):
y_encoded = self.label_encoder.transform(y)
# Train model
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
self.model.fit(X_prepared, y_encoded, verbose=2)
if len(X_prepared.shape) == 1:
# For text-based features (like LogisticRegression with vectorization)
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
else:
# For numerical features
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
self.model.fit(X_prepared, y_encoded)
self.is_fitted = True
return self