refactor: update configuration loading and ensure directory existence across modules

This commit is contained in:
2025-08-07 00:36:32 +02:00
parent 104d7e1146
commit 96291b4ad0
9 changed files with 179 additions and 45 deletions
+2 -2
View File
@@ -1,7 +1,7 @@
#!.venv/bin/python3 #!.venv/bin/python3
import streamlit as st import streamlit as st
from core.config import setup_config from core.config import get_config
from core.utils.data_loader import DataLoader from core.utils.data_loader import DataLoader
from interface.configuration import Configuration from interface.configuration import Configuration
from interface.dashboard import Dashboard from interface.dashboard import Dashboard
@@ -26,7 +26,7 @@ st.set_page_config(
@st.cache_data @st.cache_data
def load_config(): def load_config():
"""Load application configuration with unified setup""" """Load application configuration with unified setup"""
return setup_config(env="development") return get_config()
class StreamlitApp: class StreamlitApp:
+2 -2
View File
@@ -37,8 +37,8 @@ data:
split_by_gender: true split_by_gender: true
evaluation_fraction: 0.2 evaluation_fraction: 0.2
random_seed: 42 random_seed: 42
max_dataset_size: 10_000 # Limit to 10k records for development/testing max_dataset_size: ~ # Limit to 10k records for development/testing
balance_by_sex: true # Balance male/female samples when limiting balance_by_sex: false # Balance male/female samples when limiting
# Enhanced logging for development # Enhanced logging for development
logging: logging:
+4 -4
View File
@@ -2,9 +2,10 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
from core.config.config_manager import ConfigManager from core.utils import ensure_directories
from core.config.logging_config import LoggingConfig from .config_manager import ConfigManager
from core.config.pipeline_config import PipelineConfig from .logging_config import LoggingConfig
from .pipeline_config import PipelineConfig
config_manager = ConfigManager() config_manager = ConfigManager()
@@ -43,7 +44,6 @@ def setup_config(config_path: Optional[Path] = None, env: str = "development") -
setup_logging(config) setup_logging(config)
# Ensure required directories exist # Ensure required directories exist
from core.utils import ensure_directories
ensure_directories(config) ensure_directories(config)
logging.info(f"Loaded configuration: {config.name} v{config.version}") logging.info(f"Loaded configuration: {config.name} v{config.version}")
+9 -5
View File
@@ -1,13 +1,17 @@
import logging import logging
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING
from core.config import get_config, PipelineConfig if TYPE_CHECKING:
from core.config import PipelineConfig
@contextmanager @contextmanager
def temporary_config_override(**overrides): def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration""" """Context manager for temporarily overriding configuration"""
from core.config import get_config
config = get_config() config = get_config()
original_values = {} original_values = {}
@@ -25,7 +29,7 @@ def temporary_config_override(**overrides):
setattr(config, key, value) setattr(config, key, value)
def ensure_directories(config: PipelineConfig) -> None: def ensure_directories(config: "PipelineConfig") -> None:
"""Ensure all required directories exist""" """Ensure all required directories exist"""
directories = [ directories = [
config.paths.data_dir, config.paths.data_dir,
@@ -42,16 +46,16 @@ def ensure_directories(config: PipelineConfig) -> None:
logging.info("Ensured all required directories exist") logging.info("Ensured all required directories exist")
def get_data_file_path(filename: str, config: PipelineConfig) -> Path: def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for a data file""" """Get full path for a data file"""
return config.paths.data_dir / filename return config.paths.data_dir / filename
def get_model_file_path(filename: str, config: PipelineConfig) -> Path: def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for a model file""" """Get full path for a model file"""
return config.paths.models_dir / filename return config.paths.models_dir / filename
def get_output_file_path(filename: str, config: PipelineConfig) -> Path: def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
"""Get full path for an output file""" """Get full path for an output file"""
return config.paths.outputs_dir / filename return config.paths.outputs_dir / filename
+28 -4
View File
@@ -3,6 +3,7 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import List, Dict, Optional from typing import List, Dict, Optional
import joblib
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
@@ -207,13 +208,36 @@ class ExperimentRunner:
experiment = self.tracker.get_experiment(experiment_id) experiment = self.tracker.get_experiment(experiment_id)
if experiment and experiment.model_path: if experiment and experiment.model_path:
return BaseModel.load(experiment.model_path) try:
# Load the saved model data Recreate the model instance using the saved config
model_data = joblib.load(experiment.model_path)
config = ExperimentConfig.from_dict(model_data["config"])
model = create_model(config)
# Restore the saved state
model.model = model_data["model"]
model.feature_extractor = model_data["feature_extractor"]
model.label_encoder = model_data["label_encoder"]
model.tokenizer = model_data.get("tokenizer")
model.is_fitted = model_data["is_fitted"]
model.training_history = model_data.get("training_history", {})
model.learning_curve_data = model_data.get("learning_curve_data", {})
# Restore vectorizers and encoders for models that use them (like XGBoost)
if "vectorizers" in model_data and hasattr(model, 'vectorizers'):
model.vectorizers = model_data["vectorizers"]
if "label_encoders" in model_data and hasattr(model, 'label_encoders'):
model.label_encoders = model_data["label_encoders"]
return model
except Exception as e:
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
return None
return None return None
def compare_experiments( def compare_experiments(self, experiment_ids: List[str], metric: str = "accuracy") -> pd.DataFrame:
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
"""Compare experiments and return analysis""" """Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids) comparison_df = self.tracker.compare_experiments(experiment_ids)
+54 -9
View File
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
class LightGBMModel(TraditionalModel): class LightGBMModel(TraditionalModel):
"""LightGBM with engineered features""" """LightGBM with engineered features"""
def __init__(self, config):
super().__init__(config)
# Store vectorizers and encoders to ensure consistent feature space
self.vectorizers = {}
self.label_encoders = {}
def build_model(self) -> BaseEstimator: def build_model(self) -> BaseEstimator:
params = self.config.model_params params = self.config.model_params
@@ -33,19 +39,58 @@ class LightGBMModel(TraditionalModel):
column = X[feature_type.value] column = X[feature_type.value]
if feature_type.value in ["name_length", "word_count"]: if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1)) features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]: elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character n-grams for text features # Character-level features for names
vectorizer = CountVectorizer( feature_key = f"vectorizer_{feature_type.value}"
analyzer="char", ngram_range=(2, 3), max_features=50
) if feature_key not in self.vectorizers:
char_features = vectorizer.fit_transform( # First time - create and fit vectorizer
column.fillna("").astype(str) self.vectorizers[feature_key] = CountVectorizer(
).toarray() analyzer="char", ngram_range=(2, 3), max_features=50
)
char_features = self.vectorizers[feature_key].fit_transform(
column.fillna("").astype(str)
).toarray()
else:
# Subsequent times - use existing vectorizer
char_features = self.vectorizers[feature_key].transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features) features.append(char_features)
else: else:
le = LabelEncoder() # Categorical features
encoded = le.fit_transform(column.fillna("unknown").astype(str)) feature_key = f"encoder_{feature_type.value}"
if feature_key not in self.label_encoders:
# First time - create and fit encoder
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
# Subsequent times - use existing encoder
# Handle unseen labels by mapping them to a default value
column_clean = column.fillna("unknown").astype(str)
# Get the classes the encoder was trained on
known_classes = set(self.label_encoders[feature_key].classes_)
# Map unseen values to "unknown" if it exists, otherwise to the first class
if "unknown" in known_classes:
default_class = "unknown"
else:
default_class = self.label_encoders[feature_key].classes_[0]
# Replace unseen values with default
column_mapped = column_clean.apply(
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(column_mapped)
features.append(encoded.reshape(-1, 1)) features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0) return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+51 -8
View File
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
class XGBoostModel(TraditionalModel): class XGBoostModel(TraditionalModel):
"""XGBoost with engineered features and character embeddings""" """XGBoost with engineered features and character embeddings"""
def __init__(self, config):
super().__init__(config)
# Store vectorizers and encoders to ensure consistent feature space
self.vectorizers = {}
self.label_encoders = {}
def build_model(self) -> BaseEstimator: def build_model(self) -> BaseEstimator:
params = self.config.model_params params = self.config.model_params
@@ -37,17 +43,54 @@ class XGBoostModel(TraditionalModel):
features.append(column.fillna(0).values.reshape(-1, 1)) features.append(column.fillna(0).values.reshape(-1, 1))
elif feature_type.value in ["full_name", "native_name", "surname"]: elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character-level features for names # Character-level features for names
vectorizer = CountVectorizer( feature_key = f"vectorizer_{feature_type.value}"
analyzer="char", ngram_range=(2, 3), max_features=100
) if feature_key not in self.vectorizers:
char_features = vectorizer.fit_transform( # First time - create and fit vectorizer
column.fillna("").astype(str) self.vectorizers[feature_key] = CountVectorizer(
).toarray() analyzer="char", ngram_range=(2, 3), max_features=100
)
char_features = self.vectorizers[feature_key].fit_transform(
column.fillna("").astype(str)
).toarray()
else:
# Subsequent times - use existing vectorizer
char_features = self.vectorizers[feature_key].transform(
column.fillna("").astype(str)
).toarray()
features.append(char_features) features.append(char_features)
else: else:
# Categorical features # Categorical features
le = LabelEncoder() feature_key = f"encoder_{feature_type.value}"
encoded = le.fit_transform(column.fillna("unknown").astype(str))
if feature_key not in self.label_encoders:
# First time - create and fit encoder
self.label_encoders[feature_key] = LabelEncoder()
encoded = self.label_encoders[feature_key].fit_transform(
column.fillna("unknown").astype(str)
)
else:
# Subsequent times - use existing encoder
# Handle unseen labels by mapping them to a default value
column_clean = column.fillna("unknown").astype(str)
# Get the classes the encoder was trained on
known_classes = set(self.label_encoders[feature_key].classes_)
# Map unseen values to "unknown" if it exists, otherwise to the first class
if "unknown" in known_classes:
default_class = "unknown"
else:
default_class = self.label_encoders[feature_key].classes_[0]
# Replace unseen values with default
column_mapped = column_clean.apply(
lambda x: x if x in known_classes else default_class
)
encoded = self.label_encoders[feature_key].transform(column_mapped)
features.append(encoded.reshape(-1, 1)) features.append(encoded.reshape(-1, 1))
return np.hstack(features) if features else np.array([]).reshape(len(X), 0) return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
+21 -9
View File
@@ -95,9 +95,15 @@ class NeuralNetworkModel(BaseModel):
recalls = [] recalls = []
f1_scores = [] f1_scores = []
# Get vocabulary size and model parameters
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)): for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
# Create fresh model for each fold # Create fresh model for each fold using build_model_with_vocab
fold_model = self.build_model() fold_model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train on fold # Train on fold
if hasattr(fold_model, "fit"): if hasattr(fold_model, "fit"):
@@ -127,13 +133,9 @@ class NeuralNetworkModel(BaseModel):
return { return {
"accuracy": np.mean(accuracies), "accuracy": np.mean(accuracies),
"accuracy_std": np.std(accuracies),
"precision": np.mean(precisions), "precision": np.mean(precisions),
"precision_std": np.std(precisions),
"recall": np.mean(recalls), "recall": np.mean(recalls),
"recall_std": np.std(recalls),
"f1": np.mean(f1_scores), "f1": np.mean(f1_scores),
"f1_std": np.std(f1_scores),
} }
def generate_learning_curve( def generate_learning_curve(
@@ -150,9 +152,17 @@ class NeuralNetworkModel(BaseModel):
"val_scores_std": [], "val_scores_std": [],
} }
# Prepare features and get vocabulary size
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
# Split data once for validation # Split data once for validation
X_train_full, X_val, y_train_full, y_val = train_test_split( X_train_full, X_val, y_train_full, y_val = train_test_split(
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y X_prepared, y_encoded, test_size=0.2, random_state=self.config.random_seed, stratify=y_encoded
) )
for size in train_sizes: for size in train_sizes:
@@ -170,8 +180,10 @@ class NeuralNetworkModel(BaseModel):
val_scores = [] val_scores = []
for seed in range(3): # 3 runs for variance for seed in range(3): # 3 runs for variance
# Build fresh model # Build fresh model using build_model_with_vocab
model = self.build_model() model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train model # Train model
if hasattr(model, "fit"): if hasattr(model, "fit"):
+8 -2
View File
@@ -50,8 +50,14 @@ class TraditionalModel(BaseModel):
y_encoded = self.label_encoder.transform(y) y_encoded = self.label_encoder.transform(y)
# Train model # Train model
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features") if len(X_prepared.shape) == 1:
self.model.fit(X_prepared, y_encoded, verbose=2) # For text-based features (like LogisticRegression with vectorization)
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
else:
# For numerical features
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
self.model.fit(X_prepared, y_encoded)
self.is_fitted = True self.is_fitted = True
return self return self