refactor: update configuration loading and ensure directory existence across modules
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
#!.venv/bin/python3
|
#!.venv/bin/python3
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.config import setup_config
|
from core.config import get_config
|
||||||
from core.utils.data_loader import DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from interface.configuration import Configuration
|
from interface.configuration import Configuration
|
||||||
from interface.dashboard import Dashboard
|
from interface.dashboard import Dashboard
|
||||||
@@ -26,7 +26,7 @@ st.set_page_config(
|
|||||||
@st.cache_data
|
@st.cache_data
|
||||||
def load_config():
|
def load_config():
|
||||||
"""Load application configuration with unified setup"""
|
"""Load application configuration with unified setup"""
|
||||||
return setup_config(env="development")
|
return get_config()
|
||||||
|
|
||||||
|
|
||||||
class StreamlitApp:
|
class StreamlitApp:
|
||||||
|
|||||||
@@ -37,8 +37,8 @@ data:
|
|||||||
split_by_gender: true
|
split_by_gender: true
|
||||||
evaluation_fraction: 0.2
|
evaluation_fraction: 0.2
|
||||||
random_seed: 42
|
random_seed: 42
|
||||||
max_dataset_size: 10_000 # Limit to 10k records for development/testing
|
max_dataset_size: ~ # Limit to 10k records for development/testing
|
||||||
balance_by_sex: true # Balance male/female samples when limiting
|
balance_by_sex: false # Balance male/female samples when limiting
|
||||||
|
|
||||||
# Enhanced logging for development
|
# Enhanced logging for development
|
||||||
logging:
|
logging:
|
||||||
|
|||||||
@@ -2,9 +2,10 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
from core.config.config_manager import ConfigManager
|
from core.utils import ensure_directories
|
||||||
from core.config.logging_config import LoggingConfig
|
from .config_manager import ConfigManager
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from .logging_config import LoggingConfig
|
||||||
|
from .pipeline_config import PipelineConfig
|
||||||
|
|
||||||
config_manager = ConfigManager()
|
config_manager = ConfigManager()
|
||||||
|
|
||||||
@@ -43,7 +44,6 @@ def setup_config(config_path: Optional[Path] = None, env: str = "development") -
|
|||||||
setup_logging(config)
|
setup_logging(config)
|
||||||
|
|
||||||
# Ensure required directories exist
|
# Ensure required directories exist
|
||||||
from core.utils import ensure_directories
|
|
||||||
ensure_directories(config)
|
ensure_directories(config)
|
||||||
|
|
||||||
logging.info(f"Loaded configuration: {config.name} v{config.version}")
|
logging.info(f"Loaded configuration: {config.name} v{config.version}")
|
||||||
|
|||||||
@@ -1,13 +1,17 @@
|
|||||||
import logging
|
import logging
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from core.config import get_config, PipelineConfig
|
if TYPE_CHECKING:
|
||||||
|
from core.config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def temporary_config_override(**overrides):
|
def temporary_config_override(**overrides):
|
||||||
"""Context manager for temporarily overriding configuration"""
|
"""Context manager for temporarily overriding configuration"""
|
||||||
|
from core.config import get_config
|
||||||
|
|
||||||
config = get_config()
|
config = get_config()
|
||||||
original_values = {}
|
original_values = {}
|
||||||
|
|
||||||
@@ -25,7 +29,7 @@ def temporary_config_override(**overrides):
|
|||||||
setattr(config, key, value)
|
setattr(config, key, value)
|
||||||
|
|
||||||
|
|
||||||
def ensure_directories(config: PipelineConfig) -> None:
|
def ensure_directories(config: "PipelineConfig") -> None:
|
||||||
"""Ensure all required directories exist"""
|
"""Ensure all required directories exist"""
|
||||||
directories = [
|
directories = [
|
||||||
config.paths.data_dir,
|
config.paths.data_dir,
|
||||||
@@ -42,16 +46,16 @@ def ensure_directories(config: PipelineConfig) -> None:
|
|||||||
logging.info("Ensured all required directories exist")
|
logging.info("Ensured all required directories exist")
|
||||||
|
|
||||||
|
|
||||||
def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
|
def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||||
"""Get full path for a data file"""
|
"""Get full path for a data file"""
|
||||||
return config.paths.data_dir / filename
|
return config.paths.data_dir / filename
|
||||||
|
|
||||||
|
|
||||||
def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
|
def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||||
"""Get full path for a model file"""
|
"""Get full path for a model file"""
|
||||||
return config.paths.models_dir / filename
|
return config.paths.models_dir / filename
|
||||||
|
|
||||||
|
|
||||||
def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
|
def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
|
||||||
"""Get full path for an output file"""
|
"""Get full path for an output file"""
|
||||||
return config.paths.outputs_dir / filename
|
return config.paths.outputs_dir / filename
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from datetime import datetime
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
import joblib
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
@@ -207,13 +208,36 @@ class ExperimentRunner:
|
|||||||
experiment = self.tracker.get_experiment(experiment_id)
|
experiment = self.tracker.get_experiment(experiment_id)
|
||||||
|
|
||||||
if experiment and experiment.model_path:
|
if experiment and experiment.model_path:
|
||||||
return BaseModel.load(experiment.model_path)
|
try:
|
||||||
|
# Load the saved model data Recreate the model instance using the saved config
|
||||||
|
model_data = joblib.load(experiment.model_path)
|
||||||
|
config = ExperimentConfig.from_dict(model_data["config"])
|
||||||
|
model = create_model(config)
|
||||||
|
|
||||||
|
# Restore the saved state
|
||||||
|
model.model = model_data["model"]
|
||||||
|
model.feature_extractor = model_data["feature_extractor"]
|
||||||
|
model.label_encoder = model_data["label_encoder"]
|
||||||
|
model.tokenizer = model_data.get("tokenizer")
|
||||||
|
model.is_fitted = model_data["is_fitted"]
|
||||||
|
model.training_history = model_data.get("training_history", {})
|
||||||
|
model.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||||
|
|
||||||
|
# Restore vectorizers and encoders for models that use them (like XGBoost)
|
||||||
|
if "vectorizers" in model_data and hasattr(model, 'vectorizers'):
|
||||||
|
model.vectorizers = model_data["vectorizers"]
|
||||||
|
if "label_encoders" in model_data and hasattr(model, 'label_encoders'):
|
||||||
|
model.label_encoders = model_data["label_encoders"]
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to load model for experiment {experiment_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def compare_experiments(
|
def compare_experiments(self, experiment_ids: List[str], metric: str = "accuracy") -> pd.DataFrame:
|
||||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
|
||||||
) -> pd.DataFrame:
|
|
||||||
"""Compare experiments and return analysis"""
|
"""Compare experiments and return analysis"""
|
||||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
|
|||||||
class LightGBMModel(TraditionalModel):
|
class LightGBMModel(TraditionalModel):
|
||||||
"""LightGBM with engineered features"""
|
"""LightGBM with engineered features"""
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
# Store vectorizers and encoders to ensure consistent feature space
|
||||||
|
self.vectorizers = {}
|
||||||
|
self.label_encoders = {}
|
||||||
|
|
||||||
def build_model(self) -> BaseEstimator:
|
def build_model(self) -> BaseEstimator:
|
||||||
params = self.config.model_params
|
params = self.config.model_params
|
||||||
|
|
||||||
@@ -33,19 +39,58 @@ class LightGBMModel(TraditionalModel):
|
|||||||
column = X[feature_type.value]
|
column = X[feature_type.value]
|
||||||
|
|
||||||
if feature_type.value in ["name_length", "word_count"]:
|
if feature_type.value in ["name_length", "word_count"]:
|
||||||
|
# Numerical features
|
||||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||||
# Character n-grams for text features
|
# Character-level features for names
|
||||||
vectorizer = CountVectorizer(
|
feature_key = f"vectorizer_{feature_type.value}"
|
||||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
|
||||||
)
|
if feature_key not in self.vectorizers:
|
||||||
char_features = vectorizer.fit_transform(
|
# First time - create and fit vectorizer
|
||||||
column.fillna("").astype(str)
|
self.vectorizers[feature_key] = CountVectorizer(
|
||||||
).toarray()
|
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||||
|
)
|
||||||
|
char_features = self.vectorizers[feature_key].fit_transform(
|
||||||
|
column.fillna("").astype(str)
|
||||||
|
).toarray()
|
||||||
|
else:
|
||||||
|
# Subsequent times - use existing vectorizer
|
||||||
|
char_features = self.vectorizers[feature_key].transform(
|
||||||
|
column.fillna("").astype(str)
|
||||||
|
).toarray()
|
||||||
|
|
||||||
features.append(char_features)
|
features.append(char_features)
|
||||||
else:
|
else:
|
||||||
le = LabelEncoder()
|
# Categorical features
|
||||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
feature_key = f"encoder_{feature_type.value}"
|
||||||
|
|
||||||
|
if feature_key not in self.label_encoders:
|
||||||
|
# First time - create and fit encoder
|
||||||
|
self.label_encoders[feature_key] = LabelEncoder()
|
||||||
|
encoded = self.label_encoders[feature_key].fit_transform(
|
||||||
|
column.fillna("unknown").astype(str)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Subsequent times - use existing encoder
|
||||||
|
# Handle unseen labels by mapping them to a default value
|
||||||
|
column_clean = column.fillna("unknown").astype(str)
|
||||||
|
|
||||||
|
# Get the classes the encoder was trained on
|
||||||
|
known_classes = set(self.label_encoders[feature_key].classes_)
|
||||||
|
|
||||||
|
# Map unseen values to "unknown" if it exists, otherwise to the first class
|
||||||
|
if "unknown" in known_classes:
|
||||||
|
default_class = "unknown"
|
||||||
|
else:
|
||||||
|
default_class = self.label_encoders[feature_key].classes_[0]
|
||||||
|
|
||||||
|
# Replace unseen values with default
|
||||||
|
column_mapped = column_clean.apply(
|
||||||
|
lambda x: x if x in known_classes else default_class
|
||||||
|
)
|
||||||
|
|
||||||
|
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
||||||
|
|
||||||
features.append(encoded.reshape(-1, 1))
|
features.append(encoded.reshape(-1, 1))
|
||||||
|
|
||||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||||
|
|||||||
@@ -11,6 +11,12 @@ from research.traditional_model import TraditionalModel
|
|||||||
class XGBoostModel(TraditionalModel):
|
class XGBoostModel(TraditionalModel):
|
||||||
"""XGBoost with engineered features and character embeddings"""
|
"""XGBoost with engineered features and character embeddings"""
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
# Store vectorizers and encoders to ensure consistent feature space
|
||||||
|
self.vectorizers = {}
|
||||||
|
self.label_encoders = {}
|
||||||
|
|
||||||
def build_model(self) -> BaseEstimator:
|
def build_model(self) -> BaseEstimator:
|
||||||
params = self.config.model_params
|
params = self.config.model_params
|
||||||
|
|
||||||
@@ -37,17 +43,54 @@ class XGBoostModel(TraditionalModel):
|
|||||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||||
# Character-level features for names
|
# Character-level features for names
|
||||||
vectorizer = CountVectorizer(
|
feature_key = f"vectorizer_{feature_type.value}"
|
||||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
|
||||||
)
|
if feature_key not in self.vectorizers:
|
||||||
char_features = vectorizer.fit_transform(
|
# First time - create and fit vectorizer
|
||||||
column.fillna("").astype(str)
|
self.vectorizers[feature_key] = CountVectorizer(
|
||||||
).toarray()
|
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||||
|
)
|
||||||
|
char_features = self.vectorizers[feature_key].fit_transform(
|
||||||
|
column.fillna("").astype(str)
|
||||||
|
).toarray()
|
||||||
|
else:
|
||||||
|
# Subsequent times - use existing vectorizer
|
||||||
|
char_features = self.vectorizers[feature_key].transform(
|
||||||
|
column.fillna("").astype(str)
|
||||||
|
).toarray()
|
||||||
|
|
||||||
features.append(char_features)
|
features.append(char_features)
|
||||||
else:
|
else:
|
||||||
# Categorical features
|
# Categorical features
|
||||||
le = LabelEncoder()
|
feature_key = f"encoder_{feature_type.value}"
|
||||||
encoded = le.fit_transform(column.fillna("unknown").astype(str))
|
|
||||||
|
if feature_key not in self.label_encoders:
|
||||||
|
# First time - create and fit encoder
|
||||||
|
self.label_encoders[feature_key] = LabelEncoder()
|
||||||
|
encoded = self.label_encoders[feature_key].fit_transform(
|
||||||
|
column.fillna("unknown").astype(str)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Subsequent times - use existing encoder
|
||||||
|
# Handle unseen labels by mapping them to a default value
|
||||||
|
column_clean = column.fillna("unknown").astype(str)
|
||||||
|
|
||||||
|
# Get the classes the encoder was trained on
|
||||||
|
known_classes = set(self.label_encoders[feature_key].classes_)
|
||||||
|
|
||||||
|
# Map unseen values to "unknown" if it exists, otherwise to the first class
|
||||||
|
if "unknown" in known_classes:
|
||||||
|
default_class = "unknown"
|
||||||
|
else:
|
||||||
|
default_class = self.label_encoders[feature_key].classes_[0]
|
||||||
|
|
||||||
|
# Replace unseen values with default
|
||||||
|
column_mapped = column_clean.apply(
|
||||||
|
lambda x: x if x in known_classes else default_class
|
||||||
|
)
|
||||||
|
|
||||||
|
encoded = self.label_encoders[feature_key].transform(column_mapped)
|
||||||
|
|
||||||
features.append(encoded.reshape(-1, 1))
|
features.append(encoded.reshape(-1, 1))
|
||||||
|
|
||||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||||
|
|||||||
@@ -95,9 +95,15 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
recalls = []
|
recalls = []
|
||||||
f1_scores = []
|
f1_scores = []
|
||||||
|
|
||||||
|
# Get vocabulary size and model parameters
|
||||||
|
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||||
|
max_len = self.config.model_params.get("max_len", 6)
|
||||||
|
|
||||||
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
|
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
|
||||||
# Create fresh model for each fold
|
# Create fresh model for each fold using build_model_with_vocab
|
||||||
fold_model = self.build_model()
|
fold_model = self.build_model_with_vocab(
|
||||||
|
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||||
|
)
|
||||||
|
|
||||||
# Train on fold
|
# Train on fold
|
||||||
if hasattr(fold_model, "fit"):
|
if hasattr(fold_model, "fit"):
|
||||||
@@ -127,13 +133,9 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"accuracy": np.mean(accuracies),
|
"accuracy": np.mean(accuracies),
|
||||||
"accuracy_std": np.std(accuracies),
|
|
||||||
"precision": np.mean(precisions),
|
"precision": np.mean(precisions),
|
||||||
"precision_std": np.std(precisions),
|
|
||||||
"recall": np.mean(recalls),
|
"recall": np.mean(recalls),
|
||||||
"recall_std": np.std(recalls),
|
|
||||||
"f1": np.mean(f1_scores),
|
"f1": np.mean(f1_scores),
|
||||||
"f1_std": np.std(f1_scores),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def generate_learning_curve(
|
def generate_learning_curve(
|
||||||
@@ -150,9 +152,17 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
"val_scores_std": [],
|
"val_scores_std": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Prepare features and get vocabulary size
|
||||||
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
|
X_prepared = self.prepare_features(features_df)
|
||||||
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
|
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
|
||||||
|
max_len = self.config.model_params.get("max_len", 6)
|
||||||
|
|
||||||
# Split data once for validation
|
# Split data once for validation
|
||||||
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
||||||
X, y, test_size=0.2, random_state=self.config.random_seed, stratify=y
|
X_prepared, y_encoded, test_size=0.2, random_state=self.config.random_seed, stratify=y_encoded
|
||||||
)
|
)
|
||||||
|
|
||||||
for size in train_sizes:
|
for size in train_sizes:
|
||||||
@@ -170,8 +180,10 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
val_scores = []
|
val_scores = []
|
||||||
|
|
||||||
for seed in range(3): # 3 runs for variance
|
for seed in range(3): # 3 runs for variance
|
||||||
# Build fresh model
|
# Build fresh model using build_model_with_vocab
|
||||||
model = self.build_model()
|
model = self.build_model_with_vocab(
|
||||||
|
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
|
||||||
|
)
|
||||||
|
|
||||||
# Train model
|
# Train model
|
||||||
if hasattr(model, "fit"):
|
if hasattr(model, "fit"):
|
||||||
|
|||||||
@@ -50,8 +50,14 @@ class TraditionalModel(BaseModel):
|
|||||||
y_encoded = self.label_encoder.transform(y)
|
y_encoded = self.label_encoder.transform(y)
|
||||||
|
|
||||||
# Train model
|
# Train model
|
||||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
if len(X_prepared.shape) == 1:
|
||||||
self.model.fit(X_prepared, y_encoded, verbose=2)
|
# For text-based features (like LogisticRegression with vectorization)
|
||||||
|
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
|
||||||
|
else:
|
||||||
|
# For numerical features
|
||||||
|
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
||||||
|
|
||||||
|
self.model.fit(X_prepared, y_encoded)
|
||||||
self.is_fitted = True
|
self.is_fitted = True
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|||||||
Reference in New Issue
Block a user