feat: enhance logging and memory management across modules
This commit is contained in:
@@ -224,9 +224,9 @@ class ExperimentRunner:
|
||||
model.learning_curve_data = model_data.get("learning_curve_data", {})
|
||||
|
||||
# Restore vectorizers and encoders for models that use them (like XGBoost)
|
||||
if "vectorizers" in model_data and hasattr(model, 'vectorizers'):
|
||||
if "vectorizers" in model_data and hasattr(model, "vectorizers"):
|
||||
model.vectorizers = model_data["vectorizers"]
|
||||
if "label_encoders" in model_data and hasattr(model, 'label_encoders'):
|
||||
if "label_encoders" in model_data and hasattr(model, "label_encoders"):
|
||||
model.label_encoders = model_data["label_encoders"]
|
||||
|
||||
return model
|
||||
@@ -237,7 +237,9 @@ class ExperimentRunner:
|
||||
|
||||
return None
|
||||
|
||||
def compare_experiments(self, experiment_ids: List[str], metric: str = "accuracy") -> pd.DataFrame:
|
||||
def compare_experiments(
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
) -> pd.DataFrame:
|
||||
"""Compare experiments and return analysis"""
|
||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||
|
||||
|
||||
@@ -28,13 +28,13 @@ class ModelTrainer:
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def train_single_model(
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
tags: List[str] = None,
|
||||
save_artifacts: bool = True,
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
tags: List[str] = None,
|
||||
save_artifacts: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Train a single model and save its artifacts.
|
||||
@@ -76,10 +76,7 @@ class ModelTrainer:
|
||||
return experiment_id
|
||||
|
||||
def train_multiple_models(
|
||||
self,
|
||||
base_name: str,
|
||||
model_configs: List[Dict[str, Any]],
|
||||
save_all: bool = True
|
||||
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||
) -> List[str]:
|
||||
"""
|
||||
Train multiple models with different configurations.
|
||||
|
||||
@@ -50,14 +50,18 @@ class LightGBMModel(TraditionalModel):
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = self.vectorizers[feature_key].fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.fit_transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = self.vectorizers[feature_key].transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
|
||||
features.append(char_features)
|
||||
else:
|
||||
|
||||
@@ -20,9 +20,7 @@ class LogisticRegressionModel(TraditionalModel):
|
||||
)
|
||||
|
||||
classifier = LogisticRegression(
|
||||
max_iter=params.get("max_iter", 1000),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2
|
||||
max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed, verbose=2
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
@@ -18,7 +18,7 @@ class RandomForestModel(TraditionalModel):
|
||||
n_estimators=params.get("n_estimators", 100),
|
||||
max_depth=params.get("max_depth", None),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
|
||||
@@ -25,7 +25,7 @@ class SVMModel(TraditionalModel):
|
||||
gamma=params.get("gamma", "scale"),
|
||||
probability=True, # Enable probability prediction
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
@@ -28,7 +28,7 @@ class XGBoostModel(TraditionalModel):
|
||||
colsample_bytree=params.get("colsample_bytree", 0.8),
|
||||
random_state=self.config.random_seed,
|
||||
eval_metric="logloss",
|
||||
verbosity=2
|
||||
verbosity=2,
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
@@ -50,14 +50,18 @@ class XGBoostModel(TraditionalModel):
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=100
|
||||
)
|
||||
char_features = self.vectorizers[feature_key].fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.fit_transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = self.vectorizers[feature_key].transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
|
||||
features.append(char_features)
|
||||
else:
|
||||
|
||||
@@ -59,7 +59,9 @@ class NeuralNetworkModel(BaseModel):
|
||||
)
|
||||
|
||||
# Train the neural network
|
||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
||||
logging.info(
|
||||
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
|
||||
)
|
||||
history = self.model.fit(
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
@@ -162,7 +164,11 @@ class NeuralNetworkModel(BaseModel):
|
||||
|
||||
# Split data once for validation
|
||||
X_train_full, X_val, y_train_full, y_val = train_test_split(
|
||||
X_prepared, y_encoded, test_size=0.2, random_state=self.config.random_seed, stratify=y_encoded
|
||||
X_prepared,
|
||||
y_encoded,
|
||||
test_size=0.2,
|
||||
random_state=self.config.random_seed,
|
||||
stratify=y_encoded,
|
||||
)
|
||||
|
||||
for size in train_sizes:
|
||||
|
||||
@@ -55,7 +55,9 @@ class TraditionalModel(BaseModel):
|
||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples (text features)")
|
||||
else:
|
||||
# For numerical features
|
||||
logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
|
||||
logging.info(
|
||||
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
|
||||
)
|
||||
|
||||
self.model.fit(X_prepared, y_encoded)
|
||||
self.is_fitted = True
|
||||
|
||||
Reference in New Issue
Block a user