refactor: reorganize project structure and enhance model verbosity

2025-08-06 21:57:10 +02:00
parent ad8db43748
commit d7aa24a935
23 changed files with 1209 additions and 1416 deletions
@@ -11,6 +11,7 @@ from core.utils.data_loader import DataLoader
 from research.experiment import FeatureType, ExperimentConfig
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
+from research.model_registry import MODEL_REGISTRY


 class ModelTrainer:
@@ -21,25 +22,24 @@ class ModelTrainer:
        self.data_loader = DataLoader(self.config)
        self.experiment_runner = ExperimentRunner(self.config)
        self.experiment_tracker = ExperimentTracker(self.config)
-        self.logger = logging.getLogger(__name__)

        # Setup model artifacts directory
        self.models_dir = self.config.paths.models_dir
        self.models_dir.mkdir(parents=True, exist_ok=True)

    def train_single_model(
-        self,
-        model_name: str,
-        model_type: str = "logistic_regression",
-        features: List[str] = None,
-        model_params: Dict[str, Any] = None,
-        save_artifacts: bool = True,
+            self,
+            model_name: str,
+            model_type: str = "logistic_regression",
+            features: List[str] = None,
+            model_params: Dict[str, Any] = None,
+            save_artifacts: bool = True,
    ) -> str:
        """
        Train a single model and save its artifacts.
        Returns the experiment ID.
        """
-        self.logger.info(f"Training {model_type} model: {model_name}")
+        logging.info(f"Training {model_type} model: {model_name}")

        if features is None:
            features = ["full_name"]
@@ -60,10 +60,10 @@ class ModelTrainer:
        experiment = self.experiment_tracker.get_experiment(experiment_id)

        if experiment and experiment.test_metrics:
-            self.logger.info("Training completed successfully!")
-            self.logger.info(f"   Experiment ID: {experiment_id}")
-            self.logger.info(f"   Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
-            self.logger.info(f"   Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")
+            logging.info("Training completed successfully!")
+            logging.info(f"Experiment ID: {experiment_id}")
+            logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}")
+            logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}")

            if save_artifacts:
                self.save_model_artifacts(experiment_id)
@@ -71,12 +71,15 @@ class ModelTrainer:
        return experiment_id

    def train_multiple_models(
-        self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
+            self,
+            base_name: str,
+            model_configs: List[Dict[str, Any]],
+            save_all: bool = True
    ) -> List[str]:
        """
        Train multiple models with different configurations.
        """
-        self.logger.info(f"Training {len(model_configs)} models...")
+        logging.info(f"Training {len(model_configs)} models...")

        experiment_ids = []

@@ -94,10 +97,10 @@ class ModelTrainer:
                experiment_ids.append(exp_id)

            except Exception as e:
-                self.logger.error(f"Failed to train {model_name}: {e}")
+                logging.error(f"Failed to train {model_name}: {e}")
                continue

-        self.logger.info(f"Completed training {len(experiment_ids)} models successfully")
+        logging.info(f"Completed training {len(experiment_ids)} models successfully")
        return experiment_ids

    def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]:
@@ -145,7 +148,7 @@ class ModelTrainer:
                df = self.data_loader.load_csv_complete(data_path)

                # Generate learning curve
-                self.logger.info("Generating learning curve...")
+                logging.info("Generating learning curve...")
                trained_model.generate_learning_curve(df, df[experiment.config.target_column])

                # Plot and save learning curve
@@ -169,7 +172,7 @@ class ModelTrainer:
                        json.dump(trained_model.training_history, f, indent=2)

        except Exception as e:
-            self.logger.warning(f"Could not generate learning curves: {e}")
+            logging.warning(f"Could not generate learning curves: {e}")

        # Save artifacts metadata
        metadata = {
@@ -193,17 +196,17 @@ class ModelTrainer:
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)

-        self.logger.info(f"Model artifacts saved to: {model_dir}")
-        self.logger.info(f"   - Complete model: {model_path.name}")
-        self.logger.info(f"   - Configuration: {config_path.name}")
-        self.logger.info(f"   - Results: {results_path.name}")
-        self.logger.info(f"   - Metadata: {metadata_path.name}")
+        logging.info(f"Model artifacts saved to: {model_dir}")
+        logging.info(f"   - Complete model: {model_path.name}")
+        logging.info(f"   - Configuration: {config_path.name}")
+        logging.info(f"   - Results: {results_path.name}")
+        logging.info(f"   - Metadata: {metadata_path.name}")

        if learning_curve_path and learning_curve_path.exists():
-            self.logger.info(f"   - Learning curve: {learning_curve_path.name}")
+            logging.info(f"   - Learning curve: {learning_curve_path.name}")

        if training_history_path and training_history_path.exists():
-            self.logger.info(f"   - Training history: {training_history_path.name}")
+            logging.info(f"   - Training history: {training_history_path.name}")

        return {
            "model_dir": str(model_dir),
@@ -231,16 +234,14 @@ class ModelTrainer:
            metadata = json.load(f)

        model_type = metadata["model_type"]
-        from research.model_registry import MODEL_REGISTRY
-
        model_class = MODEL_REGISTRY[model_type]

        # Load the complete model
        loaded_model = model_class.load(str(model_path))

-        self.logger.info(f"Loaded model: {metadata['model_name']}")
-        self.logger.info(f"   Type: {model_type}")
-        self.logger.info(f"   Accuracy: {metadata['test_accuracy']:.4f}")
+        logging.info(f"Loaded model: {metadata['model_name']}")
+        logging.info(f"   Type: {model_type}")
+        logging.info(f"   Accuracy: {metadata['test_accuracy']:.4f}")

        return loaded_model

@@ -259,10 +260,10 @@ class ModelTrainer:
                            metadata = json.load(f)
                        models_data.append(metadata)
                    except Exception as e:
-                        self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}")
+                        logging.warning(f"Could not read metadata for {model_dir.name}: {e}")

        if not models_data:
-            self.logger.info("No saved models found.")
+            logging.info("No saved models found.")
            return pd.DataFrame()

        df = pd.DataFrame(models_data)
@@ -22,7 +22,7 @@ class LightGBMModel(TraditionalModel):
            subsample=params.get("subsample", 0.8),
            colsample_bytree=params.get("colsample_bytree", 0.8),
            random_state=self.config.random_seed,
-            verbose=-1,
+            verbose=2,
        )

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
@@ -20,7 +20,9 @@ class LogisticRegressionModel(TraditionalModel):
        )

        classifier = LogisticRegression(
-            max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed
+            max_iter=params.get("max_iter", 1000),
+            random_state=self.config.random_seed,
+            verbose=2
        )

        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
@@ -18,6 +18,7 @@ class RandomForestModel(TraditionalModel):
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth", None),
            random_state=self.config.random_seed,
+            verbose=2
        )

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
@@ -25,6 +25,7 @@ class SVMModel(TraditionalModel):
            gamma=params.get("gamma", "scale"),
            probability=True,  # Enable probability prediction
            random_state=self.config.random_seed,
+            verbose=2
        )

        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
@@ -22,6 +22,7 @@ class XGBoostModel(TraditionalModel):
            colsample_bytree=params.get("colsample_bytree", 0.8),
            random_state=self.config.random_seed,
            eval_metric="logloss",
+            verbosity=2
        )

    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
@@ -49,6 +49,7 @@ class NeuralNetworkModel(BaseModel):

        # Now we can build the model with known vocab size
        vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
+        logging.info(f"Vocabulary size: {vocab_size}")

        # Get additional model parameters
        max_len = self.config.model_params.get("max_len", 6)
@@ -58,16 +59,18 @@ class NeuralNetworkModel(BaseModel):
        )

        # Train the neural network
+        logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
        history = self.model.fit(
            X_prepared,
            y_encoded,
            epochs=self.config.model_params.get("epochs", 10),
            batch_size=self.config.model_params.get("batch_size", 64),
            validation_split=0.1,
-            verbose=1,
+            verbose=2,
        )

        # Store training history
+
        self.training_history = {
            "accuracy": history.history["accuracy"],
            "loss": history.history["loss"],
@@ -50,7 +50,8 @@ class TraditionalModel(BaseModel):
            y_encoded = self.label_encoder.transform(y)

        # Train model
-        self.model.fit(X_prepared, y_encoded)
+        logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features")
+        self.model.fit(X_prepared, y_encoded, verbose=2)
        self.is_fitted = True

        return self