From 7b652d6999d743c35761231e704a4b70854b5d3b Mon Sep 17 00:00:00 2001
From: bernard-ng <ngandubernard@gmail.com>
Date: Fri, 15 Aug 2025 08:08:11 +0200
Subject: [PATCH] hotfixes

---
 app.py                                      |  2 +-
 core/config/project_paths.py                |  3 +++
 core/utils/__init__.py                      | 15 ---------------
 interface/dashboard.py                      |  4 ++--
 interface/data_overview.py                  |  4 ++--
 interface/data_processing.py                |  1 +
 interface/predictions.py                    |  5 +----
 main.py                                     |  8 ++------
 monitor.py                                  |  3 +--
 ner.py                                      |  9 +++------
 processing/ner/ner_data_builder.py          | 11 ++++-------
 processing/ner/ner_engineering.py           |  9 +++------
 processing/steps/feature_extraction_step.py |  2 +-
 research/experiment/experiment_runner.py    |  5 ++---
 research/model_trainer.py                   |  3 +--
 research/neural_network_model.py            |  1 -
 train.py                                    |  3 +--
 17 files changed, 28 insertions(+), 60 deletions(-)

diff --git a/app.py b/app.py
index 8dc8a87..81a5b08 100644
--- a/app.py
+++ b/app.py
@@ -3,7 +3,7 @@ import argparse
 
 import streamlit as st
 
-from core.config import get_config, setup_config, PipelineConfig
+from core.config import setup_config, PipelineConfig
 from core.utils.data_loader import DataLoader
 from interface.configuration import Configuration
 from interface.dashboard import Dashboard
diff --git a/core/config/project_paths.py b/core/config/project_paths.py
index d2eba08..31972b6 100644
--- a/core/config/project_paths.py
+++ b/core/config/project_paths.py
@@ -21,3 +21,6 @@ class ProjectPaths(BaseModel):
     @field_validator("*", mode="before")
     def convert_to_path(cls, v):
         return Path(v) if not isinstance(v, Path) else v
+
+    def get_data_path(self, filename: str) -> Path:
+        return self.data_dir / filename
diff --git a/core/utils/__init__.py b/core/utils/__init__.py
index 845a116..b452b86 100644
--- a/core/utils/__init__.py
+++ b/core/utils/__init__.py
@@ -44,18 +44,3 @@ def ensure_directories(config: "PipelineConfig") -> None:
         Path(directory).mkdir(parents=True, exist_ok=True)
 
     logging.info("Ensured all required directories exist")
-
-
-def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
-    """Get full path for a data file"""
-    return config.paths.data_dir / filename
-
-
-def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
-    """Get full path for a model file"""
-    return config.paths.models_dir / filename
-
-
-def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
-    """Get full path for an output file"""
-    return config.paths.outputs_dir / filename
diff --git a/interface/dashboard.py b/interface/dashboard.py
index aa02fa9..5287322 100644
--- a/interface/dashboard.py
+++ b/interface/dashboard.py
@@ -1,10 +1,10 @@
 import pandas as pd
 import streamlit as st
 
-from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES
 
 
+@st.cache_data
 def load_dataset(file_path: str) -> pd.DataFrame:
     try:
         return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
@@ -25,7 +25,7 @@ class Dashboard:
 
         # Load basic statistics
         try:
-            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
+            data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
             if data_path.exists():
                 df = load_dataset(str(data_path))
 
diff --git a/interface/data_overview.py b/interface/data_overview.py
index 912afc3..fc34190 100644
--- a/interface/data_overview.py
+++ b/interface/data_overview.py
@@ -4,10 +4,10 @@ import pandas as pd
 import plotly.express as px
 import streamlit as st
 
-from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES
 
 
+@st.cache_data
 def load_dataset(file_path: str) -> pd.DataFrame:
     try:
         return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
@@ -31,7 +31,7 @@ class DataOverview:
         }
 
         selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
-        file_path = get_data_file_path(data_files[selected_file], self.config)
+        file_path = self.config.paths.get_data_path(data_files[selected_file])
 
         if not file_path.exists():
             st.warning(f"Dataset not found: {file_path}")
diff --git a/interface/data_processing.py b/interface/data_processing.py
index 2ff0482..4f6093e 100644
--- a/interface/data_processing.py
+++ b/interface/data_processing.py
@@ -6,6 +6,7 @@ from core.utils.data_loader import OPTIMIZED_DTYPES
 from interface.log_reader import LogReader
 
 
+@st.cache_data
 def load_dataset(file_path: str) -> pd.DataFrame:
     try:
         return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
diff --git a/interface/predictions.py b/interface/predictions.py
index c8154cb..56bbf99 100644
--- a/interface/predictions.py
+++ b/interface/predictions.py
@@ -1,5 +1,3 @@
-"""Predictions interface for the Streamlit app"""
-
 from datetime import datetime
 from typing import Optional
 
@@ -8,7 +6,6 @@ import pandas as pd
 import plotly.express as px
 import streamlit as st
 
-from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
@@ -268,7 +265,7 @@ class Predictions:
         }
 
         selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
-        file_path = get_data_file_path(dataset_options[selected_dataset], self.config)
+        file_path = self.config.paths.get_data_path(dataset_options[selected_dataset])
 
         if not file_path.exists():
             st.warning(f"Dataset not found: {file_path}")
diff --git a/main.py b/main.py
index bbc3a18..227bf51 100755
--- a/main.py
+++ b/main.py
@@ -5,7 +5,6 @@ import sys
 import traceback
 
 from core.config import setup_config
-from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from processing.batch.batch_config import BatchConfig
 from processing.pipeline import Pipeline
@@ -47,7 +46,7 @@ def run_pipeline(config) -> int:
         logging.info(f"Starting pipeline: {config.name} v{config.version}")
 
         # Load input data
-        input_file_path = get_data_file_path(config.data.input_file, config)
+        input_file_path = config.paths.get_data_path(config.data.input_file)
         if not input_file_path.exists():
             logging.error(f"Input file not found: {input_file_path}")
             return 1
@@ -60,8 +59,6 @@ def run_pipeline(config) -> int:
 
         # Create and run pipeline
         pipeline = create_pipeline(config)
-
-        logging.info("Starting pipeline execution")
         data_splitter.split(pipeline.run(df))
 
         # Show completion statistics
@@ -104,5 +101,4 @@ def main():
 
 
 if __name__ == "__main__":
-    exit_code = main()
-    sys.exit(exit_code)
+    sys.exit(main())
diff --git a/monitor.py b/monitor.py
index 1d435b5..58af1e2 100755
--- a/monitor.py
+++ b/monitor.py
@@ -86,5 +86,4 @@ def main():
 
 
 if __name__ == "__main__":
-    exit_code = main()
-    sys.exit(exit_code)
+    sys.exit(main())
diff --git a/ner.py b/ner.py
index 09dfcdd..1e4ed8f 100755
--- a/ner.py
+++ b/ner.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 import argparse
 import logging
-import sys
 import os
+import sys
 import traceback
 from pathlib import Path
 
@@ -48,21 +48,18 @@ def train(config: PipelineConfig):
 
 
 def run_pipeline(config: PipelineConfig, reset: bool = False):
-    # Step 1: Feature engineering
-    if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["engineered"]):
+    if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
         logging.info("Step 1: Feature engineering already done.")
     else:
         logging.info("Step 1: Running feature engineering")
         feature(config)
 
-    # Step 2: Build dataset
-    if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["ner_data"]):
+    if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
         logging.info("Step 2: NER dataset already built.")
     else:
         logging.info("Step 2: Building NER dataset")
         build(config)
 
-    # Step 3: Train model
     logging.info("Step 3: Training NER Model")
     train(config)
 
diff --git a/processing/ner/ner_data_builder.py b/processing/ner/ner_data_builder.py
index 86022bb..4d1d5cc 100644
--- a/processing/ner/ner_data_builder.py
+++ b/processing/ner/ner_data_builder.py
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
 from spacy.util import filter_spans
 
 from core.config import PipelineConfig
-from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 
 
@@ -98,10 +97,8 @@ class NERDataBuilder:
         return docs
 
     def build(self) -> int:
-        input_filepath = get_data_file_path(
-            self.config.data.output_files["engineered"], self.config
-        )
-        df = self.data_loader.load_csv_complete(input_filepath)
+        filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
+        df = self.data_loader.load_csv_complete(filepath)
         df = df[["name", "ner_tagged", "ner_entities"]]
 
         # Filter early
@@ -139,8 +136,8 @@ class NERDataBuilder:
         doc_bin = DocBin(docs=docs)
 
         # Save
-        json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
-        spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
+        json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
+        spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
 
         with open(json_path, "w", encoding="utf-8") as f:
             json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
diff --git a/processing/ner/ner_engineering.py b/processing/ner/ner_engineering.py
index 398ab23..713654b 100644
--- a/processing/ner/ner_engineering.py
+++ b/processing/ner/ner_engineering.py
@@ -7,7 +7,6 @@ import pandas as pd
 from tqdm import tqdm
 
 from core.config import PipelineConfig
-from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
 from processing.ner.formats.connectors_format import ConnectorFormatter
 from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
@@ -55,7 +54,7 @@ class NEREngineering:
     def load_data(self) -> pd.DataFrame:
         """Load and filter NER-tagged data from CSV file"""
 
-        filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
+        filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
         df = self.data_loader.load_csv_complete(filepath)
 
         # Filter only NER-tagged rows
@@ -66,10 +65,8 @@ class NEREngineering:
 
     def compute(self) -> None:
         logging.info("Applying feature engineering transformations...")
-        input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
-        output_filepath = get_data_file_path(
-            self.config.data.output_files["engineered"], self.config
-        )
+        input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
+        output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
 
         df = self.data_loader.load_csv_complete(input_filepath)
         ner_df = df[df["ner_tagged"] == 1].copy()
diff --git a/processing/steps/feature_extraction_step.py b/processing/steps/feature_extraction_step.py
index dc2dfff..b64ede6 100644
--- a/processing/steps/feature_extraction_step.py
+++ b/processing/steps/feature_extraction_step.py
@@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep):
 
     def _process_simple_names(self, df: pd.DataFrame) -> None:
         """Process 3-word names efficiently with vectorized operations"""
-        mask = df["words"] == 3
+        mask = pd.Series(df["words"] == 3)
 
         if not mask.any():
             return
diff --git a/research/experiment/experiment_runner.py b/research/experiment/experiment_runner.py
index d997443..7cc3201 100644
--- a/research/experiment/experiment_runner.py
+++ b/research/experiment/experiment_runner.py
@@ -10,7 +10,6 @@ from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import train_test_split
 
 from core.config import PipelineConfig
-from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from research.base_model import BaseModel
 from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
@@ -36,8 +35,8 @@ class ExperimentRunner:
             self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
 
             # Load data
-            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
-            df = self.data_loader.load_csv_complete(data_path)
+            filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
+            df = self.data_loader.load_csv_complete(filepath)
 
             # Apply data filters if specified
             df = self._apply_data_filters(df, experiment_config)
diff --git a/research/model_trainer.py b/research/model_trainer.py
index b27f702..5d908ee 100644
--- a/research/model_trainer.py
+++ b/research/model_trainer.py
@@ -6,7 +6,6 @@ from typing import List, Dict, Any
 import pandas as pd
 
 from core.config import get_config
-from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from research.experiment import FeatureType, ExperimentConfig
 from research.experiment.experiment_runner import ExperimentRunner
@@ -145,7 +144,7 @@ class ModelTrainer:
 
         try:
             # Load data for learning curve generation
-            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
+            data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
             if data_path.exists():
                 df = self.data_loader.load_csv_complete(data_path)
 
diff --git a/research/neural_network_model.py b/research/neural_network_model.py
index 6cbe13b..927995f 100644
--- a/research/neural_network_model.py
+++ b/research/neural_network_model.py
@@ -72,7 +72,6 @@ class NeuralNetworkModel(BaseModel):
         )
 
         # Store training history
-
         self.training_history = {
             "accuracy": history.history["accuracy"],
             "loss": history.history["loss"],
diff --git a/train.py b/train.py
index a7a9184..290e87a 100755
--- a/train.py
+++ b/train.py
@@ -112,5 +112,4 @@ def main():
 
 
 if __name__ == "__main__":
-    exit_code = main()
-    sys.exit(exit_code)
+    sys.exit(main())