From 7b652d6999d743c35761231e704a4b70854b5d3b Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Fri, 15 Aug 2025 08:08:11 +0200 Subject: [PATCH] hotfixes --- app.py | 2 +- core/config/project_paths.py | 3 +++ core/utils/__init__.py | 15 --------------- interface/dashboard.py | 4 ++-- interface/data_overview.py | 4 ++-- interface/data_processing.py | 1 + interface/predictions.py | 5 +---- main.py | 8 ++------ monitor.py | 3 +-- ner.py | 9 +++------ processing/ner/ner_data_builder.py | 11 ++++------- processing/ner/ner_engineering.py | 9 +++------ processing/steps/feature_extraction_step.py | 2 +- research/experiment/experiment_runner.py | 5 ++--- research/model_trainer.py | 3 +-- research/neural_network_model.py | 1 - train.py | 3 +-- 17 files changed, 28 insertions(+), 60 deletions(-) diff --git a/app.py b/app.py index 8dc8a87..81a5b08 100644 --- a/app.py +++ b/app.py @@ -3,7 +3,7 @@ import argparse import streamlit as st -from core.config import get_config, setup_config, PipelineConfig +from core.config import setup_config, PipelineConfig from core.utils.data_loader import DataLoader from interface.configuration import Configuration from interface.dashboard import Dashboard diff --git a/core/config/project_paths.py b/core/config/project_paths.py index d2eba08..31972b6 100644 --- a/core/config/project_paths.py +++ b/core/config/project_paths.py @@ -21,3 +21,6 @@ class ProjectPaths(BaseModel): @field_validator("*", mode="before") def convert_to_path(cls, v): return Path(v) if not isinstance(v, Path) else v + + def get_data_path(self, filename: str) -> Path: + return self.data_dir / filename diff --git a/core/utils/__init__.py b/core/utils/__init__.py index 845a116..b452b86 100644 --- a/core/utils/__init__.py +++ b/core/utils/__init__.py @@ -44,18 +44,3 @@ def ensure_directories(config: "PipelineConfig") -> None: Path(directory).mkdir(parents=True, exist_ok=True) logging.info("Ensured all required directories exist") - - -def get_data_file_path(filename: str, config: "PipelineConfig") -> Path: - """Get full path for a data file""" - return config.paths.data_dir / filename - - -def get_model_file_path(filename: str, config: "PipelineConfig") -> Path: - """Get full path for a model file""" - return config.paths.models_dir / filename - - -def get_output_file_path(filename: str, config: "PipelineConfig") -> Path: - """Get full path for an output file""" - return config.paths.outputs_dir / filename diff --git a/interface/dashboard.py b/interface/dashboard.py index aa02fa9..5287322 100644 --- a/interface/dashboard.py +++ b/interface/dashboard.py @@ -1,10 +1,10 @@ import pandas as pd import streamlit as st -from core.utils import get_data_file_path from core.utils.data_loader import OPTIMIZED_DTYPES +@st.cache_data def load_dataset(file_path: str) -> pd.DataFrame: try: return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES) @@ -25,7 +25,7 @@ class Dashboard: # Load basic statistics try: - data_path = get_data_file_path(self.config.data.output_files["featured"], self.config) + data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"]) if data_path.exists(): df = load_dataset(str(data_path)) diff --git a/interface/data_overview.py b/interface/data_overview.py index 912afc3..fc34190 100644 --- a/interface/data_overview.py +++ b/interface/data_overview.py @@ -4,10 +4,10 @@ import pandas as pd import plotly.express as px import streamlit as st -from core.utils import get_data_file_path from core.utils.data_loader import OPTIMIZED_DTYPES +@st.cache_data def load_dataset(file_path: str) -> pd.DataFrame: try: return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES) @@ -31,7 +31,7 @@ class DataOverview: } selected_file = st.selectbox("Select Dataset", list(data_files.keys())) - file_path = get_data_file_path(data_files[selected_file], self.config) + file_path = self.config.paths.get_data_path(data_files[selected_file]) if not file_path.exists(): st.warning(f"Dataset not found: {file_path}") diff --git a/interface/data_processing.py b/interface/data_processing.py index 2ff0482..4f6093e 100644 --- a/interface/data_processing.py +++ b/interface/data_processing.py @@ -6,6 +6,7 @@ from core.utils.data_loader import OPTIMIZED_DTYPES from interface.log_reader import LogReader +@st.cache_data def load_dataset(file_path: str) -> pd.DataFrame: try: return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES) diff --git a/interface/predictions.py b/interface/predictions.py index c8154cb..56bbf99 100644 --- a/interface/predictions.py +++ b/interface/predictions.py @@ -1,5 +1,3 @@ -"""Predictions interface for the Streamlit app""" - from datetime import datetime from typing import Optional @@ -8,7 +6,6 @@ import pandas as pd import plotly.express as px import streamlit as st -from core.utils import get_data_file_path from core.utils.data_loader import OPTIMIZED_DTYPES from research.experiment.experiment_runner import ExperimentRunner from research.experiment.experiment_tracker import ExperimentTracker @@ -268,7 +265,7 @@ class Predictions: } selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys())) - file_path = get_data_file_path(dataset_options[selected_dataset], self.config) + file_path = self.config.paths.get_data_path(dataset_options[selected_dataset]) if not file_path.exists(): st.warning(f"Dataset not found: {file_path}") diff --git a/main.py b/main.py index bbc3a18..227bf51 100755 --- a/main.py +++ b/main.py @@ -5,7 +5,6 @@ import sys import traceback from core.config import setup_config -from core.utils import get_data_file_path from core.utils.data_loader import DataLoader from processing.batch.batch_config import BatchConfig from processing.pipeline import Pipeline @@ -47,7 +46,7 @@ def run_pipeline(config) -> int: logging.info(f"Starting pipeline: {config.name} v{config.version}") # Load input data - input_file_path = get_data_file_path(config.data.input_file, config) + input_file_path = config.paths.get_data_path(config.data.input_file) if not input_file_path.exists(): logging.error(f"Input file not found: {input_file_path}") return 1 @@ -60,8 +59,6 @@ def run_pipeline(config) -> int: # Create and run pipeline pipeline = create_pipeline(config) - - logging.info("Starting pipeline execution") data_splitter.split(pipeline.run(df)) # Show completion statistics @@ -104,5 +101,4 @@ def main(): if __name__ == "__main__": - exit_code = main() - sys.exit(exit_code) + sys.exit(main()) diff --git a/monitor.py b/monitor.py index 1d435b5..58af1e2 100755 --- a/monitor.py +++ b/monitor.py @@ -86,5 +86,4 @@ def main(): if __name__ == "__main__": - exit_code = main() - sys.exit(exit_code) + sys.exit(main()) diff --git a/ner.py b/ner.py index 09dfcdd..1e4ed8f 100755 --- a/ner.py +++ b/ner.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 import argparse import logging -import sys import os +import sys import traceback from pathlib import Path @@ -48,21 +48,18 @@ def train(config: PipelineConfig): def run_pipeline(config: PipelineConfig, reset: bool = False): - # Step 1: Feature engineering - if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["engineered"]): + if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])): logging.info("Step 1: Feature engineering already done.") else: logging.info("Step 1: Running feature engineering") feature(config) - # Step 2: Build dataset - if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["ner_data"]): + if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])): logging.info("Step 2: NER dataset already built.") else: logging.info("Step 2: Building NER dataset") build(config) - # Step 3: Train model logging.info("Step 3: Training NER Model") train(config) diff --git a/processing/ner/ner_data_builder.py b/processing/ner/ner_data_builder.py index 86022bb..4d1d5cc 100644 --- a/processing/ner/ner_data_builder.py +++ b/processing/ner/ner_data_builder.py @@ -9,7 +9,6 @@ from spacy.tokens import DocBin from spacy.util import filter_spans from core.config import PipelineConfig -from core.utils import get_data_file_path from core.utils.data_loader import DataLoader @@ -98,10 +97,8 @@ class NERDataBuilder: return docs def build(self) -> int: - input_filepath = get_data_file_path( - self.config.data.output_files["engineered"], self.config - ) - df = self.data_loader.load_csv_complete(input_filepath) + filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"]) + df = self.data_loader.load_csv_complete(filepath) df = df[["name", "ner_tagged", "ner_entities"]] # Filter early @@ -139,8 +136,8 @@ class NERDataBuilder: doc_bin = DocBin(docs=docs) # Save - json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"] - spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"] + json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"]) + spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"]) with open(json_path, "w", encoding="utf-8") as f: json.dump(training_data, f, ensure_ascii=False, separators=(",", ":")) diff --git a/processing/ner/ner_engineering.py b/processing/ner/ner_engineering.py index 398ab23..713654b 100644 --- a/processing/ner/ner_engineering.py +++ b/processing/ner/ner_engineering.py @@ -7,7 +7,6 @@ import pandas as pd from tqdm import tqdm from core.config import PipelineConfig -from core.utils import get_data_file_path from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader from processing.ner.formats.connectors_format import ConnectorFormatter from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter @@ -55,7 +54,7 @@ class NEREngineering: def load_data(self) -> pd.DataFrame: """Load and filter NER-tagged data from CSV file""" - filepath = get_data_file_path(self.config.data.output_files["featured"], self.config) + filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"]) df = self.data_loader.load_csv_complete(filepath) # Filter only NER-tagged rows @@ -66,10 +65,8 @@ class NEREngineering: def compute(self) -> None: logging.info("Applying feature engineering transformations...") - input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config) - output_filepath = get_data_file_path( - self.config.data.output_files["engineered"], self.config - ) + input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"]) + output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"]) df = self.data_loader.load_csv_complete(input_filepath) ner_df = df[df["ner_tagged"] == 1].copy() diff --git a/processing/steps/feature_extraction_step.py b/processing/steps/feature_extraction_step.py index dc2dfff..b64ede6 100644 --- a/processing/steps/feature_extraction_step.py +++ b/processing/steps/feature_extraction_step.py @@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep): def _process_simple_names(self, df: pd.DataFrame) -> None: """Process 3-word names efficiently with vectorized operations""" - mask = df["words"] == 3 + mask = pd.Series(df["words"] == 3) if not mask.any(): return diff --git a/research/experiment/experiment_runner.py b/research/experiment/experiment_runner.py index d997443..7cc3201 100644 --- a/research/experiment/experiment_runner.py +++ b/research/experiment/experiment_runner.py @@ -10,7 +10,6 @@ from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from core.config import PipelineConfig -from core.utils import get_data_file_path from core.utils.data_loader import DataLoader from research.base_model import BaseModel from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics @@ -36,8 +35,8 @@ class ExperimentRunner: self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING) # Load data - data_path = get_data_file_path(self.config.data.output_files["featured"], self.config) - df = self.data_loader.load_csv_complete(data_path) + filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"]) + df = self.data_loader.load_csv_complete(filepath) # Apply data filters if specified df = self._apply_data_filters(df, experiment_config) diff --git a/research/model_trainer.py b/research/model_trainer.py index b27f702..5d908ee 100644 --- a/research/model_trainer.py +++ b/research/model_trainer.py @@ -6,7 +6,6 @@ from typing import List, Dict, Any import pandas as pd from core.config import get_config -from core.utils import get_data_file_path from core.utils.data_loader import DataLoader from research.experiment import FeatureType, ExperimentConfig from research.experiment.experiment_runner import ExperimentRunner @@ -145,7 +144,7 @@ class ModelTrainer: try: # Load data for learning curve generation - data_path = get_data_file_path(self.config.data.output_files["featured"], self.config) + data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"]) if data_path.exists(): df = self.data_loader.load_csv_complete(data_path) diff --git a/research/neural_network_model.py b/research/neural_network_model.py index 6cbe13b..927995f 100644 --- a/research/neural_network_model.py +++ b/research/neural_network_model.py @@ -72,7 +72,6 @@ class NeuralNetworkModel(BaseModel): ) # Store training history - self.training_history = { "accuracy": history.history["accuracy"], "loss": history.history["loss"], diff --git a/train.py b/train.py index a7a9184..290e87a 100755 --- a/train.py +++ b/train.py @@ -112,5 +112,4 @@ def main(): if __name__ == "__main__": - exit_code = main() - sys.exit(exit_code) + sys.exit(main())