hotfixes

2025-08-15 08:08:11 +02:00
parent 9601c5e44d
commit 7b652d6999
17 changed files with 28 additions and 60 deletions
@@ -3,7 +3,7 @@ import argparse
 import streamlit as st
-from core.config import get_config, setup_config, PipelineConfig
+from core.config import setup_config, PipelineConfig
 from core.utils.data_loader import DataLoader
 from interface.configuration import Configuration
 from interface.dashboard import Dashboard
@@ -21,3 +21,6 @@ class ProjectPaths(BaseModel):
    @field_validator("*", mode="before")
    def convert_to_path(cls, v):
        return Path(v) if not isinstance(v, Path) else v
    def get_data_path(self, filename: str) -> Path:
        return self.data_dir / filename
@@ -44,18 +44,3 @@ def ensure_directories(config: "PipelineConfig") -> None:
        Path(directory).mkdir(parents=True, exist_ok=True)
    logging.info("Ensured all required directories exist")
 def get_data_file_path(filename: str, config: "PipelineConfig") -> Path:
    """Get full path for a data file"""
    return config.paths.data_dir / filename
 def get_model_file_path(filename: str, config: "PipelineConfig") -> Path:
    """Get full path for a model file"""
    return config.paths.models_dir / filename
 def get_output_file_path(filename: str, config: "PipelineConfig") -> Path:
    """Get full path for an output file"""
    return config.paths.outputs_dir / filename
@@ -1,10 +1,10 @@
 import pandas as pd
 import streamlit as st
 from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
@@ -25,7 +25,7 @@ class Dashboard:
        # Load basic statistics
        try:
-            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
+            data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
            if data_path.exists():
                df = load_dataset(str(data_path))
@@ -4,10 +4,10 @@ import pandas as pd
 import plotly.express as px
 import streamlit as st
 from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES
@st.cache_data
 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
@@ -31,7 +31,7 @@ class DataOverview:
        }
        selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
-        file_path = get_data_file_path(data_files[selected_file], self.config)
+        file_path = self.config.paths.get_data_path(data_files[selected_file])
        if not file_path.exists():
            st.warning(f"Dataset not found: {file_path}")
@@ -6,6 +6,7 @@ from core.utils.data_loader import OPTIMIZED_DTYPES
 from interface.log_reader import LogReader
@st.cache_data
 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
@@ -1,5 +1,3 @@
 """Predictions interface for the Streamlit app"""
 from datetime import datetime
 from typing import Optional
@@ -8,7 +6,6 @@ import pandas as pd
 import plotly.express as px
 import streamlit as st
 from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
@@ -268,7 +265,7 @@ class Predictions:
        }
        selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
-        file_path = get_data_file_path(dataset_options[selected_dataset], self.config)
+        file_path = self.config.paths.get_data_path(dataset_options[selected_dataset])
        if not file_path.exists():
            st.warning(f"Dataset not found: {file_path}")
@@ -5,7 +5,6 @@ import sys
 import traceback
 from core.config import setup_config
 from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from processing.batch.batch_config import BatchConfig
 from processing.pipeline import Pipeline
@@ -47,7 +46,7 @@ def run_pipeline(config) -> int:
        logging.info(f"Starting pipeline: {config.name} v{config.version}")
        # Load input data
-        input_file_path = get_data_file_path(config.data.input_file, config)
+        input_file_path = config.paths.get_data_path(config.data.input_file)
        if not input_file_path.exists():
            logging.error(f"Input file not found: {input_file_path}")
            return 1
@@ -60,8 +59,6 @@ def run_pipeline(config) -> int:
        # Create and run pipeline
        pipeline = create_pipeline(config)
        logging.info("Starting pipeline execution")
        data_splitter.split(pipeline.run(df))
        # Show completion statistics
@@ -104,5 +101,4 @@ def main():
 if __name__ == "__main__":
-    exit_code = main()
+    sys.exit(main())
    sys.exit(exit_code)
@@ -86,5 +86,4 @@ def main():
 if __name__ == "__main__":
-    exit_code = main()
+    sys.exit(main())
    sys.exit(exit_code)
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 import argparse
 import logging
 import sys
 import os
 import sys
 import traceback
 from pathlib import Path
@@ -48,21 +48,18 @@ def train(config: PipelineConfig):
 def run_pipeline(config: PipelineConfig, reset: bool = False):
-    # Step 1: Feature engineering
+    if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
    if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["engineered"]):
        logging.info("Step 1: Feature engineering already done.")
    else:
        logging.info("Step 1: Running feature engineering")
        feature(config)
-    # Step 2: Build dataset
+    if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
    if not reset and os.path.exists(config.paths.data_dir / config.data.output_files["ner_data"]):
        logging.info("Step 2: NER dataset already built.")
    else:
        logging.info("Step 2: Building NER dataset")
        build(config)
    # Step 3: Train model
    logging.info("Step 3: Training NER Model")
    train(config)
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
 from spacy.util import filter_spans
 from core.config import PipelineConfig
 from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
@@ -98,10 +97,8 @@ class NERDataBuilder:
        return docs
    def build(self) -> int:
-        input_filepath = get_data_file_path(
+        filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
-            self.config.data.output_files["engineered"], self.config
+        df = self.data_loader.load_csv_complete(filepath)
        )
        df = self.data_loader.load_csv_complete(input_filepath)
        df = df[["name", "ner_tagged", "ner_entities"]]
        # Filter early
@@ -139,8 +136,8 @@ class NERDataBuilder:
        doc_bin = DocBin(docs=docs)
        # Save
-        json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
+        json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
-        spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
+        spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
@@ -7,7 +7,6 @@ import pandas as pd
 from tqdm import tqdm
 from core.config import PipelineConfig
 from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
 from processing.ner.formats.connectors_format import ConnectorFormatter
 from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
@@ -55,7 +54,7 @@ class NEREngineering:
    def load_data(self) -> pd.DataFrame:
        """Load and filter NER-tagged data from CSV file"""
-        filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
+        filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
        df = self.data_loader.load_csv_complete(filepath)
        # Filter only NER-tagged rows
@@ -66,10 +65,8 @@ class NEREngineering:
    def compute(self) -> None:
        logging.info("Applying feature engineering transformations...")
-        input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
+        input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
-        output_filepath = get_data_file_path(
+        output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
            self.config.data.output_files["engineered"], self.config
        )
        df = self.data_loader.load_csv_complete(input_filepath)
        ner_df = df[df["ner_tagged"] == 1].copy()
@@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep):
    def _process_simple_names(self, df: pd.DataFrame) -> None:
        """Process 3-word names efficiently with vectorized operations"""
-        mask = df["words"] == 3
+        mask = pd.Series(df["words"] == 3)
        if not mask.any():
            return
@@ -10,7 +10,6 @@ from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import train_test_split
 from core.config import PipelineConfig
 from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from research.base_model import BaseModel
 from research.experiment import ExperimentConfig, ExperimentStatus, calculate_metrics
@@ -36,8 +35,8 @@ class ExperimentRunner:
            self.tracker.update_experiment(experiment_id, status=ExperimentStatus.RUNNING)
            # Load data
-            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
+            filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
-            df = self.data_loader.load_csv_complete(data_path)
+            df = self.data_loader.load_csv_complete(filepath)
            # Apply data filters if specified
            df = self._apply_data_filters(df, experiment_config)
@@ -6,7 +6,6 @@ from typing import List, Dict, Any
 import pandas as pd
 from core.config import get_config
 from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader
 from research.experiment import FeatureType, ExperimentConfig
 from research.experiment.experiment_runner import ExperimentRunner
@@ -145,7 +144,7 @@ class ModelTrainer:
        try:
            # Load data for learning curve generation
-            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
+            data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"])
            if data_path.exists():
                df = self.data_loader.load_csv_complete(data_path)
@@ -72,7 +72,6 @@ class NeuralNetworkModel(BaseModel):
        )
        # Store training history
        self.training_history = {
            "accuracy": history.history["accuracy"],
            "loss": history.history["loss"],
@@ -112,5 +112,4 @@ def main():
 if __name__ == "__main__":
-    exit_code = main()
+    sys.exit(main())
    sys.exit(exit_code)