hotfixes

2025-08-15 08:08:11 +02:00
parent 9601c5e44d
commit 7b652d6999
17 changed files with 28 additions and 60 deletions
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
 from spacy.util import filter_spans

 from core.config import PipelineConfig
-from core.utils import get_data_file_path
 from core.utils.data_loader import DataLoader


@@ -98,10 +97,8 @@ class NERDataBuilder:
        return docs

    def build(self) -> int:
-        input_filepath = get_data_file_path(
-            self.config.data.output_files["engineered"], self.config
-        )
-        df = self.data_loader.load_csv_complete(input_filepath)
+        filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
+        df = self.data_loader.load_csv_complete(filepath)
        df = df[["name", "ner_tagged", "ner_entities"]]

        # Filter early
@@ -139,8 +136,8 @@ class NERDataBuilder:
        doc_bin = DocBin(docs=docs)

        # Save
-        json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
-        spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
+        json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
+        spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])

        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
@@ -7,7 +7,6 @@ import pandas as pd
 from tqdm import tqdm

 from core.config import PipelineConfig
-from core.utils import get_data_file_path
 from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
 from processing.ner.formats.connectors_format import ConnectorFormatter
 from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
@@ -55,7 +54,7 @@ class NEREngineering:
    def load_data(self) -> pd.DataFrame:
        """Load and filter NER-tagged data from CSV file"""

-        filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
+        filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
        df = self.data_loader.load_csv_complete(filepath)

        # Filter only NER-tagged rows
@@ -66,10 +65,8 @@ class NEREngineering:

    def compute(self) -> None:
        logging.info("Applying feature engineering transformations...")
-        input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
-        output_filepath = get_data_file_path(
-            self.config.data.output_files["engineered"], self.config
-        )
+        input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
+        output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])

        df = self.data_loader.load_csv_complete(input_filepath)
        ner_df = df[df["ner_tagged"] == 1].copy()
@@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep):

    def _process_simple_names(self, df: pd.DataFrame) -> None:
        """Process 3-word names efficiently with vectorized operations"""
-        mask = df["words"] == 3
+        mask = pd.Series(df["words"] == 3)

        if not mask.any():
            return