This commit is contained in:
2025-08-15 08:08:11 +02:00
parent 9601c5e44d
commit 7b652d6999
17 changed files with 28 additions and 60 deletions
+4 -7
View File
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
from spacy.util import filter_spans
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
@@ -98,10 +97,8 @@ class NERDataBuilder:
return docs
def build(self) -> int:
input_filepath = get_data_file_path(
self.config.data.output_files["engineered"], self.config
)
df = self.data_loader.load_csv_complete(input_filepath)
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
# Filter early
@@ -139,8 +136,8 @@ class NERDataBuilder:
doc_bin = DocBin(docs=docs)
# Save
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
+3 -6
View File
@@ -7,7 +7,6 @@ import pandas as pd
from tqdm import tqdm
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
from processing.ner.formats.connectors_format import ConnectorFormatter
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
@@ -55,7 +54,7 @@ class NEREngineering:
def load_data(self) -> pd.DataFrame:
"""Load and filter NER-tagged data from CSV file"""
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
df = self.data_loader.load_csv_complete(filepath)
# Filter only NER-tagged rows
@@ -66,10 +65,8 @@ class NEREngineering:
def compute(self) -> None:
logging.info("Applying feature engineering transformations...")
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
output_filepath = get_data_file_path(
self.config.data.output_files["engineered"], self.config
)
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy()
+1 -1
View File
@@ -127,7 +127,7 @@ class FeatureExtractionStep(PipelineStep):
def _process_simple_names(self, df: pd.DataFrame) -> None:
"""Process 3-word names efficiently with vectorized operations"""
mask = df["words"] == 3
mask = pd.Series(df["words"] == 3)
if not mask.any():
return