hotfixes
This commit is contained in:
@@ -9,7 +9,6 @@ from spacy.tokens import DocBin
|
||||
from spacy.util import filter_spans
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
|
||||
|
||||
@@ -98,10 +97,8 @@ class NERDataBuilder:
|
||||
return docs
|
||||
|
||||
def build(self) -> int:
|
||||
input_filepath = get_data_file_path(
|
||||
self.config.data.output_files["engineered"], self.config
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||
|
||||
# Filter early
|
||||
@@ -139,8 +136,8 @@ class NERDataBuilder:
|
||||
doc_bin = DocBin(docs=docs)
|
||||
|
||||
# Save
|
||||
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
|
||||
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
|
||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
@@ -7,7 +7,6 @@ import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||
@@ -55,7 +54,7 @@ class NEREngineering:
|
||||
def load_data(self) -> pd.DataFrame:
|
||||
"""Load and filter NER-tagged data from CSV file"""
|
||||
|
||||
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Filter only NER-tagged rows
|
||||
@@ -66,10 +65,8 @@ class NEREngineering:
|
||||
|
||||
def compute(self) -> None:
|
||||
logging.info("Applying feature engineering transformations...")
|
||||
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
output_filepath = get_data_file_path(
|
||||
self.config.data.output_files["engineered"], self.config
|
||||
)
|
||||
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||
|
||||
Reference in New Issue
Block a user