From 47e52d130cd88abc33dff61b21c34569f72d4dae Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Tue, 12 Aug 2025 23:17:18 +0200 Subject: [PATCH] hotfixes --- config/pipeline.development.yaml | 6 +++--- processing/annotate.py | 0 processing/ner/formats/connectors_format.py | 1 + processing/prepare.py | 0 4 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 processing/annotate.py create mode 100644 processing/prepare.py diff --git a/config/pipeline.development.yaml b/config/pipeline.development.yaml index d437458..40c43af 100644 --- a/config/pipeline.development.yaml +++ b/config/pipeline.development.yaml @@ -3,10 +3,10 @@ debug: true # Processing settings processing: - batch_size: 10_000 - max_workers: 8 + batch_size: 100_000 + max_workers: 1 checkpoint_interval: 10 - use_multiprocessing: true + use_multiprocessing: false # Pipeline stages stages: diff --git a/processing/annotate.py b/processing/annotate.py new file mode 100644 index 0000000..e69de29 diff --git a/processing/ner/formats/connectors_format.py b/processing/ner/formats/connectors_format.py index b52cc49..9996378 100644 --- a/processing/ner/formats/connectors_format.py +++ b/processing/ner/formats/connectors_format.py @@ -12,6 +12,7 @@ class ConnectorFormatter(BaseNameFormatter): surname = row['probable_surname'] if pd.notna(row['probable_surname']) else '' connector = random.choice(self.connectors) + # Connect native parts with a random connector if len(native_parts) > 1: connected_native = f" {connector} ".join(native_parts) full_name = f"{connected_native} {surname}".strip() diff --git a/processing/prepare.py b/processing/prepare.py new file mode 100644 index 0000000..e69de29