diff --git a/config/pipeline.development.yaml b/config/pipeline.development.yaml index d437458..40c43af 100644 --- a/config/pipeline.development.yaml +++ b/config/pipeline.development.yaml @@ -3,10 +3,10 @@ debug: true # Processing settings processing: - batch_size: 10_000 - max_workers: 8 + batch_size: 100_000 + max_workers: 1 checkpoint_interval: 10 - use_multiprocessing: true + use_multiprocessing: false # Pipeline stages stages: diff --git a/processing/annotate.py b/processing/annotate.py new file mode 100644 index 0000000..e69de29 diff --git a/processing/ner/formats/connectors_format.py b/processing/ner/formats/connectors_format.py index b52cc49..9996378 100644 --- a/processing/ner/formats/connectors_format.py +++ b/processing/ner/formats/connectors_format.py @@ -12,6 +12,7 @@ class ConnectorFormatter(BaseNameFormatter): surname = row['probable_surname'] if pd.notna(row['probable_surname']) else '' connector = random.choice(self.connectors) + # Connect native parts with a random connector if len(native_parts) > 1: connected_native = f" {connector} ".join(native_parts) full_name = f"{connected_native} {surname}".strip() diff --git a/processing/prepare.py b/processing/prepare.py new file mode 100644 index 0000000..e69de29