feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -6,8 +6,8 @@ from typing import Dict
 import pandas as pd

 from core.config.pipeline_config import PipelineConfig
-from processing.steps import PipelineStep, NameAnnotation
 from processing.ner.ner_name_model import NERNameModel
+from processing.steps import PipelineStep, NameAnnotation


 class NERAnnotationStep(PipelineStep):
@@ -63,7 +63,7 @@ class NERAnnotationStep(PipelineStep):

                # Get NER predictions
                prediction = self.ner_trainer.predict(name.lower())
-                entities = prediction.get('entities', [])
+                entities = prediction.get("entities", [])

                elapsed_time = time.time() - start_time

@@ -72,15 +72,15 @@ class NERAnnotationStep(PipelineStep):
                surname_parts = []

                for entity in entities:
-                    if entity['label'] == 'NATIVE':
-                        native_parts.append(entity['text'])
-                    elif entity['label'] == 'SURNAME':
-                        surname_parts.append(entity['text'])
+                    if entity["label"] == "NATIVE":
+                        native_parts.append(entity["text"])
+                    elif entity["label"] == "SURNAME":
+                        surname_parts.append(entity["text"])

                # Create annotation result in same format as LLM step
                annotation = NameAnnotation(
                    identified_name=" ".join(native_parts) if native_parts else None,
-                    identified_surname=" ".join(surname_parts) if surname_parts else None
+                    identified_surname=" ".join(surname_parts) if surname_parts else None,
                )

                result = {
@@ -159,6 +159,8 @@ class NERAnnotationStep(PipelineStep):
                        batch.loc[idx, "annotated"] = 0

        # Ensure proper data types
-        batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        batch["annotated"] = (
+            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        )

        return batch