feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -13,10 +13,17 @@ class BaseNameFormatter(ABC):
    """

    def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
-        self.connectors = connectors or ['wa', 'ya', 'ka', 'ba']
+        self.connectors = connectors or ["wa", "ya", "ka", "ba"]
        self.additional_surnames = additional_surnames or [
-            'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
-            'andre', 'michel', 'robert'
+            "jean",
+            "paul",
+            "marie",
+            "joseph",
+            "pierre",
+            "claude",
+            "andre",
+            "michel",
+            "robert",
        ]

    @classmethod
@@ -26,7 +33,9 @@ class BaseNameFormatter(ABC):
            return []
        return native_str.strip().split()

-    def create_ner_tags(self, text: str, native_parts: List[str], surname: str) -> List[Tuple[int, int, str]]:
+    def create_ner_tags(
+        self, text: str, native_parts: List[str], surname: str
+    ) -> List[Tuple[int, int, str]]:
        """Create NER entity tags for transformed text"""
        entities = []
        current_pos = 0
@@ -38,15 +47,15 @@ class BaseNameFormatter(ABC):

            # Determine tag based on word content
            if word in native_parts or any(connector in word for connector in self.connectors):
-                tag = 'NATIVE'
+                tag = "NATIVE"
            elif word == surname or word in self.additional_surnames:
-                tag = 'SURNAME'
+                tag = "SURNAME"
            else:
                # Check if it's a compound native word or new surname
                if any(part in word for part in native_parts):
-                    tag = 'NATIVE'
+                    tag = "NATIVE"
                else:
-                    tag = 'SURNAME'
+                    tag = "SURNAME"

            entities.append((start_pos, end_pos, tag))
            current_pos = end_pos + 1  # +1 for space
@@ -54,15 +63,17 @@ class BaseNameFormatter(ABC):
        return entities

    @classmethod
-    def compute_derived_attributes(cls, name: str) -> Dict:
+    def compute_numeric_features(cls, name: str) -> Dict:
        """Compute all derived attributes for the transformed name"""
        words_count = len(name.split()) if name else 0
        length = len(name) if name else 0

        return {
-            'words': words_count,
-            'length': length,
-            'identified_category': NameCategory.SIMPLE if words_count == 3 else NameCategory.COMPOSE,
+            "words": words_count,
+            "length": length,
+            "identified_category": (
+                NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
+            ),
        }

    @abstractmethod
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter

 class ConnectorFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
        connector = random.choice(self.connectors)

        # Connect native parts with a random connector
@@ -17,20 +17,22 @@ class ConnectorFormatter(BaseNameFormatter):
            connected_native = f" {connector} ".join(native_parts)
            full_name = f"{connected_native} {surname}".strip()
        else:
-            connected_native = f"{row['probable_native']} {connector} {row['probable_native']}".strip()
+            connected_native = (
+                f"{row['probable_native']} {connector} {row['probable_native']}".strip()
+            )
            full_name = f"{connected_native} {surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': connected_native,
-            'identify_name': connected_native,
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": connected_native,
+            "identified_name": connected_native,
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'connector_added'
+        return "connector_added"
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter

 class ExtendedSurnameFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        original_surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Add random additional surname
        additional_surname = random.choice(self.additional_surnames)
@@ -17,16 +17,16 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
        full_name = f"{row['probable_native']} {combined_surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': combined_surname,
-            'identity_surname': combined_surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, combined_surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": combined_surname,
+            "identified_surname": combined_surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'extended_surname'
+        return "extended_surname"
@@ -7,22 +7,22 @@ from processing.ner.formats import BaseNameFormatter

 class NativeOnlyFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
+        native_parts = self.parse_native_components(row["probable_native"])

        # Only native components
-        full_name = row['probable_native']
+        full_name = row["probable_native"]

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': '',
-            'identify_surname': '',
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, '')),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": "",
+            "identified_surname": "",
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'native_only'
+        return "native_only"
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter

 class OriginalFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Keep original order: native components + surname
        full_name = f"{row['probable_native']} {surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'original'
+        return "original"
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter

 class PositionFlippedFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Flip order: surname + native components
        full_name = f"{surname} {row['probable_native']}".strip()

        return {
-            'name': full_name,
-            'probable_native': row['probable_native'],
-            'identify_name': row['probable_native'],
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": row["probable_native"],
+            "identified_name": row["probable_native"],
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'position_flipped'
+        return "position_flipped"
@@ -7,24 +7,24 @@ from processing.ner.formats import BaseNameFormatter

 class ReducedNativeFormatter(BaseNameFormatter):
    def transform(self, row: pd.Series) -> Dict:
-        native_parts = self.parse_native_components(row['probable_native'])
-        surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
+        native_parts = self.parse_native_components(row["probable_native"])
+        surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""

        # Keep only first native component + surname
-        reduced_native = native_parts[0] if len(native_parts) > 1 else row['probable_native']
+        reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
        full_name = f"{reduced_native} {surname}".strip()

        return {
-            'name': full_name,
-            'probable_native': reduced_native,
-            'identify_name': reduced_native,
-            'probable_surname': surname,
-            'identify_surname': surname,
-            'ner_entities': str(self.create_ner_tags(full_name, [reduced_native], surname)),
-            'transformation_type': self.transformation_type,
-            **self.compute_derived_attributes(full_name)
+            "name": full_name,
+            "probable_native": reduced_native,
+            "identified_name": reduced_native,
+            "probable_surname": surname,
+            "identified_surname": surname,
+            "ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
+            "transformation_type": self.transformation_type,
+            **self.compute_numeric_features(full_name),
        }

    @property
    def transformation_type(self) -> str:
-        return 'reduced_native'
+        return "reduced_native"