refactoring: uv

2025-10-05 18:14:15 +02:00
parent f3b06fbd07
commit 9dd4f759b3
120 changed files with 5525 additions and 3366 deletions
@@ -0,0 +1,94 @@
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict
+
+import pandas as pd
+
+from ners.processing.steps.feature_extraction_step import NameCategory
+
+
+class BaseNameFormatter(ABC):
+    """
+    Base class for name formatting transformations.
+    Contains common logic for NER tagging and attribute computation.
+    """
+
+    def __init__(
+        self, connectors: List[str] = None, additional_surnames: List[str] = None
+    ):
+        self.connectors = connectors or ["wa", "ya", "ka", "ba"]
+        self.additional_surnames = additional_surnames or [
+            "jean",
+            "paul",
+            "marie",
+            "joseph",
+            "pierre",
+            "claude",
+            "andre",
+            "michel",
+            "robert",
+        ]
+
+    @classmethod
+    def parse_native_components(cls, native_str: str) -> List[str]:
+        """Parse native name string into individual components"""
+        if pd.isna(native_str) or not native_str:
+            return []
+        return native_str.strip().split()
+
+    def create_ner_tags(
+        self, text: str, native_parts: List[str], surname: str
+    ) -> List[Tuple[int, int, str]]:
+        """Create NER entity tags for transformed text"""
+        entities = []
+        current_pos = 0
+        words = text.split()
+
+        for word in words:
+            start_pos = current_pos
+            end_pos = current_pos + len(word)
+
+            # Determine tag based on word content
+            if word in native_parts or any(
+                connector in word for connector in self.connectors
+            ):
+                tag = "NATIVE"
+            elif word == surname or word in self.additional_surnames:
+                tag = "SURNAME"
+            else:
+                # Check if it's a compound native word or new surname
+                if any(part in word for part in native_parts):
+                    tag = "NATIVE"
+                else:
+                    tag = "SURNAME"
+
+            entities.append((start_pos, end_pos, tag))
+            current_pos = end_pos + 1  # +1 for space
+
+        return entities
+
+    @classmethod
+    def compute_numeric_features(cls, name: str) -> Dict:
+        """Compute all derived attributes for the transformed name"""
+        words_count = len(name.split()) if name else 0
+        length = len(name) if name else 0
+
+        return {
+            "words": words_count,
+            "length": length,
+            "identified_category": (
+                NameCategory.SIMPLE.value
+                if words_count == 3
+                else NameCategory.COMPOSE.value
+            ),
+        }
+
+    @abstractmethod
+    def transform(self, row: pd.Series) -> Dict:
+        """Transform a row according to the specific format rules"""
+        pass
+
+    @property
+    @abstractmethod
+    def transformation_type(self) -> str:
+        """Return the transformation type identifier"""
+        pass