drc-ners-nlp/src/ners/processing/ner/formats/__init__.py

from abc import ABC, abstractmethod
from typing import List, Tuple, Dict

import pandas as pd

from ners.processing.steps.feature_extraction_step import NameCategory


class BaseNameFormatter(ABC):
    """
    Base class for name formatting transformations.
    Contains common logic for NER tagging and attribute computation.
    """

    def __init__(
        self, connectors: List[str] = None, additional_surnames: List[str] = None
    ):
        self.connectors = connectors or ["wa", "ya", "ka", "ba"]
        self.additional_surnames = additional_surnames or [
            "jean",
            "paul",
            "marie",
            "joseph",
            "pierre",
            "claude",
            "andre",
            "michel",
            "robert",
        ]

    @classmethod
    def parse_native_components(cls, native_str: str) -> List[str]:
        """Parse native name string into individual components"""
        if pd.isna(native_str) or not native_str:
            return []
        return native_str.strip().split()

    def create_ner_tags(
        self, text: str, native_parts: List[str], surname: str
    ) -> List[Tuple[int, int, str]]:
        """Create NER entity tags for transformed text"""
        entities = []
        current_pos = 0
        words = text.split()

        for word in words:
            start_pos = current_pos
            end_pos = current_pos + len(word)

            # Determine tag based on word content
            if word in native_parts or any(
                connector in word for connector in self.connectors
            ):
                tag = "NATIVE"
            elif word == surname or word in self.additional_surnames:
                tag = "SURNAME"
            else:
                # Check if it's a compound native word or new surname
                if any(part in word for part in native_parts):
                    tag = "NATIVE"
                else:
                    tag = "SURNAME"

            entities.append((start_pos, end_pos, tag))
            current_pos = end_pos + 1  # +1 for space

        return entities

    @classmethod
    def compute_numeric_features(cls, name: str) -> Dict:
        """Compute all derived attributes for the transformed name"""
        words_count = len(name.split()) if name else 0
        length = len(name) if name else 0

        return {
            "words": words_count,
            "length": length,
            "identified_category": (
                NameCategory.SIMPLE.value
                if words_count == 3
                else NameCategory.COMPOSE.value
            ),
        }

    @abstractmethod
    def transform(self, row: pd.Series) -> Dict:
        """Transform a row according to the specific format rules"""
        pass

    @property
    @abstractmethod
    def transformation_type(self) -> str:
        """Return the transformation type identifier"""
        pass