feat: add NER annotation step and integrate into pipeline

2025-08-11 07:13:09 +02:00
parent 6d39c3afc1
commit d5a4aaaf4a
23 changed files with 1108 additions and 160 deletions
@@ -6,9 +6,10 @@ from dataclasses import dataclass
 from typing import List, Optional

 import pandas as pd
+from pydantic import BaseModel

-from processing.batch.batch_config import BatchConfig
 from core.config.pipeline_config import PipelineConfig
+from processing.batch.batch_config import BatchConfig


@dataclass
@@ -25,11 +26,18 @@ class PipelineState:
            self.failed_batches = []


+class NameAnnotation(BaseModel):
+    """Model for name annotation results"""
+
+    identified_name: Optional[str]
+    identified_surname: Optional[str]
+
+
 class PipelineStep(ABC):
    """Abstract base class for pipeline steps"""

    def __init__(
-        self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
+            self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
    ):
        self.name = name
        self.pipeline_config = pipeline_config
@@ -25,4 +25,7 @@ class DataCleaningStep(PipelineStep):
        # Apply text cleaning
        batch = self.text_cleaner.clean_dataframe_text_columns(batch)

+        # Remove duplicates
+        batch = batch.drop_duplicates(subset=self.required_columns)
+
        return batch
@@ -5,6 +5,7 @@ import pandas as pd

 from core.config.pipeline_config import PipelineConfig
 from core.utils.region_mapper import RegionMapper
+from processing.ner.ner_name_tagger import NERNameTagger
 from processing.steps import PipelineStep


@@ -24,6 +25,7 @@ class FeatureExtractionStep(PipelineStep):
    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("feature_extraction", pipeline_config)
        self.region_mapper = RegionMapper()
+        self.name_tagger = NERNameTagger()

    @classmethod
    def validate_gender(cls, gender: str) -> Gender:
@@ -52,7 +54,7 @@ class FeatureExtractionStep(PipelineStep):

        # Basic features
        batch["words"] = batch["name"].str.count(" ") + 1
-        batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
+        batch["length"] = batch["name"].str.len()

        # Handle year column
        if "year" in batch.columns:
@@ -63,6 +65,8 @@ class FeatureExtractionStep(PipelineStep):
        batch["probable_surname"] = None
        batch["identified_name"] = None
        batch["identified_surname"] = None
+        batch["ner_entities"] = None
+        batch["ner_tagged"] = 0
        batch["annotated"] = 0

        # Vectorized category assignment
@@ -81,14 +85,19 @@ class FeatureExtractionStep(PipelineStep):

        # Auto-assign for 3-word names
        three_word_mask = batch["words"] == 3
-        batch.loc[three_word_mask, "identified_name"] = batch.loc[
-            three_word_mask, "probable_native"
-        ]
-        batch.loc[three_word_mask, "identified_surname"] = batch.loc[
-            three_word_mask, "probable_surname"
-        ]
+        batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"]
+        batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"]
        batch.loc[three_word_mask, "annotated"] = 1

+        # Tag names with NER entities
+        three_word_rows = batch[three_word_mask]
+        for idx, row in three_word_rows.iterrows():
+            entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname'])
+
+            if entity:
+                batch.at[idx, "ner_entities"] = entity["entities"]
+                batch.at[idx, "ner_tagged"] = 1
+
        # Map regions to provinces
        batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])

@@ -1,25 +1,18 @@
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, Optional
+from typing import Dict

 import ollama
 import pandas as pd
-from pydantic import ValidationError, BaseModel
+from pydantic import ValidationError

 from core.config.pipeline_config import PipelineConfig
 from core.utils.prompt_manager import PromptManager
-from core.utils.rate_limiter import RateLimiter
 from core.utils.rate_limiter import RateLimitConfig
+from core.utils.rate_limiter import RateLimiter
 from processing.batch.batch_config import BatchConfig
-from processing.steps import PipelineStep
-
-
-class NameAnnotation(BaseModel):
-    """Model for name annotation results"""
-
-    identified_name: Optional[str]
-    identified_surname: Optional[str]
+from processing.steps import PipelineStep, NameAnnotation


 class LLMAnnotationStep(PipelineStep):
@@ -27,10 +20,12 @@ class LLMAnnotationStep(PipelineStep):

    def __init__(self, pipeline_config: PipelineConfig):
        # Create custom batch config for LLM processing
+        self.llm_config = pipeline_config.annotation.llm
        batch_config = BatchConfig(
            batch_size=pipeline_config.processing.batch_size,
            max_workers=min(
-                pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
+                self.llm_config.max_concurrent_requests,
+                pipeline_config.processing.max_workers
            ),
            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
            use_multiprocessing=pipeline_config.processing.use_multiprocessing,
@@ -39,7 +34,7 @@ class LLMAnnotationStep(PipelineStep):

        self.prompt = PromptManager(pipeline_config).load_prompt()
        self.rate_limiter = (
-            self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
+            self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None
        )

        # Statistics
@@ -53,14 +48,14 @@ class LLMAnnotationStep(PipelineStep):
    def _create_rate_limiter(self):
        """Create rate limiter based on configuration"""
        rate_config = RateLimitConfig(
-            requests_per_minute=self.pipeline_config.llm.requests_per_minute,
-            requests_per_second=self.pipeline_config.llm.requests_per_second,
+            requests_per_minute=self.llm_config.requests_per_minute,
+            requests_per_second=self.llm_config.requests_per_second,
        )
        return RateLimiter(rate_config)

-    def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
+    def analyze_name(self, client: ollama.Client, name: str) -> Dict:
        """Analyze a name with retry logic and rate limiting"""
-        for attempt in range(self.pipeline_config.llm.retry_attempts):
+        for attempt in range(self.llm_config.retry_attempts):
            try:
                # Apply rate limiting if enabled
                if self.rate_limiter:
@@ -68,7 +63,7 @@ class LLMAnnotationStep(PipelineStep):

                start_time = time.time()
                response = client.chat(
-                    model=self.pipeline_config.llm.model_name,
+                    model=self.llm_config.model_name,
                    messages=[
                        {"role": "system", "content": self.prompt},
                        {"role": "user", "content": name},
@@ -77,9 +72,9 @@ class LLMAnnotationStep(PipelineStep):
                )
                elapsed_time = time.time() - start_time

-                if elapsed_time > self.pipeline_config.llm.timeout_seconds:
+                if elapsed_time > self.llm_config.timeout_seconds:
                    raise TimeoutError(
-                        f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
+                        f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
                    )

                annotation = NameAnnotation.model_validate_json(response.message.content)
@@ -98,12 +93,12 @@ class LLMAnnotationStep(PipelineStep):

            except (ValidationError, TimeoutError, Exception) as e:
                logging.warning(
-                    f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
+                    f"Error analyzing '{name}' (attempt {attempt + 1}/{self.llm_config.retry_attempts}): {e}"
                )

                # Exponential backoff with jitter
-                if attempt < self.pipeline_config.llm.retry_attempts - 1:
-                    wait_time = (2**attempt) + (time.time() % 1)
+                if attempt < self.llm_config.retry_attempts - 1:
+                    wait_time = (2 ** attempt) + (time.time() % 1)
                    time.sleep(min(wait_time, 10))

        self.failed_requests += 1
@@ -112,7 +107,7 @@ class LLMAnnotationStep(PipelineStep):
            "identified_surname": None,
            "annotated": 0,
            "processing_time": 0,
-            "attempts": self.pipeline_config.llm.retry_attempts,
+            "attempts": self.llm_config.retry_attempts,
            "failed": True,
        }

@@ -125,18 +120,18 @@ class LLMAnnotationStep(PipelineStep):
            logging.info(f"Batch {batch_id}: No entries to annotate")
            return batch

-        logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
+        logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM")

        batch = batch.copy()
        client = ollama.Client()

        # Process with controlled concurrency
-        max_workers = self.pipeline_config.llm.max_concurrent_requests
+        max_workers = self.llm_config.max_concurrent_requests

        if len(unannotated_entries) == 1 or max_workers == 1:
            # Sequential processing
            for idx, row in unannotated_entries.iterrows():
-                result = self.analyze_name_with_retry(client, row["name"], idx)
+                result = self.analyze_name(client, row["name"])
                for field, value in result.items():
                    if field not in ["failed"]:
                        batch.loc[idx, field] = value
@@ -146,7 +141,7 @@ class LLMAnnotationStep(PipelineStep):
                future_to_idx = {}

                for idx, row in unannotated_entries.iterrows():
-                    future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
+                    future = executor.submit(self.analyze_name, client, row["name"])
                    future_to_idx[future] = idx

                for future in as_completed(future_to_idx):
@@ -161,8 +156,6 @@ class LLMAnnotationStep(PipelineStep):
                        batch.loc[idx, "annotated"] = 0

        # Ensure proper data types
-        batch["annotated"] = (
-            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
-        )
+        batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")

        return batch
@@ -0,0 +1,164 @@
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict
+
+import pandas as pd
+
+from core.config.pipeline_config import PipelineConfig
+from processing.steps import PipelineStep, NameAnnotation
+from processing.ner.ner_name_model import NERNameModel
+
+
+class NERAnnotationStep(PipelineStep):
+    """NER annotation step using trained spaCy model for entity recognition"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        # Create custom batch config for NER processing
+        super().__init__("ner_annotation", pipeline_config)
+
+        self.model_name = "drc_ner_model"
+        self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
+        self.ner_trainer = NERNameModel(pipeline_config)
+        self.ner_config = pipeline_config.annotation.ner
+
+        # Statistics
+        self.successful_requests = 0
+        self.failed_requests = 0
+        self.total_retry_attempts = 0
+
+        # Load the model
+        self._load_ner_model()
+
+    def _load_ner_model(self) -> None:
+        """Load the trained NER model"""
+        try:
+            if self.model_path.exists():
+                logging.info(f"Loading NER model from {self.model_path}")
+                self.ner_trainer.load(str(self.model_path))
+                logging.info("NER model loaded successfully")
+            else:
+                logging.warning(f"NER model not found at {self.model_path}")
+                logging.warning("NER annotation will be skipped. Train the model first.")
+                self.ner_trainer.nlp = None
+        except Exception as e:
+            logging.error(f"Failed to load NER model: {e}")
+            self.ner_trainer.nlp = None
+
+    def analyze_name(self, name: str) -> Dict:
+        """Analyze a name with retry logic"""
+        if self.ner_trainer.nlp is None:
+            return {
+                "identified_name": None,
+                "identified_surname": None,
+                "annotated": 0,
+                "processing_time": 0,
+                "attempts": 0,
+                "failed": True,
+            }
+
+        for attempt in range(self.ner_config.retry_attempts):
+            try:
+                start_time = time.time()
+
+                # Get NER predictions
+                prediction = self.ner_trainer.predict(name.lower())
+                entities = prediction.get('entities', [])
+
+                elapsed_time = time.time() - start_time
+
+                # Extract native names and surnames from entities
+                native_parts = []
+                surname_parts = []
+
+                for entity in entities:
+                    if entity['label'] == 'NATIVE':
+                        native_parts.append(entity['text'])
+                    elif entity['label'] == 'SURNAME':
+                        surname_parts.append(entity['text'])
+
+                # Create annotation result in same format as LLM step
+                annotation = NameAnnotation(
+                    identified_name=" ".join(native_parts) if native_parts else None,
+                    identified_surname=" ".join(surname_parts) if surname_parts else None
+                )
+
+                result = {
+                    **annotation.model_dump(),
+                    "annotated": 1,
+                    "processing_time": elapsed_time,
+                    "attempts": attempt + 1,
+                }
+
+                self.successful_requests += 1
+                if attempt > 0:
+                    self.total_retry_attempts += attempt
+
+                return result
+
+            except Exception as e:
+                logging.warning(
+                    f"Error analyzing '{name}' with NER (attempt {attempt + 1}/{self.ner_config.retry_attempts}): {e}"
+                )
+
+                # Small delay between retries
+                if attempt < self.ner_config.retry_attempts - 1:
+                    time.sleep(0.1)
+
+        self.failed_requests += 1
+        return {
+            "identified_name": None,
+            "identified_surname": None,
+            "annotated": 0,
+            "processing_time": 0,
+            "attempts": self.ner_config.retry_attempts,
+            "failed": True,
+        }
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process batch with NER annotation using same logic as LLM step"""
+        unannotated_mask = batch.get("annotated", 0) == 0
+        unannotated_entries = batch[unannotated_mask]
+
+        if unannotated_entries.empty:
+            logging.info(f"Batch {batch_id}: No entries to annotate")
+            return batch
+
+        logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER")
+
+        batch = batch.copy()
+
+        # Process with controlled concurrency
+        max_workers = self.batch_config.max_workers
+
+        if len(unannotated_entries) == 1 or max_workers == 1:
+            # Sequential processing
+            for idx, row in unannotated_entries.iterrows():
+                result = self.analyze_name(row["name"])
+                for field, value in result.items():
+                    if field not in ["failed"]:
+                        batch.loc[idx, field] = value
+        else:
+            # Concurrent processing
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                future_to_idx = {}
+
+                for idx, row in unannotated_entries.iterrows():
+                    future = executor.submit(self.analyze_name, row["name"])
+                    future_to_idx[future] = idx
+
+                for future in as_completed(future_to_idx):
+                    idx = future_to_idx[future]
+                    try:
+                        result = future.result()
+                        for field, value in result.items():
+                            if field not in ["failed"]:
+                                batch.loc[idx, field] = value
+                    except Exception as e:
+                        logging.error(f"Failed to process row {idx}: {e}")
+                        batch.loc[idx, "annotated"] = 0
+
+        # Ensure proper data types
+        batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+
+        return batch