feat: add NER annotation step and integrate into pipeline
This commit is contained in:
@@ -6,9 +6,10 @@ from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -25,11 +26,18 @@ class PipelineState:
|
||||
self.failed_batches = []
|
||||
|
||||
|
||||
class NameAnnotation(BaseModel):
|
||||
"""Model for name annotation results"""
|
||||
|
||||
identified_name: Optional[str]
|
||||
identified_surname: Optional[str]
|
||||
|
||||
|
||||
class PipelineStep(ABC):
|
||||
"""Abstract base class for pipeline steps"""
|
||||
|
||||
def __init__(
|
||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
||||
):
|
||||
self.name = name
|
||||
self.pipeline_config = pipeline_config
|
||||
|
||||
@@ -25,4 +25,7 @@ class DataCleaningStep(PipelineStep):
|
||||
# Apply text cleaning
|
||||
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
|
||||
|
||||
# Remove duplicates
|
||||
batch = batch.drop_duplicates(subset=self.required_columns)
|
||||
|
||||
return batch
|
||||
|
||||
@@ -5,6 +5,7 @@ import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.region_mapper import RegionMapper
|
||||
from processing.ner.ner_name_tagger import NERNameTagger
|
||||
from processing.steps import PipelineStep
|
||||
|
||||
|
||||
@@ -24,6 +25,7 @@ class FeatureExtractionStep(PipelineStep):
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
super().__init__("feature_extraction", pipeline_config)
|
||||
self.region_mapper = RegionMapper()
|
||||
self.name_tagger = NERNameTagger()
|
||||
|
||||
@classmethod
|
||||
def validate_gender(cls, gender: str) -> Gender:
|
||||
@@ -52,7 +54,7 @@ class FeatureExtractionStep(PipelineStep):
|
||||
|
||||
# Basic features
|
||||
batch["words"] = batch["name"].str.count(" ") + 1
|
||||
batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
|
||||
batch["length"] = batch["name"].str.len()
|
||||
|
||||
# Handle year column
|
||||
if "year" in batch.columns:
|
||||
@@ -63,6 +65,8 @@ class FeatureExtractionStep(PipelineStep):
|
||||
batch["probable_surname"] = None
|
||||
batch["identified_name"] = None
|
||||
batch["identified_surname"] = None
|
||||
batch["ner_entities"] = None
|
||||
batch["ner_tagged"] = 0
|
||||
batch["annotated"] = 0
|
||||
|
||||
# Vectorized category assignment
|
||||
@@ -81,14 +85,19 @@ class FeatureExtractionStep(PipelineStep):
|
||||
|
||||
# Auto-assign for 3-word names
|
||||
three_word_mask = batch["words"] == 3
|
||||
batch.loc[three_word_mask, "identified_name"] = batch.loc[
|
||||
three_word_mask, "probable_native"
|
||||
]
|
||||
batch.loc[three_word_mask, "identified_surname"] = batch.loc[
|
||||
three_word_mask, "probable_surname"
|
||||
]
|
||||
batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"]
|
||||
batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"]
|
||||
batch.loc[three_word_mask, "annotated"] = 1
|
||||
|
||||
# Tag names with NER entities
|
||||
three_word_rows = batch[three_word_mask]
|
||||
for idx, row in three_word_rows.iterrows():
|
||||
entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname'])
|
||||
|
||||
if entity:
|
||||
batch.at[idx, "ner_entities"] = entity["entities"]
|
||||
batch.at[idx, "ner_tagged"] = 1
|
||||
|
||||
# Map regions to provinces
|
||||
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
|
||||
|
||||
|
||||
@@ -1,25 +1,18 @@
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Dict, Optional
|
||||
from typing import Dict
|
||||
|
||||
import ollama
|
||||
import pandas as pd
|
||||
from pydantic import ValidationError, BaseModel
|
||||
from pydantic import ValidationError
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.prompt_manager import PromptManager
|
||||
from core.utils.rate_limiter import RateLimiter
|
||||
from core.utils.rate_limiter import RateLimitConfig
|
||||
from core.utils.rate_limiter import RateLimiter
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.steps import PipelineStep
|
||||
|
||||
|
||||
class NameAnnotation(BaseModel):
|
||||
"""Model for name annotation results"""
|
||||
|
||||
identified_name: Optional[str]
|
||||
identified_surname: Optional[str]
|
||||
from processing.steps import PipelineStep, NameAnnotation
|
||||
|
||||
|
||||
class LLMAnnotationStep(PipelineStep):
|
||||
@@ -27,10 +20,12 @@ class LLMAnnotationStep(PipelineStep):
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
# Create custom batch config for LLM processing
|
||||
self.llm_config = pipeline_config.annotation.llm
|
||||
batch_config = BatchConfig(
|
||||
batch_size=pipeline_config.processing.batch_size,
|
||||
max_workers=min(
|
||||
pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
|
||||
self.llm_config.max_concurrent_requests,
|
||||
pipeline_config.processing.max_workers
|
||||
),
|
||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||
@@ -39,7 +34,7 @@ class LLMAnnotationStep(PipelineStep):
|
||||
|
||||
self.prompt = PromptManager(pipeline_config).load_prompt()
|
||||
self.rate_limiter = (
|
||||
self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
|
||||
self._create_rate_limiter() if self.llm_config.enable_rate_limiting else None
|
||||
)
|
||||
|
||||
# Statistics
|
||||
@@ -53,14 +48,14 @@ class LLMAnnotationStep(PipelineStep):
|
||||
def _create_rate_limiter(self):
|
||||
"""Create rate limiter based on configuration"""
|
||||
rate_config = RateLimitConfig(
|
||||
requests_per_minute=self.pipeline_config.llm.requests_per_minute,
|
||||
requests_per_second=self.pipeline_config.llm.requests_per_second,
|
||||
requests_per_minute=self.llm_config.requests_per_minute,
|
||||
requests_per_second=self.llm_config.requests_per_second,
|
||||
)
|
||||
return RateLimiter(rate_config)
|
||||
|
||||
def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
|
||||
def analyze_name(self, client: ollama.Client, name: str) -> Dict:
|
||||
"""Analyze a name with retry logic and rate limiting"""
|
||||
for attempt in range(self.pipeline_config.llm.retry_attempts):
|
||||
for attempt in range(self.llm_config.retry_attempts):
|
||||
try:
|
||||
# Apply rate limiting if enabled
|
||||
if self.rate_limiter:
|
||||
@@ -68,7 +63,7 @@ class LLMAnnotationStep(PipelineStep):
|
||||
|
||||
start_time = time.time()
|
||||
response = client.chat(
|
||||
model=self.pipeline_config.llm.model_name,
|
||||
model=self.llm_config.model_name,
|
||||
messages=[
|
||||
{"role": "system", "content": self.prompt},
|
||||
{"role": "user", "content": name},
|
||||
@@ -77,9 +72,9 @@ class LLMAnnotationStep(PipelineStep):
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
if elapsed_time > self.pipeline_config.llm.timeout_seconds:
|
||||
if elapsed_time > self.llm_config.timeout_seconds:
|
||||
raise TimeoutError(
|
||||
f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
|
||||
f"Request took {elapsed_time:.2f}s, exceeding {self.llm_config.timeout_seconds}s timeout"
|
||||
)
|
||||
|
||||
annotation = NameAnnotation.model_validate_json(response.message.content)
|
||||
@@ -98,12 +93,12 @@ class LLMAnnotationStep(PipelineStep):
|
||||
|
||||
except (ValidationError, TimeoutError, Exception) as e:
|
||||
logging.warning(
|
||||
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
|
||||
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.llm_config.retry_attempts}): {e}"
|
||||
)
|
||||
|
||||
# Exponential backoff with jitter
|
||||
if attempt < self.pipeline_config.llm.retry_attempts - 1:
|
||||
wait_time = (2**attempt) + (time.time() % 1)
|
||||
if attempt < self.llm_config.retry_attempts - 1:
|
||||
wait_time = (2 ** attempt) + (time.time() % 1)
|
||||
time.sleep(min(wait_time, 10))
|
||||
|
||||
self.failed_requests += 1
|
||||
@@ -112,7 +107,7 @@ class LLMAnnotationStep(PipelineStep):
|
||||
"identified_surname": None,
|
||||
"annotated": 0,
|
||||
"processing_time": 0,
|
||||
"attempts": self.pipeline_config.llm.retry_attempts,
|
||||
"attempts": self.llm_config.retry_attempts,
|
||||
"failed": True,
|
||||
}
|
||||
|
||||
@@ -125,18 +120,18 @@ class LLMAnnotationStep(PipelineStep):
|
||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||
return batch
|
||||
|
||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
|
||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with LLM")
|
||||
|
||||
batch = batch.copy()
|
||||
client = ollama.Client()
|
||||
|
||||
# Process with controlled concurrency
|
||||
max_workers = self.pipeline_config.llm.max_concurrent_requests
|
||||
max_workers = self.llm_config.max_concurrent_requests
|
||||
|
||||
if len(unannotated_entries) == 1 or max_workers == 1:
|
||||
# Sequential processing
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
result = self.analyze_name_with_retry(client, row["name"], idx)
|
||||
result = self.analyze_name(client, row["name"])
|
||||
for field, value in result.items():
|
||||
if field not in ["failed"]:
|
||||
batch.loc[idx, field] = value
|
||||
@@ -146,7 +141,7 @@ class LLMAnnotationStep(PipelineStep):
|
||||
future_to_idx = {}
|
||||
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
|
||||
future = executor.submit(self.analyze_name, client, row["name"])
|
||||
future_to_idx[future] = idx
|
||||
|
||||
for future in as_completed(future_to_idx):
|
||||
@@ -161,8 +156,6 @@ class LLMAnnotationStep(PipelineStep):
|
||||
batch.loc[idx, "annotated"] = 0
|
||||
|
||||
# Ensure proper data types
|
||||
batch["annotated"] = (
|
||||
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
)
|
||||
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
|
||||
return batch
|
||||
|
||||
@@ -0,0 +1,164 @@
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.steps import PipelineStep, NameAnnotation
|
||||
from processing.ner.ner_name_model import NERNameModel
|
||||
|
||||
|
||||
class NERAnnotationStep(PipelineStep):
|
||||
"""NER annotation step using trained spaCy model for entity recognition"""
|
||||
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
# Create custom batch config for NER processing
|
||||
super().__init__("ner_annotation", pipeline_config)
|
||||
|
||||
self.model_name = "drc_ner_model"
|
||||
self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
|
||||
self.ner_trainer = NERNameModel(pipeline_config)
|
||||
self.ner_config = pipeline_config.annotation.ner
|
||||
|
||||
# Statistics
|
||||
self.successful_requests = 0
|
||||
self.failed_requests = 0
|
||||
self.total_retry_attempts = 0
|
||||
|
||||
# Load the model
|
||||
self._load_ner_model()
|
||||
|
||||
def _load_ner_model(self) -> None:
|
||||
"""Load the trained NER model"""
|
||||
try:
|
||||
if self.model_path.exists():
|
||||
logging.info(f"Loading NER model from {self.model_path}")
|
||||
self.ner_trainer.load(str(self.model_path))
|
||||
logging.info("NER model loaded successfully")
|
||||
else:
|
||||
logging.warning(f"NER model not found at {self.model_path}")
|
||||
logging.warning("NER annotation will be skipped. Train the model first.")
|
||||
self.ner_trainer.nlp = None
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load NER model: {e}")
|
||||
self.ner_trainer.nlp = None
|
||||
|
||||
def analyze_name(self, name: str) -> Dict:
|
||||
"""Analyze a name with retry logic"""
|
||||
if self.ner_trainer.nlp is None:
|
||||
return {
|
||||
"identified_name": None,
|
||||
"identified_surname": None,
|
||||
"annotated": 0,
|
||||
"processing_time": 0,
|
||||
"attempts": 0,
|
||||
"failed": True,
|
||||
}
|
||||
|
||||
for attempt in range(self.ner_config.retry_attempts):
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Get NER predictions
|
||||
prediction = self.ner_trainer.predict(name.lower())
|
||||
entities = prediction.get('entities', [])
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Extract native names and surnames from entities
|
||||
native_parts = []
|
||||
surname_parts = []
|
||||
|
||||
for entity in entities:
|
||||
if entity['label'] == 'NATIVE':
|
||||
native_parts.append(entity['text'])
|
||||
elif entity['label'] == 'SURNAME':
|
||||
surname_parts.append(entity['text'])
|
||||
|
||||
# Create annotation result in same format as LLM step
|
||||
annotation = NameAnnotation(
|
||||
identified_name=" ".join(native_parts) if native_parts else None,
|
||||
identified_surname=" ".join(surname_parts) if surname_parts else None
|
||||
)
|
||||
|
||||
result = {
|
||||
**annotation.model_dump(),
|
||||
"annotated": 1,
|
||||
"processing_time": elapsed_time,
|
||||
"attempts": attempt + 1,
|
||||
}
|
||||
|
||||
self.successful_requests += 1
|
||||
if attempt > 0:
|
||||
self.total_retry_attempts += attempt
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
f"Error analyzing '{name}' with NER (attempt {attempt + 1}/{self.ner_config.retry_attempts}): {e}"
|
||||
)
|
||||
|
||||
# Small delay between retries
|
||||
if attempt < self.ner_config.retry_attempts - 1:
|
||||
time.sleep(0.1)
|
||||
|
||||
self.failed_requests += 1
|
||||
return {
|
||||
"identified_name": None,
|
||||
"identified_surname": None,
|
||||
"annotated": 0,
|
||||
"processing_time": 0,
|
||||
"attempts": self.ner_config.retry_attempts,
|
||||
"failed": True,
|
||||
}
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process batch with NER annotation using same logic as LLM step"""
|
||||
unannotated_mask = batch.get("annotated", 0) == 0
|
||||
unannotated_entries = batch[unannotated_mask]
|
||||
|
||||
if unannotated_entries.empty:
|
||||
logging.info(f"Batch {batch_id}: No entries to annotate")
|
||||
return batch
|
||||
|
||||
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries with NER")
|
||||
|
||||
batch = batch.copy()
|
||||
|
||||
# Process with controlled concurrency
|
||||
max_workers = self.batch_config.max_workers
|
||||
|
||||
if len(unannotated_entries) == 1 or max_workers == 1:
|
||||
# Sequential processing
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
result = self.analyze_name(row["name"])
|
||||
for field, value in result.items():
|
||||
if field not in ["failed"]:
|
||||
batch.loc[idx, field] = value
|
||||
else:
|
||||
# Concurrent processing
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_idx = {}
|
||||
|
||||
for idx, row in unannotated_entries.iterrows():
|
||||
future = executor.submit(self.analyze_name, row["name"])
|
||||
future_to_idx[future] = idx
|
||||
|
||||
for future in as_completed(future_to_idx):
|
||||
idx = future_to_idx[future]
|
||||
try:
|
||||
result = future.result()
|
||||
for field, value in result.items():
|
||||
if field not in ["failed"]:
|
||||
batch.loc[idx, field] = value
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process row {idx}: {e}")
|
||||
batch.loc[idx, "annotated"] = 0
|
||||
|
||||
# Ensure proper data types
|
||||
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
|
||||
return batch
|
||||
Reference in New Issue
Block a user