feat: add NER annotation step and integrate into pipeline

This commit is contained in:
2025-08-11 07:13:09 +02:00
parent 6d39c3afc1
commit d5a4aaaf4a
23 changed files with 1108 additions and 160 deletions
+29
View File
@@ -0,0 +1,29 @@
from pydantic import BaseModel
class NERConfig(BaseModel):
"""NER annotation configuration"""
model_name: str = "drc_names_ner"
retry_attempts: int = 3
class LLMConfig(BaseModel):
"""LLM annotation configuration"""
model_name: str = "mistral:7b"
requests_per_minute: int = 60
requests_per_second: int = 2
retry_attempts: int = 3
timeout_seconds: int = 30
max_concurrent_requests: int = 2
enable_rate_limiting: bool = False
class AnnotationConfig(BaseModel):
"""Base class for annotation configurations"""
llm: LLMConfig = LLMConfig()
ner: NERConfig = NERConfig()
class Config:
arbitrary_types_allowed = True
+1 -1
View File
@@ -65,7 +65,7 @@ class ConfigManager:
# Ensure paths are properly set
if "paths" not in config_data:
config_data["paths"] = self.default_paths.dict()
config_data["paths"] = self.default_paths.model_dump()
self._config = PipelineConfig(**config_data)
return self._config
+4 -1
View File
@@ -14,10 +14,13 @@ class DataConfig(BaseModel):
"evaluation": "names_evaluation.csv",
"males": "names_males.csv",
"females": "names_females.csv",
"ner_data": "names_ner.json",
"ner_spacy": "names_ner.spacy"
}
)
split_evaluation: bool = True
split_evaluation: bool = False
split_by_gender: bool = True
split_ner_data: bool = True
evaluation_fraction: float = 0.2
random_seed: int = 42
-13
View File
@@ -1,13 +0,0 @@
from pydantic import BaseModel
class LLMConfig(BaseModel):
"""LLM annotation configuration"""
model_name: str = "mistral:7b"
requests_per_minute: int = 60
requests_per_second: int = 2
retry_attempts: int = 3
timeout_seconds: int = 30
max_concurrent_requests: int = 2
enable_rate_limiting: bool = False
+3 -3
View File
@@ -1,8 +1,8 @@
from pydantic import BaseModel
from core.config.logging_config import LoggingConfig
from core.config.annotation_config import AnnotationConfig
from core.config.data_config import DataConfig
from core.config.llm_config import LLMConfig
from core.config.logging_config import LoggingConfig
from core.config.processing_config import ProcessingConfig
from core.config.project_paths import ProjectPaths
@@ -17,7 +17,7 @@ class PipelineConfig(BaseModel):
paths: ProjectPaths
stages: list[str] = []
processing: ProcessingConfig = ProcessingConfig()
llm: LLMConfig = LLMConfig()
annotation: AnnotationConfig = AnnotationConfig()
data: DataConfig = DataConfig()
logging: LoggingConfig = LoggingConfig()