feat: add NER annotation step and integrate into pipeline
This commit is contained in:
@@ -0,0 +1,29 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
class NERConfig(BaseModel):
|
||||
"""NER annotation configuration"""
|
||||
|
||||
model_name: str = "drc_names_ner"
|
||||
retry_attempts: int = 3
|
||||
|
||||
|
||||
class LLMConfig(BaseModel):
|
||||
"""LLM annotation configuration"""
|
||||
|
||||
model_name: str = "mistral:7b"
|
||||
requests_per_minute: int = 60
|
||||
requests_per_second: int = 2
|
||||
retry_attempts: int = 3
|
||||
timeout_seconds: int = 30
|
||||
max_concurrent_requests: int = 2
|
||||
enable_rate_limiting: bool = False
|
||||
|
||||
|
||||
class AnnotationConfig(BaseModel):
|
||||
"""Base class for annotation configurations"""
|
||||
|
||||
llm: LLMConfig = LLMConfig()
|
||||
ner: NERConfig = NERConfig()
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -65,7 +65,7 @@ class ConfigManager:
|
||||
|
||||
# Ensure paths are properly set
|
||||
if "paths" not in config_data:
|
||||
config_data["paths"] = self.default_paths.dict()
|
||||
config_data["paths"] = self.default_paths.model_dump()
|
||||
|
||||
self._config = PipelineConfig(**config_data)
|
||||
return self._config
|
||||
|
||||
@@ -14,10 +14,13 @@ class DataConfig(BaseModel):
|
||||
"evaluation": "names_evaluation.csv",
|
||||
"males": "names_males.csv",
|
||||
"females": "names_females.csv",
|
||||
"ner_data": "names_ner.json",
|
||||
"ner_spacy": "names_ner.spacy"
|
||||
}
|
||||
)
|
||||
split_evaluation: bool = True
|
||||
split_evaluation: bool = False
|
||||
split_by_gender: bool = True
|
||||
split_ner_data: bool = True
|
||||
evaluation_fraction: float = 0.2
|
||||
random_seed: int = 42
|
||||
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class LLMConfig(BaseModel):
|
||||
"""LLM annotation configuration"""
|
||||
|
||||
model_name: str = "mistral:7b"
|
||||
requests_per_minute: int = 60
|
||||
requests_per_second: int = 2
|
||||
retry_attempts: int = 3
|
||||
timeout_seconds: int = 30
|
||||
max_concurrent_requests: int = 2
|
||||
enable_rate_limiting: bool = False
|
||||
@@ -1,8 +1,8 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.config.logging_config import LoggingConfig
|
||||
from core.config.annotation_config import AnnotationConfig
|
||||
from core.config.data_config import DataConfig
|
||||
from core.config.llm_config import LLMConfig
|
||||
from core.config.logging_config import LoggingConfig
|
||||
from core.config.processing_config import ProcessingConfig
|
||||
from core.config.project_paths import ProjectPaths
|
||||
|
||||
@@ -17,7 +17,7 @@ class PipelineConfig(BaseModel):
|
||||
paths: ProjectPaths
|
||||
stages: list[str] = []
|
||||
processing: ProcessingConfig = ProcessingConfig()
|
||||
llm: LLMConfig = LLMConfig()
|
||||
annotation: AnnotationConfig = AnnotationConfig()
|
||||
data: DataConfig = DataConfig()
|
||||
logging: LoggingConfig = LoggingConfig()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user