refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
+61
View File
@@ -0,0 +1,61 @@
import logging
from pathlib import Path
from typing import Optional, Union
from core.config.config_manager import ConfigManager
from core.config.logging_config import LoggingConfig
from core.config.pipeline_config import PipelineConfig
config_manager = ConfigManager()
def get_config() -> PipelineConfig:
"""Get the global configuration instance"""
return config_manager.get_config()
def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
"""Load configuration from specified path"""
if config_path:
return config_manager.load_config(Path(config_path))
return config_manager.get_config()
def setup_logging(config: PipelineConfig):
"""Setup logging based on configuration"""
# Create logs directory
log_dir = config.paths.logs_dir
log_dir.mkdir(parents=True, exist_ok=True)
# Setup logging configuration
log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
# Create formatter
formatter = logging.Formatter(config.logging.format)
# Setup root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Clear existing handlers
root_logger.handlers.clear()
# Console handler
if config.logging.console_logging:
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
root_logger.addHandler(console_handler)
# File handler
if config.logging.file_logging:
from logging.handlers import RotatingFileHandler
log_file_path = log_dir / config.logging.log_file
file_handler = RotatingFileHandler(
log_file_path,
maxBytes=config.logging.max_log_size,
backupCount=config.logging.backup_count,
)
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)
+145
View File
@@ -0,0 +1,145 @@
import json
import logging
from pathlib import Path
from typing import Optional, Union, Dict, Any
import yaml
from core.config.pipeline_config import PipelineConfig
from core.config.project_paths import ProjectPaths
class ConfigManager:
"""Centralized configuration management"""
def __init__(self, config_path: Optional[Union[str, Path]] = None):
self.config_path = config_path or self._find_config_file()
self._config: Optional[PipelineConfig] = None
self._setup_default_paths()
@classmethod
def _find_config_file(cls) -> Path:
"""Find configuration file in standard locations"""
possible_paths = [
Path.cwd() / "config" / "pipeline.yaml",
Path.cwd() / "config" / "pipeline.yml",
Path.cwd() / "pipeline.yaml",
Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
]
for path in possible_paths:
if path.exists():
return path
# Return default path if none found
return Path.cwd() / "config" / "pipeline.yaml"
def _setup_default_paths(self):
"""Setup default project paths"""
root_dir = Path(__file__).parent.parent.parent
self.default_paths = ProjectPaths(
root_dir=root_dir,
configs_dir=root_dir / "config",
data_dir=root_dir / "data" / "dataset",
models_dir=root_dir / "data" / "models",
outputs_dir=root_dir / "data" / "outputs",
logs_dir=root_dir / "data" / "logs",
checkpoints_dir=root_dir / "data" / "checkpoints",
)
def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
"""Load configuration from file"""
if config_path:
self.config_path = config_path
if not self.config_path.exists():
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
return self._create_default_config()
try:
with open(self.config_path, "r") as f:
if self.config_path.suffix.lower() in [".yaml", ".yml"]:
config_data = yaml.safe_load(f)
else:
config_data = json.load(f)
# Ensure paths are properly set
if "paths" not in config_data:
config_data["paths"] = self.default_paths.dict()
self._config = PipelineConfig(**config_data)
return self._config
except Exception as e:
logging.error(f"Failed to load config from {self.config_path}: {e}")
return self._create_default_config()
def _create_default_config(self) -> PipelineConfig:
"""Create default configuration"""
return PipelineConfig(paths=self.default_paths)
def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
"""Save configuration to file"""
save_path = path or self.config_path
save_path.parent.mkdir(parents=True, exist_ok=True)
config_dict = config.model_dump()
# Convert Path objects to strings for serialization
if "paths" in config_dict:
for key, value in config_dict["paths"].items():
if isinstance(value, Path):
config_dict["paths"][key] = str(value)
try:
with open(save_path, "w") as f:
if save_path.suffix.lower() in [".yaml", ".yml"]:
yaml.dump(config_dict, f, default_flow_style=False, indent=2)
else:
json.dump(config_dict, f, indent=2)
logging.info(f"Configuration saved to {save_path}")
except Exception as e:
logging.error(f"Failed to save config to {save_path}: {e}")
def get_config(self) -> PipelineConfig:
"""Get current configuration, loading if necessary"""
if self._config is None:
self._config = self.load_config()
return self._config
def update_config(self, updates: Dict[str, Any]):
"""Update configuration with new values"""
config = self.get_config()
# Deep update configuration
config_dict = config.model_dump()
self._deep_update(config_dict, updates)
self._config = PipelineConfig(**config_dict)
def _deep_update(self, base_dict: Dict, update_dict: Dict):
"""Recursively update nested dictionaries"""
for key, value in update_dict.items():
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
self._deep_update(base_dict[key], value)
else:
base_dict[key] = value
def get_environment_config(self, env: str) -> PipelineConfig:
"""Load environment-specific configuration"""
env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
if env_config_path.exists():
base_config = self.load_config()
env_config = self.load_config(env_config_path)
# Merge configurations
base_dict = base_config.dict()
env_dict = env_config.dict()
self._deep_update(base_dict, env_dict)
return PipelineConfig(**base_dict)
return self.get_config()
+22
View File
@@ -0,0 +1,22 @@
from dataclasses import field
from typing import Dict
from pydantic import BaseModel
class DataConfig(BaseModel):
"""Data handling configuration"""
input_file: str = "names.csv"
output_files: Dict[str, str] = field(
default_factory=lambda: {
"featured": "names_featured.csv",
"evaluation": "names_evaluation.csv",
"males": "names_males.csv",
"females": "names_females.csv",
}
)
split_evaluation: bool = True
split_by_gender: bool = True
evaluation_fraction: float = 0.2
random_seed: int = 42
+13
View File
@@ -0,0 +1,13 @@
from pydantic import BaseModel
class LLMConfig(BaseModel):
"""LLM annotation configuration"""
model_name: str = "mistral:7b"
requests_per_minute: int = 60
requests_per_second: int = 2
retry_attempts: int = 3
timeout_seconds: int = 30
max_concurrent_requests: int = 2
enable_rate_limiting: bool = False
+13
View File
@@ -0,0 +1,13 @@
from pydantic import BaseModel
class LoggingConfig(BaseModel):
"""Logging configuration"""
level: str = "INFO"
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: bool = True
console_logging: bool = True
log_file: str = "pipeline.log"
max_log_size: int = 10 * 1024 * 1024 # 10MB
backup_count: int = 5
+29
View File
@@ -0,0 +1,29 @@
from pydantic import BaseModel
from core.config.logging_config import LoggingConfig
from core.config.data_config import DataConfig
from core.config.llm_config import LLMConfig
from core.config.processing_config import ProcessingConfig
from core.config.project_paths import ProjectPaths
class PipelineConfig(BaseModel):
"""Main pipeline configuration"""
name: str = "drc_names_pipeline"
version: str = "1.0.0"
description: str = "DRC Names NLP Processing Pipeline"
paths: ProjectPaths
stages: list[str] = []
processing: ProcessingConfig = ProcessingConfig()
llm: LLMConfig = LLMConfig()
data: DataConfig = DataConfig()
logging: LoggingConfig = LoggingConfig()
# Environment-specific settings
environment: str = "development"
debug: bool = True
class Config:
arbitrary_types_allowed = True
+14
View File
@@ -0,0 +1,14 @@
from dataclasses import field
from pydantic import BaseModel
class ProcessingConfig(BaseModel):
"""Data processing pipeline configuration"""
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5
use_multiprocessing: bool = False
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
chunk_size: int = 100_000
+23
View File
@@ -0,0 +1,23 @@
from pathlib import Path
from pydantic import BaseModel, field_validator
class ProjectPaths(BaseModel):
"""Project directory structure configuration"""
root_dir: Path
data_dir: Path
models_dir: Path
outputs_dir: Path
logs_dir: Path
configs_dir: Path
checkpoints_dir: Path
class Config:
arbitrary_types_allowed = True
@classmethod
@field_validator("*", mode="before")
def convert_to_path(cls, v):
return Path(v) if not isinstance(v, Path) else v