refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,61 @@
+import logging
+from pathlib import Path
+from typing import Optional, Union
+
+from core.config.config_manager import ConfigManager
+from core.config.logging_config import LoggingConfig
+from core.config.pipeline_config import PipelineConfig
+
+config_manager = ConfigManager()
+
+
+def get_config() -> PipelineConfig:
+    """Get the global configuration instance"""
+    return config_manager.get_config()
+
+
+def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
+    """Load configuration from specified path"""
+    if config_path:
+        return config_manager.load_config(Path(config_path))
+    return config_manager.get_config()
+
+
+def setup_logging(config: PipelineConfig):
+    """Setup logging based on configuration"""
+
+    # Create logs directory
+    log_dir = config.paths.logs_dir
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    # Setup logging configuration
+    log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
+
+    # Create formatter
+    formatter = logging.Formatter(config.logging.format)
+
+    # Setup root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+
+    # Clear existing handlers
+    root_logger.handlers.clear()
+
+    # Console handler
+    if config.logging.console_logging:
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+
+    # File handler
+    if config.logging.file_logging:
+        from logging.handlers import RotatingFileHandler
+
+        log_file_path = log_dir / config.logging.log_file
+        file_handler = RotatingFileHandler(
+            log_file_path,
+            maxBytes=config.logging.max_log_size,
+            backupCount=config.logging.backup_count,
+        )
+        file_handler.setFormatter(formatter)
+        root_logger.addHandler(file_handler)
@@ -0,0 +1,145 @@
+import json
+import logging
+from pathlib import Path
+from typing import Optional, Union, Dict, Any
+
+import yaml
+
+from core.config.pipeline_config import PipelineConfig
+from core.config.project_paths import ProjectPaths
+
+
+class ConfigManager:
+    """Centralized configuration management"""
+
+    def __init__(self, config_path: Optional[Union[str, Path]] = None):
+        self.config_path = config_path or self._find_config_file()
+        self._config: Optional[PipelineConfig] = None
+        self._setup_default_paths()
+
+    @classmethod
+    def _find_config_file(cls) -> Path:
+        """Find configuration file in standard locations"""
+        possible_paths = [
+            Path.cwd() / "config" / "pipeline.yaml",
+            Path.cwd() / "config" / "pipeline.yml",
+            Path.cwd() / "pipeline.yaml",
+            Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
+        ]
+
+        for path in possible_paths:
+            if path.exists():
+                return path
+
+        # Return default path if none found
+        return Path.cwd() / "config" / "pipeline.yaml"
+
+    def _setup_default_paths(self):
+        """Setup default project paths"""
+        root_dir = Path(__file__).parent.parent.parent
+        self.default_paths = ProjectPaths(
+            root_dir=root_dir,
+            configs_dir=root_dir / "config",
+            data_dir=root_dir / "data" / "dataset",
+            models_dir=root_dir / "data" / "models",
+            outputs_dir=root_dir / "data" / "outputs",
+            logs_dir=root_dir / "data" / "logs",
+            checkpoints_dir=root_dir / "data" / "checkpoints",
+        )
+
+    def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
+        """Load configuration from file"""
+        if config_path:
+            self.config_path = config_path
+
+        if not self.config_path.exists():
+            logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
+            return self._create_default_config()
+
+        try:
+            with open(self.config_path, "r") as f:
+                if self.config_path.suffix.lower() in [".yaml", ".yml"]:
+                    config_data = yaml.safe_load(f)
+                else:
+                    config_data = json.load(f)
+
+            # Ensure paths are properly set
+            if "paths" not in config_data:
+                config_data["paths"] = self.default_paths.dict()
+
+            self._config = PipelineConfig(**config_data)
+            return self._config
+
+        except Exception as e:
+            logging.error(f"Failed to load config from {self.config_path}: {e}")
+            return self._create_default_config()
+
+    def _create_default_config(self) -> PipelineConfig:
+        """Create default configuration"""
+        return PipelineConfig(paths=self.default_paths)
+
+    def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
+        """Save configuration to file"""
+        save_path = path or self.config_path
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+
+        config_dict = config.model_dump()
+
+        # Convert Path objects to strings for serialization
+        if "paths" in config_dict:
+            for key, value in config_dict["paths"].items():
+                if isinstance(value, Path):
+                    config_dict["paths"][key] = str(value)
+
+        try:
+            with open(save_path, "w") as f:
+                if save_path.suffix.lower() in [".yaml", ".yml"]:
+                    yaml.dump(config_dict, f, default_flow_style=False, indent=2)
+                else:
+                    json.dump(config_dict, f, indent=2)
+
+            logging.info(f"Configuration saved to {save_path}")
+
+        except Exception as e:
+            logging.error(f"Failed to save config to {save_path}: {e}")
+
+    def get_config(self) -> PipelineConfig:
+        """Get current configuration, loading if necessary"""
+        if self._config is None:
+            self._config = self.load_config()
+        return self._config
+
+    def update_config(self, updates: Dict[str, Any]):
+        """Update configuration with new values"""
+        config = self.get_config()
+
+        # Deep update configuration
+        config_dict = config.model_dump()
+        self._deep_update(config_dict, updates)
+
+        self._config = PipelineConfig(**config_dict)
+
+    def _deep_update(self, base_dict: Dict, update_dict: Dict):
+        """Recursively update nested dictionaries"""
+        for key, value in update_dict.items():
+            if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
+                self._deep_update(base_dict[key], value)
+            else:
+                base_dict[key] = value
+
+    def get_environment_config(self, env: str) -> PipelineConfig:
+        """Load environment-specific configuration"""
+        env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
+
+        if env_config_path.exists():
+            base_config = self.load_config()
+            env_config = self.load_config(env_config_path)
+
+            # Merge configurations
+            base_dict = base_config.dict()
+            env_dict = env_config.dict()
+            self._deep_update(base_dict, env_dict)
+
+            return PipelineConfig(**base_dict)
+
+        return self.get_config()
@@ -0,0 +1,22 @@
+from dataclasses import field
+from typing import Dict
+
+from pydantic import BaseModel
+
+
+class DataConfig(BaseModel):
+    """Data handling configuration"""
+
+    input_file: str = "names.csv"
+    output_files: Dict[str, str] = field(
+        default_factory=lambda: {
+            "featured": "names_featured.csv",
+            "evaluation": "names_evaluation.csv",
+            "males": "names_males.csv",
+            "females": "names_females.csv",
+        }
+    )
+    split_evaluation: bool = True
+    split_by_gender: bool = True
+    evaluation_fraction: float = 0.2
+    random_seed: int = 42
@@ -0,0 +1,13 @@
+from pydantic import BaseModel
+
+
+class LLMConfig(BaseModel):
+    """LLM annotation configuration"""
+
+    model_name: str = "mistral:7b"
+    requests_per_minute: int = 60
+    requests_per_second: int = 2
+    retry_attempts: int = 3
+    timeout_seconds: int = 30
+    max_concurrent_requests: int = 2
+    enable_rate_limiting: bool = False
@@ -0,0 +1,13 @@
+from pydantic import BaseModel
+
+
+class LoggingConfig(BaseModel):
+    """Logging configuration"""
+
+    level: str = "INFO"
+    format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    file_logging: bool = True
+    console_logging: bool = True
+    log_file: str = "pipeline.log"
+    max_log_size: int = 10 * 1024 * 1024  # 10MB
+    backup_count: int = 5
@@ -0,0 +1,29 @@
+from pydantic import BaseModel
+
+from core.config.logging_config import LoggingConfig
+from core.config.data_config import DataConfig
+from core.config.llm_config import LLMConfig
+from core.config.processing_config import ProcessingConfig
+from core.config.project_paths import ProjectPaths
+
+
+class PipelineConfig(BaseModel):
+    """Main pipeline configuration"""
+
+    name: str = "drc_names_pipeline"
+    version: str = "1.0.0"
+    description: str = "DRC Names NLP Processing Pipeline"
+
+    paths: ProjectPaths
+    stages: list[str] = []
+    processing: ProcessingConfig = ProcessingConfig()
+    llm: LLMConfig = LLMConfig()
+    data: DataConfig = DataConfig()
+    logging: LoggingConfig = LoggingConfig()
+
+    # Environment-specific settings
+    environment: str = "development"
+    debug: bool = True
+
+    class Config:
+        arbitrary_types_allowed = True
@@ -0,0 +1,14 @@
+from dataclasses import field
+
+from pydantic import BaseModel
+
+
+class ProcessingConfig(BaseModel):
+    """Data processing pipeline configuration"""
+
+    batch_size: int = 1000
+    max_workers: int = 4
+    checkpoint_interval: int = 5
+    use_multiprocessing: bool = False
+    encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
+    chunk_size: int = 100_000
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+from pydantic import BaseModel, field_validator
+
+
+class ProjectPaths(BaseModel):
+    """Project directory structure configuration"""
+
+    root_dir: Path
+    data_dir: Path
+    models_dir: Path
+    outputs_dir: Path
+    logs_dir: Path
+    configs_dir: Path
+    checkpoints_dir: Path
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    @classmethod
+    @field_validator("*", mode="before")
+    def convert_to_path(cls, v):
+        return Path(v) if not isinstance(v, Path) else v