refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,61 @@
+import logging
+from pathlib import Path
+from typing import Optional, Union
+
+from core.config.config_manager import ConfigManager
+from core.config.logging_config import LoggingConfig
+from core.config.pipeline_config import PipelineConfig
+
+config_manager = ConfigManager()
+
+
+def get_config() -> PipelineConfig:
+    """Get the global configuration instance"""
+    return config_manager.get_config()
+
+
+def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
+    """Load configuration from specified path"""
+    if config_path:
+        return config_manager.load_config(Path(config_path))
+    return config_manager.get_config()
+
+
+def setup_logging(config: PipelineConfig):
+    """Setup logging based on configuration"""
+
+    # Create logs directory
+    log_dir = config.paths.logs_dir
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    # Setup logging configuration
+    log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
+
+    # Create formatter
+    formatter = logging.Formatter(config.logging.format)
+
+    # Setup root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+
+    # Clear existing handlers
+    root_logger.handlers.clear()
+
+    # Console handler
+    if config.logging.console_logging:
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+
+    # File handler
+    if config.logging.file_logging:
+        from logging.handlers import RotatingFileHandler
+
+        log_file_path = log_dir / config.logging.log_file
+        file_handler = RotatingFileHandler(
+            log_file_path,
+            maxBytes=config.logging.max_log_size,
+            backupCount=config.logging.backup_count,
+        )
+        file_handler.setFormatter(formatter)
+        root_logger.addHandler(file_handler)
@@ -0,0 +1,145 @@
+import json
+import logging
+from pathlib import Path
+from typing import Optional, Union, Dict, Any
+
+import yaml
+
+from core.config.pipeline_config import PipelineConfig
+from core.config.project_paths import ProjectPaths
+
+
+class ConfigManager:
+    """Centralized configuration management"""
+
+    def __init__(self, config_path: Optional[Union[str, Path]] = None):
+        self.config_path = config_path or self._find_config_file()
+        self._config: Optional[PipelineConfig] = None
+        self._setup_default_paths()
+
+    @classmethod
+    def _find_config_file(cls) -> Path:
+        """Find configuration file in standard locations"""
+        possible_paths = [
+            Path.cwd() / "config" / "pipeline.yaml",
+            Path.cwd() / "config" / "pipeline.yml",
+            Path.cwd() / "pipeline.yaml",
+            Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
+        ]
+
+        for path in possible_paths:
+            if path.exists():
+                return path
+
+        # Return default path if none found
+        return Path.cwd() / "config" / "pipeline.yaml"
+
+    def _setup_default_paths(self):
+        """Setup default project paths"""
+        root_dir = Path(__file__).parent.parent.parent
+        self.default_paths = ProjectPaths(
+            root_dir=root_dir,
+            configs_dir=root_dir / "config",
+            data_dir=root_dir / "data" / "dataset",
+            models_dir=root_dir / "data" / "models",
+            outputs_dir=root_dir / "data" / "outputs",
+            logs_dir=root_dir / "data" / "logs",
+            checkpoints_dir=root_dir / "data" / "checkpoints",
+        )
+
+    def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
+        """Load configuration from file"""
+        if config_path:
+            self.config_path = config_path
+
+        if not self.config_path.exists():
+            logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
+            return self._create_default_config()
+
+        try:
+            with open(self.config_path, "r") as f:
+                if self.config_path.suffix.lower() in [".yaml", ".yml"]:
+                    config_data = yaml.safe_load(f)
+                else:
+                    config_data = json.load(f)
+
+            # Ensure paths are properly set
+            if "paths" not in config_data:
+                config_data["paths"] = self.default_paths.dict()
+
+            self._config = PipelineConfig(**config_data)
+            return self._config
+
+        except Exception as e:
+            logging.error(f"Failed to load config from {self.config_path}: {e}")
+            return self._create_default_config()
+
+    def _create_default_config(self) -> PipelineConfig:
+        """Create default configuration"""
+        return PipelineConfig(paths=self.default_paths)
+
+    def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
+        """Save configuration to file"""
+        save_path = path or self.config_path
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+
+        config_dict = config.model_dump()
+
+        # Convert Path objects to strings for serialization
+        if "paths" in config_dict:
+            for key, value in config_dict["paths"].items():
+                if isinstance(value, Path):
+                    config_dict["paths"][key] = str(value)
+
+        try:
+            with open(save_path, "w") as f:
+                if save_path.suffix.lower() in [".yaml", ".yml"]:
+                    yaml.dump(config_dict, f, default_flow_style=False, indent=2)
+                else:
+                    json.dump(config_dict, f, indent=2)
+
+            logging.info(f"Configuration saved to {save_path}")
+
+        except Exception as e:
+            logging.error(f"Failed to save config to {save_path}: {e}")
+
+    def get_config(self) -> PipelineConfig:
+        """Get current configuration, loading if necessary"""
+        if self._config is None:
+            self._config = self.load_config()
+        return self._config
+
+    def update_config(self, updates: Dict[str, Any]):
+        """Update configuration with new values"""
+        config = self.get_config()
+
+        # Deep update configuration
+        config_dict = config.model_dump()
+        self._deep_update(config_dict, updates)
+
+        self._config = PipelineConfig(**config_dict)
+
+    def _deep_update(self, base_dict: Dict, update_dict: Dict):
+        """Recursively update nested dictionaries"""
+        for key, value in update_dict.items():
+            if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
+                self._deep_update(base_dict[key], value)
+            else:
+                base_dict[key] = value
+
+    def get_environment_config(self, env: str) -> PipelineConfig:
+        """Load environment-specific configuration"""
+        env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
+
+        if env_config_path.exists():
+            base_config = self.load_config()
+            env_config = self.load_config(env_config_path)
+
+            # Merge configurations
+            base_dict = base_config.dict()
+            env_dict = env_config.dict()
+            self._deep_update(base_dict, env_dict)
+
+            return PipelineConfig(**base_dict)
+
+        return self.get_config()
@@ -0,0 +1,22 @@
+from dataclasses import field
+from typing import Dict
+
+from pydantic import BaseModel
+
+
+class DataConfig(BaseModel):
+    """Data handling configuration"""
+
+    input_file: str = "names.csv"
+    output_files: Dict[str, str] = field(
+        default_factory=lambda: {
+            "featured": "names_featured.csv",
+            "evaluation": "names_evaluation.csv",
+            "males": "names_males.csv",
+            "females": "names_females.csv",
+        }
+    )
+    split_evaluation: bool = True
+    split_by_gender: bool = True
+    evaluation_fraction: float = 0.2
+    random_seed: int = 42
@@ -0,0 +1,13 @@
+from pydantic import BaseModel
+
+
+class LLMConfig(BaseModel):
+    """LLM annotation configuration"""
+
+    model_name: str = "mistral:7b"
+    requests_per_minute: int = 60
+    requests_per_second: int = 2
+    retry_attempts: int = 3
+    timeout_seconds: int = 30
+    max_concurrent_requests: int = 2
+    enable_rate_limiting: bool = False
@@ -0,0 +1,13 @@
+from pydantic import BaseModel
+
+
+class LoggingConfig(BaseModel):
+    """Logging configuration"""
+
+    level: str = "INFO"
+    format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    file_logging: bool = True
+    console_logging: bool = True
+    log_file: str = "pipeline.log"
+    max_log_size: int = 10 * 1024 * 1024  # 10MB
+    backup_count: int = 5
@@ -0,0 +1,29 @@
+from pydantic import BaseModel
+
+from core.config.logging_config import LoggingConfig
+from core.config.data_config import DataConfig
+from core.config.llm_config import LLMConfig
+from core.config.processing_config import ProcessingConfig
+from core.config.project_paths import ProjectPaths
+
+
+class PipelineConfig(BaseModel):
+    """Main pipeline configuration"""
+
+    name: str = "drc_names_pipeline"
+    version: str = "1.0.0"
+    description: str = "DRC Names NLP Processing Pipeline"
+
+    paths: ProjectPaths
+    stages: list[str] = []
+    processing: ProcessingConfig = ProcessingConfig()
+    llm: LLMConfig = LLMConfig()
+    data: DataConfig = DataConfig()
+    logging: LoggingConfig = LoggingConfig()
+
+    # Environment-specific settings
+    environment: str = "development"
+    debug: bool = True
+
+    class Config:
+        arbitrary_types_allowed = True
@@ -0,0 +1,14 @@
+from dataclasses import field
+
+from pydantic import BaseModel
+
+
+class ProcessingConfig(BaseModel):
+    """Data processing pipeline configuration"""
+
+    batch_size: int = 1000
+    max_workers: int = 4
+    checkpoint_interval: int = 5
+    use_multiprocessing: bool = False
+    encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
+    chunk_size: int = 100_000
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+from pydantic import BaseModel, field_validator
+
+
+class ProjectPaths(BaseModel):
+    """Project directory structure configuration"""
+
+    root_dir: Path
+    data_dir: Path
+    models_dir: Path
+    outputs_dir: Path
+    logs_dir: Path
+    configs_dir: Path
+    checkpoints_dir: Path
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    @classmethod
+    @field_validator("*", mode="before")
+    def convert_to_path(cls, v):
+        return Path(v) if not isinstance(v, Path) else v
@@ -0,0 +1,57 @@
+import logging
+from contextlib import contextmanager
+from pathlib import Path
+
+from core.config import get_config, PipelineConfig
+
+
+@contextmanager
+def temporary_config_override(**overrides):
+    """Context manager for temporarily overriding configuration"""
+    config = get_config()
+    original_values = {}
+
+    # Store original values and apply overrides
+    for key, value in overrides.items():
+        if hasattr(config, key):
+            original_values[key] = getattr(config, key)
+            setattr(config, key, value)
+
+    try:
+        yield config
+    finally:
+        # Restore original values
+        for key, value in original_values.items():
+            setattr(config, key, value)
+
+
+def ensure_directories(config: PipelineConfig) -> None:
+    """Ensure all required directories exist"""
+    directories = [
+        config.paths.data_dir,
+        config.paths.models_dir,
+        config.paths.outputs_dir,
+        config.paths.logs_dir,
+        config.paths.configs_dir,
+        config.paths.checkpoints_dir,
+    ]
+
+    for directory in directories:
+        Path(directory).mkdir(parents=True, exist_ok=True)
+
+    logging.info("Ensured all required directories exist")
+
+
+def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
+    """Get full path for a data file"""
+    return config.paths.data_dir / filename
+
+
+def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
+    """Get full path for a model file"""
+    return config.paths.models_dir / filename
+
+
+def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
+    """Get full path for an output file"""
+    return config.paths.outputs_dir / filename
@@ -0,0 +1,62 @@
+import logging
+from pathlib import Path
+from typing import Optional, Union, Iterator
+
+import pandas as pd
+
+from core.config.pipeline_config import PipelineConfig
+
+
+class DataLoader:
+    """Reusable data loading utilities"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+
+    def load_csv_chunked(
+        self, filepath: Union[str, Path], chunk_size: Optional[int] = None
+    ) -> Iterator[pd.DataFrame]:
+        """Load CSV file in chunks for memory efficiency"""
+        chunk_size = chunk_size or self.config.processing.chunk_size
+        encodings = self.config.processing.encoding_options
+
+        filepath = Path(filepath)
+
+        for encoding in encodings:
+            try:
+                logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
+
+                chunk_iter = pd.read_csv(
+                    filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
+                )
+
+                for i, chunk in enumerate(chunk_iter):
+                    logging.debug(f"Processing chunk {i+1}")
+                    yield chunk
+
+                logging.info(f"Successfully read {filepath} with encoding: {encoding}")
+                return
+
+            except Exception as e:
+                logging.warning(f"Failed with encoding {encoding}: {e}")
+                continue
+
+        raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
+
+    def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
+        """Load complete CSV file into memory"""
+        chunks = list(self.load_csv_chunked(filepath))
+        return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
+
+    @classmethod
+    def save_csv(
+        cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
+    ) -> None:
+        """Save DataFrame to CSV with proper handling"""
+        filepath = Path(filepath)
+
+        if create_dirs:
+            filepath.parent.mkdir(parents=True, exist_ok=True)
+
+        df.to_csv(filepath, index=False, encoding="utf-8")
+        logging.info(f"Saved {len(df)} rows to {filepath}")
@@ -0,0 +1,3 @@
+
+
+
@@ -0,0 +1,24 @@
+from core.config.pipeline_config import PipelineConfig
+
+
+class PromptManager:
+    """Manage prompts for LLM operations"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.prompts_dir = self.config.paths.configs_dir / "prompts"
+
+    def load_prompt(self, prompt_name: str = "default") -> str:
+        """Load a prompt template"""
+        prompt_file = self.prompts_dir / f"{prompt_name}.txt"
+
+        if not prompt_file.exists():
+            # Fallback to root directory
+            fallback_file = self.config.paths.root_dir / "prompt.txt"
+            if fallback_file.exists():
+                prompt_file = fallback_file
+            else:
+                raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
+
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            return f.read().strip()
@@ -0,0 +1,56 @@
+import threading
+import time
+from dataclasses import dataclass
+from queue import Queue
+
+
+@dataclass
+class RateLimitConfig:
+    """Configuration for rate limiting LLM requests"""
+
+    requests_per_minute: int = 60
+    requests_per_second: int = 2
+    burst_limit: int = 5
+
+
+class RateLimiter:
+    """Thread-safe rate limiter for LLM requests"""
+
+    def __init__(self, config: RateLimitConfig):
+        self.config = config
+        self.request_times = Queue()
+        self.lock = threading.Lock()
+        self.last_request_time = 0
+
+    def wait_if_needed(self):
+        """Wait if necessary to respect rate limits"""
+        with self.lock:
+            current_time = time.time()
+
+            # Check requests per second limit
+            time_since_last = current_time - self.last_request_time
+            min_interval = 1.0 / self.config.requests_per_second
+
+            if time_since_last < min_interval:
+                sleep_time = min_interval - time_since_last
+                time.sleep(sleep_time)
+                current_time = time.time()
+
+            # Clean old request times (older than 1 minute)
+            while not self.request_times.empty():
+                if current_time - self.request_times.queue[0] > 60:
+                    self.request_times.get()
+                else:
+                    break
+
+            # Check requests per minute limit
+            if self.request_times.qsize() >= self.config.requests_per_minute:
+                oldest_request = self.request_times.queue[0]
+                wait_time = 60 - (current_time - oldest_request)
+                if wait_time > 0:
+                    time.sleep(wait_time)
+                    current_time = time.time()
+
+            # Record this request
+            self.request_times.put(current_time)
+            self.last_request_time = current_time
@@ -0,0 +1,162 @@
+from typing import Optional, Dict, Tuple
+
+import pandas as pd
+
+
+class RegionMapper:
+    """Reusable region mapping utilities"""
+
+    def __init__(self, mapping: Optional[Dict] = None):
+        self.mapping = mapping or REGION_MAPPING
+
+    def map_region_to_province(self, region: str) -> str:
+        """Map a region to its province"""
+        region_lower = str(region).lower().strip()
+        return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
+
+    def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
+        """Vectorized region to province mapping"""
+        return regions.str.lower().map(
+            lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
+        )
+
+    @staticmethod
+    def get_provinces():
+        return [
+            "kinshasa",
+            "bas-congo",
+            "bandundu",
+            "katanga",
+            "equateur",
+            "province-orientale",
+            "maniema",
+            "nord-kivu",
+            "sud-kivu",
+            "kasai-occidental",
+            "kasai-oriental",
+        ]
+
+
+# DRC Region to Province Mapping
+REGION_MAPPING: Dict[str, Tuple[str, str]] = {
+    # Kinshasa
+    "kinshasa": ("KINSHASA", "KINSHASA"),
+    "kinshasa-centre": ("KINSHASA", "KINSHASA"),
+    "kinshasa-est": ("KINSHASA", "KINSHASA"),
+    "kinshasa-funa": ("KINSHASA", "KINSHASA"),
+    "kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
+    "kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
+    "kinshasa-ouest": ("KINSHASA", "KINSHASA"),
+    "kinshasa-plateau": ("KINSHASA", "KINSHASA"),
+    "kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
+    # Bas-Congo → Kongo-Central → BAS-CONGO
+    "bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
+    "kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
+    # Kwilu, Kwango, Mai-Ndombe → BANDUNDU
+    "bandundu": ("BANDUNDU", "BANDUNDU"),
+    "bandundu-1": ("BANDUNDU", "BANDUNDU"),
+    "bandundu-2": ("BANDUNDU", "BANDUNDU"),
+    "bandundu-3": ("BANDUNDU", "BANDUNDU"),
+    "kwilu": ("KWILU", "BANDUNDU"),
+    "kwilu-1": ("KWILU", "BANDUNDU"),
+    "kwilu-2": ("KWILU", "BANDUNDU"),
+    "kwilu-3": ("KWILU", "BANDUNDU"),
+    "kwango": ("KWANGO", "BANDUNDU"),
+    "kwango-1": ("KWANGO", "BANDUNDU"),
+    "kwango-2": ("KWANGO", "BANDUNDU"),
+    "mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
+    "mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
+    "mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
+    "mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
+    # Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
+    "haut-katanga": ("HAUT-KATANGA", "KATANGA"),
+    "haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
+    "haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
+    "haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
+    "haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
+    "haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
+    "lualaba": ("LUALABA", "KATANGA"),
+    "lualaba-1": ("LUALABA", "KATANGA"),
+    "lualaba-2": ("LUALABA", "KATANGA"),
+    "lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
+    "tanganyika": ("TANGANYIKA", "KATANGA"),
+    "tanganyika-1": ("TANGANYIKA", "KATANGA"),
+    "tanganyika-2": ("TANGANYIKA", "KATANGA"),
+    # Equateur → MONGALA, NORD-UBANGI, SUD-UBANGI, TSHUAPA
+    "equateur": ("EQUATEUR", "EQUATEUR"),
+    "equateur-1": ("EQUATEUR", "EQUATEUR"),
+    "equateur-2": ("EQUATEUR", "EQUATEUR"),
+    "equateur-3": ("EQUATEUR", "EQUATEUR"),
+    "equateur-4": ("EQUATEUR", "EQUATEUR"),
+    "equateur-5": ("EQUATEUR", "EQUATEUR"),
+    "mongala": ("MONGALA", "EQUATEUR"),
+    "mongala-1": ("MONGALA", "EQUATEUR"),
+    "mongala-2": ("MONGALA", "EQUATEUR"),
+    "nord-ubangi": ("NORD-UBANGI", "EQUATEUR"),
+    "nord-ubangi-1": ("NORD-UBANGI", "EQUATEUR"),
+    "nord-ubangi-2": ("NORD-UBANGI", "EQUATEUR"),
+    "sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
+    "sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
+    "sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
+    "tshuapa": ("TSHUAPA", "EQUATEUR"),
+    "tshuapa-1": ("TSHUAPA", "EQUATEUR"),
+    "tshuapa-2": ("TSHUAPA", "EQUATEUR"),
+    # Province-Orientale
+    "province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
+    "province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
+    "province-orientale-2": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
+    "province-orientale-3": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
+    "province-orientale-4": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
+    "haut-uele": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
+    "haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
+    "haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
+    "bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
+    "bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"),
+    "bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"),
+    "ituri": ("ITURI", "PROVINCE-ORIENTALE"),
+    "ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
+    "ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
+    "tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
+    "tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
+    "tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
+    # Maniema
+    "maniema": ("MANIEMA", "MANIEMA"),
+    "maniema-1": ("MANIEMA", "MANIEMA"),
+    "maniema-2": ("MANIEMA", "MANIEMA"),
+    # Nord-Kivu
+    "nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
+    "nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
+    "nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
+    "nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
+    # Sud-Kivu
+    "sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
+    "sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
+    "sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
+    "sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
+    # Kasai-Occidental → KASAI, KASAI-CENTRAL
+    "kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
+    "kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
+    "kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
+    "kasai": ("KASAI", "KASAI-OCCIDENTAL"),
+    "kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
+    "kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
+    "kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
+    "kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
+    "kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
+    # Kasai-Oriental → LOMAMI, SANKURU
+    "kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
+    "lomami": ("LOMAMI", "KASAI-ORIENTAL"),
+    "lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
+    "lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
+    "sankuru": ("SANKURU", "KASAI-ORIENTAL"),
+    "sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
+    "sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
+}
@@ -0,0 +1,41 @@
+import json
+import logging
+from typing import Dict, Any
+
+from core.config.pipeline_config import PipelineConfig
+
+
+class StateManager:
+    """Manage pipeline state and checkpoints"""
+
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.checkpoints_dir = self.config.paths.checkpoints_dir
+
+    def save_state(self, state: Dict[str, Any], state_name: str) -> None:
+        """Save pipeline state"""
+        self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
+        state_file = self.checkpoints_dir / f"{state_name}.json"
+
+        with open(state_file, "w") as f:
+            json.dump(state, f, indent=2, default=str)
+
+        logging.debug(f"Saved state to {state_file}")
+
+    def load_state(self, state_name: str) -> Dict[str, Any]:
+        """Load pipeline state"""
+        state_file = self.checkpoints_dir / f"{state_name}.json"
+
+        if not state_file.exists():
+            return {}
+
+        with open(state_file, "r") as f:
+            return json.load(f)
+
+    def clear_state(self, state_name: str) -> None:
+        """Clear pipeline state"""
+        state_file = self.checkpoints_dir / f"{state_name}.json"
+
+        if state_file.exists():
+            state_file.unlink()
+            logging.info(f"Cleared state: {state_name}")
@@ -0,0 +1,38 @@
+from typing import Optional, Dict
+
+import pandas as pd
+
+
+class TextCleaner:
+    """Reusable text cleaning utilities"""
+
+    def __init__(self, patterns: Optional[Dict[str, str]] = None):
+        self.patterns = patterns or {
+            "null_bytes": "\x00",
+            "non_breaking_spaces": "\u00a0",
+            "multiple_spaces": r" +",
+            "extra_whitespace": r"\s+",
+        }
+
+    def clean_text_series(self, series: pd.Series) -> pd.Series:
+        """Clean a pandas Series of text data"""
+        cleaned = series.astype(str)
+
+        # Apply cleaning patterns
+        for pattern_name, pattern in self.patterns.items():
+            if pattern_name == "multiple_spaces":
+                cleaned = cleaned.str.replace(pattern, " ", regex=True)
+            else:
+                cleaned = cleaned.str.replace(pattern, " ", regex=False)
+
+        return cleaned.str.strip().str.lower()
+
+    def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Clean all text columns in a DataFrame"""
+        df = df.copy()
+        text_columns = df.select_dtypes(include="object").columns
+
+        for col in text_columns:
+            df[col] = self.clean_text_series(df[col])
+
+        return df