refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from core.config.config_manager import ConfigManager
|
||||
from core.config.logging_config import LoggingConfig
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
|
||||
config_manager = ConfigManager()
|
||||
|
||||
|
||||
def get_config() -> PipelineConfig:
|
||||
"""Get the global configuration instance"""
|
||||
return config_manager.get_config()
|
||||
|
||||
|
||||
def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
|
||||
"""Load configuration from specified path"""
|
||||
if config_path:
|
||||
return config_manager.load_config(Path(config_path))
|
||||
return config_manager.get_config()
|
||||
|
||||
|
||||
def setup_logging(config: PipelineConfig):
|
||||
"""Setup logging based on configuration"""
|
||||
|
||||
# Create logs directory
|
||||
log_dir = config.paths.logs_dir
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup logging configuration
|
||||
log_level = getattr(logging, config.logging.level.upper(), logging.INFO)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(config.logging.format)
|
||||
|
||||
# Setup root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(log_level)
|
||||
|
||||
# Clear existing handlers
|
||||
root_logger.handlers.clear()
|
||||
|
||||
# Console handler
|
||||
if config.logging.console_logging:
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# File handler
|
||||
if config.logging.file_logging:
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
log_file_path = log_dir / config.logging.log_file
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file_path,
|
||||
maxBytes=config.logging.max_log_size,
|
||||
backupCount=config.logging.backup_count,
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(file_handler)
|
||||
@@ -0,0 +1,145 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Dict, Any
|
||||
|
||||
import yaml
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Centralized configuration management"""
|
||||
|
||||
def __init__(self, config_path: Optional[Union[str, Path]] = None):
|
||||
self.config_path = config_path or self._find_config_file()
|
||||
self._config: Optional[PipelineConfig] = None
|
||||
self._setup_default_paths()
|
||||
|
||||
@classmethod
|
||||
def _find_config_file(cls) -> Path:
|
||||
"""Find configuration file in standard locations"""
|
||||
possible_paths = [
|
||||
Path.cwd() / "config" / "pipeline.yaml",
|
||||
Path.cwd() / "config" / "pipeline.yml",
|
||||
Path.cwd() / "pipeline.yaml",
|
||||
Path(__file__).parent.parent.parent / "config" / "pipeline.yaml",
|
||||
]
|
||||
|
||||
for path in possible_paths:
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Return default path if none found
|
||||
return Path.cwd() / "config" / "pipeline.yaml"
|
||||
|
||||
def _setup_default_paths(self):
|
||||
"""Setup default project paths"""
|
||||
root_dir = Path(__file__).parent.parent.parent
|
||||
self.default_paths = ProjectPaths(
|
||||
root_dir=root_dir,
|
||||
configs_dir=root_dir / "config",
|
||||
data_dir=root_dir / "data" / "dataset",
|
||||
models_dir=root_dir / "data" / "models",
|
||||
outputs_dir=root_dir / "data" / "outputs",
|
||||
logs_dir=root_dir / "data" / "logs",
|
||||
checkpoints_dir=root_dir / "data" / "checkpoints",
|
||||
)
|
||||
|
||||
def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig:
|
||||
"""Load configuration from file"""
|
||||
if config_path:
|
||||
self.config_path = config_path
|
||||
|
||||
if not self.config_path.exists():
|
||||
logging.warning(f"Config file not found: {self.config_path}. Using defaults.")
|
||||
return self._create_default_config()
|
||||
|
||||
try:
|
||||
with open(self.config_path, "r") as f:
|
||||
if self.config_path.suffix.lower() in [".yaml", ".yml"]:
|
||||
config_data = yaml.safe_load(f)
|
||||
else:
|
||||
config_data = json.load(f)
|
||||
|
||||
# Ensure paths are properly set
|
||||
if "paths" not in config_data:
|
||||
config_data["paths"] = self.default_paths.dict()
|
||||
|
||||
self._config = PipelineConfig(**config_data)
|
||||
return self._config
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load config from {self.config_path}: {e}")
|
||||
return self._create_default_config()
|
||||
|
||||
def _create_default_config(self) -> PipelineConfig:
|
||||
"""Create default configuration"""
|
||||
return PipelineConfig(paths=self.default_paths)
|
||||
|
||||
def save_config(self, config: PipelineConfig, path: Optional[Path] = None):
|
||||
"""Save configuration to file"""
|
||||
save_path = path or self.config_path
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
config_dict = config.model_dump()
|
||||
|
||||
# Convert Path objects to strings for serialization
|
||||
if "paths" in config_dict:
|
||||
for key, value in config_dict["paths"].items():
|
||||
if isinstance(value, Path):
|
||||
config_dict["paths"][key] = str(value)
|
||||
|
||||
try:
|
||||
with open(save_path, "w") as f:
|
||||
if save_path.suffix.lower() in [".yaml", ".yml"]:
|
||||
yaml.dump(config_dict, f, default_flow_style=False, indent=2)
|
||||
else:
|
||||
json.dump(config_dict, f, indent=2)
|
||||
|
||||
logging.info(f"Configuration saved to {save_path}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to save config to {save_path}: {e}")
|
||||
|
||||
def get_config(self) -> PipelineConfig:
|
||||
"""Get current configuration, loading if necessary"""
|
||||
if self._config is None:
|
||||
self._config = self.load_config()
|
||||
return self._config
|
||||
|
||||
def update_config(self, updates: Dict[str, Any]):
|
||||
"""Update configuration with new values"""
|
||||
config = self.get_config()
|
||||
|
||||
# Deep update configuration
|
||||
config_dict = config.model_dump()
|
||||
self._deep_update(config_dict, updates)
|
||||
|
||||
self._config = PipelineConfig(**config_dict)
|
||||
|
||||
def _deep_update(self, base_dict: Dict, update_dict: Dict):
|
||||
"""Recursively update nested dictionaries"""
|
||||
for key, value in update_dict.items():
|
||||
if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
|
||||
self._deep_update(base_dict[key], value)
|
||||
else:
|
||||
base_dict[key] = value
|
||||
|
||||
def get_environment_config(self, env: str) -> PipelineConfig:
|
||||
"""Load environment-specific configuration"""
|
||||
env_config_path = self.config_path.parent / f"pipeline.{env}.yaml"
|
||||
|
||||
if env_config_path.exists():
|
||||
base_config = self.load_config()
|
||||
env_config = self.load_config(env_config_path)
|
||||
|
||||
# Merge configurations
|
||||
base_dict = base_config.dict()
|
||||
env_dict = env_config.dict()
|
||||
self._deep_update(base_dict, env_dict)
|
||||
|
||||
return PipelineConfig(**base_dict)
|
||||
|
||||
return self.get_config()
|
||||
@@ -0,0 +1,22 @@
|
||||
from dataclasses import field
|
||||
from typing import Dict
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DataConfig(BaseModel):
|
||||
"""Data handling configuration"""
|
||||
|
||||
input_file: str = "names.csv"
|
||||
output_files: Dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
"featured": "names_featured.csv",
|
||||
"evaluation": "names_evaluation.csv",
|
||||
"males": "names_males.csv",
|
||||
"females": "names_females.csv",
|
||||
}
|
||||
)
|
||||
split_evaluation: bool = True
|
||||
split_by_gender: bool = True
|
||||
evaluation_fraction: float = 0.2
|
||||
random_seed: int = 42
|
||||
@@ -0,0 +1,13 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class LLMConfig(BaseModel):
|
||||
"""LLM annotation configuration"""
|
||||
|
||||
model_name: str = "mistral:7b"
|
||||
requests_per_minute: int = 60
|
||||
requests_per_second: int = 2
|
||||
retry_attempts: int = 3
|
||||
timeout_seconds: int = 30
|
||||
max_concurrent_requests: int = 2
|
||||
enable_rate_limiting: bool = False
|
||||
@@ -0,0 +1,13 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class LoggingConfig(BaseModel):
|
||||
"""Logging configuration"""
|
||||
|
||||
level: str = "INFO"
|
||||
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: bool = True
|
||||
console_logging: bool = True
|
||||
log_file: str = "pipeline.log"
|
||||
max_log_size: int = 10 * 1024 * 1024 # 10MB
|
||||
backup_count: int = 5
|
||||
@@ -0,0 +1,29 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.config.logging_config import LoggingConfig
|
||||
from core.config.data_config import DataConfig
|
||||
from core.config.llm_config import LLMConfig
|
||||
from core.config.processing_config import ProcessingConfig
|
||||
from core.config.project_paths import ProjectPaths
|
||||
|
||||
|
||||
class PipelineConfig(BaseModel):
|
||||
"""Main pipeline configuration"""
|
||||
|
||||
name: str = "drc_names_pipeline"
|
||||
version: str = "1.0.0"
|
||||
description: str = "DRC Names NLP Processing Pipeline"
|
||||
|
||||
paths: ProjectPaths
|
||||
stages: list[str] = []
|
||||
processing: ProcessingConfig = ProcessingConfig()
|
||||
llm: LLMConfig = LLMConfig()
|
||||
data: DataConfig = DataConfig()
|
||||
logging: LoggingConfig = LoggingConfig()
|
||||
|
||||
# Environment-specific settings
|
||||
environment: str = "development"
|
||||
debug: bool = True
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -0,0 +1,14 @@
|
||||
from dataclasses import field
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ProcessingConfig(BaseModel):
|
||||
"""Data processing pipeline configuration"""
|
||||
|
||||
batch_size: int = 1000
|
||||
max_workers: int = 4
|
||||
checkpoint_interval: int = 5
|
||||
use_multiprocessing: bool = False
|
||||
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
||||
chunk_size: int = 100_000
|
||||
@@ -0,0 +1,23 @@
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseModel, field_validator
|
||||
|
||||
|
||||
class ProjectPaths(BaseModel):
|
||||
"""Project directory structure configuration"""
|
||||
|
||||
root_dir: Path
|
||||
data_dir: Path
|
||||
models_dir: Path
|
||||
outputs_dir: Path
|
||||
logs_dir: Path
|
||||
configs_dir: Path
|
||||
checkpoints_dir: Path
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@classmethod
|
||||
@field_validator("*", mode="before")
|
||||
def convert_to_path(cls, v):
|
||||
return Path(v) if not isinstance(v, Path) else v
|
||||
@@ -0,0 +1,57 @@
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from core.config import get_config, PipelineConfig
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temporary_config_override(**overrides):
|
||||
"""Context manager for temporarily overriding configuration"""
|
||||
config = get_config()
|
||||
original_values = {}
|
||||
|
||||
# Store original values and apply overrides
|
||||
for key, value in overrides.items():
|
||||
if hasattr(config, key):
|
||||
original_values[key] = getattr(config, key)
|
||||
setattr(config, key, value)
|
||||
|
||||
try:
|
||||
yield config
|
||||
finally:
|
||||
# Restore original values
|
||||
for key, value in original_values.items():
|
||||
setattr(config, key, value)
|
||||
|
||||
|
||||
def ensure_directories(config: PipelineConfig) -> None:
|
||||
"""Ensure all required directories exist"""
|
||||
directories = [
|
||||
config.paths.data_dir,
|
||||
config.paths.models_dir,
|
||||
config.paths.outputs_dir,
|
||||
config.paths.logs_dir,
|
||||
config.paths.configs_dir,
|
||||
config.paths.checkpoints_dir,
|
||||
]
|
||||
|
||||
for directory in directories:
|
||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info("Ensured all required directories exist")
|
||||
|
||||
|
||||
def get_data_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||
"""Get full path for a data file"""
|
||||
return config.paths.data_dir / filename
|
||||
|
||||
|
||||
def get_model_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||
"""Get full path for a model file"""
|
||||
return config.paths.models_dir / filename
|
||||
|
||||
|
||||
def get_output_file_path(filename: str, config: PipelineConfig) -> Path:
|
||||
"""Get full path for an output file"""
|
||||
return config.paths.outputs_dir / filename
|
||||
@@ -0,0 +1,62 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Iterator
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class DataLoader:
|
||||
"""Reusable data loading utilities"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
|
||||
def load_csv_chunked(
|
||||
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
|
||||
) -> Iterator[pd.DataFrame]:
|
||||
"""Load CSV file in chunks for memory efficiency"""
|
||||
chunk_size = chunk_size or self.config.processing.chunk_size
|
||||
encodings = self.config.processing.encoding_options
|
||||
|
||||
filepath = Path(filepath)
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
|
||||
|
||||
chunk_iter = pd.read_csv(
|
||||
filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
|
||||
)
|
||||
|
||||
for i, chunk in enumerate(chunk_iter):
|
||||
logging.debug(f"Processing chunk {i+1}")
|
||||
yield chunk
|
||||
|
||||
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed with encoding {encoding}: {e}")
|
||||
continue
|
||||
|
||||
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
|
||||
|
||||
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
||||
"""Load complete CSV file into memory"""
|
||||
chunks = list(self.load_csv_chunked(filepath))
|
||||
return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
|
||||
|
||||
@classmethod
|
||||
def save_csv(
|
||||
cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
|
||||
) -> None:
|
||||
"""Save DataFrame to CSV with proper handling"""
|
||||
filepath = Path(filepath)
|
||||
|
||||
if create_dirs:
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
df.to_csv(filepath, index=False, encoding="utf-8")
|
||||
logging.info(f"Saved {len(df)} rows to {filepath}")
|
||||
@@ -0,0 +1,3 @@
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class PromptManager:
|
||||
"""Manage prompts for LLM operations"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.prompts_dir = self.config.paths.configs_dir / "prompts"
|
||||
|
||||
def load_prompt(self, prompt_name: str = "default") -> str:
|
||||
"""Load a prompt template"""
|
||||
prompt_file = self.prompts_dir / f"{prompt_name}.txt"
|
||||
|
||||
if not prompt_file.exists():
|
||||
# Fallback to root directory
|
||||
fallback_file = self.config.paths.root_dir / "prompt.txt"
|
||||
if fallback_file.exists():
|
||||
prompt_file = fallback_file
|
||||
else:
|
||||
raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
|
||||
|
||||
with open(prompt_file, "r", encoding="utf-8") as f:
|
||||
return f.read().strip()
|
||||
@@ -0,0 +1,56 @@
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from queue import Queue
|
||||
|
||||
|
||||
@dataclass
|
||||
class RateLimitConfig:
|
||||
"""Configuration for rate limiting LLM requests"""
|
||||
|
||||
requests_per_minute: int = 60
|
||||
requests_per_second: int = 2
|
||||
burst_limit: int = 5
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Thread-safe rate limiter for LLM requests"""
|
||||
|
||||
def __init__(self, config: RateLimitConfig):
|
||||
self.config = config
|
||||
self.request_times = Queue()
|
||||
self.lock = threading.Lock()
|
||||
self.last_request_time = 0
|
||||
|
||||
def wait_if_needed(self):
|
||||
"""Wait if necessary to respect rate limits"""
|
||||
with self.lock:
|
||||
current_time = time.time()
|
||||
|
||||
# Check requests per second limit
|
||||
time_since_last = current_time - self.last_request_time
|
||||
min_interval = 1.0 / self.config.requests_per_second
|
||||
|
||||
if time_since_last < min_interval:
|
||||
sleep_time = min_interval - time_since_last
|
||||
time.sleep(sleep_time)
|
||||
current_time = time.time()
|
||||
|
||||
# Clean old request times (older than 1 minute)
|
||||
while not self.request_times.empty():
|
||||
if current_time - self.request_times.queue[0] > 60:
|
||||
self.request_times.get()
|
||||
else:
|
||||
break
|
||||
|
||||
# Check requests per minute limit
|
||||
if self.request_times.qsize() >= self.config.requests_per_minute:
|
||||
oldest_request = self.request_times.queue[0]
|
||||
wait_time = 60 - (current_time - oldest_request)
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
current_time = time.time()
|
||||
|
||||
# Record this request
|
||||
self.request_times.put(current_time)
|
||||
self.last_request_time = current_time
|
||||
@@ -0,0 +1,162 @@
|
||||
from typing import Optional, Dict, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class RegionMapper:
|
||||
"""Reusable region mapping utilities"""
|
||||
|
||||
def __init__(self, mapping: Optional[Dict] = None):
|
||||
self.mapping = mapping or REGION_MAPPING
|
||||
|
||||
def map_region_to_province(self, region: str) -> str:
|
||||
"""Map a region to its province"""
|
||||
region_lower = str(region).lower().strip()
|
||||
return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
|
||||
|
||||
def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
|
||||
"""Vectorized region to province mapping"""
|
||||
return regions.str.lower().map(
|
||||
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_provinces():
|
||||
return [
|
||||
"kinshasa",
|
||||
"bas-congo",
|
||||
"bandundu",
|
||||
"katanga",
|
||||
"equateur",
|
||||
"province-orientale",
|
||||
"maniema",
|
||||
"nord-kivu",
|
||||
"sud-kivu",
|
||||
"kasai-occidental",
|
||||
"kasai-oriental",
|
||||
]
|
||||
|
||||
|
||||
# DRC Region to Province Mapping
|
||||
REGION_MAPPING: Dict[str, Tuple[str, str]] = {
|
||||
# Kinshasa
|
||||
"kinshasa": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-est": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-funa": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
|
||||
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
|
||||
# Bas-Congo → Kongo-Central → BAS-CONGO
|
||||
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
|
||||
# Kwilu, Kwango, Mai-Ndombe → BANDUNDU
|
||||
"bandundu": ("BANDUNDU", "BANDUNDU"),
|
||||
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
|
||||
"bandundu-2": ("BANDUNDU", "BANDUNDU"),
|
||||
"bandundu-3": ("BANDUNDU", "BANDUNDU"),
|
||||
"kwilu": ("KWILU", "BANDUNDU"),
|
||||
"kwilu-1": ("KWILU", "BANDUNDU"),
|
||||
"kwilu-2": ("KWILU", "BANDUNDU"),
|
||||
"kwilu-3": ("KWILU", "BANDUNDU"),
|
||||
"kwango": ("KWANGO", "BANDUNDU"),
|
||||
"kwango-1": ("KWANGO", "BANDUNDU"),
|
||||
"kwango-2": ("KWANGO", "BANDUNDU"),
|
||||
"mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
|
||||
# Katanga → HAUT-KATANGA, HAUT-LOMAMI, LUALABA, TANGANYIKA
|
||||
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
|
||||
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
|
||||
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
|
||||
"haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
|
||||
"haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
|
||||
"haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
|
||||
"lualaba": ("LUALABA", "KATANGA"),
|
||||
"lualaba-1": ("LUALABA", "KATANGA"),
|
||||
"lualaba-2": ("LUALABA", "KATANGA"),
|
||||
"lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
|
||||
"tanganyika": ("TANGANYIKA", "KATANGA"),
|
||||
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
|
||||
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
|
||||
# Equateur → MONGALA, NORD-UBANGI, SUD-UBANGI, TSHUAPA
|
||||
"equateur": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-1": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-2": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-3": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-4": ("EQUATEUR", "EQUATEUR"),
|
||||
"equateur-5": ("EQUATEUR", "EQUATEUR"),
|
||||
"mongala": ("MONGALA", "EQUATEUR"),
|
||||
"mongala-1": ("MONGALA", "EQUATEUR"),
|
||||
"mongala-2": ("MONGALA", "EQUATEUR"),
|
||||
"nord-ubangi": ("NORD-UBANGI", "EQUATEUR"),
|
||||
"nord-ubangi-1": ("NORD-UBANGI", "EQUATEUR"),
|
||||
"nord-ubangi-2": ("NORD-UBANGI", "EQUATEUR"),
|
||||
"sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
|
||||
"sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
|
||||
"sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
|
||||
"tshuapa": ("TSHUAPA", "EQUATEUR"),
|
||||
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
|
||||
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
|
||||
# Province-Orientale
|
||||
"province-orientale": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
||||
"province-orientale-1": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
||||
"province-orientale-2": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
||||
"province-orientale-3": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
||||
"province-orientale-4": ("PROVINCE-ORIENTALE", "PROVINCE-ORIENTALE"),
|
||||
"haut-uele": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
|
||||
"haut-uele-1": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
|
||||
"haut-uele-2": ("HAUT-UELE", "PROVINCE-ORIENTALE"),
|
||||
"bas-uele": ("BAS-UELE", "PROVINCE-ORIENTALE"),
|
||||
"bas-uele-1": ("BAS-UELE", "PROVINCE-ORIENTALE"),
|
||||
"bas-uele-2": ("BAS-UELE", "PROVINCE-ORIENTALE"),
|
||||
"ituri": ("ITURI", "PROVINCE-ORIENTALE"),
|
||||
"ituri-1": ("ITURI", "PROVINCE-ORIENTALE"),
|
||||
"ituri-2": ("ITURI", "PROVINCE-ORIENTALE"),
|
||||
"tshopo": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
||||
"tshopo-1": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
||||
"tshopo-2": ("TSHOPO", "PROVINCE-ORIENTALE"),
|
||||
# Maniema
|
||||
"maniema": ("MANIEMA", "MANIEMA"),
|
||||
"maniema-1": ("MANIEMA", "MANIEMA"),
|
||||
"maniema-2": ("MANIEMA", "MANIEMA"),
|
||||
# Nord-Kivu
|
||||
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
|
||||
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
|
||||
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
|
||||
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
|
||||
# Sud-Kivu
|
||||
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
|
||||
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
|
||||
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
|
||||
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
|
||||
# Kasai-Occidental → KASAI, KASAI-CENTRAL
|
||||
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
|
||||
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
|
||||
# Kasai-Oriental → LOMAMI, SANKURU
|
||||
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
|
||||
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
|
||||
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
|
||||
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
|
||||
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class StateManager:
|
||||
"""Manage pipeline state and checkpoints"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.checkpoints_dir = self.config.paths.checkpoints_dir
|
||||
|
||||
def save_state(self, state: Dict[str, Any], state_name: str) -> None:
|
||||
"""Save pipeline state"""
|
||||
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
|
||||
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||
|
||||
with open(state_file, "w") as f:
|
||||
json.dump(state, f, indent=2, default=str)
|
||||
|
||||
logging.debug(f"Saved state to {state_file}")
|
||||
|
||||
def load_state(self, state_name: str) -> Dict[str, Any]:
|
||||
"""Load pipeline state"""
|
||||
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||
|
||||
if not state_file.exists():
|
||||
return {}
|
||||
|
||||
with open(state_file, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
def clear_state(self, state_name: str) -> None:
|
||||
"""Clear pipeline state"""
|
||||
state_file = self.checkpoints_dir / f"{state_name}.json"
|
||||
|
||||
if state_file.exists():
|
||||
state_file.unlink()
|
||||
logging.info(f"Cleared state: {state_name}")
|
||||
@@ -0,0 +1,38 @@
|
||||
from typing import Optional, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
"""Reusable text cleaning utilities"""
|
||||
|
||||
def __init__(self, patterns: Optional[Dict[str, str]] = None):
|
||||
self.patterns = patterns or {
|
||||
"null_bytes": "\x00",
|
||||
"non_breaking_spaces": "\u00a0",
|
||||
"multiple_spaces": r" +",
|
||||
"extra_whitespace": r"\s+",
|
||||
}
|
||||
|
||||
def clean_text_series(self, series: pd.Series) -> pd.Series:
|
||||
"""Clean a pandas Series of text data"""
|
||||
cleaned = series.astype(str)
|
||||
|
||||
# Apply cleaning patterns
|
||||
for pattern_name, pattern in self.patterns.items():
|
||||
if pattern_name == "multiple_spaces":
|
||||
cleaned = cleaned.str.replace(pattern, " ", regex=True)
|
||||
else:
|
||||
cleaned = cleaned.str.replace(pattern, " ", regex=False)
|
||||
|
||||
return cleaned.str.strip().str.lower()
|
||||
|
||||
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Clean all text columns in a DataFrame"""
|
||||
df = df.copy()
|
||||
text_columns = df.select_dtypes(include="object").columns
|
||||
|
||||
for col in text_columns:
|
||||
df[col] = self.clean_text_series(df[col])
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user