refactoring: uv

This commit is contained in:
2025-10-05 18:14:15 +02:00
parent f3b06fbd07
commit 9dd4f759b3
120 changed files with 5525 additions and 3366 deletions
+46
View File
@@ -0,0 +1,46 @@
import logging
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ners.core.config import PipelineConfig
@contextmanager
def temporary_config_override(**overrides):
"""Context manager for temporarily overriding configuration"""
from ners.core.config import get_config
config = get_config()
original_values = {}
# Store original values and apply overrides
for key, value in overrides.items():
if hasattr(config, key):
original_values[key] = getattr(config, key)
setattr(config, key, value)
try:
yield config
finally:
# Restore original values
for key, value in original_values.items():
setattr(config, key, value)
def ensure_directories(config: "PipelineConfig") -> None:
"""Ensure all required directories exist"""
directories = [
config.paths.data_dir,
config.paths.models_dir,
config.paths.outputs_dir,
config.paths.logs_dir,
config.paths.configs_dir,
config.paths.checkpoints_dir,
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
logging.info("Ensured all required directories exist")
+174
View File
@@ -0,0 +1,174 @@
import gc
import logging
from pathlib import Path
from typing import Optional, Union, Iterator, Dict
import pandas as pd
from ners.core.config.pipeline_config import PipelineConfig
OPTIMIZED_DTYPES = {
# Numeric columns with appropriate bit-width
"year": "Int16", # Years fit in 16-bit integer
"words": "Int8", # Word counts typically < 128
"length": "Int16", # Name lengths fit in 16-bit
"annotated": "Int8", # Binary flag (0/1)
"ner_tagged": "Int8", # Binary flag (0/1)
# Categorical columns (memory efficient for repeated values)
"sex": "category",
"province": "category",
"region": "category",
"identified_category": "category",
"transformation_type": "category",
# String columns with proper string dtype
"name": "string",
"probable_native": "string",
"probable_surname": "string",
"identified_name": "string",
"identified_surname": "string",
"ner_entities": "string",
}
class DataLoader:
"""Reusable data loading utilities"""
def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
self.config = config
self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
def load_csv_chunked(
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
) -> Iterator[pd.DataFrame]:
"""Load CSV file in chunks for memory efficiency"""
chunk_size = chunk_size or self.config.processing.chunk_size
encodings = self.config.processing.encoding_options
filepath = Path(filepath)
for encoding in encodings:
try:
logging.info(f"Reading {filepath} with encoding: {encoding}")
# Read with optimal dtypes
chunk_iter = pd.read_csv(
filepath,
encoding=encoding,
chunksize=chunk_size,
on_bad_lines="skip",
dtype=self.dtypes,
)
for i, chunk in enumerate(chunk_iter):
logging.debug(f"Processing optimized chunk {i + 1}")
yield chunk
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
return
except Exception as e:
logging.warning(f"Failed with encoding {encoding}: {e}")
continue
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
"""Load complete CSV with memory optimization"""
chunks = []
for chunk in self.load_csv_chunked(filepath):
chunks.append(chunk)
if not chunks:
return pd.DataFrame()
logging.info(f"Concatenating {len(chunks)} optimized chunks")
df = pd.concat(chunks, ignore_index=True, copy=False)
# Cleanup chunks from memory
del chunks
gc.collect()
# Apply dataset size limiting if configured
if self.config.data.max_dataset_size is not None:
df = self._limit_dataset_size(df)
return df
def _limit_dataset_size(self, df: pd.DataFrame) -> pd.DataFrame:
"""Limit dataset size with optional sex balancing"""
max_size = self.config.data.max_dataset_size
if max_size is None or len(df) <= max_size:
return df
if self.config.data.balance_by_sex and "sex" in df.columns:
return self._balanced_sample(df, max_size)
else:
# Simple random sampling
return df.sample(n=max_size, random_state=self.config.data.random_seed)
def _balanced_sample(self, df: pd.DataFrame, max_size: int) -> pd.DataFrame:
"""Sample data with balanced sex distribution"""
# Get unique sex values
sex_values = df["sex"].dropna().unique()
if len(sex_values) == 0:
logging.warning(
"No valid values found in sex column 'sex', using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Calculate samples per sex category
samples_per_sex = max_size // len(sex_values)
remaining_samples = max_size % len(sex_values)
balanced_samples = []
for i, sex in enumerate(sex_values):
# Use boolean indexing instead of creating temporary DataFrames
sex_mask = df["sex"] == sex
sex_indices = df[sex_mask].index
# Distribute remaining samples to first categories
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
current_samples = min(current_samples, len(sex_indices))
if current_samples > 0:
# Sample indices instead of DataFrame
sampled_indices = pd.Series(sex_indices).sample(
n=current_samples, random_state=self.config.data.random_seed + i
)
balanced_samples.extend(sampled_indices.tolist())
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
if not balanced_samples:
logging.warning(
"No balanced samples could be created, using random sampling"
)
return df.sample(n=max_size, random_state=self.config.data.random_seed)
# Create result using iloc with indices (no copying until final step)
result = df.iloc[balanced_samples].copy()
# Shuffle the final result
result = result.sample(
frac=1, random_state=self.config.data.random_seed
).reset_index(drop=True)
logging.info(
f"Created balanced dataset with {len(result)} records from {len(df)} total"
)
return result
@classmethod
def save_csv(
cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
) -> None:
"""Save DataFrame to CSV with proper handling"""
filepath = Path(filepath)
if create_dirs:
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False, encoding="utf-8", sep=",", quoting=1)
logging.info(f"Saved {len(df)} rows to {filepath}")
+24
View File
@@ -0,0 +1,24 @@
from ners.core.config.pipeline_config import PipelineConfig
class PromptManager:
"""Manage prompts for LLM operations"""
def __init__(self, config: PipelineConfig):
self.config = config
self.prompts_dir = self.config.paths.configs_dir / "prompts"
def load_prompt(self, prompt_name: str = "default") -> str:
"""Load a prompt template"""
prompt_file = self.prompts_dir / f"{prompt_name}.txt"
if not prompt_file.exists():
# Fallback to root directory
fallback_file = self.config.paths.root_dir / "prompt.txt"
if fallback_file.exists():
prompt_file = fallback_file
else:
raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
with open(prompt_file, "r", encoding="utf-8") as f:
return f.read().strip()
+56
View File
@@ -0,0 +1,56 @@
import threading
import time
from dataclasses import dataclass
from queue import Queue
@dataclass
class RateLimitConfig:
"""Configuration for rate limiting LLM requests"""
requests_per_minute: int = 60
requests_per_second: int = 2
burst_limit: int = 5
class RateLimiter:
"""Thread-safe rate limiter for LLM requests"""
def __init__(self, config: RateLimitConfig):
self.config = config
self.request_times = Queue()
self.lock = threading.Lock()
self.last_request_time = 0
def wait_if_needed(self):
"""Wait if necessary to respect rate limits"""
with self.lock:
current_time = time.time()
# Check requests per second limit
time_since_last = current_time - self.last_request_time
min_interval = 1.0 / self.config.requests_per_second
if time_since_last < min_interval:
sleep_time = min_interval - time_since_last
time.sleep(sleep_time)
current_time = time.time()
# Clean old request times (older than 1 minute)
while not self.request_times.empty():
if current_time - self.request_times.queue[0] > 60:
self.request_times.get()
else:
break
# Check requests per minute limit
if self.request_times.qsize() >= self.config.requests_per_minute:
oldest_request = self.request_times.queue[0]
wait_time = 60 - (current_time - oldest_request)
if wait_time > 0:
time.sleep(wait_time)
current_time = time.time()
# Record this request
self.request_times.put(current_time)
self.last_request_time = current_time
+174
View File
@@ -0,0 +1,174 @@
import unicodedata
from typing import Optional, Dict, Tuple
import pandas as pd
class RegionMapper:
"""Reusable region mapping utilities"""
def __init__(self, mapping: Optional[Dict] = None):
self.mapping = mapping or REGION_MAPPING
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
def map(self, series: pd.Series) -> pd.Series:
return series.str.lower().map(self.mapping).fillna("AUTRES")
@staticmethod
def clean_province(series: pd.Series) -> pd.Series:
return (
series.str.upper()
.str.strip()
.apply(
lambda x: (
unicodedata.normalize("NFKD", x)
.encode("ascii", errors="ignore")
.decode("utf-8")
if isinstance(x, str)
else x
)
)
)
@staticmethod
def get_provinces():
return [
"kinshasa",
"bas-congo",
"bandundu",
"katanga",
"equateur",
"orientale",
"maniema",
"nord-kivu",
"sud-kivu",
"kasai-occidental",
"kasai-oriental",
"autres",
]
# DRC Region to Province Mapping
REGION_MAPPING: Dict[str, Tuple[str, str]] = {
"bandundu": ("BANDUNDU", "BANDUNDU"),
"bandundu-1": ("BANDUNDU", "BANDUNDU"),
"bandundu-2": ("BANDUNDU", "BANDUNDU"),
"bandundu-3": ("BANDUNDU", "BANDUNDU"),
"bas-congo": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-congo-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-fleuve": ("KONGO-CENTRAL", "BAS-CONGO"),
"bas-uele": ("BAS-UELE", "ORIENTALE"),
"bas-uele-1": ("BAS-UELE", "ORIENTALE"),
"bas-uele-2": ("BAS-UELE", "ORIENTALE"),
"cataractes": ("KONGO-CENTRAL", "BAS-CONGO"),
"equateur": ("EQUATEUR", "EQUATEUR"),
"equateur-1": ("EQUATEUR", "EQUATEUR"),
"equateur-2": ("EQUATEUR", "EQUATEUR"),
"equateur-3": ("EQUATEUR", "EQUATEUR"),
"equateur-4": ("EQUATEUR", "EQUATEUR"),
"equateur-5": ("EQUATEUR", "EQUATEUR"),
"haut-katanga": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-1": ("HAUT-KATANGA", "KATANGA"),
"haut-katanga-2": ("HAUT-KATANGA", "KATANGA"),
"haut-lomami": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-1": ("HAUT-LOMAMI", "KATANGA"),
"haut-lomami-2": ("HAUT-LOMAMI", "KATANGA"),
"haut-uele": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-1": ("HAUT-UELE", "ORIENTALE"),
"haut-uele-2": ("HAUT-UELE", "ORIENTALE"),
"ituri": ("ITURI", "ORIENTALE"),
"ituri-1": ("ITURI", "ORIENTALE"),
"ituri-2": ("ITURI", "ORIENTALE"),
"ituri-3": ("ITURI", "ORIENTALE"),
"kasai": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-1": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-2": ("KASAI", "KASAI-OCCIDENTAL"),
"kasai-ce": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-1": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-central-2": ("KASAI-CENTRAL", "KASAI-OCCIDENTAL"),
"kasai-occidental": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-1": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-occidental-2": ("KASAI-OCCIDENTAL", "KASAI-OCCIDENTAL"),
"kasai-oriental": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-1": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-2": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-oriental-3": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"kasai-orientale": ("KASAI-ORIENTAL", "KASAI-ORIENTAL"),
"katanga": ("KATANGA", "KATANGA"),
"katanga-1": ("KATANGA", "KATANGA"),
"katanga-2": ("KATANGA", "KATANGA"),
"katanga-3": ("KATANGA", "KATANGA"),
"katanga-4": ("KATANGA", "KATANGA"),
"kinshasa": ("KINSHASA", "KINSHASA"),
"kinshasa-centre": ("KINSHASA", "KINSHASA"),
"kinshasa-est": ("KINSHASA", "KINSHASA"),
"kinshasa-funa": ("KINSHASA", "KINSHASA"),
"kinshasa-global": ("KINSHASA", "KINSHASA"),
"kinshasa-lukunga": ("KINSHASA", "KINSHASA"),
"kinshasa-mont-amba": ("KINSHASA", "KINSHASA"),
"kinshasa-ouest": ("KINSHASA", "KINSHASA"),
"kinshasa-plateau": ("KINSHASA", "KINSHASA"),
"kinshasa-tshangu": ("KINSHASA", "KINSHASA"),
"kongo-central": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-1": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-2": ("KONGO-CENTRAL", "BAS-CONGO"),
"kongo-central-3": ("KONGO-CENTRAL", "BAS-CONGO"),
"kwango": ("KWANGO", "BANDUNDU"),
"kwango-1": ("KWANGO", "BANDUNDU"),
"kwango-2": ("KWANGO", "BANDUNDU"),
"kwilu": ("KWILU", "BANDUNDU"),
"kwilu-1": ("KWILU", "BANDUNDU"),
"kwilu-2": ("KWILU", "BANDUNDU"),
"kwilu-3": ("KWILU", "BANDUNDU"),
"lomami": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-1": ("LOMAMI", "KASAI-ORIENTAL"),
"lomami-2": ("LOMAMI", "KASAI-ORIENTAL"),
"lualaba": ("LUALABA", "KATANGA"),
"lualaba-1": ("LUALABA", "KATANGA"),
"lualaba-2": ("LUALABA", "KATANGA"),
"lualaba-74-corrige-922a": ("LUALABA", "KATANGA"),
"lukaya": ("KONGO-CENTRAL", "BAS-CONGO"),
"mai-ndombe": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-1": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-2": ("MAI-NDOMBE", "BANDUNDU"),
"mai-ndombe-3": ("MAI-NDOMBE", "BANDUNDU"),
"maniema": ("MANIEMA", "MANIEMA"),
"maniema-1": ("MANIEMA", "MANIEMA"),
"maniema-2": ("MANIEMA", "MANIEMA"),
"mongala": ("MONGALA", "EQUATEUR"),
"mongala-1": ("MONGALA", "EQUATEUR"),
"mongala-2": ("MONGALA", "EQUATEUR"),
"nord-kivu": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-1": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-2": ("NORD-KIVU", "NORD-KIVU"),
"nord-kivu-3": ("NORD-KIVU", "NORD-KIVU"),
"nord-ubangi": ("NORD-UBANGI", "EQUATEUR"),
"nord-ubangi-1": ("NORD-UBANGI", "EQUATEUR"),
"nord-ubangi-2": ("NORD-UBANGI", "EQUATEUR"),
"province-orientale": ("ORIENTALE", "ORIENTALE"),
"province-orientale-1": ("ORIENTALE", "ORIENTALE"),
"province-orientale-2": ("ORIENTALE", "ORIENTALE"),
"province-orientale-3": ("ORIENTALE", "ORIENTALE"),
"province-orientale-4": ("ORIENTALE", "ORIENTALE"),
"sankuru": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-1": ("SANKURU", "KASAI-ORIENTAL"),
"sankuru-2": ("SANKURU", "KASAI-ORIENTAL"),
"sud-kivu": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-1": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-2": ("SUD-KIVU", "SUD-KIVU"),
"sud-kivu-3": ("SUD-KIVU", "SUD-KIVU"),
"sud-ubangi": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-1": ("SUD-UBANGI", "EQUATEUR"),
"sud-ubangi-2": ("SUD-UBANGI", "EQUATEUR"),
"tanganyika": ("TANGANYIKA", "KATANGA"),
"tanganyika-1": ("TANGANYIKA", "KATANGA"),
"tanganyika-2": ("TANGANYIKA", "KATANGA"),
"tshopo": ("TSHOPO", "ORIENTALE"),
"tshopo-1": ("TSHOPO", "ORIENTALE"),
"tshopo-2": ("TSHOPO", "ORIENTALE"),
"tshuapa": ("TSHUAPA", "EQUATEUR"),
"tshuapa-1": ("TSHUAPA", "EQUATEUR"),
"tshuapa-2": ("TSHUAPA", "EQUATEUR"),
}
+41
View File
@@ -0,0 +1,41 @@
import json
import logging
from typing import Dict, Any
from ners.core.config.pipeline_config import PipelineConfig
class StateManager:
"""Manage pipeline state and checkpoints"""
def __init__(self, config: PipelineConfig):
self.config = config
self.checkpoints_dir = self.config.paths.checkpoints_dir
def save_state(self, state: Dict[str, Any], state_name: str) -> None:
"""Save pipeline state"""
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
state_file = self.checkpoints_dir / f"{state_name}.json"
with open(state_file, "w") as f:
json.dump(state, f, indent=2, default=str)
logging.debug(f"Saved state to {state_file}")
def load_state(self, state_name: str) -> Dict[str, Any]:
"""Load pipeline state"""
state_file = self.checkpoints_dir / f"{state_name}.json"
if not state_file.exists():
return {}
with open(state_file, "r") as f:
return json.load(f)
def clear_state(self, state_name: str) -> None:
"""Clear pipeline state"""
state_file = self.checkpoints_dir / f"{state_name}.json"
if state_file.exists():
state_file.unlink()
logging.info(f"Cleared state: {state_name}")
+37
View File
@@ -0,0 +1,37 @@
from typing import Optional, Dict
import pandas as pd
class TextCleaner:
"""Reusable text cleaning utilities"""
def __init__(self, patterns: Optional[Dict[str, str]] = None):
self.patterns = patterns or {
"null_bytes": "\x00",
"non_breaking_spaces": "\u00a0",
"multiple_spaces": r" +",
"extra_whitespace": r"\s+",
}
def clean_text_series(self, series: pd.Series) -> pd.Series:
"""Clean a pandas Series of text data"""
cleaned = series.astype(str)
# Apply cleaning patterns
for pattern_name, pattern in self.patterns.items():
if pattern_name == "multiple_spaces":
cleaned = cleaned.str.replace(pattern, " ", regex=True)
else:
cleaned = cleaned.str.replace(pattern, " ", regex=False)
return cleaned.str.strip().str.lower()
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean all text columns in a DataFrame"""
df = df.copy()
columns = df.select_dtypes(include=["object", "string"]).columns
for col in columns:
df[col] = self.clean_text_series(df[col])
return df