refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
+111
View File
@@ -0,0 +1,111 @@
import json
import logging
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
from processing.batch.batch_config import BatchConfig
from core.config.pipeline_config import PipelineConfig
@dataclass
class PipelineState:
"""Tracks the state of pipeline execution"""
processed_batches: int = 0
total_batches: int = 0
failed_batches: List[int] = None
last_checkpoint: Optional[str] = None
def __post_init__(self):
if self.failed_batches is None:
self.failed_batches = []
class PipelineStep(ABC):
"""Abstract base class for pipeline steps"""
def __init__(
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
):
self.name = name
self.pipeline_config = pipeline_config
# Use provided batch_config or create default from pipeline config
if batch_config is None:
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=pipeline_config.processing.max_workers,
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
self.batch_config = batch_config
self.state = PipelineState()
@abstractmethod
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch of data"""
pass
def get_checkpoint_path(self, batch_id: int) -> str:
"""Get the checkpoint file path for a batch"""
checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
checkpoint_dir.mkdir(parents=True, exist_ok=True)
return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
def get_state_path(self) -> str:
"""Get the state file path"""
state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
state_dir.mkdir(parents=True, exist_ok=True)
return str(state_dir / "pipeline_state.json")
def save_state(self):
"""Save pipeline state to disk"""
state_file = self.get_state_path()
with open(state_file, "w") as f:
json.dump(
{
"processed_batches": self.state.processed_batches,
"total_batches": self.state.total_batches,
"failed_batches": self.state.failed_batches,
"last_checkpoint": self.state.last_checkpoint,
},
f,
)
def load_state(self) -> bool:
"""Load pipeline state from disk. Returns True if state was loaded."""
state_file = self.get_state_path()
if os.path.exists(state_file):
try:
with open(state_file, "r") as f:
state_data = json.load(f)
self.state.processed_batches = state_data.get("processed_batches", 0)
self.state.total_batches = state_data.get("total_batches", 0)
self.state.failed_batches = state_data.get("failed_batches", [])
self.state.last_checkpoint = state_data.get("last_checkpoint")
return True
except Exception as e:
logging.warning(f"Failed to load state: {e}")
return False
def batch_exists(self, batch_id: int) -> bool:
"""Check if a batch has already been processed (idempotency)"""
checkpoint_path = self.get_checkpoint_path(batch_id)
return os.path.exists(checkpoint_path)
def save_batch(self, batch: pd.DataFrame, batch_id: int):
"""Save processed batch to checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
batch.to_csv(checkpoint_path, index=False)
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
"""Load processed batch from checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
if os.path.exists(checkpoint_path):
return pd.read_csv(checkpoint_path)
return None
+28
View File
@@ -0,0 +1,28 @@
import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep
class DataCleaningStep(PipelineStep):
"""Configuration-driven data cleaning step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_cleaning", pipeline_config)
self.text_cleaner = TextCleaner()
self.required_columns = ["name", "sex", "region"]
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data cleaning"""
logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
# Drop rows with essential missing values
batch = batch.dropna(subset=self.required_columns)
# Apply text cleaning
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
return batch
+60
View File
@@ -0,0 +1,60 @@
import numpy as np
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps.feature_extraction_step import Gender
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class DataSplittingStep(PipelineStep):
"""Configuration-driven data splitting step"""
def __init__(self, pipeline_config: PipelineConfig):
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=1, # No need for parallelism in splitting
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=False,
)
super().__init__("data_splitting", pipeline_config, batch_config)
self.data_loader = DataLoader(pipeline_config)
self.eval_indices = None
def determine_eval_indices(self, total_size: int) -> set:
"""Determine evaluation indices consistently across batches"""
if self.eval_indices is None:
np.random.seed(self.pipeline_config.data.random_seed)
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
return self.eval_indices
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch for data splitting - no modification needed"""
return batch.copy()
def save_splits(self, df: pd.DataFrame) -> None:
"""Save the split datasets based on configuration"""
output_files = self.pipeline_config.data.output_files
data_dir = self.pipeline_config.paths.data_dir
if self.pipeline_config.data.split_evaluation:
eval_indices = self.determine_eval_indices(len(df))
eval_mask = df.index.isin(eval_indices)
df_evaluation = df[eval_mask]
df_featured = df[~eval_mask]
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
else:
self.data_loader.save_csv(df, data_dir / output_files["featured"])
if self.pipeline_config.data.split_by_gender:
df_males = df[df["sex"] == Gender.MALE.value]
df_females = df[df["sex"] == Gender.FEMALE.value]
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -0,0 +1,99 @@
import logging
from enum import Enum
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.steps import PipelineStep
class Gender(Enum):
MALE = "m"
FEMALE = "f"
class NameCategory(Enum):
SIMPLE = "simple"
COMPOSE = "compose"
class FeatureExtractionStep(PipelineStep):
"""Configuration-driven feature extraction step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("feature_extraction", pipeline_config)
self.region_mapper = RegionMapper()
@classmethod
def validate_gender(cls, gender: str) -> Gender:
"""Validate and normalize gender value"""
gender_lower = gender.lower().strip()
if gender_lower in ["m", "male", "homme", "masculin"]:
return Gender.MALE
elif gender_lower in ["f", "female", "femme", "féminin"]:
return Gender.FEMALE
else:
raise ValueError(f"Unknown gender: {gender}")
@classmethod
def get_name_category(cls, word_count: int) -> NameCategory:
"""Determine name category based on word count"""
if word_count <= 3:
return NameCategory.SIMPLE
else:
return NameCategory.COMPOSE
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Extract features from names in batch"""
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
batch = batch.copy()
# Basic features
batch["words"] = batch["name"].str.count(" ") + 1
batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
# Handle year column
if "year" in batch.columns:
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
# Initialize new columns
batch["probable_native"] = None
batch["probable_surname"] = None
batch["identified_name"] = None
batch["identified_surname"] = None
batch["annotated"] = 0
# Vectorized category assignment
batch["identified_category"] = batch["words"].apply(
lambda x: self.get_name_category(x).value
)
# Assign probable_native and probable_surname for all names
name_splits = batch["name"].str.split()
batch["probable_native"] = name_splits.apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
)
batch["probable_surname"] = name_splits.apply(
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
)
# Auto-assign for 3-word names
three_word_mask = batch["words"] == 3
batch.loc[three_word_mask, "identified_name"] = batch.loc[
three_word_mask, "probable_native"
]
batch.loc[three_word_mask, "identified_surname"] = batch.loc[
three_word_mask, "probable_surname"
]
batch.loc[three_word_mask, "annotated"] = 1
# Map regions to provinces
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
# Normalize gender
if "sex" in batch.columns:
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
return batch
+168
View File
@@ -0,0 +1,168 @@
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, Optional
import ollama
import pandas as pd
from pydantic import ValidationError, BaseModel
from core.config.pipeline_config import PipelineConfig
from core.utils.prompt_manager import PromptManager
from core.utils.rate_limiter import RateLimiter
from core.utils.rate_limiter import RateLimitConfig
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class NameAnnotation(BaseModel):
"""Model for name annotation results"""
identified_name: Optional[str]
identified_surname: Optional[str]
class LLMAnnotationStep(PipelineStep):
"""Configuration-driven LLM annotation step"""
def __init__(self, pipeline_config: PipelineConfig):
# Create custom batch config for LLM processing
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=min(
pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
),
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
super().__init__("llm_annotation", pipeline_config, batch_config)
self.prompt = PromptManager(pipeline_config).load_prompt()
self.rate_limiter = (
self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
)
# Statistics
self.successful_requests = 0
self.failed_requests = 0
self.total_retry_attempts = 0
# Setup logging
logging.getLogger("httpx").setLevel(logging.WARNING)
def _create_rate_limiter(self):
"""Create rate limiter based on configuration"""
rate_config = RateLimitConfig(
requests_per_minute=self.pipeline_config.llm.requests_per_minute,
requests_per_second=self.pipeline_config.llm.requests_per_second,
)
return RateLimiter(rate_config)
def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
"""Analyze a name with retry logic and rate limiting"""
for attempt in range(self.pipeline_config.llm.retry_attempts):
try:
# Apply rate limiting if enabled
if self.rate_limiter:
self.rate_limiter.wait_if_needed()
start_time = time.time()
response = client.chat(
model=self.pipeline_config.llm.model_name,
messages=[
{"role": "system", "content": self.prompt},
{"role": "user", "content": name},
],
format=NameAnnotation.model_json_schema(),
)
elapsed_time = time.time() - start_time
if elapsed_time > self.pipeline_config.llm.timeout_seconds:
raise TimeoutError(
f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
)
annotation = NameAnnotation.model_validate_json(response.message.content)
result = {
**annotation.model_dump(),
"annotated": 1,
"processing_time": elapsed_time,
"attempts": attempt + 1,
}
self.successful_requests += 1
if attempt > 0:
self.total_retry_attempts += attempt
return result
except (ValidationError, TimeoutError, Exception) as e:
logging.warning(
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
)
# Exponential backoff with jitter
if attempt < self.pipeline_config.llm.retry_attempts - 1:
wait_time = (2**attempt) + (time.time() % 1)
time.sleep(min(wait_time, 10))
self.failed_requests += 1
return {
"identified_name": None,
"identified_surname": None,
"annotated": 0,
"processing_time": 0,
"attempts": self.pipeline_config.llm.retry_attempts,
"failed": True,
}
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch with LLM annotation"""
unannotated_mask = batch.get("annotated", 0) == 0
unannotated_entries = batch[unannotated_mask]
if unannotated_entries.empty:
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
batch = batch.copy()
client = ollama.Client()
# Process with controlled concurrency
max_workers = self.pipeline_config.llm.max_concurrent_requests
if len(unannotated_entries) == 1 or max_workers == 1:
# Sequential processing
for idx, row in unannotated_entries.iterrows():
result = self.analyze_name_with_retry(client, row["name"], idx)
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
else:
# Concurrent processing
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {}
for idx, row in unannotated_entries.iterrows():
future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
future_to_idx[future] = idx
for future in as_completed(future_to_idx):
idx = future_to_idx[future]
try:
result = future.result()
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
except Exception as e:
logging.error(f"Failed to process row {idx}: {e}")
batch.loc[idx, "annotated"] = 0
# Ensure proper data types
batch["annotated"] = (
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
)
return batch