refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,111 @@
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List, Optional
+
+import pandas as pd
+
+from processing.batch.batch_config import BatchConfig
+from core.config.pipeline_config import PipelineConfig
+
+
+@dataclass
+class PipelineState:
+    """Tracks the state of pipeline execution"""
+
+    processed_batches: int = 0
+    total_batches: int = 0
+    failed_batches: List[int] = None
+    last_checkpoint: Optional[str] = None
+
+    def __post_init__(self):
+        if self.failed_batches is None:
+            self.failed_batches = []
+
+
+class PipelineStep(ABC):
+    """Abstract base class for pipeline steps"""
+
+    def __init__(
+        self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
+    ):
+        self.name = name
+        self.pipeline_config = pipeline_config
+
+        # Use provided batch_config or create default from pipeline config
+        if batch_config is None:
+            batch_config = BatchConfig(
+                batch_size=pipeline_config.processing.batch_size,
+                max_workers=pipeline_config.processing.max_workers,
+                checkpoint_interval=pipeline_config.processing.checkpoint_interval,
+                use_multiprocessing=pipeline_config.processing.use_multiprocessing,
+            )
+        self.batch_config = batch_config
+        self.state = PipelineState()
+
+    @abstractmethod
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process a single batch of data"""
+        pass
+
+    def get_checkpoint_path(self, batch_id: int) -> str:
+        """Get the checkpoint file path for a batch"""
+        checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
+
+    def get_state_path(self) -> str:
+        """Get the state file path"""
+        state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
+        state_dir.mkdir(parents=True, exist_ok=True)
+        return str(state_dir / "pipeline_state.json")
+
+    def save_state(self):
+        """Save pipeline state to disk"""
+        state_file = self.get_state_path()
+        with open(state_file, "w") as f:
+            json.dump(
+                {
+                    "processed_batches": self.state.processed_batches,
+                    "total_batches": self.state.total_batches,
+                    "failed_batches": self.state.failed_batches,
+                    "last_checkpoint": self.state.last_checkpoint,
+                },
+                f,
+            )
+
+    def load_state(self) -> bool:
+        """Load pipeline state from disk. Returns True if state was loaded."""
+        state_file = self.get_state_path()
+        if os.path.exists(state_file):
+            try:
+                with open(state_file, "r") as f:
+                    state_data = json.load(f)
+                self.state.processed_batches = state_data.get("processed_batches", 0)
+                self.state.total_batches = state_data.get("total_batches", 0)
+                self.state.failed_batches = state_data.get("failed_batches", [])
+                self.state.last_checkpoint = state_data.get("last_checkpoint")
+                return True
+            except Exception as e:
+                logging.warning(f"Failed to load state: {e}")
+        return False
+
+    def batch_exists(self, batch_id: int) -> bool:
+        """Check if a batch has already been processed (idempotency)"""
+        checkpoint_path = self.get_checkpoint_path(batch_id)
+        return os.path.exists(checkpoint_path)
+
+    def save_batch(self, batch: pd.DataFrame, batch_id: int):
+        """Save processed batch to checkpoint"""
+        checkpoint_path = self.get_checkpoint_path(batch_id)
+        batch.to_csv(checkpoint_path, index=False)
+        logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
+
+    def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
+        """Load processed batch from checkpoint"""
+        checkpoint_path = self.get_checkpoint_path(batch_id)
+        if os.path.exists(checkpoint_path):
+            return pd.read_csv(checkpoint_path)
+        return None
@@ -0,0 +1,28 @@
+import logging
+
+import pandas as pd
+
+from core.config.pipeline_config import PipelineConfig
+from core.utils.text_cleaner import TextCleaner
+from processing.steps import PipelineStep
+
+
+class DataCleaningStep(PipelineStep):
+    """Configuration-driven data cleaning step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        super().__init__("data_cleaning", pipeline_config)
+        self.text_cleaner = TextCleaner()
+        self.required_columns = ["name", "sex", "region"]
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process a single batch for data cleaning"""
+        logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
+
+        # Drop rows with essential missing values
+        batch = batch.dropna(subset=self.required_columns)
+
+        # Apply text cleaning
+        batch = self.text_cleaner.clean_dataframe_text_columns(batch)
+
+        return batch
@@ -0,0 +1,60 @@
+import numpy as np
+import pandas as pd
+
+from core.config.pipeline_config import PipelineConfig
+from processing.steps.feature_extraction_step import Gender
+from core.utils.data_loader import DataLoader
+
+from processing.batch.batch_config import BatchConfig
+from processing.steps import PipelineStep
+
+
+class DataSplittingStep(PipelineStep):
+    """Configuration-driven data splitting step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        batch_config = BatchConfig(
+            batch_size=pipeline_config.processing.batch_size,
+            max_workers=1,  # No need for parallelism in splitting
+            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
+            use_multiprocessing=False,
+        )
+        super().__init__("data_splitting", pipeline_config, batch_config)
+        self.data_loader = DataLoader(pipeline_config)
+        self.eval_indices = None
+
+    def determine_eval_indices(self, total_size: int) -> set:
+        """Determine evaluation indices consistently across batches"""
+        if self.eval_indices is None:
+            np.random.seed(self.pipeline_config.data.random_seed)
+            eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
+            self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
+        return self.eval_indices
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process batch for data splitting - no modification needed"""
+        return batch.copy()
+
+    def save_splits(self, df: pd.DataFrame) -> None:
+        """Save the split datasets based on configuration"""
+        output_files = self.pipeline_config.data.output_files
+        data_dir = self.pipeline_config.paths.data_dir
+
+        if self.pipeline_config.data.split_evaluation:
+            eval_indices = self.determine_eval_indices(len(df))
+            eval_mask = df.index.isin(eval_indices)
+
+            df_evaluation = df[eval_mask]
+            df_featured = df[~eval_mask]
+
+            self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
+            self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
+        else:
+            self.data_loader.save_csv(df, data_dir / output_files["featured"])
+
+        if self.pipeline_config.data.split_by_gender:
+            df_males = df[df["sex"] == Gender.MALE.value]
+            df_females = df[df["sex"] == Gender.FEMALE.value]
+
+            self.data_loader.save_csv(df_males, data_dir / output_files["males"])
+            self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -0,0 +1,99 @@
+import logging
+from enum import Enum
+
+import pandas as pd
+
+from core.config.pipeline_config import PipelineConfig
+from core.utils.region_mapper import RegionMapper
+from processing.steps import PipelineStep
+
+
+class Gender(Enum):
+    MALE = "m"
+    FEMALE = "f"
+
+
+class NameCategory(Enum):
+    SIMPLE = "simple"
+    COMPOSE = "compose"
+
+
+class FeatureExtractionStep(PipelineStep):
+    """Configuration-driven feature extraction step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        super().__init__("feature_extraction", pipeline_config)
+        self.region_mapper = RegionMapper()
+
+    @classmethod
+    def validate_gender(cls, gender: str) -> Gender:
+        """Validate and normalize gender value"""
+        gender_lower = gender.lower().strip()
+        if gender_lower in ["m", "male", "homme", "masculin"]:
+            return Gender.MALE
+        elif gender_lower in ["f", "female", "femme", "féminin"]:
+            return Gender.FEMALE
+        else:
+            raise ValueError(f"Unknown gender: {gender}")
+
+    @classmethod
+    def get_name_category(cls, word_count: int) -> NameCategory:
+        """Determine name category based on word count"""
+        if word_count <= 3:
+            return NameCategory.SIMPLE
+        else:
+            return NameCategory.COMPOSE
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Extract features from names in batch"""
+        logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
+
+        batch = batch.copy()
+
+        # Basic features
+        batch["words"] = batch["name"].str.count(" ") + 1
+        batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
+
+        # Handle year column
+        if "year" in batch.columns:
+            batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
+
+        # Initialize new columns
+        batch["probable_native"] = None
+        batch["probable_surname"] = None
+        batch["identified_name"] = None
+        batch["identified_surname"] = None
+        batch["annotated"] = 0
+
+        # Vectorized category assignment
+        batch["identified_category"] = batch["words"].apply(
+            lambda x: self.get_name_category(x).value
+        )
+
+        # Assign probable_native and probable_surname for all names
+        name_splits = batch["name"].str.split()
+        batch["probable_native"] = name_splits.apply(
+            lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
+        )
+        batch["probable_surname"] = name_splits.apply(
+            lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
+        )
+
+        # Auto-assign for 3-word names
+        three_word_mask = batch["words"] == 3
+        batch.loc[three_word_mask, "identified_name"] = batch.loc[
+            three_word_mask, "probable_native"
+        ]
+        batch.loc[three_word_mask, "identified_surname"] = batch.loc[
+            three_word_mask, "probable_surname"
+        ]
+        batch.loc[three_word_mask, "annotated"] = 1
+
+        # Map regions to provinces
+        batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
+
+        # Normalize gender
+        if "sex" in batch.columns:
+            batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
+
+        return batch
@@ -0,0 +1,168 @@
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, Optional
+
+import ollama
+import pandas as pd
+from pydantic import ValidationError, BaseModel
+
+from core.config.pipeline_config import PipelineConfig
+from core.utils.prompt_manager import PromptManager
+from core.utils.rate_limiter import RateLimiter
+from core.utils.rate_limiter import RateLimitConfig
+from processing.batch.batch_config import BatchConfig
+from processing.steps import PipelineStep
+
+
+class NameAnnotation(BaseModel):
+    """Model for name annotation results"""
+
+    identified_name: Optional[str]
+    identified_surname: Optional[str]
+
+
+class LLMAnnotationStep(PipelineStep):
+    """Configuration-driven LLM annotation step"""
+
+    def __init__(self, pipeline_config: PipelineConfig):
+        # Create custom batch config for LLM processing
+        batch_config = BatchConfig(
+            batch_size=pipeline_config.processing.batch_size,
+            max_workers=min(
+                pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
+            ),
+            checkpoint_interval=pipeline_config.processing.checkpoint_interval,
+            use_multiprocessing=pipeline_config.processing.use_multiprocessing,
+        )
+        super().__init__("llm_annotation", pipeline_config, batch_config)
+
+        self.prompt = PromptManager(pipeline_config).load_prompt()
+        self.rate_limiter = (
+            self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
+        )
+
+        # Statistics
+        self.successful_requests = 0
+        self.failed_requests = 0
+        self.total_retry_attempts = 0
+
+        # Setup logging
+        logging.getLogger("httpx").setLevel(logging.WARNING)
+
+    def _create_rate_limiter(self):
+        """Create rate limiter based on configuration"""
+        rate_config = RateLimitConfig(
+            requests_per_minute=self.pipeline_config.llm.requests_per_minute,
+            requests_per_second=self.pipeline_config.llm.requests_per_second,
+        )
+        return RateLimiter(rate_config)
+
+    def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
+        """Analyze a name with retry logic and rate limiting"""
+        for attempt in range(self.pipeline_config.llm.retry_attempts):
+            try:
+                # Apply rate limiting if enabled
+                if self.rate_limiter:
+                    self.rate_limiter.wait_if_needed()
+
+                start_time = time.time()
+                response = client.chat(
+                    model=self.pipeline_config.llm.model_name,
+                    messages=[
+                        {"role": "system", "content": self.prompt},
+                        {"role": "user", "content": name},
+                    ],
+                    format=NameAnnotation.model_json_schema(),
+                )
+                elapsed_time = time.time() - start_time
+
+                if elapsed_time > self.pipeline_config.llm.timeout_seconds:
+                    raise TimeoutError(
+                        f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
+                    )
+
+                annotation = NameAnnotation.model_validate_json(response.message.content)
+                result = {
+                    **annotation.model_dump(),
+                    "annotated": 1,
+                    "processing_time": elapsed_time,
+                    "attempts": attempt + 1,
+                }
+
+                self.successful_requests += 1
+                if attempt > 0:
+                    self.total_retry_attempts += attempt
+
+                return result
+
+            except (ValidationError, TimeoutError, Exception) as e:
+                logging.warning(
+                    f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
+                )
+
+                # Exponential backoff with jitter
+                if attempt < self.pipeline_config.llm.retry_attempts - 1:
+                    wait_time = (2**attempt) + (time.time() % 1)
+                    time.sleep(min(wait_time, 10))
+
+        self.failed_requests += 1
+        return {
+            "identified_name": None,
+            "identified_surname": None,
+            "annotated": 0,
+            "processing_time": 0,
+            "attempts": self.pipeline_config.llm.retry_attempts,
+            "failed": True,
+        }
+
+    def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
+        """Process batch with LLM annotation"""
+        unannotated_mask = batch.get("annotated", 0) == 0
+        unannotated_entries = batch[unannotated_mask]
+
+        if unannotated_entries.empty:
+            logging.info(f"Batch {batch_id}: No entries to annotate")
+            return batch
+
+        logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
+
+        batch = batch.copy()
+        client = ollama.Client()
+
+        # Process with controlled concurrency
+        max_workers = self.pipeline_config.llm.max_concurrent_requests
+
+        if len(unannotated_entries) == 1 or max_workers == 1:
+            # Sequential processing
+            for idx, row in unannotated_entries.iterrows():
+                result = self.analyze_name_with_retry(client, row["name"], idx)
+                for field, value in result.items():
+                    if field not in ["failed"]:
+                        batch.loc[idx, field] = value
+        else:
+            # Concurrent processing
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                future_to_idx = {}
+
+                for idx, row in unannotated_entries.iterrows():
+                    future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
+                    future_to_idx[future] = idx
+
+                for future in as_completed(future_to_idx):
+                    idx = future_to_idx[future]
+                    try:
+                        result = future.result()
+                        for field, value in result.items():
+                            if field not in ["failed"]:
+                                batch.loc[idx, field] = value
+                    except Exception as e:
+                        logging.error(f"Failed to process row {idx}: {e}")
+                        batch.loc[idx, "annotated"] = 0
+
+        # Ensure proper data types
+        batch["annotated"] = (
+            pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
+        )
+
+        return batch