refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
View File
-109
View File
@@ -1,109 +0,0 @@
import os
import argparse
import ollama
import pandas as pd
from pydantic import BaseModel, ValidationError
from tqdm import tqdm
from typing import Optional
from misc import load_prompt, load_csv_dataset, DATA_DIR, logging
class NameAnalysis(BaseModel):
identified_name: Optional[str]
identified_surname: Optional[str]
def analyze_name(client: ollama.Client, model: str, prompt: str, name: str) -> dict:
"""
Analyze a name using the specified model and prompt.
Returns a dictionary with identified name, surname, and category.
"""
try:
response = client.chat(
model=model,
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": name},
],
format=NameAnalysis.model_json_schema(),
)
analysis = NameAnalysis.model_validate_json(response.message.content)
return analysis.model_dump()
except ValidationError as ve:
logging.warning(f"Validation error: {ve}")
except Exception as e:
logging.error(f"Unexpected error: {e}")
return {"identified_name": None, "identified_surname": None}
def save_checkpoint(df: pd.DataFrame):
df.to_csv(os.path.join(DATA_DIR, "names_featured.csv"), index=False)
logging.critical(f"Checkpoint saved")
def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame:
BATCH_SIZE = 10
client = ollama.Client()
prompt = load_prompt()
updates = []
# Set logging level for HTTP client to reduce noise
# This is useful to avoid excessive logging from the HTTP client used by Ollama
logging.getLogger("httpx").setLevel(logging.WARNING)
for idx, (row_idx, row) in enumerate(entries.iterrows(), 1):
try:
entry = analyze_name(client, llm_model, prompt, row["name"])
entry["annotated"] = 1
updates.append((row_idx, entry))
logging.info(f"Analyzed: {row['name']} - {entry}")
except Exception as e:
logging.warning(f"Failed to analyze '{row['name']}': {e}")
continue
if idx % BATCH_SIZE == 0 or idx == len(entries):
update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
df.update(update_df)
save_checkpoint(df)
updates.clear() # avoid re-applying same updates
return df
def main(llm_model: str = "llama3.2:3b"):
df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
# Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
entries = df[df["annotated"] == 0]
if entries.empty:
logging.info("No names to analyze.")
return
logging.info(f"Found {len(entries)} names to analyze.")
df = build_updates(llm_model, df, entries)
save_checkpoint(df)
logging.info("Analysis complete.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze names using an LLM model.")
parser.add_argument(
"--llm_model",
type=str,
default="mistral:7b",
help="Ollama model name to use (default: mistral:7b)",
)
args = parser.parse_args()
try:
main(llm_model=args.llm_model)
except Exception as e:
logging.error(f"Fatal error: {e}", exc_info=True)
View File
+11
View File
@@ -0,0 +1,11 @@
from dataclasses import dataclass
@dataclass
class BatchConfig:
"""Configuration for batch processing"""
batch_size: int = 1000
max_workers: int = 4
checkpoint_interval: int = 5 # Save checkpoint every N batches
use_multiprocessing: bool = False # Use ProcessPoolExecutor instead of ThreadPoolExecutor
+102
View File
@@ -0,0 +1,102 @@
import logging
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from typing import Iterator
import pandas as pd
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class BatchProcessor:
"""Handles batch processing with concurrency and checkpointing"""
def __init__(self, config: BatchConfig):
self.config = config
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
"""Create batches from DataFrame"""
total_rows = len(df)
batch_size = self.config.batch_size
for i in range(0, total_rows, batch_size):
batch = df.iloc[i : i + batch_size].copy()
batch_id = i // batch_size
yield batch, batch_id
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process batches sequentially"""
results = []
for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
processed_batch = step.load_batch(batch_id)
else:
try:
processed_batch = step.process_batch(batch, batch_id)
step.save_batch(processed_batch, batch_id)
step.state.processed_batches += 1
except Exception as e:
logging.error(f"Failed to process batch {batch_id}: {e}")
step.state.failed_batches.append(batch_id)
continue
results.append(processed_batch)
# Save state periodically
if batch_id % self.config.checkpoint_interval == 0:
step.save_state()
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process batches concurrently"""
executor_class = (
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
)
results = {}
with executor_class(max_workers=self.config.max_workers) as executor:
# Submit all batches
future_to_batch = {}
for batch, batch_id in self.create_batches(df):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
results[batch_id] = step.load_batch(batch_id)
else:
future = executor.submit(step.process_batch, batch, batch_id)
future_to_batch[future] = (batch_id, batch)
# Collect results as they complete
for future in as_completed(future_to_batch):
batch_id, batch = future_to_batch[future]
try:
processed_batch = future.result()
step.save_batch(processed_batch, batch_id)
results[batch_id] = processed_batch
step.state.processed_batches += 1
logging.info(f"Completed batch {batch_id}")
except Exception as e:
logging.error(f"Failed to process batch {batch_id}: {e}")
step.state.failed_batches.append(batch_id)
# Reassemble results in order
ordered_results = []
for batch_id in sorted(results.keys()):
ordered_results.append(results[batch_id])
step.save_state()
return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process data using the configured strategy"""
step.state.total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
step.load_state()
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
if self.config.max_workers == 1:
return self.process_sequential(step, df)
else:
return self.process_concurrent(step, df)
View File
+80
View File
@@ -0,0 +1,80 @@
import logging
from typing import Dict
import pandas as pd
class DatasetAnalyzer:
"""Analyze dataset statistics and quality"""
def __init__(self, filepath: str):
self.filepath = filepath
self.df = None
def load_data(self) -> bool:
"""Load dataset for analysis"""
try:
self.df = pd.read_csv(self.filepath)
return True
except Exception as e:
logging.error(f"Failed to load {self.filepath}: {e}")
return False
def analyze_completion(self) -> Dict:
"""Analyze annotation completion status"""
if self.df is None:
return {}
total_rows = len(self.df)
# Check annotation status
if "annotated" in self.df.columns:
annotated_count = (self.df["annotated"] == 1).sum()
unannotated_count = (self.df["annotated"] == 0).sum()
else:
annotated_count = 0
unannotated_count = total_rows
# Analyze name completeness
complete_names = 0
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
complete_names = (
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
).sum()
return {
"total_rows": total_rows,
"annotated_rows": annotated_count,
"unannotated_rows": unannotated_count,
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
"complete_names": complete_names,
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
}
def analyze_quality(self) -> Dict:
"""Analyze data quality metrics"""
if self.df is None:
return {}
quality_metrics = {}
# Missing values
missing_data = self.df.isnull().sum()
quality_metrics["missing_values"] = missing_data.to_dict()
# Name length distribution
if "name" in self.df.columns:
name_lengths = self.df["name"].str.len()
quality_metrics["name_length"] = {
"mean": name_lengths.mean(),
"median": name_lengths.median(),
"min": name_lengths.min(),
"max": name_lengths.max(),
}
# Word count distribution
if "words" in self.df.columns:
word_counts = self.df["words"].value_counts().sort_index()
quality_metrics["word_distribution"] = word_counts.to_dict()
return quality_metrics
+179
View File
@@ -0,0 +1,179 @@
import json
import logging
import shutil
from datetime import datetime
from typing import Optional, Dict
from core.config.config_manager import ConfigManager
from core.config.project_paths import ProjectPaths
class PipelineMonitor:
"""Monitor and manage pipeline execution"""
def __init__(self, paths: Optional[ProjectPaths] = None):
if paths is None:
# Use default configuration if none provided
config_manager = ConfigManager()
paths = config_manager.default_paths
self.paths = paths
self.checkpoint_dir = paths.checkpoints_dir
self.steps = ["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"]
def get_step_status(self, step_name: str) -> Dict:
"""Get status of a specific pipeline step"""
step_dir = self.checkpoint_dir / step_name
state_file = step_dir / "pipeline_state.json"
if not state_file.exists():
return {
"step": step_name,
"status": "not_started",
"processed_batches": 0,
"total_batches": 0,
"failed_batches": 0,
"completion_percentage": 0.0,
}
try:
with open(state_file, "r") as f:
state = json.load(f)
processed = state.get("processed_batches", 0)
total = state.get("total_batches", 0)
failed = len(state.get("failed_batches", []))
if total == 0:
completion = 0.0
status = "not_started"
elif processed >= total:
completion = 100.0
status = "completed" if failed == 0 else "completed_with_errors"
else:
completion = (processed / total) * 100
status = "in_progress"
return {
"step": step_name,
"status": status,
"processed_batches": processed,
"total_batches": total,
"failed_batches": failed,
"completion_percentage": completion,
"last_checkpoint": state.get("last_checkpoint"),
"failed_batch_ids": state.get("failed_batches", []),
}
except Exception as e:
logging.error(f"Error reading state for {step_name}: {e}")
return {"step": step_name, "status": "error", "error": str(e)}
def get_pipeline_status(self) -> Dict:
"""Get overall pipeline status"""
step_statuses = {}
overall_status = "not_started"
total_completion = 0.0
for step in self.steps:
status = self.get_step_status(step)
step_statuses[step] = status
if status["status"] == "error":
overall_status = "error"
elif status["status"] in ["in_progress"]:
overall_status = "in_progress"
elif status["status"] == "completed_with_errors":
overall_status = "completed_with_errors"
total_completion += status.get("completion_percentage", 0)
avg_completion = total_completion / len(self.steps)
if avg_completion >= 100 and overall_status not in ["error", "completed_with_errors"]:
overall_status = "completed"
return {
"overall_status": overall_status,
"overall_completion": avg_completion,
"steps": step_statuses,
"timestamp": datetime.now().isoformat(),
}
def print_status(self, detailed: bool = False):
"""Print pipeline status in a human-readable format"""
status = self.get_pipeline_status()
print("\n=== Pipeline Status ===")
print(f"Overall Status: {status['overall_status'].upper()}")
print(f"Overall Completion: {status['overall_completion']:.1f}%")
print(f"Last Updated: {status['timestamp']}")
print()
for step_name, step_status in status["steps"].items():
print(f"{step_name.replace('_', ' ').title()}:")
print(f" Status: {step_status['status']}")
print(f" Progress: {step_status['completion_percentage']:.1f}%")
print(f" Batches: {step_status['processed_batches']}/{step_status['total_batches']}")
if step_status["failed_batches"] > 0:
print(f" Failed Batches: {step_status['failed_batches']}")
if detailed and "failed_batch_ids" in step_status:
print(f" Failed Batch IDs: {step_status['failed_batch_ids']}")
print()
def count_checkpoint_files(self) -> Dict:
"""Count checkpoint files for each step"""
counts = {}
total_size = 0
for step in self.steps:
step_dir = self.checkpoint_dir / step
if step_dir.exists():
csv_files = list(step_dir.glob("*.csv"))
step_size = sum(f.stat().st_size for f in csv_files)
counts[step] = {"files": len(csv_files), "size_mb": step_size / (1024 * 1024)}
total_size += step_size
else:
counts[step] = {"files": 0, "size_mb": 0}
counts["total_size_mb"] = total_size / (1024 * 1024)
return counts
def clean_step_checkpoints(self, step_name: str, keep_last: int = 1):
"""Clean checkpoint files for a specific step"""
step_dir = self.checkpoint_dir / step_name
if not step_dir.exists():
logging.info(f"No checkpoints found for {step_name}")
return
csv_files = sorted(step_dir.glob("batch_*.csv"))
if len(csv_files) <= keep_last:
logging.info(f"Only {len(csv_files)} checkpoint files for {step_name}, keeping all")
return
files_to_delete = csv_files[:-keep_last] if keep_last > 0 else csv_files
for file_path in files_to_delete:
try:
file_path.unlink()
logging.info(f"Deleted {file_path}")
except Exception as e:
logging.error(f"Failed to delete {file_path}: {e}")
def reset_step(self, step_name: str):
"""Reset a pipeline step by removing its checkpoints and state"""
step_dir = self.checkpoint_dir / step_name
if step_dir.exists():
try:
shutil.rmtree(step_dir)
logging.info(f"Reset step: {step_name}")
except Exception as e:
logging.error(f"Failed to reset {step_name}: {e}")
else:
logging.info(f"Step {step_name} has no checkpoints to reset")
+57
View File
@@ -0,0 +1,57 @@
import logging
import pandas as pd
from typing import Dict, Any
import time
from processing.batch.batch_config import BatchConfig
from processing.batch.batch_processor import BatchProcessor
from processing.steps import PipelineStep
class Pipeline:
"""Main pipeline orchestrator"""
def __init__(self, config: BatchConfig):
self.config = config
self.processor = BatchProcessor(config)
self.steps = []
def add_step(self, step: PipelineStep):
"""Add a processing step to the pipeline"""
self.steps.append(step)
def run(self, input_data: pd.DataFrame) -> pd.DataFrame:
"""Run the complete pipeline"""
current_data = input_data.copy()
for step in self.steps:
logging.info(f"Running pipeline step: {step.name}")
start_time = time.time()
current_data = self.processor.process(step, current_data)
elapsed_time = time.time() - start_time
logging.info(f"Completed {step.name} in {elapsed_time:.2f} seconds")
if step.state.failed_batches:
logging.warning(
f"Step {step.name} had {len(step.state.failed_batches)} failed batches"
)
return current_data
def get_progress(self) -> Dict[str, Any]:
"""Get progress information for all steps"""
progress = {}
for step in self.steps:
progress[step.name] = {
"processed_batches": step.state.processed_batches,
"total_batches": step.state.total_batches,
"failed_batches": len(step.state.failed_batches),
"completion_percentage": (
step.state.processed_batches / max(1, step.state.total_batches)
)
* 100,
}
return progress
-119
View File
@@ -1,119 +0,0 @@
import os
import argparse
import pandas as pd
from misc import DATA_DIR, REGION_MAPPING, logging
def clean(filepath) -> pd.DataFrame:
"""
Clean the CSV file by removing null bytes, non-breaking spaces, and extra spaces.
Also, it attempts to read the file with different encodings to handle potential encoding issues.
"""
encodings = ['utf-8', 'utf-16', 'latin1']
for enc in encodings:
try:
logging.info(f"Trying to read {filepath} with encoding: {enc}")
# Use chunked reading to handle large files
chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
cleaned_chunks = []
for chunk in chunks:
# Drop rows with essential missing values early
chunk = chunk.dropna(subset=['name', 'sex', 'region'])
# Clean string columns in-place
for col in chunk.select_dtypes(include='object').columns:
chunk[col] = (
chunk[col]
.astype(str)
.str.replace('\x00', ' ', regex=False)
.str.replace('\u00a0', ' ', regex=False)
.str.replace(' +', ' ', regex=True)
.str.strip()
.str.lower()
)
cleaned_chunks.append(chunk)
df = pd.concat(cleaned_chunks, ignore_index=True)
df.to_csv(filepath, index=False, encoding='utf-8')
logging.info(f"Successfully read with encoding: {enc}")
return df
except Exception:
continue
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
def process(df: pd.DataFrame) -> pd.DataFrame:
"""
Process the DataFrame to extract features and clean data.
This includes counting words, calculating name length, and extracting probable native names and surnames.
Also maps regions to provinces based on REGION_MAPPING.
"""
logging.info("Preprocessing names")
df['words'] = df['name'].str.count(' ') + 1
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
df['year'] = df['year'].astype(int)
# Calculate probable_native and probable_surname
name_split = df['name'].str.split()
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple')
df['identified_name'] = None
df['identified_surname'] = None
df['annotated'] = 0
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
# This is a common pattern in Congolese names
three_word_mask = df['words'] == 3
df.loc[three_word_mask, 'identified_name'] = df.loc[three_word_mask, 'probable_native']
df.loc[three_word_mask, 'identified_surname'] = df.loc[three_word_mask, 'probable_surname']
df.loc[three_word_mask, 'annotated'] = 1
logging.info("Mapping regions to provinces")
df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1])
df['province'] = df['province'].str.lower()
return df
def save_artifacts(df: pd.DataFrame, split_eval: bool = True, split_by_sex: bool = True) -> None:
"""
Splits the input DataFrame into evaluation and featured datasets, saves them as CSV files,
and additionally saves separate CSV files for male and female entries if requested.
"""
if split_eval:
logging.info("Saving evaluation and featured datasets")
eval_idx = df.sample(frac=0.2, random_state=42).index
df_evaluation = df.loc[eval_idx]
df_featured = df.drop(index=eval_idx)
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
else:
df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
if split_by_sex:
logging.info("Saving by sex")
df[df['sex'] == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
df[df['sex'] == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
def main(split_eval: bool = True, split_by_sex: bool = True):
df = process(clean(os.path.join(DATA_DIR, 'names.csv')))
save_artifacts(df, split_eval=split_eval, split_by_sex=split_by_sex)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Prepare name datasets with optional splits.")
parser.add_argument('--split_eval', action='store_true', default=True, help="Split into evaluation and featured datasets (default: True)")
parser.add_argument('--no-split_eval', action='store_false', dest='split_eval', help="Do not split into evaluation and featured datasets")
parser.add_argument('--split_by_sex', action='store_true', default=True, help="Split by sex into male/female datasets (default: True)")
parser.add_argument('--no-split_by_sex', action='store_false', dest='split_by_sex', help="Do not split by sex into male/female datasets")
args = parser.parse_args()
main(split_eval=args.split_eval, split_by_sex=args.split_by_sex)
+111
View File
@@ -0,0 +1,111 @@
import json
import logging
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
from processing.batch.batch_config import BatchConfig
from core.config.pipeline_config import PipelineConfig
@dataclass
class PipelineState:
"""Tracks the state of pipeline execution"""
processed_batches: int = 0
total_batches: int = 0
failed_batches: List[int] = None
last_checkpoint: Optional[str] = None
def __post_init__(self):
if self.failed_batches is None:
self.failed_batches = []
class PipelineStep(ABC):
"""Abstract base class for pipeline steps"""
def __init__(
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
):
self.name = name
self.pipeline_config = pipeline_config
# Use provided batch_config or create default from pipeline config
if batch_config is None:
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=pipeline_config.processing.max_workers,
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
self.batch_config = batch_config
self.state = PipelineState()
@abstractmethod
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch of data"""
pass
def get_checkpoint_path(self, batch_id: int) -> str:
"""Get the checkpoint file path for a batch"""
checkpoint_dir = self.pipeline_config.paths.checkpoints_dir / self.name
checkpoint_dir.mkdir(parents=True, exist_ok=True)
return str(checkpoint_dir / f"batch_{batch_id:06d}.csv")
def get_state_path(self) -> str:
"""Get the state file path"""
state_dir = self.pipeline_config.paths.checkpoints_dir / self.name
state_dir.mkdir(parents=True, exist_ok=True)
return str(state_dir / "pipeline_state.json")
def save_state(self):
"""Save pipeline state to disk"""
state_file = self.get_state_path()
with open(state_file, "w") as f:
json.dump(
{
"processed_batches": self.state.processed_batches,
"total_batches": self.state.total_batches,
"failed_batches": self.state.failed_batches,
"last_checkpoint": self.state.last_checkpoint,
},
f,
)
def load_state(self) -> bool:
"""Load pipeline state from disk. Returns True if state was loaded."""
state_file = self.get_state_path()
if os.path.exists(state_file):
try:
with open(state_file, "r") as f:
state_data = json.load(f)
self.state.processed_batches = state_data.get("processed_batches", 0)
self.state.total_batches = state_data.get("total_batches", 0)
self.state.failed_batches = state_data.get("failed_batches", [])
self.state.last_checkpoint = state_data.get("last_checkpoint")
return True
except Exception as e:
logging.warning(f"Failed to load state: {e}")
return False
def batch_exists(self, batch_id: int) -> bool:
"""Check if a batch has already been processed (idempotency)"""
checkpoint_path = self.get_checkpoint_path(batch_id)
return os.path.exists(checkpoint_path)
def save_batch(self, batch: pd.DataFrame, batch_id: int):
"""Save processed batch to checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
batch.to_csv(checkpoint_path, index=False)
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
"""Load processed batch from checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
if os.path.exists(checkpoint_path):
return pd.read_csv(checkpoint_path)
return None
+28
View File
@@ -0,0 +1,28 @@
import logging
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.text_cleaner import TextCleaner
from processing.steps import PipelineStep
class DataCleaningStep(PipelineStep):
"""Configuration-driven data cleaning step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("data_cleaning", pipeline_config)
self.text_cleaner = TextCleaner()
self.required_columns = ["name", "sex", "region"]
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch for data cleaning"""
logging.info(f"Cleaning batch {batch_id} with {len(batch)} rows")
# Drop rows with essential missing values
batch = batch.dropna(subset=self.required_columns)
# Apply text cleaning
batch = self.text_cleaner.clean_dataframe_text_columns(batch)
return batch
+60
View File
@@ -0,0 +1,60 @@
import numpy as np
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps.feature_extraction_step import Gender
from core.utils.data_loader import DataLoader
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class DataSplittingStep(PipelineStep):
"""Configuration-driven data splitting step"""
def __init__(self, pipeline_config: PipelineConfig):
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=1, # No need for parallelism in splitting
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=False,
)
super().__init__("data_splitting", pipeline_config, batch_config)
self.data_loader = DataLoader(pipeline_config)
self.eval_indices = None
def determine_eval_indices(self, total_size: int) -> set:
"""Determine evaluation indices consistently across batches"""
if self.eval_indices is None:
np.random.seed(self.pipeline_config.data.random_seed)
eval_size = int(total_size * self.pipeline_config.data.evaluation_fraction)
self.eval_indices = set(np.random.choice(total_size, size=eval_size, replace=False))
return self.eval_indices
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch for data splitting - no modification needed"""
return batch.copy()
def save_splits(self, df: pd.DataFrame) -> None:
"""Save the split datasets based on configuration"""
output_files = self.pipeline_config.data.output_files
data_dir = self.pipeline_config.paths.data_dir
if self.pipeline_config.data.split_evaluation:
eval_indices = self.determine_eval_indices(len(df))
eval_mask = df.index.isin(eval_indices)
df_evaluation = df[eval_mask]
df_featured = df[~eval_mask]
self.data_loader.save_csv(df_evaluation, data_dir / output_files["evaluation"])
self.data_loader.save_csv(df_featured, data_dir / output_files["featured"])
else:
self.data_loader.save_csv(df, data_dir / output_files["featured"])
if self.pipeline_config.data.split_by_gender:
df_males = df[df["sex"] == Gender.MALE.value]
df_females = df[df["sex"] == Gender.FEMALE.value]
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
@@ -0,0 +1,99 @@
import logging
from enum import Enum
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper
from processing.steps import PipelineStep
class Gender(Enum):
MALE = "m"
FEMALE = "f"
class NameCategory(Enum):
SIMPLE = "simple"
COMPOSE = "compose"
class FeatureExtractionStep(PipelineStep):
"""Configuration-driven feature extraction step"""
def __init__(self, pipeline_config: PipelineConfig):
super().__init__("feature_extraction", pipeline_config)
self.region_mapper = RegionMapper()
@classmethod
def validate_gender(cls, gender: str) -> Gender:
"""Validate and normalize gender value"""
gender_lower = gender.lower().strip()
if gender_lower in ["m", "male", "homme", "masculin"]:
return Gender.MALE
elif gender_lower in ["f", "female", "femme", "féminin"]:
return Gender.FEMALE
else:
raise ValueError(f"Unknown gender: {gender}")
@classmethod
def get_name_category(cls, word_count: int) -> NameCategory:
"""Determine name category based on word count"""
if word_count <= 3:
return NameCategory.SIMPLE
else:
return NameCategory.COMPOSE
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Extract features from names in batch"""
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
batch = batch.copy()
# Basic features
batch["words"] = batch["name"].str.count(" ") + 1
batch["length"] = batch["name"].str.replace(" ", "", regex=False).str.len()
# Handle year column
if "year" in batch.columns:
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
# Initialize new columns
batch["probable_native"] = None
batch["probable_surname"] = None
batch["identified_name"] = None
batch["identified_surname"] = None
batch["annotated"] = 0
# Vectorized category assignment
batch["identified_category"] = batch["words"].apply(
lambda x: self.get_name_category(x).value
)
# Assign probable_native and probable_surname for all names
name_splits = batch["name"].str.split()
batch["probable_native"] = name_splits.apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
)
batch["probable_surname"] = name_splits.apply(
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
)
# Auto-assign for 3-word names
three_word_mask = batch["words"] == 3
batch.loc[three_word_mask, "identified_name"] = batch.loc[
three_word_mask, "probable_native"
]
batch.loc[three_word_mask, "identified_surname"] = batch.loc[
three_word_mask, "probable_surname"
]
batch.loc[three_word_mask, "annotated"] = 1
# Map regions to provinces
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
# Normalize gender
if "sex" in batch.columns:
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
return batch
+168
View File
@@ -0,0 +1,168 @@
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, Optional
import ollama
import pandas as pd
from pydantic import ValidationError, BaseModel
from core.config.pipeline_config import PipelineConfig
from core.utils.prompt_manager import PromptManager
from core.utils.rate_limiter import RateLimiter
from core.utils.rate_limiter import RateLimitConfig
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
class NameAnnotation(BaseModel):
"""Model for name annotation results"""
identified_name: Optional[str]
identified_surname: Optional[str]
class LLMAnnotationStep(PipelineStep):
"""Configuration-driven LLM annotation step"""
def __init__(self, pipeline_config: PipelineConfig):
# Create custom batch config for LLM processing
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=min(
pipeline_config.llm.max_concurrent_requests, pipeline_config.processing.max_workers
),
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
)
super().__init__("llm_annotation", pipeline_config, batch_config)
self.prompt = PromptManager(pipeline_config).load_prompt()
self.rate_limiter = (
self._create_rate_limiter() if pipeline_config.llm.enable_rate_limiting else None
)
# Statistics
self.successful_requests = 0
self.failed_requests = 0
self.total_retry_attempts = 0
# Setup logging
logging.getLogger("httpx").setLevel(logging.WARNING)
def _create_rate_limiter(self):
"""Create rate limiter based on configuration"""
rate_config = RateLimitConfig(
requests_per_minute=self.pipeline_config.llm.requests_per_minute,
requests_per_second=self.pipeline_config.llm.requests_per_second,
)
return RateLimiter(rate_config)
def analyze_name_with_retry(self, client: ollama.Client, name: str, row_id: int) -> Dict:
"""Analyze a name with retry logic and rate limiting"""
for attempt in range(self.pipeline_config.llm.retry_attempts):
try:
# Apply rate limiting if enabled
if self.rate_limiter:
self.rate_limiter.wait_if_needed()
start_time = time.time()
response = client.chat(
model=self.pipeline_config.llm.model_name,
messages=[
{"role": "system", "content": self.prompt},
{"role": "user", "content": name},
],
format=NameAnnotation.model_json_schema(),
)
elapsed_time = time.time() - start_time
if elapsed_time > self.pipeline_config.llm.timeout_seconds:
raise TimeoutError(
f"Request took {elapsed_time:.2f}s, exceeding {self.pipeline_config.llm.timeout_seconds}s timeout"
)
annotation = NameAnnotation.model_validate_json(response.message.content)
result = {
**annotation.model_dump(),
"annotated": 1,
"processing_time": elapsed_time,
"attempts": attempt + 1,
}
self.successful_requests += 1
if attempt > 0:
self.total_retry_attempts += attempt
return result
except (ValidationError, TimeoutError, Exception) as e:
logging.warning(
f"Error analyzing '{name}' (attempt {attempt + 1}/{self.pipeline_config.llm.retry_attempts}): {e}"
)
# Exponential backoff with jitter
if attempt < self.pipeline_config.llm.retry_attempts - 1:
wait_time = (2**attempt) + (time.time() % 1)
time.sleep(min(wait_time, 10))
self.failed_requests += 1
return {
"identified_name": None,
"identified_surname": None,
"annotated": 0,
"processing_time": 0,
"attempts": self.pipeline_config.llm.retry_attempts,
"failed": True,
}
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch with LLM annotation"""
unannotated_mask = batch.get("annotated", 0) == 0
unannotated_entries = batch[unannotated_mask]
if unannotated_entries.empty:
logging.info(f"Batch {batch_id}: No entries to annotate")
return batch
logging.info(f"Batch {batch_id}: Annotating {len(unannotated_entries)} entries")
batch = batch.copy()
client = ollama.Client()
# Process with controlled concurrency
max_workers = self.pipeline_config.llm.max_concurrent_requests
if len(unannotated_entries) == 1 or max_workers == 1:
# Sequential processing
for idx, row in unannotated_entries.iterrows():
result = self.analyze_name_with_retry(client, row["name"], idx)
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
else:
# Concurrent processing
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {}
for idx, row in unannotated_entries.iterrows():
future = executor.submit(self.analyze_name_with_retry, client, row["name"], idx)
future_to_idx[future] = idx
for future in as_completed(future_to_idx):
idx = future_to_idx[future]
try:
result = future.result()
for field, value in result.items():
if field not in ["failed"]:
batch.loc[idx, field] = value
except Exception as e:
logging.error(f"Failed to process row {idx}: {e}")
batch.loc[idx, "annotated"] = 0
# Ensure proper data types
batch["annotated"] = (
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
)
return batch
-26
View File
@@ -1,26 +0,0 @@
import ollama
from pydantic import BaseModel
from misc import load_prompt
class NameAnalysis(BaseModel):
identified_name: str | None
identified_surname: str | None
name = input("Enter name: ")
client = ollama.Client()
response = client.chat(
model="mistral:7b",
messages=[
{"role": "system", "content": load_prompt()},
{"role": "user", "content": name}
],
format=NameAnalysis.model_json_schema()
)
analysis = NameAnalysis.model_validate_json(response.message.content)
result = analysis.model_dump()
print(result)