feat: enhance logging and memory management across modules
This commit is contained in:
@@ -9,6 +9,7 @@ import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
|
||||
|
||||
@@ -37,10 +38,11 @@ class PipelineStep(ABC):
|
||||
"""Abstract base class for pipeline steps"""
|
||||
|
||||
def __init__(
|
||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
||||
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
|
||||
):
|
||||
self.name = name
|
||||
self.pipeline_config = pipeline_config
|
||||
self.data_loader = DataLoader(pipeline_config)
|
||||
|
||||
# Use provided batch_config or create default from pipeline config
|
||||
if batch_config is None:
|
||||
@@ -53,6 +55,11 @@ class PipelineStep(ABC):
|
||||
self.batch_config = batch_config
|
||||
self.state = PipelineState()
|
||||
|
||||
@property
|
||||
def requires_batch_mutation(self) -> bool:
|
||||
"""Indicates if this step modifies the batch data"""
|
||||
return False
|
||||
|
||||
@abstractmethod
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process a single batch of data"""
|
||||
@@ -108,12 +115,12 @@ class PipelineStep(ABC):
|
||||
def save_batch(self, batch: pd.DataFrame, batch_id: int):
|
||||
"""Save processed batch to checkpoint"""
|
||||
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||
batch.to_csv(checkpoint_path, index=False)
|
||||
self.data_loader.save_csv(batch, checkpoint_path)
|
||||
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
|
||||
|
||||
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
|
||||
"""Load processed batch from checkpoint"""
|
||||
checkpoint_path = self.get_checkpoint_path(batch_id)
|
||||
if os.path.exists(checkpoint_path):
|
||||
return pd.read_csv(checkpoint_path)
|
||||
return self.data_loader.load_csv_complete(checkpoint_path)
|
||||
return None
|
||||
|
||||
@@ -2,11 +2,10 @@ import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.steps.feature_extraction_step import Gender
|
||||
from core.utils.data_loader import DataLoader
|
||||
|
||||
from core.utils.region_mapper import RegionMapper
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.steps import PipelineStep
|
||||
from processing.steps.feature_extraction_step import Gender
|
||||
|
||||
|
||||
class DataSplittingStep(PipelineStep):
|
||||
@@ -20,7 +19,6 @@ class DataSplittingStep(PipelineStep):
|
||||
use_multiprocessing=False,
|
||||
)
|
||||
super().__init__("data_splitting", pipeline_config, batch_config)
|
||||
self.data_loader = DataLoader(pipeline_config)
|
||||
self.eval_indices = None
|
||||
|
||||
def determine_eval_indices(self, total_size: int) -> set:
|
||||
@@ -33,9 +31,9 @@ class DataSplittingStep(PipelineStep):
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Process batch for data splitting - no modification needed"""
|
||||
return batch.copy()
|
||||
return batch
|
||||
|
||||
def save_splits(self, df: pd.DataFrame) -> None:
|
||||
def split(self, df: pd.DataFrame) -> None:
|
||||
"""Save the split datasets based on configuration"""
|
||||
output_files = self.pipeline_config.data.output_files
|
||||
data_dir = self.pipeline_config.paths.data_dir
|
||||
@@ -52,9 +50,14 @@ class DataSplittingStep(PipelineStep):
|
||||
else:
|
||||
self.data_loader.save_csv(df, data_dir / output_files["featured"])
|
||||
|
||||
if self.pipeline_config.data.split_by_province:
|
||||
for province in RegionMapper.get_provinces():
|
||||
df_region = df[df.province == province]
|
||||
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
|
||||
|
||||
if self.pipeline_config.data.split_by_gender:
|
||||
df_males = df[df["sex"] == Gender.MALE.value]
|
||||
df_females = df[df["sex"] == Gender.FEMALE.value]
|
||||
df_males = df[df.sex == Gender.MALE.value]
|
||||
df_females = df[df.sex == Gender.FEMALE.value]
|
||||
|
||||
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
|
||||
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import gc
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@@ -27,10 +29,15 @@ class FeatureExtractionStep(PipelineStep):
|
||||
self.region_mapper = RegionMapper()
|
||||
self.name_tagger = NERNameTagger()
|
||||
|
||||
@classmethod
|
||||
def requires_batch_mutation(cls) -> bool:
|
||||
"""This step creates new columns, so mutation is required"""
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def validate_gender(cls, gender: str) -> Gender:
|
||||
"""Validate and normalize gender value"""
|
||||
gender_lower = gender.lower().strip()
|
||||
gender_lower = str(gender).lower().strip()
|
||||
if gender_lower in ["m", "male", "homme", "masculin"]:
|
||||
return Gender.MALE
|
||||
elif gender_lower in ["f", "female", "femme", "féminin"]:
|
||||
@@ -41,68 +48,144 @@ class FeatureExtractionStep(PipelineStep):
|
||||
@classmethod
|
||||
def get_name_category(cls, word_count: int) -> NameCategory:
|
||||
"""Determine name category based on word count"""
|
||||
if word_count == 3:
|
||||
return NameCategory.SIMPLE
|
||||
else:
|
||||
return NameCategory.COMPOSE
|
||||
return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
|
||||
|
||||
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
|
||||
"""Extract features from names in batch"""
|
||||
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
|
||||
|
||||
batch = batch.copy()
|
||||
result = batch.copy()
|
||||
numeric_features = self._compute_numeric_features(result["name"])
|
||||
result = result.assign(**numeric_features)
|
||||
|
||||
# Basic features
|
||||
batch["words"] = batch["name"].str.count(" ") + 1
|
||||
batch["length"] = batch["name"].str.len()
|
||||
# Initialize features columns with optimal dtypes
|
||||
features_columns = self._initialize_features_columns(len(result))
|
||||
result = result.assign(**features_columns)
|
||||
|
||||
# Handle year column
|
||||
if "year" in batch.columns:
|
||||
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
|
||||
self._assign_probable_names(result)
|
||||
self._process_simple_names(result)
|
||||
result["identified_category"] = self._assign_identified_category(result["words"])
|
||||
|
||||
# Initialize new columns
|
||||
batch["probable_native"] = None
|
||||
batch["probable_surname"] = None
|
||||
batch["identified_name"] = None
|
||||
batch["identified_surname"] = None
|
||||
batch["ner_entities"] = None
|
||||
batch["ner_tagged"] = 0
|
||||
batch["annotated"] = 0
|
||||
if "year" in result.columns:
|
||||
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
|
||||
|
||||
# Vectorized category assignment
|
||||
batch["identified_category"] = batch["words"].apply(
|
||||
lambda x: self.get_name_category(x).value
|
||||
if "region" in result.columns:
|
||||
result["province"] = self.region_mapper.map(result["region"])
|
||||
result["province"] = result["province"].astype("category")
|
||||
|
||||
if "sex" in result.columns:
|
||||
result["sex"] = self._normalize_gender(result["sex"])
|
||||
|
||||
# Apply final dtype optimizations
|
||||
result = self._optimize_dtypes(result)
|
||||
|
||||
# Cleanup
|
||||
del numeric_features, features_columns
|
||||
if batch_id % 10 == 0: # Periodic cleanup
|
||||
gc.collect()
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
|
||||
"""Calculate basic features in vectorized manner"""
|
||||
return {
|
||||
"words": (series.str.count(" ") + 1).astype("Int8"),
|
||||
"length": series.str.len().astype("Int16"),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
|
||||
"""Initialize new columns with optimal dtypes"""
|
||||
return {
|
||||
"probable_native": pd.Series([None] * size, dtype="string"),
|
||||
"probable_surname": pd.Series([None] * size, dtype="string"),
|
||||
"identified_name": pd.Series([None] * size, dtype="string"),
|
||||
"identified_surname": pd.Series([None] * size, dtype="string"),
|
||||
"ner_entities": pd.Series([None] * size, dtype="string"),
|
||||
"ner_tagged": pd.Series([0] * size, dtype="Int8"),
|
||||
"annotated": pd.Series([0] * size, dtype="Int8"),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _assign_probable_names(cls, df: pd.DataFrame) -> None:
|
||||
"""Assign probable native and surname names efficiently"""
|
||||
|
||||
name_splits = df["name"].str.split()
|
||||
mask = name_splits.str.len() >= 2
|
||||
|
||||
df.loc[mask, "probable_native"] = name_splits[mask].apply(
|
||||
lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
|
||||
)
|
||||
df.loc[mask, "probable_surname"] = name_splits[mask].apply(
|
||||
lambda x: x[-1] if isinstance(x, list) else None
|
||||
)
|
||||
|
||||
# Assign probable_native and probable_surname for all names
|
||||
name_splits = batch["name"].str.split()
|
||||
batch["probable_native"] = name_splits.apply(
|
||||
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
|
||||
)
|
||||
batch["probable_surname"] = name_splits.apply(
|
||||
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
|
||||
)
|
||||
def _assign_identified_category(self, series: pd.Series) -> pd.Series:
|
||||
"""Assign identified category based on word count"""
|
||||
return series.map(lambda x: self.get_name_category(x).value).astype("category")
|
||||
|
||||
# Auto-assign for 3-word names
|
||||
three_word_mask = batch["words"] == 3
|
||||
batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"]
|
||||
batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"]
|
||||
batch.loc[three_word_mask, "annotated"] = 1
|
||||
def _process_simple_names(self, df: pd.DataFrame) -> None:
|
||||
"""Process 3-word names efficiently with vectorized operations"""
|
||||
mask = df["words"] == 3
|
||||
|
||||
# Tag names with NER entities
|
||||
three_word_rows = batch[three_word_mask]
|
||||
if not mask.any():
|
||||
return
|
||||
|
||||
df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
|
||||
df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
|
||||
df.loc[mask, "annotated"] = 1
|
||||
|
||||
# NER tagging for 3-word names
|
||||
three_word_rows = df[mask]
|
||||
for idx, row in three_word_rows.iterrows():
|
||||
entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname'])
|
||||
try:
|
||||
entity = self.name_tagger.tag_name(
|
||||
row["name"], row["identified_name"], row["identified_surname"]
|
||||
)
|
||||
|
||||
if entity:
|
||||
batch.at[idx, "ner_entities"] = entity["entities"]
|
||||
batch.at[idx, "ner_tagged"] = 1
|
||||
if entity:
|
||||
df.at[idx, "ner_entities"] = str(entity["entities"])
|
||||
df.at[idx, "ner_tagged"] = 1
|
||||
except Exception as e:
|
||||
logging.warning(f"NER tagging failed for row {idx}: {e}")
|
||||
|
||||
# Map regions to provinces
|
||||
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
|
||||
def _normalize_gender(self, series: pd.Series) -> pd.Series:
|
||||
gender_mapping = {
|
||||
"m": "m",
|
||||
"male": "m",
|
||||
"homme": "m",
|
||||
"masculin": "m",
|
||||
"f": "f",
|
||||
"female": "f",
|
||||
"femme": "f",
|
||||
"féminin": "f",
|
||||
}
|
||||
|
||||
# Normalize gender
|
||||
if "sex" in batch.columns:
|
||||
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
|
||||
# Apply mapping with error handling
|
||||
normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
|
||||
return normalized.astype("category")
|
||||
|
||||
return batch
|
||||
@classmethod
|
||||
def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
|
||||
categories = ["province", "identified_category", "sex"]
|
||||
|
||||
for col in categories:
|
||||
if col in df.columns and df[col].dtype != "category":
|
||||
df[col] = df[col].astype("category")
|
||||
|
||||
# Ensure string columns are proper string dtype
|
||||
string_cols = [
|
||||
"name",
|
||||
"probable_native",
|
||||
"probable_surname",
|
||||
"identified_name",
|
||||
"identified_surname",
|
||||
"ner_entities",
|
||||
]
|
||||
|
||||
for col in string_cols:
|
||||
if col in df.columns and df[col].dtype == "object":
|
||||
df[col] = df[col].astype("string")
|
||||
|
||||
return df
|
||||
|
||||
@@ -24,8 +24,7 @@ class LLMAnnotationStep(PipelineStep):
|
||||
batch_config = BatchConfig(
|
||||
batch_size=pipeline_config.processing.batch_size,
|
||||
max_workers=min(
|
||||
self.llm_config.max_concurrent_requests,
|
||||
pipeline_config.processing.max_workers
|
||||
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
|
||||
),
|
||||
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
|
||||
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
|
||||
@@ -98,7 +97,7 @@ class LLMAnnotationStep(PipelineStep):
|
||||
|
||||
# Exponential backoff with jitter
|
||||
if attempt < self.llm_config.retry_attempts - 1:
|
||||
wait_time = (2 ** attempt) + (time.time() % 1)
|
||||
wait_time = (2**attempt) + (time.time() % 1)
|
||||
time.sleep(min(wait_time, 10))
|
||||
|
||||
self.failed_requests += 1
|
||||
@@ -156,6 +155,8 @@ class LLMAnnotationStep(PipelineStep):
|
||||
batch.loc[idx, "annotated"] = 0
|
||||
|
||||
# Ensure proper data types
|
||||
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
batch["annotated"] = (
|
||||
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
)
|
||||
|
||||
return batch
|
||||
|
||||
@@ -6,8 +6,8 @@ from typing import Dict
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.steps import PipelineStep, NameAnnotation
|
||||
from processing.ner.ner_name_model import NERNameModel
|
||||
from processing.steps import PipelineStep, NameAnnotation
|
||||
|
||||
|
||||
class NERAnnotationStep(PipelineStep):
|
||||
@@ -63,7 +63,7 @@ class NERAnnotationStep(PipelineStep):
|
||||
|
||||
# Get NER predictions
|
||||
prediction = self.ner_trainer.predict(name.lower())
|
||||
entities = prediction.get('entities', [])
|
||||
entities = prediction.get("entities", [])
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
@@ -72,15 +72,15 @@ class NERAnnotationStep(PipelineStep):
|
||||
surname_parts = []
|
||||
|
||||
for entity in entities:
|
||||
if entity['label'] == 'NATIVE':
|
||||
native_parts.append(entity['text'])
|
||||
elif entity['label'] == 'SURNAME':
|
||||
surname_parts.append(entity['text'])
|
||||
if entity["label"] == "NATIVE":
|
||||
native_parts.append(entity["text"])
|
||||
elif entity["label"] == "SURNAME":
|
||||
surname_parts.append(entity["text"])
|
||||
|
||||
# Create annotation result in same format as LLM step
|
||||
annotation = NameAnnotation(
|
||||
identified_name=" ".join(native_parts) if native_parts else None,
|
||||
identified_surname=" ".join(surname_parts) if surname_parts else None
|
||||
identified_surname=" ".join(surname_parts) if surname_parts else None,
|
||||
)
|
||||
|
||||
result = {
|
||||
@@ -159,6 +159,8 @@ class NERAnnotationStep(PipelineStep):
|
||||
batch.loc[idx, "annotated"] = 0
|
||||
|
||||
# Ensure proper data types
|
||||
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
batch["annotated"] = (
|
||||
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||
)
|
||||
|
||||
return batch
|
||||
|
||||
Reference in New Issue
Block a user