169 lines
6.1 KiB
Python
169 lines
6.1 KiB
Python
import gc
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional, Union, Iterator, Dict
|
|
|
|
import pandas as pd
|
|
|
|
from core.config.pipeline_config import PipelineConfig
|
|
|
|
OPTIMIZED_DTYPES = {
|
|
# Numeric columns with appropriate bit-width
|
|
"year": "Int16", # Years fit in 16-bit integer
|
|
"words": "Int8", # Word counts typically < 128
|
|
"length": "Int16", # Name lengths fit in 16-bit
|
|
"annotated": "Int8", # Binary flag (0/1)
|
|
"ner_tagged": "Int8", # Binary flag (0/1)
|
|
# Categorical columns (memory efficient for repeated values)
|
|
"sex": "category",
|
|
"province": "category",
|
|
"region": "category",
|
|
"identified_category": "category",
|
|
"transformation_type": "category",
|
|
# String columns with proper string dtype
|
|
"name": "string",
|
|
"probable_native": "string",
|
|
"probable_surname": "string",
|
|
"identified_name": "string",
|
|
"identified_surname": "string",
|
|
"ner_entities": "string",
|
|
}
|
|
|
|
|
|
class DataLoader:
|
|
"""Reusable data loading utilities"""
|
|
|
|
def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
|
|
self.config = config
|
|
self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
|
|
|
|
def load_csv_chunked(
|
|
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
|
|
) -> Iterator[pd.DataFrame]:
|
|
"""Load CSV file in chunks for memory efficiency"""
|
|
chunk_size = chunk_size or self.config.processing.chunk_size
|
|
encodings = self.config.processing.encoding_options
|
|
filepath = Path(filepath)
|
|
|
|
for encoding in encodings:
|
|
try:
|
|
logging.info(f"Reading {filepath} with encoding: {encoding}")
|
|
|
|
# Read with optimal dtypes
|
|
chunk_iter = pd.read_csv(
|
|
filepath,
|
|
encoding=encoding,
|
|
chunksize=chunk_size,
|
|
on_bad_lines="skip",
|
|
dtype=self.dtypes,
|
|
)
|
|
|
|
for i, chunk in enumerate(chunk_iter):
|
|
logging.debug(f"Processing optimized chunk {i + 1}")
|
|
yield chunk
|
|
|
|
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
|
|
return
|
|
|
|
except Exception as e:
|
|
logging.warning(f"Failed with encoding {encoding}: {e}")
|
|
continue
|
|
|
|
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
|
|
|
|
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
|
"""Load complete CSV with memory optimization"""
|
|
chunks = []
|
|
for chunk in self.load_csv_chunked(filepath):
|
|
chunks.append(chunk)
|
|
|
|
if not chunks:
|
|
return pd.DataFrame()
|
|
|
|
logging.info(f"Concatenating {len(chunks)} optimized chunks")
|
|
df = pd.concat(chunks, ignore_index=True, copy=False)
|
|
|
|
# Cleanup chunks from memory
|
|
del chunks
|
|
gc.collect()
|
|
|
|
# Apply dataset size limiting if configured
|
|
if self.config.data.max_dataset_size is not None:
|
|
df = self._limit_dataset_size(df)
|
|
|
|
return df
|
|
|
|
def _limit_dataset_size(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Limit dataset size with optional sex balancing"""
|
|
max_size = self.config.data.max_dataset_size
|
|
|
|
if max_size is None or len(df) <= max_size:
|
|
return df
|
|
|
|
if self.config.data.balance_by_sex and "sex" in df.columns:
|
|
return self._balanced_sample(df, max_size)
|
|
else:
|
|
# Simple random sampling
|
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
|
|
|
def _balanced_sample(self, df: pd.DataFrame, max_size: int) -> pd.DataFrame:
|
|
"""Sample data with balanced sex distribution"""
|
|
|
|
# Get unique sex values
|
|
sex_values = df["sex"].dropna().unique()
|
|
|
|
if len(sex_values) == 0:
|
|
logging.warning(f"No valid values found in sex column 'sex', using random sampling")
|
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
|
|
|
# Calculate samples per sex category
|
|
samples_per_sex = max_size // len(sex_values)
|
|
remaining_samples = max_size % len(sex_values)
|
|
|
|
balanced_samples = []
|
|
|
|
for i, sex in enumerate(sex_values):
|
|
# Use boolean indexing instead of creating temporary DataFrames
|
|
sex_mask = df["sex"] == sex
|
|
sex_indices = df[sex_mask].index
|
|
|
|
# Distribute remaining samples to first categories
|
|
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
|
|
current_samples = min(current_samples, len(sex_indices))
|
|
|
|
if current_samples > 0:
|
|
# Sample indices instead of DataFrame
|
|
sampled_indices = pd.Series(sex_indices).sample(
|
|
n=current_samples, random_state=self.config.data.random_seed + i
|
|
)
|
|
balanced_samples.extend(sampled_indices.tolist())
|
|
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
|
|
|
if not balanced_samples:
|
|
logging.warning("No balanced samples could be created, using random sampling")
|
|
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
|
|
|
# Create result using iloc with indices (no copying until final step)
|
|
result = df.iloc[balanced_samples].copy()
|
|
|
|
# Shuffle the final result
|
|
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
|
drop=True
|
|
)
|
|
|
|
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
|
|
return result
|
|
|
|
@classmethod
|
|
def save_csv(
|
|
cls, df: pd.DataFrame, filepath: Union[str, Path], create_dirs: bool = True
|
|
) -> None:
|
|
"""Save DataFrame to CSV with proper handling"""
|
|
filepath = Path(filepath)
|
|
|
|
if create_dirs:
|
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
df.to_csv(filepath, index=False, encoding="utf-8")
|
|
logging.info(f"Saved {len(df)} rows to {filepath}")
|