feat: enhance logging and memory management across modules
This commit is contained in:
+60
-16
@@ -1,17 +1,41 @@
|
||||
import gc
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Iterator
|
||||
from typing import Optional, Union, Iterator, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
|
||||
OPTIMIZED_DTYPES = {
|
||||
# Numeric columns with appropriate bit-width
|
||||
"year": "Int16", # Years fit in 16-bit integer
|
||||
"words": "Int8", # Word counts typically < 128
|
||||
"length": "Int16", # Name lengths fit in 16-bit
|
||||
"annotated": "Int8", # Binary flag (0/1)
|
||||
"ner_tagged": "Int8", # Binary flag (0/1)
|
||||
# Categorical columns (memory efficient for repeated values)
|
||||
"sex": "category",
|
||||
"province": "category",
|
||||
"region": "category",
|
||||
"identified_category": "category",
|
||||
"transformation_type": "category",
|
||||
# String columns with proper string dtype
|
||||
"name": "string",
|
||||
"probable_native": "string",
|
||||
"probable_surname": "string",
|
||||
"identified_name": "string",
|
||||
"identified_surname": "string",
|
||||
"ner_entities": "string",
|
||||
}
|
||||
|
||||
|
||||
class DataLoader:
|
||||
"""Reusable data loading utilities"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
def __init__(self, config: PipelineConfig, custom_dtypes: Optional[Dict] = None):
|
||||
self.config = config
|
||||
self.dtypes = {**OPTIMIZED_DTYPES, **(custom_dtypes or {})}
|
||||
|
||||
def load_csv_chunked(
|
||||
self, filepath: Union[str, Path], chunk_size: Optional[int] = None
|
||||
@@ -19,19 +43,23 @@ class DataLoader:
|
||||
"""Load CSV file in chunks for memory efficiency"""
|
||||
chunk_size = chunk_size or self.config.processing.chunk_size
|
||||
encodings = self.config.processing.encoding_options
|
||||
|
||||
filepath = Path(filepath)
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
logging.info(f"Attempting to read {filepath} with encoding: {encoding}")
|
||||
logging.info(f"Reading {filepath} with encoding: {encoding}")
|
||||
|
||||
# Read with optimal dtypes
|
||||
chunk_iter = pd.read_csv(
|
||||
filepath, encoding=encoding, chunksize=chunk_size, on_bad_lines="skip"
|
||||
filepath,
|
||||
encoding=encoding,
|
||||
chunksize=chunk_size,
|
||||
on_bad_lines="skip",
|
||||
dtype=self.dtypes,
|
||||
)
|
||||
|
||||
for i, chunk in enumerate(chunk_iter):
|
||||
logging.debug(f"Processing chunk {i+1}")
|
||||
logging.debug(f"Processing optimized chunk {i + 1}")
|
||||
yield chunk
|
||||
|
||||
logging.info(f"Successfully read {filepath} with encoding: {encoding}")
|
||||
@@ -44,12 +72,20 @@ class DataLoader:
|
||||
raise ValueError(f"Unable to decode {filepath} with any encoding: {encodings}")
|
||||
|
||||
def load_csv_complete(self, filepath: Union[str, Path]) -> pd.DataFrame:
|
||||
"""Load complete CSV file into memory with size limiting and balancing"""
|
||||
chunks = list(self.load_csv_chunked(filepath))
|
||||
"""Load complete CSV with memory optimization"""
|
||||
chunks = []
|
||||
for chunk in self.load_csv_chunked(filepath):
|
||||
chunks.append(chunk)
|
||||
|
||||
if not chunks:
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.concat(chunks, ignore_index=True)
|
||||
logging.info(f"Concatenating {len(chunks)} optimized chunks")
|
||||
df = pd.concat(chunks, ignore_index=True, copy=False)
|
||||
|
||||
# Cleanup chunks from memory
|
||||
del chunks
|
||||
gc.collect()
|
||||
|
||||
# Apply dataset size limiting if configured
|
||||
if self.config.data.max_dataset_size is not None:
|
||||
@@ -87,27 +123,35 @@ class DataLoader:
|
||||
balanced_samples = []
|
||||
|
||||
for i, sex in enumerate(sex_values):
|
||||
sex_df = df[df["sex"] == sex]
|
||||
# Use boolean indexing instead of creating temporary DataFrames
|
||||
sex_mask = df["sex"] == sex
|
||||
sex_indices = df[sex_mask].index
|
||||
|
||||
# Distribute remaining samples to first categories
|
||||
current_samples = samples_per_sex + (1 if i < remaining_samples else 0)
|
||||
current_samples = min(current_samples, len(sex_df))
|
||||
current_samples = min(current_samples, len(sex_indices))
|
||||
|
||||
if current_samples > 0:
|
||||
sample = sex_df.sample(n=current_samples, random_state=self.config.data.random_seed + i)
|
||||
balanced_samples.append(sample)
|
||||
# Sample indices instead of DataFrame
|
||||
sampled_indices = pd.Series(sex_indices).sample(
|
||||
n=current_samples, random_state=self.config.data.random_seed + i
|
||||
)
|
||||
balanced_samples.extend(sampled_indices.tolist())
|
||||
logging.info(f"Sampled {current_samples} records for sex '{sex}'")
|
||||
|
||||
if not balanced_samples:
|
||||
logging.warning("No balanced samples could be created, using random sampling")
|
||||
return df.sample(n=max_size, random_state=self.config.data.random_seed)
|
||||
|
||||
result = pd.concat(balanced_samples, ignore_index=True)
|
||||
# Create result using iloc with indices (no copying until final step)
|
||||
result = df.iloc[balanced_samples].copy()
|
||||
|
||||
# Shuffle the final result
|
||||
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(drop=True)
|
||||
result = result.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total records")
|
||||
logging.info(f"Created balanced dataset with {len(result)} records from {len(df)} total")
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -1,3 +1 @@
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -9,14 +9,9 @@ class RegionMapper:
|
||||
def __init__(self, mapping: Optional[Dict] = None):
|
||||
self.mapping = mapping or REGION_MAPPING
|
||||
|
||||
def map_region_to_province(self, region: str) -> str:
|
||||
"""Map a region to its province"""
|
||||
region_lower = str(region).lower().strip()
|
||||
return self.mapping.get(region_lower, ("AUTRES", "AUTRES"))[1].lower()
|
||||
|
||||
def map_regions_vectorized(self, regions: pd.Series) -> pd.Series:
|
||||
def map(self, series: pd.Series) -> pd.Series:
|
||||
"""Vectorized region to province mapping"""
|
||||
return regions.str.lower().map(
|
||||
return series.str.lower().map(
|
||||
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
||||
)
|
||||
|
||||
@@ -34,6 +29,7 @@ class RegionMapper:
|
||||
"sud-kivu",
|
||||
"kasai-occidental",
|
||||
"kasai-oriental",
|
||||
"autres",
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user