39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
from typing import Optional, Dict
|
|
|
|
import pandas as pd
|
|
|
|
|
|
class TextCleaner:
|
|
"""Reusable text cleaning utilities"""
|
|
|
|
def __init__(self, patterns: Optional[Dict[str, str]] = None):
|
|
self.patterns = patterns or {
|
|
"null_bytes": "\x00",
|
|
"non_breaking_spaces": "\u00a0",
|
|
"multiple_spaces": r" +",
|
|
"extra_whitespace": r"\s+",
|
|
}
|
|
|
|
def clean_text_series(self, series: pd.Series) -> pd.Series:
|
|
"""Clean a pandas Series of text data"""
|
|
cleaned = series.astype(str)
|
|
|
|
# Apply cleaning patterns
|
|
for pattern_name, pattern in self.patterns.items():
|
|
if pattern_name == "multiple_spaces":
|
|
cleaned = cleaned.str.replace(pattern, " ", regex=True)
|
|
else:
|
|
cleaned = cleaned.str.replace(pattern, " ", regex=False)
|
|
|
|
return cleaned.str.strip().str.lower()
|
|
|
|
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Clean all text columns in a DataFrame"""
|
|
df = df.copy()
|
|
text_columns = df.select_dtypes(include="object").columns
|
|
|
|
for col in text_columns:
|
|
df[col] = self.clean_text_series(df[col])
|
|
|
|
return df
|