Files
drc-ners-nlp/core/utils/text_cleaner.py
T

39 lines
1.2 KiB
Python

from typing import Optional, Dict
import pandas as pd
class TextCleaner:
"""Reusable text cleaning utilities"""
def __init__(self, patterns: Optional[Dict[str, str]] = None):
self.patterns = patterns or {
"null_bytes": "\x00",
"non_breaking_spaces": "\u00a0",
"multiple_spaces": r" +",
"extra_whitespace": r"\s+",
}
def clean_text_series(self, series: pd.Series) -> pd.Series:
"""Clean a pandas Series of text data"""
cleaned = series.astype(str)
# Apply cleaning patterns
for pattern_name, pattern in self.patterns.items():
if pattern_name == "multiple_spaces":
cleaned = cleaned.str.replace(pattern, " ", regex=True)
else:
cleaned = cleaned.str.replace(pattern, " ", regex=False)
return cleaned.str.strip().str.lower()
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean all text columns in a DataFrame"""
df = df.copy()
text_columns = df.select_dtypes(include="object").columns
for col in text_columns:
df[col] = self.clean_text_series(df[col])
return df