refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,38 @@
|
||||
from typing import Optional, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
"""Reusable text cleaning utilities"""
|
||||
|
||||
def __init__(self, patterns: Optional[Dict[str, str]] = None):
|
||||
self.patterns = patterns or {
|
||||
"null_bytes": "\x00",
|
||||
"non_breaking_spaces": "\u00a0",
|
||||
"multiple_spaces": r" +",
|
||||
"extra_whitespace": r"\s+",
|
||||
}
|
||||
|
||||
def clean_text_series(self, series: pd.Series) -> pd.Series:
|
||||
"""Clean a pandas Series of text data"""
|
||||
cleaned = series.astype(str)
|
||||
|
||||
# Apply cleaning patterns
|
||||
for pattern_name, pattern in self.patterns.items():
|
||||
if pattern_name == "multiple_spaces":
|
||||
cleaned = cleaned.str.replace(pattern, " ", regex=True)
|
||||
else:
|
||||
cleaned = cleaned.str.replace(pattern, " ", regex=False)
|
||||
|
||||
return cleaned.str.strip().str.lower()
|
||||
|
||||
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Clean all text columns in a DataFrame"""
|
||||
df = df.copy()
|
||||
text_columns = df.select_dtypes(include="object").columns
|
||||
|
||||
for col in text_columns:
|
||||
df[col] = self.clean_text_series(df[col])
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user