feat: add osm data
This commit is contained in:
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
|
||||
"ner_spacy": "names_ner.spacy",
|
||||
}
|
||||
)
|
||||
selected_columns: list[str] = field(default=["name", "sex", "region"])
|
||||
split_evaluation: bool = False
|
||||
split_by_province: bool = True
|
||||
split_by_gender: bool = True
|
||||
|
||||
@@ -8,12 +8,10 @@ class RegionMapper:
|
||||
|
||||
def __init__(self, mapping: Optional[Dict] = None):
|
||||
self.mapping = mapping or REGION_MAPPING
|
||||
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
|
||||
|
||||
def map(self, series: pd.Series) -> pd.Series:
|
||||
"""Vectorized region to province mapping"""
|
||||
return series.str.lower().map(
|
||||
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
|
||||
)
|
||||
return series.str.lower().map(self.mapping).fillna("AUTRES")
|
||||
|
||||
@staticmethod
|
||||
def get_provinces():
|
||||
|
||||
@@ -30,9 +30,8 @@ class TextCleaner:
|
||||
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Clean all text columns in a DataFrame"""
|
||||
df = df.copy()
|
||||
text_columns = df.select_dtypes(include="object").columns
|
||||
|
||||
for col in text_columns:
|
||||
columns = df.select_dtypes(include=["object", "string"]).columns
|
||||
for col in columns:
|
||||
df[col] = self.clean_text_series(df[col])
|
||||
|
||||
return df
|
||||
|
||||
Reference in New Issue
Block a user