feat: add osm data

This commit is contained in:
2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
+1
View File
@@ -19,6 +19,7 @@ class DataConfig(BaseModel):
"ner_spacy": "names_ner.spacy",
}
)
selected_columns: list[str] = field(default=["name", "sex", "region"])
split_evaluation: bool = False
split_by_province: bool = True
split_by_gender: bool = True
+2 -4
View File
@@ -8,12 +8,10 @@ class RegionMapper:
def __init__(self, mapping: Optional[Dict] = None):
self.mapping = mapping or REGION_MAPPING
self.mapping = {k.lower(): v[1].upper() for k, v in self.mapping.items()}
def map(self, series: pd.Series) -> pd.Series:
"""Vectorized region to province mapping"""
return series.str.lower().map(
lambda r: self.mapping.get(r, ("AUTRES", "AUTRES"))[1].lower()
)
return series.str.lower().map(self.mapping).fillna("AUTRES")
@staticmethod
def get_provinces():
+2 -3
View File
@@ -30,9 +30,8 @@ class TextCleaner:
def clean_dataframe_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean all text columns in a DataFrame"""
df = df.copy()
text_columns = df.select_dtypes(include="object").columns
for col in text_columns:
columns = df.select_dtypes(include=["object", "string"]).columns
for col in columns:
df[col] = self.clean_text_series(df[col])
return df