import os import argparse import pandas as pd from misc import DATA_DIR, REGION_MAPPING, logging def clean(filepath) -> pd.DataFrame: """ Clean the CSV file by removing null bytes, non-breaking spaces, and extra spaces. Also, it attempts to read the file with different encodings to handle potential encoding issues. """ encodings = ['utf-8', 'utf-16', 'latin1'] for enc in encodings: try: logging.info(f"Trying to read {filepath} with encoding: {enc}") # Use chunked reading to handle large files chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip') cleaned_chunks = [] for chunk in chunks: # Drop rows with essential missing values early chunk = chunk.dropna(subset=['name', 'sex', 'region']) # Clean string columns in-place for col in chunk.select_dtypes(include='object').columns: chunk[col] = ( chunk[col] .astype(str) .str.replace('\x00', ' ', regex=False) .str.replace('\u00a0', ' ', regex=False) .str.replace(' +', ' ', regex=True) .str.strip() .str.lower() ) cleaned_chunks.append(chunk) df = pd.concat(cleaned_chunks, ignore_index=True) df.to_csv(filepath, index=False, encoding='utf-8') logging.info(f"Successfully read with encoding: {enc}") return df except Exception: continue raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.") def process(df: pd.DataFrame) -> pd.DataFrame: """ Process the DataFrame to extract features and clean data. This includes counting words, calculating name length, and extracting probable native names and surnames. Also maps regions to provinces based on REGION_MAPPING. """ logging.info("Preprocessing names") df['words'] = df['name'].str.count(' ') + 1 df['length'] = df['name'].str.replace(' ', '', regex=False).str.len() name_split = df['name'].str.split() df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '') df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '') df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple') df['identified_name'] = None df['identified_surname'] = None logging.info("Mapping regions to provinces") df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1]) df['province'] = df['province'].str.lower() df['annotated'] = 0 return df def save_artifacts(df: pd.DataFrame, split_eval: bool = True, split_by_sex: bool = True) -> None: """ Splits the input DataFrame into evaluation and featured datasets, saves them as CSV files, and additionally saves separate CSV files for male and female entries if requested. """ if split_eval: logging.info("Saving evaluation and featured datasets") eval_idx = df.sample(frac=0.2, random_state=42).index df_evaluation = df.loc[eval_idx] df_featured = df.drop(index=eval_idx) df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False) df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False) else: df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False) if split_by_sex: logging.info("Saving by sex") df[df['sex'] == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False) df[df['sex'] == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False) def main(split_eval: bool = True, split_by_sex: bool = True): df = process(clean(os.path.join(DATA_DIR, 'names.csv'))) save_artifacts(df, split_eval=split_eval, split_by_sex=split_by_sex) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Prepare name datasets with optional splits.") parser.add_argument('--split_eval', action='store_true', default=True, help="Split into evaluation and featured datasets (default: True)") parser.add_argument('--no-split_eval', action='store_false', dest='split_eval', help="Do not split into evaluation and featured datasets") parser.add_argument('--split_by_sex', action='store_true', default=True, help="Split by sex into male/female datasets (default: True)") parser.add_argument('--no-split_by_sex', action='store_false', dest='split_by_sex', help="Do not split by sex into male/female datasets") args = parser.parse_args() main(split_eval=args.split_eval, split_by_sex=args.split_by_sex)