import os import argparse import pandas as pd from misc import DATA_DIR, REGION_MAPPING, logging def clean(filepath) -> pd.DataFrame: """ Clean the CSV file by removing null bytes, non-breaking spaces, and extra spaces. Also, it attempts to read the file with different encodings to handle potential encoding issues. """ encodings = ['utf-8', 'utf-16', 'latin1'] for enc in encodings: try: logging.info(f"Trying to read {filepath} with encoding: {enc}") # Use chunked reading to handle large files chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip') cleaned_chunks = [] for chunk in chunks: # Drop rows with essential missing values early chunk = chunk.dropna(subset=['name', 'sex', 'region']) # Clean string columns in-place for col in chunk.select_dtypes(include='object').columns: chunk[col] = ( chunk[col] .astype(str) .str.replace('\x00', ' ', regex=False) .str.replace('\u00a0', ' ', regex=False) .str.replace(' +', ' ', regex=True) .str.strip() .str.lower() ) cleaned_chunks.append(chunk) df = pd.concat(cleaned_chunks, ignore_index=True) df.to_csv(filepath, index=False, encoding='utf-8') logging.info(f"Successfully read with encoding: {enc}") return df except Exception: continue raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.") def process(df: pd.DataFrame) -> pd.DataFrame: """ Process the DataFrame to extract features and clean data. This includes counting words, calculating name length, and extracting probable native names and surnames. Also maps regions to provinces based on REGION_MAPPING. """ logging.info("Preprocessing names") df['words'] = df['name'].str.count(' ') + 1 df['length'] = df['name'].str.replace(' ', '', regex=False).str.len() df['year'] = df['year'].astype(int) # Calculate probable_native and probable_surname name_split = df['name'].str.split() df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '') df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '') df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple') df['identified_name'] = None df['identified_surname'] = None df['annotated'] = 0 # We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname # This is a common pattern in Congolese names three_word_mask = df['words'] == 3 df.loc[three_word_mask, 'identified_name'] = df.loc[three_word_mask, 'probable_native'] df.loc[three_word_mask, 'identified_surname'] = df.loc[three_word_mask, 'probable_surname'] df.loc[three_word_mask, 'annotated'] = 1 logging.info("Mapping regions to provinces") df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1]) df['province'] = df['province'].str.lower() return df def save_artifacts(df: pd.DataFrame, split_eval: bool = True, split_by_sex: bool = True) -> None: """ Splits the input DataFrame into evaluation and featured datasets, saves them as CSV files, and additionally saves separate CSV files for male and female entries if requested. """ if split_eval: logging.info("Saving evaluation and featured datasets") eval_idx = df.sample(frac=0.2, random_state=42).index df_evaluation = df.loc[eval_idx] df_featured = df.drop(index=eval_idx) df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False) df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False) else: df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False) if split_by_sex: logging.info("Saving by sex") df[df['sex'] == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False) df[df['sex'] == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False) def main(split_eval: bool = True, split_by_sex: bool = True): df = process(clean(os.path.join(DATA_DIR, 'names.csv'))) save_artifacts(df, split_eval=split_eval, split_by_sex=split_by_sex) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Prepare name datasets with optional splits.") parser.add_argument('--split_eval', action='store_true', default=True, help="Split into evaluation and featured datasets (default: True)") parser.add_argument('--no-split_eval', action='store_false', dest='split_eval', help="Do not split into evaluation and featured datasets") parser.add_argument('--split_by_sex', action='store_true', default=True, help="Split by sex into male/female datasets (default: True)") parser.add_argument('--no-split_by_sex', action='store_false', dest='split_by_sex', help="Do not split by sex into male/female datasets") args = parser.parse_args() main(split_eval=args.split_eval, split_by_sex=args.split_by_sex)