Files
drc-ners-nlp/processing/prepare.py
T

111 lines
4.8 KiB
Python

import os
import argparse
import pandas as pd
from misc import DATA_DIR, REGION_MAPPING, logging
def clean(filepath) -> pd.DataFrame:
"""
Clean the CSV file by removing null bytes, non-breaking spaces, and extra spaces.
Also, it attempts to read the file with different encodings to handle potential encoding issues.
"""
encodings = ['utf-8', 'utf-16', 'latin1']
for enc in encodings:
try:
logging.info(f"Trying to read {filepath} with encoding: {enc}")
# Use chunked reading to handle large files
chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
cleaned_chunks = []
for chunk in chunks:
# Drop rows with essential missing values early
chunk = chunk.dropna(subset=['name', 'sex', 'region'])
# Clean string columns in-place
for col in chunk.select_dtypes(include='object').columns:
chunk[col] = (
chunk[col]
.astype(str)
.str.replace('\x00', ' ', regex=False)
.str.replace('\u00a0', ' ', regex=False)
.str.replace(' +', ' ', regex=True)
.str.strip()
.str.lower()
)
cleaned_chunks.append(chunk)
df = pd.concat(cleaned_chunks, ignore_index=True)
df.to_csv(filepath, index=False, encoding='utf-8')
logging.info(f"Successfully read with encoding: {enc}")
return df
except Exception:
continue
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
def process(df: pd.DataFrame) -> pd.DataFrame:
"""
Process the DataFrame to extract features and clean data.
This includes counting words, calculating name length, and extracting probable native names and surnames.
Also maps regions to provinces based on REGION_MAPPING.
"""
logging.info("Preprocessing names")
df['words'] = df['name'].str.count(' ') + 1
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
name_split = df['name'].str.split()
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
df['identified_category'] = df['words'].apply(lambda x: 'compose' if x > 3 else 'simple')
df['identified_name'] = None
df['identified_surname'] = None
logging.info("Mapping regions to provinces")
df['province'] = df['region'].map(lambda r: REGION_MAPPING.get(r, ('AUTRES', 'AUTRES'))[1])
df['province'] = df['province'].str.lower()
df['annotated'] = 0
return df
def save_artifacts(df: pd.DataFrame, split_eval: bool = True, split_by_sex: bool = True) -> None:
"""
Splits the input DataFrame into evaluation and featured datasets, saves them as CSV files,
and additionally saves separate CSV files for male and female entries if requested.
"""
if split_eval:
logging.info("Saving evaluation and featured datasets")
eval_idx = df.sample(frac=0.2, random_state=42).index
df_evaluation = df.loc[eval_idx]
df_featured = df.drop(index=eval_idx)
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
else:
df.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
if split_by_sex:
logging.info("Saving by sex")
df[df['sex'] == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
df[df['sex'] == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
def main(split_eval: bool = True, split_by_sex: bool = True):
df = process(clean(os.path.join(DATA_DIR, 'names.csv')))
save_artifacts(df, split_eval=split_eval, split_by_sex=split_by_sex)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Prepare name datasets with optional splits.")
parser.add_argument('--split_eval', action='store_true', default=True, help="Split into evaluation and featured datasets (default: True)")
parser.add_argument('--no-split_eval', action='store_false', dest='split_eval', help="Do not split into evaluation and featured datasets")
parser.add_argument('--split_by_sex', action='store_true', default=True, help="Split by sex into male/female datasets (default: True)")
parser.add_argument('--no-split_by_sex', action='store_false', dest='split_by_sex', help="Do not split by sex into male/female datasets")
args = parser.parse_args()
main(split_eval=args.split_eval, split_by_sex=args.split_by_sex)