From eacbb94a48b7eae4b99cc661a73b3d1f31d7eb66 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Fri, 18 Jul 2025 22:49:45 +0200 Subject: [PATCH] experiment: using LLM for initial annotation --- README.md | 1 + misc/__init__.py | 6 ++- processing/annotation/prepare.py | 72 ++++++++++++++++++++++++++++++++ processing/annotation/test.py | 27 ++++++++++++ processing/gender/prepare.py | 71 ++++++++++++++++++++----------- prompt.txt | 31 ++++++++++++++ 6 files changed, 182 insertions(+), 26 deletions(-) create mode 100644 processing/annotation/prepare.py create mode 100644 processing/annotation/test.py create mode 100644 prompt.txt diff --git a/README.md b/README.md index ac62e28..c400102 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ pip install -r requirements.txt ### 1. Dataset Preparation ```bash python -m processing.gender.prepare +python -m processing.annotation.prepare ``` ### 2. Training diff --git a/misc/__init__.py b/misc/__init__.py index e393613..731f6d9 100644 --- a/misc/__init__.py +++ b/misc/__init__.py @@ -3,7 +3,6 @@ import io import json import os import pickle -from typing import Optional from typing import List, Dict # Paths @@ -78,3 +77,8 @@ def save_pickle(obj, path): def load_pickle(path: str): with open(path, "rb") as f: return pickle.load(f) + + +def load_prompt() -> str: + with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f: + return f.read() diff --git a/processing/annotation/prepare.py b/processing/annotation/prepare.py new file mode 100644 index 0000000..4991b68 --- /dev/null +++ b/processing/annotation/prepare.py @@ -0,0 +1,72 @@ +import os + +import ollama +import pandas as pd +from pydantic import BaseModel, ValidationError +from tqdm import tqdm + +from misc import load_prompt, load_csv_dataset, DATA_DIR + + +class NameAnalysis(BaseModel): + identified_name: str | None + identified_surname: str | None + identified_category: str | None + + +def main(): + dataset = pd.DataFrame(load_csv_dataset('names_featured.csv')) + prompt = load_prompt() + + print(">> Filtering dataset for names that need analysis...") + to_analyze = dataset[dataset['llm_annotated'] == 0].copy() + if to_analyze.empty: + print(">> No names to analyze.") + return + + client = ollama.Client() + updates = [] + + print(">> Starting name analysis with LLM...") + for row in tqdm(to_analyze.itertuples(index=True), total=len(to_analyze)): + name = row.name + try: + response = client.chat( + model="llama3.2:3b", + messages=[ + {"role": "system", "content": prompt}, + {"role": "user", "content": name} + ], + format=NameAnalysis.model_json_schema() + ) + analysis = NameAnalysis.model_validate_json(response.message.content) + result = analysis.model_dump() + except (ValidationError, Exception): + result = { + "identified_name": None, + "identified_surname": None, + "identified_category": None + } + + updates.append({ + "index": row.Index, + "identified_name": result["identified_name"], + "identified_surname": result["identified_surname"], + "identified_category": result["identified_category"], + "llm_annotated": 1 + }) + + print(">> Updating dataset with results...") + updates_df = pd.DataFrame(updates).set_index("index") + dataset.update(updates_df) + + print(">> Saving updated dataset...") + dataset.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False) + print(">> Done.") + + +if __name__ == '__main__': + try: + main() + except Exception as e: + print(f">> Fatal error: {e}") diff --git a/processing/annotation/test.py b/processing/annotation/test.py new file mode 100644 index 0000000..541d730 --- /dev/null +++ b/processing/annotation/test.py @@ -0,0 +1,27 @@ +import ollama +from pydantic import BaseModel + +from misc import load_prompt + + +class NameAnalysis(BaseModel): + identified_name: str | None + identified_surname: str | None + identified_category: str | None + + +name = input("Enter name: ") + +client = ollama.Client() +response = client.chat( + model="mistral:7b", + messages=[ + {"role": "system", "content": load_prompt()}, + {"role": "user", "content": name} + ], + format=NameAnalysis.model_json_schema() +) +analysis = NameAnalysis.model_validate_json(response.message.content) +result = analysis.model_dump() + +print(result) diff --git a/processing/gender/prepare.py b/processing/gender/prepare.py index 39d7e9f..35873dc 100644 --- a/processing/gender/prepare.py +++ b/processing/gender/prepare.py @@ -1,7 +1,5 @@ import os - import pandas as pd - from misc import DATA_DIR @@ -10,48 +8,71 @@ def clean(filepath): for enc in encodings: try: print(f">> Trying to read {filepath} with encoding: {enc}") - df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip') + # Use chunked reading to handle large files + chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip') + cleaned_chunks = [] - print(">> Remove null bytes and non-breaking spaces from all string columns") - for col in df.select_dtypes(include=['object']).columns: - df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False) - df[col] = df[col].str.replace('\u00a0', ' ', regex=False) - df[col] = df[col].str.replace(' +', ' ', regex=True) + for chunk in chunks: + # Drop rows with essential missing values early + chunk = chunk.dropna(subset=['name', 'sex', 'region']) - print(f">> Successfully read with encoding: {enc}") - df = df.dropna(subset=['name', 'sex', 'region']) + # Clean string columns in-place + for col in chunk.select_dtypes(include='object').columns: + chunk[col] = ( + chunk[col] + .astype(str) + .str.replace('\x00', ' ', regex=False) + .str.replace('\u00a0', ' ', regex=False) + .str.replace(' +', ' ', regex=True) + ) + + cleaned_chunks.append(chunk) + + df = pd.concat(cleaned_chunks, ignore_index=True) df.to_csv(filepath, index=False, encoding='utf-8') + print(f">> Successfully read with encoding: {enc}") return df except Exception: continue raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.") -def main(): - df = clean(os.path.join(DATA_DIR, 'names.csv')) - +def process(df: pd.DataFrame): + print(">> Preprocessing names") df['name'] = df['name'].str.strip().str.lower() - df['words'] = df['name'].str.split().apply(len) + + df['words'] = df['name'].str.count(' ') + 1 df['length'] = df['name'].str.replace(' ', '', regex=False).str.len() - df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '') - df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '') - print(f">> Arranging columns") - cols = [c for c in df.columns if c != 'sex'] + ['sex'] - df = df[cols] + name_split = df['name'].str.split() + df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '') + df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '') + df['llm_annotated'] = 0 + + return df + + +def split_and_save(df: pd.DataFrame): + print(">> Saving evaluation and featured datasets") + eval_idx = df.sample(frac=0.2, random_state=42).index + + df_evaluation = df.loc[eval_idx] + df_featured = df.drop(index=eval_idx) - print(f">> Saving evaluation dataset") - df_evaluation = df.sample(frac=0.2, random_state=42) df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False) - - print(f">> Saving featured dataset") - df_featured = df.drop(df_evaluation.index) df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False) - print(f">> Splitting dataset by sex") + print(">> Saving by sex") df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False) df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False) +def main(): + filepath = os.path.join(DATA_DIR, 'names.csv') + df = clean(filepath) + df = process(df) + split_and_save(df) + + if __name__ == '__main__': main() diff --git a/prompt.txt b/prompt.txt new file mode 100644 index 0000000..dca4352 --- /dev/null +++ b/prompt.txt @@ -0,0 +1,31 @@ +## Instructions: +You are analyzing Congolese full names. For each input, return: + +- "identified_name": the native name part of the full name +- "identified_surname": the French or English, usually last part of the full name (can also be composed of multiple words) +- "identified_category": + - "simple" if the native name has no connector + - "compose" if it includes connectors like "wa", "ya", etc. + +if you cannot identify any field, return null for that field. +do not alter the original name, just identify the parts. +do not add any additional information or explanations. + +## Example: +- "tshabu ngandu bernard" +```json +{ + "identified_name": "tshabu ngandu", + "identified_surname": "bernard", + "identified_category": "simple" +} +``` + +- "ilunga wa ilunga albert" +```json +{ + "identified_name": "ilunga wa ilunga", + "identified_surname": "albert", + "identified_category": "compose" +} +```