From eacbb94a48b7eae4b99cc661a73b3d1f31d7eb66 Mon Sep 17 00:00:00 2001
From: bernard-ng <ngandubernard@gmail.com>
Date: Fri, 18 Jul 2025 22:49:45 +0200
Subject: [PATCH] experiment: using LLM for initial annotation

---
 README.md                        |  1 +
 misc/__init__.py                 |  6 ++-
 processing/annotation/prepare.py | 72 ++++++++++++++++++++++++++++++++
 processing/annotation/test.py    | 27 ++++++++++++
 processing/gender/prepare.py     | 71 ++++++++++++++++++++-----------
 prompt.txt                       | 31 ++++++++++++++
 6 files changed, 182 insertions(+), 26 deletions(-)
 create mode 100644 processing/annotation/prepare.py
 create mode 100644 processing/annotation/test.py
 create mode 100644 prompt.txt

diff --git a/README.md b/README.md
index ac62e28..c400102 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ pip install -r requirements.txt
 ### 1. Dataset Preparation
 ```bash
 python -m processing.gender.prepare
+python -m processing.annotation.prepare
 ```
 
 ### 2. Training
diff --git a/misc/__init__.py b/misc/__init__.py
index e393613..731f6d9 100644
--- a/misc/__init__.py
+++ b/misc/__init__.py
@@ -3,7 +3,6 @@ import io
 import json
 import os
 import pickle
-from typing import Optional
 from typing import List, Dict
 
 # Paths
@@ -78,3 +77,8 @@ def save_pickle(obj, path):
 def load_pickle(path: str):
     with open(path, "rb") as f:
         return pickle.load(f)
+
+
+def load_prompt() -> str:
+    with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f:
+        return f.read()
diff --git a/processing/annotation/prepare.py b/processing/annotation/prepare.py
new file mode 100644
index 0000000..4991b68
--- /dev/null
+++ b/processing/annotation/prepare.py
@@ -0,0 +1,72 @@
+import os
+
+import ollama
+import pandas as pd
+from pydantic import BaseModel, ValidationError
+from tqdm import tqdm
+
+from misc import load_prompt, load_csv_dataset, DATA_DIR
+
+
+class NameAnalysis(BaseModel):
+    identified_name: str | None
+    identified_surname: str | None
+    identified_category: str | None
+
+
+def main():
+    dataset = pd.DataFrame(load_csv_dataset('names_featured.csv'))
+    prompt = load_prompt()
+
+    print(">> Filtering dataset for names that need analysis...")
+    to_analyze = dataset[dataset['llm_annotated'] == 0].copy()
+    if to_analyze.empty:
+        print(">> No names to analyze.")
+        return
+
+    client = ollama.Client()
+    updates = []
+
+    print(">> Starting name analysis with LLM...")
+    for row in tqdm(to_analyze.itertuples(index=True), total=len(to_analyze)):
+        name = row.name
+        try:
+            response = client.chat(
+                model="llama3.2:3b",
+                messages=[
+                    {"role": "system", "content": prompt},
+                    {"role": "user", "content": name}
+                ],
+                format=NameAnalysis.model_json_schema()
+            )
+            analysis = NameAnalysis.model_validate_json(response.message.content)
+            result = analysis.model_dump()
+        except (ValidationError, Exception):
+            result = {
+                "identified_name": None,
+                "identified_surname": None,
+                "identified_category": None
+            }
+
+        updates.append({
+            "index": row.Index,
+            "identified_name": result["identified_name"],
+            "identified_surname": result["identified_surname"],
+            "identified_category": result["identified_category"],
+            "llm_annotated": 1
+        })
+
+    print(">> Updating dataset with results...")
+    updates_df = pd.DataFrame(updates).set_index("index")
+    dataset.update(updates_df)
+
+    print(">> Saving updated dataset...")
+    dataset.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
+    print(">> Done.")
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except Exception as e:
+        print(f">> Fatal error: {e}")
diff --git a/processing/annotation/test.py b/processing/annotation/test.py
new file mode 100644
index 0000000..541d730
--- /dev/null
+++ b/processing/annotation/test.py
@@ -0,0 +1,27 @@
+import ollama
+from pydantic import BaseModel
+
+from misc import load_prompt
+
+
+class NameAnalysis(BaseModel):
+    identified_name: str | None
+    identified_surname: str | None
+    identified_category: str | None
+
+
+name = input("Enter name: ")
+
+client = ollama.Client()
+response = client.chat(
+    model="mistral:7b",
+    messages=[
+        {"role": "system", "content": load_prompt()},
+        {"role": "user", "content": name}
+    ],
+    format=NameAnalysis.model_json_schema()
+)
+analysis = NameAnalysis.model_validate_json(response.message.content)
+result = analysis.model_dump()
+
+print(result)
diff --git a/processing/gender/prepare.py b/processing/gender/prepare.py
index 39d7e9f..35873dc 100644
--- a/processing/gender/prepare.py
+++ b/processing/gender/prepare.py
@@ -1,7 +1,5 @@
 import os
-
 import pandas as pd
-
 from misc import DATA_DIR
 
 
@@ -10,48 +8,71 @@ def clean(filepath):
     for enc in encodings:
         try:
             print(f">> Trying to read {filepath} with encoding: {enc}")
-            df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip')
+            # Use chunked reading to handle large files
+            chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
+            cleaned_chunks = []
 
-            print(">> Remove null bytes and non-breaking spaces from all string columns")
-            for col in df.select_dtypes(include=['object']).columns:
-                df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False)
-                df[col] = df[col].str.replace('\u00a0', ' ', regex=False)
-                df[col] = df[col].str.replace(' +', ' ', regex=True)
+            for chunk in chunks:
+                # Drop rows with essential missing values early
+                chunk = chunk.dropna(subset=['name', 'sex', 'region'])
 
-            print(f">> Successfully read with encoding: {enc}")
-            df = df.dropna(subset=['name', 'sex', 'region'])
+                # Clean string columns in-place
+                for col in chunk.select_dtypes(include='object').columns:
+                    chunk[col] = (
+                        chunk[col]
+                        .astype(str)
+                        .str.replace('\x00', ' ', regex=False)
+                        .str.replace('\u00a0', ' ', regex=False)
+                        .str.replace(' +', ' ', regex=True)
+                    )
+
+                cleaned_chunks.append(chunk)
+
+            df = pd.concat(cleaned_chunks, ignore_index=True)
             df.to_csv(filepath, index=False, encoding='utf-8')
+            print(f">> Successfully read with encoding: {enc}")
             return df
         except Exception:
             continue
     raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
 
 
-def main():
-    df = clean(os.path.join(DATA_DIR, 'names.csv'))
-
+def process(df: pd.DataFrame):
+    print(">> Preprocessing names")
     df['name'] = df['name'].str.strip().str.lower()
-    df['words'] = df['name'].str.split().apply(len)
+
+    df['words'] = df['name'].str.count(' ') + 1
     df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
-    df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
-    df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '')
 
-    print(f">> Arranging columns")
-    cols = [c for c in df.columns if c != 'sex'] + ['sex']
-    df = df[cols]
+    name_split = df['name'].str.split()
+    df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
+    df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
+    df['llm_annotated'] = 0
+
+    return df
+
+
+def split_and_save(df: pd.DataFrame):
+    print(">> Saving evaluation and featured datasets")
+    eval_idx = df.sample(frac=0.2, random_state=42).index
+
+    df_evaluation = df.loc[eval_idx]
+    df_featured = df.drop(index=eval_idx)
 
-    print(f">> Saving evaluation dataset")
-    df_evaluation = df.sample(frac=0.2, random_state=42)
     df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
-
-    print(f">> Saving featured dataset")
-    df_featured = df.drop(df_evaluation.index)
     df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
 
-    print(f">> Splitting dataset by sex")
+    print(">> Saving by sex")
     df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
     df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
 
 
+def main():
+    filepath = os.path.join(DATA_DIR, 'names.csv')
+    df = clean(filepath)
+    df = process(df)
+    split_and_save(df)
+
+
 if __name__ == '__main__':
     main()
diff --git a/prompt.txt b/prompt.txt
new file mode 100644
index 0000000..dca4352
--- /dev/null
+++ b/prompt.txt
@@ -0,0 +1,31 @@
+## Instructions:
+You are analyzing Congolese full names. For each input, return:
+
+- "identified_name": the native name part of the full name
+- "identified_surname": the French or English, usually last part of the full name (can also be composed of multiple words)
+- "identified_category":
+    - "simple" if the native name has no connector
+    - "compose" if it includes connectors like "wa", "ya", etc.
+
+if you cannot identify any field, return null for that field.
+do not alter the original name, just identify the parts.
+do not add any additional information or explanations.
+
+## Example:
+- "tshabu ngandu bernard"
+```json
+{
+  "identified_name": "tshabu ngandu",
+  "identified_surname": "bernard",
+  "identified_category": "simple"
+}
+```
+
+- "ilunga wa ilunga albert"
+```json
+{
+    "identified_name": "ilunga wa ilunga",
+    "identified_surname": "albert",
+    "identified_category": "compose"
+}
+```