experiment: using LLM for initial annotation

2025-07-18 22:49:45 +02:00
parent 78355eb1d1
commit eacbb94a48
6 changed files with 182 additions and 26 deletions
@@ -28,6 +28,7 @@ pip install -r requirements.txt
 ### 1. Dataset Preparation
 ```bash
 python -m processing.gender.prepare
 python -m processing.annotation.prepare
 ```
 ### 2. Training
@@ -3,7 +3,6 @@ import io
 import json
 import os
 import pickle
 from typing import Optional
 from typing import List, Dict
 # Paths
@@ -78,3 +77,8 @@ def save_pickle(obj, path):
 def load_pickle(path: str):
    with open(path, "rb") as f:
        return pickle.load(f)
 def load_prompt() -> str:
    with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f:
        return f.read()
@@ -0,0 +1,72 @@
 import os
 import ollama
 import pandas as pd
 from pydantic import BaseModel, ValidationError
 from tqdm import tqdm
 from misc import load_prompt, load_csv_dataset, DATA_DIR
 class NameAnalysis(BaseModel):
    identified_name: str | None
    identified_surname: str | None
    identified_category: str | None
 def main():
    dataset = pd.DataFrame(load_csv_dataset('names_featured.csv'))
    prompt = load_prompt()
    print(">> Filtering dataset for names that need analysis...")
    to_analyze = dataset[dataset['llm_annotated'] == 0].copy()
    if to_analyze.empty:
        print(">> No names to analyze.")
        return
    client = ollama.Client()
    updates = []
    print(">> Starting name analysis with LLM...")
    for row in tqdm(to_analyze.itertuples(index=True), total=len(to_analyze)):
        name = row.name
        try:
            response = client.chat(
                model="llama3.2:3b",
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": name}
                ],
                format=NameAnalysis.model_json_schema()
            )
            analysis = NameAnalysis.model_validate_json(response.message.content)
            result = analysis.model_dump()
        except (ValidationError, Exception):
            result = {
                "identified_name": None,
                "identified_surname": None,
                "identified_category": None
            }
        updates.append({
            "index": row.Index,
            "identified_name": result["identified_name"],
            "identified_surname": result["identified_surname"],
            "identified_category": result["identified_category"],
            "llm_annotated": 1
        })
    print(">> Updating dataset with results...")
    updates_df = pd.DataFrame(updates).set_index("index")
    dataset.update(updates_df)
    print(">> Saving updated dataset...")
    dataset.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
    print(">> Done.")
 if __name__ == '__main__':
    try:
        main()
    except Exception as e:
        print(f">> Fatal error: {e}")
@@ -0,0 +1,27 @@
 import ollama
 from pydantic import BaseModel
 from misc import load_prompt
 class NameAnalysis(BaseModel):
    identified_name: str | None
    identified_surname: str | None
    identified_category: str | None
 name = input("Enter name: ")
 client = ollama.Client()
 response = client.chat(
    model="mistral:7b",
    messages=[
        {"role": "system", "content": load_prompt()},
        {"role": "user", "content": name}
    ],
    format=NameAnalysis.model_json_schema()
 )
 analysis = NameAnalysis.model_validate_json(response.message.content)
 result = analysis.model_dump()
 print(result)
@@ -1,7 +1,5 @@
 import os
 import pandas as pd
 from misc import DATA_DIR
@@ -10,48 +8,71 @@ def clean(filepath):
    for enc in encodings:
        try:
            print(f">> Trying to read {filepath} with encoding: {enc}")
-            df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip')
+            # Use chunked reading to handle large files
            chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
            cleaned_chunks = []
-            print(">> Remove null bytes and non-breaking spaces from all string columns")
+            for chunk in chunks:
-            for col in df.select_dtypes(include=['object']).columns:
+                # Drop rows with essential missing values early
-                df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False)
+                chunk = chunk.dropna(subset=['name', 'sex', 'region'])
                df[col] = df[col].str.replace('\u00a0', ' ', regex=False)
                df[col] = df[col].str.replace(' +', ' ', regex=True)
-            print(f">> Successfully read with encoding: {enc}")
+                # Clean string columns in-place
-            df = df.dropna(subset=['name', 'sex', 'region'])
+                for col in chunk.select_dtypes(include='object').columns:
                    chunk[col] = (
                        chunk[col]
                        .astype(str)
                        .str.replace('\x00', ' ', regex=False)
                        .str.replace('\u00a0', ' ', regex=False)
                        .str.replace(' +', ' ', regex=True)
                    )
                cleaned_chunks.append(chunk)
            df = pd.concat(cleaned_chunks, ignore_index=True)
            df.to_csv(filepath, index=False, encoding='utf-8')
            print(f">> Successfully read with encoding: {enc}")
            return df
        except Exception:
            continue
    raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
-def main():
+def process(df: pd.DataFrame):
-    df = clean(os.path.join(DATA_DIR, 'names.csv'))
+    print(">> Preprocessing names")
    df['name'] = df['name'].str.strip().str.lower()
-    df['words'] = df['name'].str.split().apply(len)
+
    df['words'] = df['name'].str.count(' ') + 1
    df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
    df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
    df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '')
-    print(f">> Arranging columns")
+    name_split = df['name'].str.split()
-    cols = [c for c in df.columns if c != 'sex'] + ['sex']
+    df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
-    df = df[cols]
+    df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
    df['llm_annotated'] = 0
    return df
 def split_and_save(df: pd.DataFrame):
    print(">> Saving evaluation and featured datasets")
    eval_idx = df.sample(frac=0.2, random_state=42).index
    df_evaluation = df.loc[eval_idx]
    df_featured = df.drop(index=eval_idx)
    print(f">> Saving evaluation dataset")
    df_evaluation = df.sample(frac=0.2, random_state=42)
    df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
    print(f">> Saving featured dataset")
    df_featured = df.drop(df_evaluation.index)
    df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
-    print(f">> Splitting dataset by sex")
+    print(">> Saving by sex")
    df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
    df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
 def main():
    filepath = os.path.join(DATA_DIR, 'names.csv')
    df = clean(filepath)
    df = process(df)
    split_and_save(df)
 if __name__ == '__main__':
    main()
@@ -0,0 +1,31 @@
 ## Instructions:
 You are analyzing Congolese full names. For each input, return:
 - "identified_name": the native name part of the full name
 - "identified_surname": the French or English, usually last part of the full name (can also be composed of multiple words)
 - "identified_category":
    - "simple" if the native name has no connector
    - "compose" if it includes connectors like "wa", "ya", etc.
 if you cannot identify any field, return null for that field.
 do not alter the original name, just identify the parts.
 do not add any additional information or explanations.
 ## Example:
 - "tshabu ngandu bernard"
 ```json
 {
  "identified_name": "tshabu ngandu",
  "identified_surname": "bernard",
  "identified_category": "simple"
 }
 ```
 - "ilunga wa ilunga albert"
 ```json
 {
    "identified_name": "ilunga wa ilunga",
    "identified_surname": "albert",
    "identified_category": "compose"
 }
 ```