experiment: using LLM for initial annotation

This commit is contained in:
2025-07-18 22:49:45 +02:00
parent 78355eb1d1
commit eacbb94a48
6 changed files with 182 additions and 26 deletions
+1
View File
@@ -28,6 +28,7 @@ pip install -r requirements.txt
### 1. Dataset Preparation
```bash
python -m processing.gender.prepare
python -m processing.annotation.prepare
```
### 2. Training
+5 -1
View File
@@ -3,7 +3,6 @@ import io
import json
import os
import pickle
from typing import Optional
from typing import List, Dict
# Paths
@@ -78,3 +77,8 @@ def save_pickle(obj, path):
def load_pickle(path: str):
with open(path, "rb") as f:
return pickle.load(f)
def load_prompt() -> str:
with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f:
return f.read()
+72
View File
@@ -0,0 +1,72 @@
import os
import ollama
import pandas as pd
from pydantic import BaseModel, ValidationError
from tqdm import tqdm
from misc import load_prompt, load_csv_dataset, DATA_DIR
class NameAnalysis(BaseModel):
identified_name: str | None
identified_surname: str | None
identified_category: str | None
def main():
dataset = pd.DataFrame(load_csv_dataset('names_featured.csv'))
prompt = load_prompt()
print(">> Filtering dataset for names that need analysis...")
to_analyze = dataset[dataset['llm_annotated'] == 0].copy()
if to_analyze.empty:
print(">> No names to analyze.")
return
client = ollama.Client()
updates = []
print(">> Starting name analysis with LLM...")
for row in tqdm(to_analyze.itertuples(index=True), total=len(to_analyze)):
name = row.name
try:
response = client.chat(
model="llama3.2:3b",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": name}
],
format=NameAnalysis.model_json_schema()
)
analysis = NameAnalysis.model_validate_json(response.message.content)
result = analysis.model_dump()
except (ValidationError, Exception):
result = {
"identified_name": None,
"identified_surname": None,
"identified_category": None
}
updates.append({
"index": row.Index,
"identified_name": result["identified_name"],
"identified_surname": result["identified_surname"],
"identified_category": result["identified_category"],
"llm_annotated": 1
})
print(">> Updating dataset with results...")
updates_df = pd.DataFrame(updates).set_index("index")
dataset.update(updates_df)
print(">> Saving updated dataset...")
dataset.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
print(">> Done.")
if __name__ == '__main__':
try:
main()
except Exception as e:
print(f">> Fatal error: {e}")
+27
View File
@@ -0,0 +1,27 @@
import ollama
from pydantic import BaseModel
from misc import load_prompt
class NameAnalysis(BaseModel):
identified_name: str | None
identified_surname: str | None
identified_category: str | None
name = input("Enter name: ")
client = ollama.Client()
response = client.chat(
model="mistral:7b",
messages=[
{"role": "system", "content": load_prompt()},
{"role": "user", "content": name}
],
format=NameAnalysis.model_json_schema()
)
analysis = NameAnalysis.model_validate_json(response.message.content)
result = analysis.model_dump()
print(result)
+46 -25
View File
@@ -1,7 +1,5 @@
import os
import pandas as pd
from misc import DATA_DIR
@@ -10,48 +8,71 @@ def clean(filepath):
for enc in encodings:
try:
print(f">> Trying to read {filepath} with encoding: {enc}")
df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip')
# Use chunked reading to handle large files
chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
cleaned_chunks = []
print(">> Remove null bytes and non-breaking spaces from all string columns")
for col in df.select_dtypes(include=['object']).columns:
df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False)
df[col] = df[col].str.replace('\u00a0', ' ', regex=False)
df[col] = df[col].str.replace(' +', ' ', regex=True)
for chunk in chunks:
# Drop rows with essential missing values early
chunk = chunk.dropna(subset=['name', 'sex', 'region'])
print(f">> Successfully read with encoding: {enc}")
df = df.dropna(subset=['name', 'sex', 'region'])
# Clean string columns in-place
for col in chunk.select_dtypes(include='object').columns:
chunk[col] = (
chunk[col]
.astype(str)
.str.replace('\x00', ' ', regex=False)
.str.replace('\u00a0', ' ', regex=False)
.str.replace(' +', ' ', regex=True)
)
cleaned_chunks.append(chunk)
df = pd.concat(cleaned_chunks, ignore_index=True)
df.to_csv(filepath, index=False, encoding='utf-8')
print(f">> Successfully read with encoding: {enc}")
return df
except Exception:
continue
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
def main():
df = clean(os.path.join(DATA_DIR, 'names.csv'))
def process(df: pd.DataFrame):
print(">> Preprocessing names")
df['name'] = df['name'].str.strip().str.lower()
df['words'] = df['name'].str.split().apply(len)
df['words'] = df['name'].str.count(' ') + 1
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '')
print(f">> Arranging columns")
cols = [c for c in df.columns if c != 'sex'] + ['sex']
df = df[cols]
name_split = df['name'].str.split()
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
df['llm_annotated'] = 0
return df
def split_and_save(df: pd.DataFrame):
print(">> Saving evaluation and featured datasets")
eval_idx = df.sample(frac=0.2, random_state=42).index
df_evaluation = df.loc[eval_idx]
df_featured = df.drop(index=eval_idx)
print(f">> Saving evaluation dataset")
df_evaluation = df.sample(frac=0.2, random_state=42)
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
print(f">> Saving featured dataset")
df_featured = df.drop(df_evaluation.index)
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
print(f">> Splitting dataset by sex")
print(">> Saving by sex")
df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
def main():
filepath = os.path.join(DATA_DIR, 'names.csv')
df = clean(filepath)
df = process(df)
split_and_save(df)
if __name__ == '__main__':
main()
+31
View File
@@ -0,0 +1,31 @@
## Instructions:
You are analyzing Congolese full names. For each input, return:
- "identified_name": the native name part of the full name
- "identified_surname": the French or English, usually last part of the full name (can also be composed of multiple words)
- "identified_category":
- "simple" if the native name has no connector
- "compose" if it includes connectors like "wa", "ya", etc.
if you cannot identify any field, return null for that field.
do not alter the original name, just identify the parts.
do not add any additional information or explanations.
## Example:
- "tshabu ngandu bernard"
```json
{
"identified_name": "tshabu ngandu",
"identified_surname": "bernard",
"identified_category": "simple"
}
```
- "ilunga wa ilunga albert"
```json
{
"identified_name": "ilunga wa ilunga",
"identified_surname": "albert",
"identified_category": "compose"
}
```