experiment: using LLM for initial annotation
This commit is contained in:
@@ -28,6 +28,7 @@ pip install -r requirements.txt
|
|||||||
### 1. Dataset Preparation
|
### 1. Dataset Preparation
|
||||||
```bash
|
```bash
|
||||||
python -m processing.gender.prepare
|
python -m processing.gender.prepare
|
||||||
|
python -m processing.annotation.prepare
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2. Training
|
### 2. Training
|
||||||
|
|||||||
+5
-1
@@ -3,7 +3,6 @@ import io
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from typing import Optional
|
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
|
|
||||||
# Paths
|
# Paths
|
||||||
@@ -78,3 +77,8 @@ def save_pickle(obj, path):
|
|||||||
def load_pickle(path: str):
|
def load_pickle(path: str):
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
return pickle.load(f)
|
return pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_prompt() -> str:
|
||||||
|
with open(os.path.join(ROOT_DIR, 'prompt.txt'), 'r') as f:
|
||||||
|
return f.read()
|
||||||
|
|||||||
@@ -0,0 +1,72 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import ollama
|
||||||
|
import pandas as pd
|
||||||
|
from pydantic import BaseModel, ValidationError
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from misc import load_prompt, load_csv_dataset, DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
|
class NameAnalysis(BaseModel):
|
||||||
|
identified_name: str | None
|
||||||
|
identified_surname: str | None
|
||||||
|
identified_category: str | None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
dataset = pd.DataFrame(load_csv_dataset('names_featured.csv'))
|
||||||
|
prompt = load_prompt()
|
||||||
|
|
||||||
|
print(">> Filtering dataset for names that need analysis...")
|
||||||
|
to_analyze = dataset[dataset['llm_annotated'] == 0].copy()
|
||||||
|
if to_analyze.empty:
|
||||||
|
print(">> No names to analyze.")
|
||||||
|
return
|
||||||
|
|
||||||
|
client = ollama.Client()
|
||||||
|
updates = []
|
||||||
|
|
||||||
|
print(">> Starting name analysis with LLM...")
|
||||||
|
for row in tqdm(to_analyze.itertuples(index=True), total=len(to_analyze)):
|
||||||
|
name = row.name
|
||||||
|
try:
|
||||||
|
response = client.chat(
|
||||||
|
model="llama3.2:3b",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": prompt},
|
||||||
|
{"role": "user", "content": name}
|
||||||
|
],
|
||||||
|
format=NameAnalysis.model_json_schema()
|
||||||
|
)
|
||||||
|
analysis = NameAnalysis.model_validate_json(response.message.content)
|
||||||
|
result = analysis.model_dump()
|
||||||
|
except (ValidationError, Exception):
|
||||||
|
result = {
|
||||||
|
"identified_name": None,
|
||||||
|
"identified_surname": None,
|
||||||
|
"identified_category": None
|
||||||
|
}
|
||||||
|
|
||||||
|
updates.append({
|
||||||
|
"index": row.Index,
|
||||||
|
"identified_name": result["identified_name"],
|
||||||
|
"identified_surname": result["identified_surname"],
|
||||||
|
"identified_category": result["identified_category"],
|
||||||
|
"llm_annotated": 1
|
||||||
|
})
|
||||||
|
|
||||||
|
print(">> Updating dataset with results...")
|
||||||
|
updates_df = pd.DataFrame(updates).set_index("index")
|
||||||
|
dataset.update(updates_df)
|
||||||
|
|
||||||
|
print(">> Saving updated dataset...")
|
||||||
|
dataset.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
|
||||||
|
print(">> Done.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except Exception as e:
|
||||||
|
print(f">> Fatal error: {e}")
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
import ollama
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from misc import load_prompt
|
||||||
|
|
||||||
|
|
||||||
|
class NameAnalysis(BaseModel):
|
||||||
|
identified_name: str | None
|
||||||
|
identified_surname: str | None
|
||||||
|
identified_category: str | None
|
||||||
|
|
||||||
|
|
||||||
|
name = input("Enter name: ")
|
||||||
|
|
||||||
|
client = ollama.Client()
|
||||||
|
response = client.chat(
|
||||||
|
model="mistral:7b",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": load_prompt()},
|
||||||
|
{"role": "user", "content": name}
|
||||||
|
],
|
||||||
|
format=NameAnalysis.model_json_schema()
|
||||||
|
)
|
||||||
|
analysis = NameAnalysis.model_validate_json(response.message.content)
|
||||||
|
result = analysis.model_dump()
|
||||||
|
|
||||||
|
print(result)
|
||||||
@@ -1,7 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from misc import DATA_DIR
|
from misc import DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
@@ -10,48 +8,71 @@ def clean(filepath):
|
|||||||
for enc in encodings:
|
for enc in encodings:
|
||||||
try:
|
try:
|
||||||
print(f">> Trying to read {filepath} with encoding: {enc}")
|
print(f">> Trying to read {filepath} with encoding: {enc}")
|
||||||
df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip')
|
# Use chunked reading to handle large files
|
||||||
|
chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
|
||||||
|
cleaned_chunks = []
|
||||||
|
|
||||||
print(">> Remove null bytes and non-breaking spaces from all string columns")
|
for chunk in chunks:
|
||||||
for col in df.select_dtypes(include=['object']).columns:
|
# Drop rows with essential missing values early
|
||||||
df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False)
|
chunk = chunk.dropna(subset=['name', 'sex', 'region'])
|
||||||
df[col] = df[col].str.replace('\u00a0', ' ', regex=False)
|
|
||||||
df[col] = df[col].str.replace(' +', ' ', regex=True)
|
|
||||||
|
|
||||||
print(f">> Successfully read with encoding: {enc}")
|
# Clean string columns in-place
|
||||||
df = df.dropna(subset=['name', 'sex', 'region'])
|
for col in chunk.select_dtypes(include='object').columns:
|
||||||
|
chunk[col] = (
|
||||||
|
chunk[col]
|
||||||
|
.astype(str)
|
||||||
|
.str.replace('\x00', ' ', regex=False)
|
||||||
|
.str.replace('\u00a0', ' ', regex=False)
|
||||||
|
.str.replace(' +', ' ', regex=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
cleaned_chunks.append(chunk)
|
||||||
|
|
||||||
|
df = pd.concat(cleaned_chunks, ignore_index=True)
|
||||||
df.to_csv(filepath, index=False, encoding='utf-8')
|
df.to_csv(filepath, index=False, encoding='utf-8')
|
||||||
|
print(f">> Successfully read with encoding: {enc}")
|
||||||
return df
|
return df
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
|
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def process(df: pd.DataFrame):
|
||||||
df = clean(os.path.join(DATA_DIR, 'names.csv'))
|
print(">> Preprocessing names")
|
||||||
|
|
||||||
df['name'] = df['name'].str.strip().str.lower()
|
df['name'] = df['name'].str.strip().str.lower()
|
||||||
df['words'] = df['name'].str.split().apply(len)
|
|
||||||
|
df['words'] = df['name'].str.count(' ') + 1
|
||||||
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
|
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
|
||||||
df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
|
|
||||||
df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '')
|
|
||||||
|
|
||||||
print(f">> Arranging columns")
|
name_split = df['name'].str.split()
|
||||||
cols = [c for c in df.columns if c != 'sex'] + ['sex']
|
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
|
||||||
df = df[cols]
|
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
|
||||||
|
df['llm_annotated'] = 0
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def split_and_save(df: pd.DataFrame):
|
||||||
|
print(">> Saving evaluation and featured datasets")
|
||||||
|
eval_idx = df.sample(frac=0.2, random_state=42).index
|
||||||
|
|
||||||
|
df_evaluation = df.loc[eval_idx]
|
||||||
|
df_featured = df.drop(index=eval_idx)
|
||||||
|
|
||||||
print(f">> Saving evaluation dataset")
|
|
||||||
df_evaluation = df.sample(frac=0.2, random_state=42)
|
|
||||||
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
|
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
|
||||||
|
|
||||||
print(f">> Saving featured dataset")
|
|
||||||
df_featured = df.drop(df_evaluation.index)
|
|
||||||
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
|
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
|
||||||
|
|
||||||
print(f">> Splitting dataset by sex")
|
print(">> Saving by sex")
|
||||||
df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
|
df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
|
||||||
df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
|
df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
filepath = os.path.join(DATA_DIR, 'names.csv')
|
||||||
|
df = clean(filepath)
|
||||||
|
df = process(df)
|
||||||
|
split_and_save(df)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|||||||
+31
@@ -0,0 +1,31 @@
|
|||||||
|
## Instructions:
|
||||||
|
You are analyzing Congolese full names. For each input, return:
|
||||||
|
|
||||||
|
- "identified_name": the native name part of the full name
|
||||||
|
- "identified_surname": the French or English, usually last part of the full name (can also be composed of multiple words)
|
||||||
|
- "identified_category":
|
||||||
|
- "simple" if the native name has no connector
|
||||||
|
- "compose" if it includes connectors like "wa", "ya", etc.
|
||||||
|
|
||||||
|
if you cannot identify any field, return null for that field.
|
||||||
|
do not alter the original name, just identify the parts.
|
||||||
|
do not add any additional information or explanations.
|
||||||
|
|
||||||
|
## Example:
|
||||||
|
- "tshabu ngandu bernard"
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"identified_name": "tshabu ngandu",
|
||||||
|
"identified_surname": "bernard",
|
||||||
|
"identified_category": "simple"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- "ilunga wa ilunga albert"
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"identified_name": "ilunga wa ilunga",
|
||||||
|
"identified_surname": "albert",
|
||||||
|
"identified_category": "compose"
|
||||||
|
}
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user