experiment: using LLM for initial annotation

This commit is contained in:
2025-07-18 22:49:45 +02:00
parent 78355eb1d1
commit eacbb94a48
6 changed files with 182 additions and 26 deletions
+72
View File
@@ -0,0 +1,72 @@
import os
import ollama
import pandas as pd
from pydantic import BaseModel, ValidationError
from tqdm import tqdm
from misc import load_prompt, load_csv_dataset, DATA_DIR
class NameAnalysis(BaseModel):
identified_name: str | None
identified_surname: str | None
identified_category: str | None
def main():
dataset = pd.DataFrame(load_csv_dataset('names_featured.csv'))
prompt = load_prompt()
print(">> Filtering dataset for names that need analysis...")
to_analyze = dataset[dataset['llm_annotated'] == 0].copy()
if to_analyze.empty:
print(">> No names to analyze.")
return
client = ollama.Client()
updates = []
print(">> Starting name analysis with LLM...")
for row in tqdm(to_analyze.itertuples(index=True), total=len(to_analyze)):
name = row.name
try:
response = client.chat(
model="llama3.2:3b",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": name}
],
format=NameAnalysis.model_json_schema()
)
analysis = NameAnalysis.model_validate_json(response.message.content)
result = analysis.model_dump()
except (ValidationError, Exception):
result = {
"identified_name": None,
"identified_surname": None,
"identified_category": None
}
updates.append({
"index": row.Index,
"identified_name": result["identified_name"],
"identified_surname": result["identified_surname"],
"identified_category": result["identified_category"],
"llm_annotated": 1
})
print(">> Updating dataset with results...")
updates_df = pd.DataFrame(updates).set_index("index")
dataset.update(updates_df)
print(">> Saving updated dataset...")
dataset.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
print(">> Done.")
if __name__ == '__main__':
try:
main()
except Exception as e:
print(f">> Fatal error: {e}")
+27
View File
@@ -0,0 +1,27 @@
import ollama
from pydantic import BaseModel
from misc import load_prompt
class NameAnalysis(BaseModel):
identified_name: str | None
identified_surname: str | None
identified_category: str | None
name = input("Enter name: ")
client = ollama.Client()
response = client.chat(
model="mistral:7b",
messages=[
{"role": "system", "content": load_prompt()},
{"role": "user", "content": name}
],
format=NameAnalysis.model_json_schema()
)
analysis = NameAnalysis.model_validate_json(response.message.content)
result = analysis.model_dump()
print(result)
+46 -25
View File
@@ -1,7 +1,5 @@
import os
import pandas as pd
from misc import DATA_DIR
@@ -10,48 +8,71 @@ def clean(filepath):
for enc in encodings:
try:
print(f">> Trying to read {filepath} with encoding: {enc}")
df = pd.read_csv(filepath, encoding=enc, on_bad_lines='skip')
# Use chunked reading to handle large files
chunks = pd.read_csv(filepath, encoding=enc, chunksize=100_000, on_bad_lines='skip')
cleaned_chunks = []
print(">> Remove null bytes and non-breaking spaces from all string columns")
for col in df.select_dtypes(include=['object']).columns:
df[col] = df[col].astype(str).str.replace('\x00', ' ', regex=False)
df[col] = df[col].str.replace('\u00a0', ' ', regex=False)
df[col] = df[col].str.replace(' +', ' ', regex=True)
for chunk in chunks:
# Drop rows with essential missing values early
chunk = chunk.dropna(subset=['name', 'sex', 'region'])
print(f">> Successfully read with encoding: {enc}")
df = df.dropna(subset=['name', 'sex', 'region'])
# Clean string columns in-place
for col in chunk.select_dtypes(include='object').columns:
chunk[col] = (
chunk[col]
.astype(str)
.str.replace('\x00', ' ', regex=False)
.str.replace('\u00a0', ' ', regex=False)
.str.replace(' +', ' ', regex=True)
)
cleaned_chunks.append(chunk)
df = pd.concat(cleaned_chunks, ignore_index=True)
df.to_csv(filepath, index=False, encoding='utf-8')
print(f">> Successfully read with encoding: {enc}")
return df
except Exception:
continue
raise UnicodeDecodeError(f"Unable to decode {filepath} with common encodings.")
def main():
df = clean(os.path.join(DATA_DIR, 'names.csv'))
def process(df: pd.DataFrame):
print(">> Preprocessing names")
df['name'] = df['name'].str.strip().str.lower()
df['words'] = df['name'].str.split().apply(len)
df['words'] = df['name'].str.count(' ') + 1
df['length'] = df['name'].str.replace(' ', '', regex=False).str.len()
df['probable_native'] = df['name'].str.split().apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
df['probable_surname'] = df['name'].str.split().apply(lambda x: x[-1] if len(x) > 0 else '')
print(f">> Arranging columns")
cols = [c for c in df.columns if c != 'sex'] + ['sex']
df = df[cols]
name_split = df['name'].str.split()
df['probable_native'] = name_split.apply(lambda x: ' '.join(x[:-1]) if len(x) > 1 else '')
df['probable_surname'] = name_split.apply(lambda x: x[-1] if x else '')
df['llm_annotated'] = 0
return df
def split_and_save(df: pd.DataFrame):
print(">> Saving evaluation and featured datasets")
eval_idx = df.sample(frac=0.2, random_state=42).index
df_evaluation = df.loc[eval_idx]
df_featured = df.drop(index=eval_idx)
print(f">> Saving evaluation dataset")
df_evaluation = df.sample(frac=0.2, random_state=42)
df_evaluation.to_csv(os.path.join(DATA_DIR, 'names_evaluation.csv'), index=False)
print(f">> Saving featured dataset")
df_featured = df.drop(df_evaluation.index)
df_featured.to_csv(os.path.join(DATA_DIR, 'names_featured.csv'), index=False)
print(f">> Splitting dataset by sex")
print(">> Saving by sex")
df[df['sex'].str.lower() == 'm'].to_csv(os.path.join(DATA_DIR, 'names_males.csv'), index=False)
df[df['sex'].str.lower() == 'f'].to_csv(os.path.join(DATA_DIR, 'names_females.csv'), index=False)
def main():
filepath = os.path.join(DATA_DIR, 'names.csv')
df = clean(filepath)
df = process(df)
split_and_save(df)
if __name__ == '__main__':
main()