fix: dataype

This commit is contained in:
2025-07-25 10:42:02 +02:00
parent 14fc302b28
commit 19c66fd0ee
3 changed files with 722 additions and 675 deletions
+716 -670
View File
File diff suppressed because one or more lines are too long
+6 -4
View File
@@ -60,15 +60,14 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
entry = analyze_name(client, llm_model, prompt, row["name"]) entry = analyze_name(client, llm_model, prompt, row["name"])
entry["annotated"] = 1 entry["annotated"] = 1
updates.append((row_idx, entry)) updates.append((row_idx, entry))
logging.info(f"Analyzed : {row['name']} - {entry}") logging.info(f"Analyzed: {row['name']} - {entry}")
except Exception as e: except Exception as e:
logging.warning(f"Failed to analyze '{row['name']}': {e}") logging.warning(f"Failed to analyze '{row['name']}': {e}")
continue continue
if idx % BATCH_SIZE == 0 or idx == len(entries): if idx % BATCH_SIZE == 0 or idx == len(entries):
update_df = pd.DataFrame.from_dict(dict(updates), orient="index") update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
update_df = update_df['annotated'].astype('Int8').fillna(0) update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
df.update(update_df) df.update(update_df)
save_checkpoint(df) save_checkpoint(df)
@@ -80,7 +79,10 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
def main(llm_model: str = "llama3.2:3b"): def main(llm_model: str = "llama3.2:3b"):
df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv"))) df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
entries = df[df["annotated"].astype("Int8") == 0] # Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
entries = df[df["annotated"] == 0]
if entries.empty: if entries.empty:
logging.info("No names to analyze.") logging.info("No names to analyze.")
return return
-1
View File
@@ -65,7 +65,6 @@ def process(df: pd.DataFrame) -> pd.DataFrame:
df['identified_name'] = None df['identified_name'] = None
df['identified_surname'] = None df['identified_surname'] = None
df['annotated'] = 0 df['annotated'] = 0
df['annotated'] = df['annotated'].astype('Int8')
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname # We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
# This is a common pattern in Congolese names # This is a common pattern in Congolese names