fix: dataype

This commit is contained in:
2025-07-25 10:42:02 +02:00
parent 14fc302b28
commit 19c66fd0ee
3 changed files with 722 additions and 675 deletions
+716 -670
View File
File diff suppressed because one or more lines are too long
+5 -3
View File
@@ -65,10 +65,9 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
logging.warning(f"Failed to analyze '{row['name']}': {e}") logging.warning(f"Failed to analyze '{row['name']}': {e}")
continue continue
if idx % BATCH_SIZE == 0 or idx == len(entries): if idx % BATCH_SIZE == 0 or idx == len(entries):
update_df = pd.DataFrame.from_dict(dict(updates), orient="index") update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
update_df = update_df['annotated'].astype('Int8').fillna(0) update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
df.update(update_df) df.update(update_df)
save_checkpoint(df) save_checkpoint(df)
@@ -80,7 +79,10 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
def main(llm_model: str = "llama3.2:3b"): def main(llm_model: str = "llama3.2:3b"):
df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv"))) df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
entries = df[df["annotated"].astype("Int8") == 0] # Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
entries = df[df["annotated"] == 0]
if entries.empty: if entries.empty:
logging.info("No names to analyze.") logging.info("No names to analyze.")
return return
-1
View File
@@ -65,7 +65,6 @@ def process(df: pd.DataFrame) -> pd.DataFrame:
df['identified_name'] = None df['identified_name'] = None
df['identified_surname'] = None df['identified_surname'] = None
df['annotated'] = 0 df['annotated'] = 0
df['annotated'] = df['annotated'].astype('Int8')
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname # We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
# This is a common pattern in Congolese names # This is a common pattern in Congolese names