fix: dataype

2025-07-25 10:42:02 +02:00
parent 14fc302b28
commit 19c66fd0ee
3 changed files with 722 additions and 675 deletions
@@ -60,15 +60,14 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
            entry = analyze_name(client, llm_model, prompt, row["name"])
            entry["annotated"] = 1
            updates.append((row_idx, entry))
-            logging.info(f"Analyzed : {row['name']} - {entry}")
+            logging.info(f"Analyzed: {row['name']} - {entry}")
        except Exception as e:
            logging.warning(f"Failed to analyze '{row['name']}': {e}")
            continue
        if idx % BATCH_SIZE == 0 or idx == len(entries):
            update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
-            update_df = update_df['annotated'].astype('Int8').fillna(0)
+            update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
            df.update(update_df)
            save_checkpoint(df)
@@ -80,7 +79,10 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
 def main(llm_model: str = "llama3.2:3b"):
    df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
-    entries = df[df["annotated"].astype("Int8") == 0]
+    # Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
    df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
    entries = df[df["annotated"] == 0]
    if entries.empty:
        logging.info("No names to analyze.")
        return
@@ -65,7 +65,6 @@ def process(df: pd.DataFrame) -> pd.DataFrame:
    df['identified_name'] = None
    df['identified_surname'] = None
    df['annotated'] = 0
    df['annotated'] = df['annotated'].astype('Int8')
    # We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
    # This is a common pattern in Congolese names