fix: dataype
This commit is contained in:
+716
-670
File diff suppressed because one or more lines are too long
@@ -60,15 +60,14 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
|
|||||||
entry = analyze_name(client, llm_model, prompt, row["name"])
|
entry = analyze_name(client, llm_model, prompt, row["name"])
|
||||||
entry["annotated"] = 1
|
entry["annotated"] = 1
|
||||||
updates.append((row_idx, entry))
|
updates.append((row_idx, entry))
|
||||||
logging.info(f"Analyzed : {row['name']} - {entry}")
|
logging.info(f"Analyzed: {row['name']} - {entry}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Failed to analyze '{row['name']}': {e}")
|
logging.warning(f"Failed to analyze '{row['name']}': {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
if idx % BATCH_SIZE == 0 or idx == len(entries):
|
if idx % BATCH_SIZE == 0 or idx == len(entries):
|
||||||
update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
|
update_df = pd.DataFrame.from_dict(dict(updates), orient="index")
|
||||||
update_df = update_df['annotated'].astype('Int8').fillna(0)
|
update_df["annotated"] = pd.to_numeric(update_df["annotated"], errors="coerce").fillna(0).astype("Int8")
|
||||||
|
|
||||||
df.update(update_df)
|
df.update(update_df)
|
||||||
save_checkpoint(df)
|
save_checkpoint(df)
|
||||||
@@ -80,7 +79,10 @@ def build_updates(llm_model: str, df: pd.DataFrame, entries: pd.DataFrame) -> pd
|
|||||||
def main(llm_model: str = "llama3.2:3b"):
|
def main(llm_model: str = "llama3.2:3b"):
|
||||||
df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
|
df = pd.DataFrame(load_csv_dataset(os.path.join(DATA_DIR, "names_featured.csv")))
|
||||||
|
|
||||||
entries = df[df["annotated"].astype("Int8") == 0]
|
# Safely cast 'annotated' column to Int8, handling float-like strings (e.g., '1.0')
|
||||||
|
df["annotated"] = pd.to_numeric(df["annotated"], errors="coerce").fillna(0).astype(float).astype("Int8")
|
||||||
|
|
||||||
|
entries = df[df["annotated"] == 0]
|
||||||
if entries.empty:
|
if entries.empty:
|
||||||
logging.info("No names to analyze.")
|
logging.info("No names to analyze.")
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -65,7 +65,6 @@ def process(df: pd.DataFrame) -> pd.DataFrame:
|
|||||||
df['identified_name'] = None
|
df['identified_name'] = None
|
||||||
df['identified_surname'] = None
|
df['identified_surname'] = None
|
||||||
df['annotated'] = 0
|
df['annotated'] = 0
|
||||||
df['annotated'] = df['annotated'].astype('Int8')
|
|
||||||
|
|
||||||
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
|
# We can assume that if a name has exactly 3 words, the first two are the native name and the last is the surname
|
||||||
# This is a common pattern in Congolese names
|
# This is a common pattern in Congolese names
|
||||||
|
|||||||
Reference in New Issue
Block a user