fix: nn models pad_sequences

This commit is contained in:
2025-10-06 00:37:29 +02:00
parent cb22c06628
commit d3b3840278
7 changed files with 211 additions and 92 deletions
+6 -1
View File
@@ -23,6 +23,7 @@ class BiGRUModel(NeuralNetworkModel):
input_dim=vocab_size, input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64), output_dim=params.get("embedding_dim", 64),
mask_zero=True, mask_zero=True,
input_length=params.get("max_len", 6),
), ),
# First recurrent block returns full sequences to allow stacking. # First recurrent block returns full sequences to allow stacking.
# Moderate dropout + optional recurrent_dropout to reduce overfitting # Moderate dropout + optional recurrent_dropout to reduce overfitting
@@ -69,4 +70,8 @@ class BiGRUModel(NeuralNetworkModel):
sequences = self.tokenizer.texts_to_sequences(text_data) sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6) max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post") # Ensure padding and truncation are applied on the right to keep
# contiguous non-zero tokens on the left, matching RNN mask expectations.
return pad_sequences(
sequences, maxlen=max_len, padding="post", truncating="post"
)
+4 -1
View File
@@ -83,4 +83,7 @@ class CNNModel(NeuralNetworkModel):
"max_len", 20 "max_len", 20
) # Longer for character level ) # Longer for character level
return pad_sequences(sequences, maxlen=max_len, padding="post") # Right-side padding and truncation ensure contiguous non-zero tokens on the left
return pad_sequences(
sequences, maxlen=max_len, padding="post", truncating="post"
)
+4 -1
View File
@@ -68,4 +68,7 @@ class LSTMModel(NeuralNetworkModel):
sequences = self.tokenizer.texts_to_sequences(text_data) sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6) max_len = self.config.model_params.get("max_len", 6)
return pad_sequences(sequences, maxlen=max_len, padding="post") # Right-side padding and truncation to preserve contiguous non-zero tokens
return pad_sequences(
sequences, maxlen=max_len, padding="post", truncating="post"
)
@@ -88,4 +88,7 @@ class TransformerModel(NeuralNetworkModel):
sequences = self.tokenizer.texts_to_sequences(text_data) sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = int(self.config.model_params.get("max_len", 6)) max_len = int(self.config.model_params.get("max_len", 6))
return pad_sequences(sequences, maxlen=max_len, padding="post") # Right-side padding and truncation for consistent masking/shape
return pad_sequences(
sequences, maxlen=max_len, padding="post", truncating="post"
)
+23
View File
@@ -149,6 +149,29 @@ class NeuralNetworkModel(BaseModel):
if invalid_mask.any(): if invalid_mask.any():
arr[invalid_mask] = oov_index arr[invalid_mask] = oov_index
# Enforce strictly right-padded masks for RNN/cuDNN compatibility.
# Any zero appearing before the last non-zero in a sequence will be
# replaced with the OOV index so the mask remains contiguous True->False.
try:
nz = arr != 0 # non-padding tokens
if nz.ndim == 2 and arr.shape[1] > 0:
# Identify rows that have at least one non-zero
has_nz = nz.any(axis=1)
# Compute last non-zero position per row; if none, set to -1
indices = np.arange(arr.shape[1], dtype=np.int64)
# Max of indices where nz is True gives last non-zero
last_pos = (nz * indices).max(axis=1)
last_pos = np.where(has_nz, last_pos, -1)
# Broadcast to mark the left region up to last non-zero (inclusive)
left_region = indices <= last_pos[:, None]
# Zeros inside the left region are invalid padding -> set to OOV
zero_inside = (~nz) & left_region
if zero_inside.any():
arr[zero_inside] = oov_index
except Exception:
# Best-effort; skip if any unexpected broadcasting issue occurs
pass
# Use int32 for TF embedding ops compatibility # Use int32 for TF embedding ops compatibility
return arr.astype(np.int32, copy=False) return arr.astype(np.int32, copy=False)
except Exception as e: except Exception as e:
+113 -33
View File
@@ -74,21 +74,34 @@
"\n", "\n",
" cm = exp.get(\"confusion_matrix\")\n", " cm = exp.get(\"confusion_matrix\")\n",
" tn = fp = fn = tp = np.nan\n", " tn = fp = fn = tp = np.nan\n",
" if isinstance(cm, list) and len(cm)==2 and all(isinstance(r, list) and len(r)==2 for r in cm):\n", " if (\n",
" isinstance(cm, list)\n",
" and len(cm) == 2\n",
" and all(isinstance(r, list) and len(r) == 2 for r in cm)\n",
" ):\n",
" # By inspection of the provided metrics, mapping is:\n", " # By inspection of the provided metrics, mapping is:\n",
" # rows = true [f, m]; cols = pred [f, m]\n", " # rows = true [f, m]; cols = pred [f, m]\n",
" tn, fp = cm[0][0], cm[0][1] # true negatives and false positives for positive class 'm'\n", " tn, fp = (\n",
" cm[0][0],\n",
" cm[0][1],\n",
" ) # true negatives and false positives for positive class 'm'\n",
" fn, tp = cm[1][0], cm[1][1]\n", " fn, tp = cm[1][0], cm[1][1]\n",
"\n", "\n",
" # Derived metrics from confusion matrix (where present)\n", " # Derived metrics from confusion matrix (where present)\n",
" def safe_div(a, b):\n", " def safe_div(a, b):\n",
" return float(a)/float(b) if (b not in (0, None) and not pd.isna(b)) else np.nan\n", " return (\n",
" float(a) / float(b) if (b not in (0, None) and not pd.isna(b)) else np.nan\n",
" )\n",
"\n", "\n",
" sensitivity = safe_div(tp, tp + fn) # TPR for 'm'\n", " sensitivity = safe_div(tp, tp + fn) # TPR for 'm'\n",
" specificity = safe_div(tn, tn + fp) # TNR for 'm'\n", " specificity = safe_div(tn, tn + fp) # TNR for 'm'\n",
" balanced_acc = np.nanmean([sensitivity, specificity])\n", " balanced_acc = np.nanmean([sensitivity, specificity])\n",
" mcc_num = (tp*tn - fp*fn)\n", " mcc_num = tp * tn - fp * fn\n",
" mcc_den = sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)) if all(x==x for x in [tp+fp, tp+fn, tn+fp, tn+fn]) else np.nan\n", " mcc_den = (\n",
" sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))\n",
" if all(x == x for x in [tp + fp, tp + fn, tn + fp, tn + fn])\n",
" else np.nan\n",
" )\n",
" mcc = safe_div(mcc_num, mcc_den)\n", " mcc = safe_div(mcc_num, mcc_den)\n",
"\n", "\n",
" n_test = exp.get(\"test_size\") or np.nansum([tn, fp, fn, tp])\n", " n_test = exp.get(\"test_size\") or np.nansum([tn, fp, fn, tp])\n",
@@ -101,7 +114,8 @@
" else:\n", " else:\n",
" acc_ci_lo = acc_ci_hi = np.nan\n", " acc_ci_lo = acc_ci_hi = np.nan\n",
"\n", "\n",
" rows.append({\n", " rows.append(\n",
" {\n",
" \"experiment_id\": exp_id,\n", " \"experiment_id\": exp_id,\n",
" \"model\": name or model_type,\n", " \"model\": name or model_type,\n",
" \"model_family\": (model_type or \"\").upper(),\n", " \"model_family\": (model_type or \"\").upper(),\n",
@@ -114,7 +128,10 @@
" \"test_f1\": te.get(\"f1\", np.nan),\n", " \"test_f1\": te.get(\"f1\", np.nan),\n",
" \"cv_f1_mean\": cv.get(\"f1\", np.nan),\n", " \"cv_f1_mean\": cv.get(\"f1\", np.nan),\n",
" \"cv_f1_std\": cv.get(\"f1_std\", np.nan),\n", " \"cv_f1_std\": cv.get(\"f1_std\", np.nan),\n",
" \"TP\": tp, \"FP\": fp, \"TN\": tn, \"FN\": fn,\n", " \"TP\": tp,\n",
" \"FP\": fp,\n",
" \"TN\": tn,\n",
" \"FN\": fn,\n",
" \"sensitivity_TPR_m\": sensitivity,\n", " \"sensitivity_TPR_m\": sensitivity,\n",
" \"specificity_TNR_m\": specificity,\n", " \"specificity_TNR_m\": specificity,\n",
" \"balanced_accuracy\": balanced_acc,\n", " \"balanced_accuracy\": balanced_acc,\n",
@@ -122,11 +139,16 @@
" \"n_test\": n_test,\n", " \"n_test\": n_test,\n",
" \"acc_95ci_lo\": acc_ci_lo,\n", " \"acc_95ci_lo\": acc_ci_lo,\n",
" \"acc_95ci_hi\": acc_ci_hi,\n", " \"acc_95ci_hi\": acc_ci_hi,\n",
" \"train_minus_test_gap\": (tr.get(\"accuracy\", np.nan) - test_acc) if pd.notna(tr.get(\"accuracy\", np.nan)) and pd.notna(test_acc) else np.nan,\n", " \"train_minus_test_gap\": (tr.get(\"accuracy\", np.nan) - test_acc)\n",
" \"test_minus_cv_gap\": (test_acc - cv.get(\"accuracy\", np.nan)) if pd.notna(test_acc) and pd.notna(cv.get(\"accuracy\", np.nan)) else np.nan,\n", " if pd.notna(tr.get(\"accuracy\", np.nan)) and pd.notna(test_acc)\n",
" else np.nan,\n",
" \"test_minus_cv_gap\": (test_acc - cv.get(\"accuracy\", np.nan))\n",
" if pd.notna(test_acc) and pd.notna(cv.get(\"accuracy\", np.nan))\n",
" else np.nan,\n",
" \"start_time\": exp.get(\"start_time\"),\n", " \"start_time\": exp.get(\"start_time\"),\n",
" \"end_time\": exp.get(\"end_time\")\n", " \"end_time\": exp.get(\"end_time\"),\n",
" })\n", " }\n",
" )\n",
"\n", "\n",
"df = pd.DataFrame(rows)" "df = pd.DataFrame(rows)"
] ]
@@ -139,23 +161,53 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Clean and order categorical fields\n", "# Clean and order categorical fields\n",
"df[\"feature_set\"] = df[\"feature_set\"].replace({\"full_name\":\"Full name\",\"native_name\":\"Native\",\"surname\":\"Surname\"})\n", "df[\"feature_set\"] = df[\"feature_set\"].replace(\n",
" {\"full_name\": \"Full name\", \"native_name\": \"Native\", \"surname\": \"Surname\"}\n",
")\n",
"order_features = [\"Full name\", \"Surname\", \"Native\"]\n", "order_features = [\"Full name\", \"Surname\", \"Native\"]\n",
"df[\"feature_set\"] = pd.Categorical(df[\"feature_set\"], categories=order_features, ordered=True)\n", "df[\"feature_set\"] = pd.Categorical(\n",
" df[\"feature_set\"], categories=order_features, ordered=True\n",
")\n",
"\n", "\n",
"order_family = [\"LOGISTIC_REGRESSION\",\"LIGHTGBM\",\"LSTM\",\"CNN\",\"BIGRU\", \"RANDOM_FOREST\", \"TRANSFORMER\", \"NAIVE_BAYES\", \"XGBOOST\"]\n", "order_family = [\n",
"df[\"model_family\"] = pd.Categorical(df[\"model_family\"], categories=order_family, ordered=True)\n", " \"LOGISTIC_REGRESSION\",\n",
" \"LIGHTGBM\",\n",
" \"LSTM\",\n",
" \"CNN\",\n",
" \"BIGRU\",\n",
" \"RANDOM_FOREST\",\n",
" \"TRANSFORMER\",\n",
" \"NAIVE_BAYES\",\n",
" \"XGBOOST\",\n",
"]\n",
"df[\"model_family\"] = pd.Categorical(\n",
" df[\"model_family\"], categories=order_family, ordered=True\n",
")\n",
"\n", "\n",
"# Summary table (subset of most relevant columns)\n", "# Summary table (subset of most relevant columns)\n",
"summary_cols = [\n", "summary_cols = [\n",
" \"experiment_id\",\"model_family\",\"feature_set\",\n", " \"experiment_id\",\n",
" \"train_accuracy\",\"test_accuracy\",\"cv_accuracy_mean\",\"cv_accuracy_std\",\n", " \"model_family\",\n",
" \"acc_95ci_lo\",\"acc_95ci_hi\",\n", " \"feature_set\",\n",
" \"balanced_accuracy\",\"MCC\",\n", " \"train_accuracy\",\n",
" \"train_minus_test_gap\",\"test_minus_cv_gap\",\n", " \"test_accuracy\",\n",
" \"n_test\"\n", " \"cv_accuracy_mean\",\n",
" \"cv_accuracy_std\",\n",
" \"acc_95ci_lo\",\n",
" \"acc_95ci_hi\",\n",
" \"balanced_accuracy\",\n",
" \"MCC\",\n",
" \"train_minus_test_gap\",\n",
" \"test_minus_cv_gap\",\n",
" \"n_test\",\n",
"]\n", "]\n",
"summary = df[summary_cols].sort_values([\"model_family\",\"feature_set\",\"test_accuracy\"], ascending=[True, True, False]).reset_index(drop=True)\n", "summary = (\n",
" df[summary_cols]\n",
" .sort_values(\n",
" [\"model_family\", \"feature_set\", \"test_accuracy\"], ascending=[True, True, False]\n",
" )\n",
" .reset_index(drop=True)\n",
")\n",
"\n", "\n",
"# Display the master summary table\n", "# Display the master summary table\n",
"display(summary)" "display(summary)"
@@ -171,8 +223,12 @@
"# Build a pivot for plotting\n", "# Build a pivot for plotting\n",
"plot_df = df.dropna(subset=[\"test_accuracy\"]).copy()\n", "plot_df = df.dropna(subset=[\"test_accuracy\"]).copy()\n",
"# Prepare positions\n", "# Prepare positions\n",
"families = [f for f in order_family if f in plot_df[\"model_family\"].astype(str).unique()]\n", "families = [\n",
"features = [f for f in order_features if f in plot_df[\"feature_set\"].astype(str).unique()]\n", " f for f in order_family if f in plot_df[\"model_family\"].astype(str).unique()\n",
"]\n",
"features = [\n",
" f for f in order_features if f in plot_df[\"feature_set\"].astype(str).unique()\n",
"]\n",
"\n", "\n",
"# Bar positions\n", "# Bar positions\n",
"x = np.arange(len(families))\n", "x = np.arange(len(families))\n",
@@ -188,8 +244,16 @@
" row = sub[sub[\"model_family\"].astype(str) == fam]\n", " row = sub[sub[\"model_family\"].astype(str) == fam]\n",
" if len(row):\n", " if len(row):\n",
" val = float(row.iloc[0][\"test_accuracy\"])\n", " val = float(row.iloc[0][\"test_accuracy\"])\n",
" lo = float(row.iloc[0][\"acc_95ci_lo\"]) if pd.notna(row.iloc[0][\"acc_95ci_lo\"]) else np.nan\n", " lo = (\n",
" hi = float(row.iloc[0][\"acc_95ci_hi\"]) if pd.notna(row.iloc[0][\"acc_95ci_hi\"]) else np.nan\n", " float(row.iloc[0][\"acc_95ci_lo\"])\n",
" if pd.notna(row.iloc[0][\"acc_95ci_lo\"])\n",
" else np.nan\n",
" )\n",
" hi = (\n",
" float(row.iloc[0][\"acc_95ci_hi\"])\n",
" if pd.notna(row.iloc[0][\"acc_95ci_hi\"])\n",
" else np.nan\n",
" )\n",
" else:\n", " else:\n",
" val, lo, hi = np.nan, np.nan, np.nan\n", " val, lo, hi = np.nan, np.nan, np.nan\n",
" y.append(val)\n", " y.append(val)\n",
@@ -201,7 +265,14 @@
" yerr[0].append(np.nan)\n", " yerr[0].append(np.nan)\n",
" yerr[1].append(np.nan)\n", " yerr[1].append(np.nan)\n",
"\n", "\n",
" plt.bar(x + i*width - (len(features)-1)*width/2, y, width, label=feat, yerr=yerr, capsize=4)\n", " plt.bar(\n",
" x + i * width - (len(features) - 1) * width / 2,\n",
" y,\n",
" width,\n",
" label=feat,\n",
" yerr=yerr,\n",
" capsize=4,\n",
" )\n",
"\n", "\n",
"plt.xticks(x, families, rotation=0)\n", "plt.xticks(x, families, rotation=0)\n",
"plt.ylabel(\"Test accuracy\")\n", "plt.ylabel(\"Test accuracy\")\n",
@@ -250,9 +321,13 @@
" sub = df[df[\"feature_set\"].astype(str) == feat]\n", " sub = df[df[\"feature_set\"].astype(str) == feat]\n",
" plt.scatter(sub[\"train_accuracy\"], sub[\"test_accuracy\"], label=feat)\n", " plt.scatter(sub[\"train_accuracy\"], sub[\"test_accuracy\"], label=feat)\n",
"# y=x reference\n", "# y=x reference\n",
"lims = [min(df[\"train_accuracy\"].min(), df[\"test_accuracy\"].min())-0.02, max(df[\"train_accuracy\"].max(), df[\"test_accuracy\"].max())+0.02]\n", "lims = [\n",
" min(df[\"train_accuracy\"].min(), df[\"test_accuracy\"].min()) - 0.02,\n",
" max(df[\"train_accuracy\"].max(), df[\"test_accuracy\"].max()) + 0.02,\n",
"]\n",
"plt.plot(lims, lims, linestyle=\"--\")\n", "plt.plot(lims, lims, linestyle=\"--\")\n",
"plt.xlim(lims); plt.ylim(lims)\n", "plt.xlim(lims)\n",
"plt.ylim(lims)\n",
"plt.xlabel(\"Train accuracy\")\n", "plt.xlabel(\"Train accuracy\")\n",
"plt.ylabel(\"Test accuracy\")\n", "plt.ylabel(\"Test accuracy\")\n",
"plt.title(\"Overfitting analysis: Train vs Test accuracy\")\n", "plt.title(\"Overfitting analysis: Train vs Test accuracy\")\n",
@@ -268,7 +343,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"best_rows = df.sort_values(\"test_accuracy\", ascending=False).groupby(\"feature_set\").head(1)\n", "best_rows = (\n",
" df.sort_values(\"test_accuracy\", ascending=False).groupby(\"feature_set\").head(1)\n",
")\n",
"for _, row in best_rows.iterrows():\n", "for _, row in best_rows.iterrows():\n",
" cm = np.array([[row[\"TN\"], row[\"FP\"]], [row[\"FN\"], row[\"TP\"]]], dtype=float)\n", " cm = np.array([[row[\"TN\"], row[\"FP\"]], [row[\"FN\"], row[\"TP\"]]], dtype=float)\n",
" if np.isnan(cm).any():\n", " if np.isnan(cm).any():\n",
@@ -305,11 +382,14 @@
" for feat in [\"Full name\", \"Surname\"]:\n", " for feat in [\"Full name\", \"Surname\"]:\n",
" tgt = fam_rows[fam_rows[\"feature_set\"] == feat]\n", " tgt = fam_rows[fam_rows[\"feature_set\"] == feat]\n",
" if len(tgt):\n", " if len(tgt):\n",
" deltas.append({\n", " deltas.append(\n",
" {\n",
" \"model_family\": fam,\n", " \"model_family\": fam,\n",
" \"comparison\": f\"{feat} minus Native\",\n", " \"comparison\": f\"{feat} minus Native\",\n",
" \"delta_accuracy\": float(tgt.iloc[0][\"test_accuracy\"]) - base_acc\n", " \"delta_accuracy\": float(tgt.iloc[0][\"test_accuracy\"])\n",
" })\n", " - base_acc,\n",
" }\n",
" )\n",
"\n", "\n",
"deltas_df = pd.DataFrame(deltas)\n", "deltas_df = pd.DataFrame(deltas)\n",
"display(deltas_df)\n", "display(deltas_df)\n",
+3 -1
View File
@@ -113,7 +113,9 @@
"df_name_categories.head(12)\n", "df_name_categories.head(12)\n",
"\n", "\n",
"# save data\n", "# save data\n",
"df_name_categories.to_csv(\"../../assets/identified_category_distribution.csv\", index=False)" "df_name_categories.to_csv(\n",
" \"../../assets/identified_category_distribution.csv\", index=False\n",
")"
] ]
}, },
{ {