{ "cells": [ { "cell_type": "markdown", "id": "95dcf546dfd256ab", "metadata": {}, "source": [ "# Overview" ] }, { "cell_type": "code", "execution_count": null, "id": "80feb4d5", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "import json\n", "import json\n", "from pathlib import Path\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from math import sqrt" ] }, { "cell_type": "code", "execution_count": null, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-10-03T09:49:28.626603Z", "start_time": "2025-10-03T09:49:28.450965Z" }, "collapsed": true }, "outputs": [], "source": [ "sys.path.append(os.path.abspath(\"..\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "136812ec", "metadata": {}, "outputs": [], "source": [ "p = Path(\"../../data/outputs/experiments/experiments.json\")\n", "with p.open(\"r\", encoding=\"utf-8\") as f:\n", " experiments = json.load(f)" ] }, { "cell_type": "code", "execution_count": null, "id": "601c8bc2", "metadata": {}, "outputs": [], "source": [ "rows = []\n", "for exp_id, exp in experiments.items():\n", " cfg = exp.get(\"config\", {})\n", " name = cfg.get(\"name\")\n", " model_type = cfg.get(\"model_type\")\n", " features = \",\".join(cfg.get(\"features\", []))\n", " # metrics\n", " tr = exp.get(\"train_metrics\", {}) or {}\n", " te = exp.get(\"test_metrics\", {}) or {}\n", " cv = exp.get(\"cv_metrics\", {}) or {}\n", "\n", " cm = exp.get(\"confusion_matrix\")\n", " tn = fp = fn = tp = np.nan\n", " if (\n", " isinstance(cm, list)\n", " and len(cm) == 2\n", " and all(isinstance(r, list) and len(r) == 2 for r in cm)\n", " ):\n", " # By inspection of the provided metrics, mapping is:\n", " # rows = true [f, m]; cols = pred [f, m]\n", " tn, fp = (\n", " cm[0][0],\n", " cm[0][1],\n", " ) # true negatives and false positives for positive class 'm'\n", " fn, tp = cm[1][0], cm[1][1]\n", "\n", " # Derived metrics from confusion matrix (where present)\n", " def safe_div(a, b):\n", " return (\n", " float(a) / float(b) if (b not in (0, None) and not pd.isna(b)) else np.nan\n", " )\n", "\n", " sensitivity = safe_div(tp, tp + fn) # TPR for 'm'\n", " specificity = safe_div(tn, tn + fp) # TNR for 'm'\n", " balanced_acc = np.nanmean([sensitivity, specificity])\n", " mcc_num = tp * tn - fp * fn\n", " mcc_den = (\n", " sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))\n", " if all(x == x for x in [tp + fp, tp + fn, tn + fp, tn + fn])\n", " else np.nan\n", " )\n", " mcc = safe_div(mcc_num, mcc_den)\n", "\n", " n_test = exp.get(\"test_size\") or np.nansum([tn, fp, fn, tp])\n", " test_acc = te.get(\"accuracy\", np.nan)\n", " # 95% CI for accuracy via normal approximation (ok for n=2000)\n", " if pd.notna(test_acc) and pd.notna(n_test) and n_test > 0:\n", " se = np.sqrt(test_acc * (1 - test_acc) / n_test)\n", " acc_ci_lo = test_acc - 1.96 * se\n", " acc_ci_hi = test_acc + 1.96 * se\n", " else:\n", " acc_ci_lo = acc_ci_hi = np.nan\n", "\n", " rows.append(\n", " {\n", " \"experiment_id\": exp_id,\n", " \"model\": name or model_type,\n", " \"model_family\": (model_type or \"\").upper(),\n", " \"feature_set\": features,\n", " \"train_accuracy\": tr.get(\"accuracy\", np.nan),\n", " \"test_accuracy\": test_acc,\n", " \"cv_accuracy_mean\": cv.get(\"accuracy\", np.nan),\n", " \"cv_accuracy_std\": cv.get(\"accuracy_std\", np.nan),\n", " \"train_f1\": tr.get(\"f1\", np.nan),\n", " \"test_f1\": te.get(\"f1\", np.nan),\n", " \"cv_f1_mean\": cv.get(\"f1\", np.nan),\n", " \"cv_f1_std\": cv.get(\"f1_std\", np.nan),\n", " \"TP\": tp,\n", " \"FP\": fp,\n", " \"TN\": tn,\n", " \"FN\": fn,\n", " \"sensitivity_TPR_m\": sensitivity,\n", " \"specificity_TNR_m\": specificity,\n", " \"balanced_accuracy\": balanced_acc,\n", " \"MCC\": mcc,\n", " \"n_test\": n_test,\n", " \"acc_95ci_lo\": acc_ci_lo,\n", " \"acc_95ci_hi\": acc_ci_hi,\n", " \"train_minus_test_gap\": (tr.get(\"accuracy\", np.nan) - test_acc)\n", " if pd.notna(tr.get(\"accuracy\", np.nan)) and pd.notna(test_acc)\n", " else np.nan,\n", " \"test_minus_cv_gap\": (test_acc - cv.get(\"accuracy\", np.nan))\n", " if pd.notna(test_acc) and pd.notna(cv.get(\"accuracy\", np.nan))\n", " else np.nan,\n", " \"start_time\": exp.get(\"start_time\"),\n", " \"end_time\": exp.get(\"end_time\"),\n", " }\n", " )\n", "\n", "df = pd.DataFrame(rows)" ] }, { "cell_type": "code", "execution_count": null, "id": "9859c4d8", "metadata": {}, "outputs": [], "source": [ "# Clean and order categorical fields\n", "df[\"feature_set\"] = df[\"feature_set\"].replace(\n", " {\"full_name\": \"Full name\", \"native_name\": \"Native\", \"surname\": \"Surname\"}\n", ")\n", "order_features = [\"Full name\", \"Surname\", \"Native\"]\n", "df[\"feature_set\"] = pd.Categorical(\n", " df[\"feature_set\"], categories=order_features, ordered=True\n", ")\n", "\n", "order_family = [\n", " \"LOGISTIC_REGRESSION\",\n", " \"LIGHTGBM\",\n", " \"LSTM\",\n", " \"CNN\",\n", " \"BIGRU\",\n", " \"RANDOM_FOREST\",\n", " \"TRANSFORMER\",\n", " \"NAIVE_BAYES\",\n", " \"XGBOOST\",\n", "]\n", "df[\"model_family\"] = pd.Categorical(\n", " df[\"model_family\"], categories=order_family, ordered=True\n", ")\n", "\n", "# Summary table (subset of most relevant columns)\n", "summary_cols = [\n", " \"experiment_id\",\n", " \"model_family\",\n", " \"feature_set\",\n", " \"train_accuracy\",\n", " \"test_accuracy\",\n", " \"cv_accuracy_mean\",\n", " \"cv_accuracy_std\",\n", " \"acc_95ci_lo\",\n", " \"acc_95ci_hi\",\n", " \"balanced_accuracy\",\n", " \"MCC\",\n", " \"train_minus_test_gap\",\n", " \"test_minus_cv_gap\",\n", " \"n_test\",\n", "]\n", "summary = (\n", " df[summary_cols]\n", " .sort_values(\n", " [\"model_family\", \"feature_set\", \"test_accuracy\"], ascending=[True, True, False]\n", " )\n", " .reset_index(drop=True)\n", ")\n", "\n", "# Display the master summary table\n", "display(summary)" ] }, { "cell_type": "code", "execution_count": null, "id": "8189c6e1", "metadata": {}, "outputs": [], "source": [ "# Build a pivot for plotting\n", "plot_df = df.dropna(subset=[\"test_accuracy\"]).copy()\n", "# Prepare positions\n", "families = [\n", " f for f in order_family if f in plot_df[\"model_family\"].astype(str).unique()\n", "]\n", "features = [\n", " f for f in order_features if f in plot_df[\"feature_set\"].astype(str).unique()\n", "]\n", "\n", "# Bar positions\n", "x = np.arange(len(families))\n", "width = 0.8 / max(1, len(features)) # total width split by features\n", "\n", "fig1 = plt.figure(figsize=(10, 6))\n", "for i, feat in enumerate(features):\n", " sub = plot_df[plot_df[\"feature_set\"].astype(str) == feat]\n", " # Align to families\n", " y = []\n", " yerr = [[], []] # lower and upper errors for asymmetric CI\n", " for fam in families:\n", " row = sub[sub[\"model_family\"].astype(str) == fam]\n", " if len(row):\n", " val = float(row.iloc[0][\"test_accuracy\"])\n", " lo = (\n", " float(row.iloc[0][\"acc_95ci_lo\"])\n", " if pd.notna(row.iloc[0][\"acc_95ci_lo\"])\n", " else np.nan\n", " )\n", " hi = (\n", " float(row.iloc[0][\"acc_95ci_hi\"])\n", " if pd.notna(row.iloc[0][\"acc_95ci_hi\"])\n", " else np.nan\n", " )\n", " else:\n", " val, lo, hi = np.nan, np.nan, np.nan\n", " y.append(val)\n", " # symmetric error bars about the mean\n", " if not np.isnan(lo) and not np.isnan(hi) and not np.isnan(val):\n", " yerr[0].append(val - lo)\n", " yerr[1].append(hi - val)\n", " else:\n", " yerr[0].append(np.nan)\n", " yerr[1].append(np.nan)\n", "\n", " plt.bar(\n", " x + i * width - (len(features) - 1) * width / 2,\n", " y,\n", " width,\n", " label=feat,\n", " yerr=yerr,\n", " capsize=4,\n", " )\n", "\n", "plt.xticks(x, families, rotation=0)\n", "plt.ylabel(\"Test accuracy\")\n", "plt.title(\"Test accuracy by model family and feature set (95% CI)\")\n", "plt.ylim(0.45, 1.0)\n", "plt.legend(title=\"Feature set\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "e5200154", "metadata": {}, "outputs": [], "source": [ "fig2 = plt.figure(figsize=(10, 6))\n", "for i, feat in enumerate(features):\n", " sub = plot_df[plot_df[\"feature_set\"].astype(str) == feat]\n", " y = []\n", " for fam in families:\n", " row = sub[sub[\"model_family\"].astype(str) == fam]\n", " val = float(row.iloc[0][\"test_f1\"]) if len(row) else np.nan\n", " y.append(val)\n", " plt.bar(x + i * width - (len(features) - 1) * width / 2, y, width, label=feat)\n", "\n", "plt.xticks(x, families, rotation=0)\n", "plt.ylabel(\"Test F1\")\n", "plt.title(\"Test F1 by model family and feature set\")\n", "plt.ylim(0.45, 1.0)\n", "plt.legend(title=\"Feature set\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "ee2b10c5", "metadata": {}, "outputs": [], "source": [ "fig3 = plt.figure(figsize=(7, 7))\n", "for feat in features:\n", " sub = df[df[\"feature_set\"].astype(str) == feat]\n", " plt.scatter(sub[\"train_accuracy\"], sub[\"test_accuracy\"], label=feat)\n", "# y=x reference\n", "lims = [\n", " min(df[\"train_accuracy\"].min(), df[\"test_accuracy\"].min()) - 0.02,\n", " max(df[\"train_accuracy\"].max(), df[\"test_accuracy\"].max()) + 0.02,\n", "]\n", "plt.plot(lims, lims, linestyle=\"--\")\n", "plt.xlim(lims)\n", "plt.ylim(lims)\n", "plt.xlabel(\"Train accuracy\")\n", "plt.ylabel(\"Test accuracy\")\n", "plt.title(\"Overfitting analysis: Train vs Test accuracy\")\n", "plt.legend(title=\"Feature set\", loc=\"lower right\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "e879dbe3", "metadata": {}, "outputs": [], "source": [ "best_rows = (\n", " df.sort_values(\"test_accuracy\", ascending=False).groupby(\"feature_set\").head(1)\n", ")\n", "for _, row in best_rows.iterrows():\n", " cm = np.array([[row[\"TN\"], row[\"FP\"]], [row[\"FN\"], row[\"TP\"]]], dtype=float)\n", " if np.isnan(cm).any():\n", " continue\n", " fig = plt.figure(figsize=(5, 5))\n", " im = plt.imshow(cm, interpolation=\"nearest\")\n", " plt.title(f\"Confusion Matrix — {row['model_family']} ({row['feature_set']})\")\n", " plt.xticks([0, 1], [\"Pred: f\", \"Pred: m\"])\n", " plt.yticks([0, 1], [\"True: f\", \"True: m\"])\n", " # Annotate counts and rates\n", " total = cm.sum()\n", " for i in range(2):\n", " for j in range(2):\n", " val = cm[i, j]\n", " plt.text(j, i, f\"{int(val)}\\n({val / total:.2%})\", ha=\"center\", va=\"center\")\n", " plt.colorbar(im, fraction=0.046, pad=0.04)\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "588d43f2", "metadata": {}, "outputs": [], "source": [ "deltas = []\n", "for fam in families:\n", " fam_rows = df[df[\"model_family\"].astype(str) == fam]\n", " base = fam_rows[fam_rows[\"feature_set\"] == \"Native\"]\n", " if len(base):\n", " base_acc = float(base.iloc[0][\"test_accuracy\"])\n", " for feat in [\"Full name\", \"Surname\"]:\n", " tgt = fam_rows[fam_rows[\"feature_set\"] == feat]\n", " if len(tgt):\n", " deltas.append(\n", " {\n", " \"model_family\": fam,\n", " \"comparison\": f\"{feat} minus Native\",\n", " \"delta_accuracy\": float(tgt.iloc[0][\"test_accuracy\"])\n", " - base_acc,\n", " }\n", " )\n", "\n", "deltas_df = pd.DataFrame(deltas)\n", "display(deltas_df)\n", "\n", "fig5 = plt.figure(figsize=(10, 6))\n", "# Make bars grouped by model_family\n", "comp_types = deltas_df[\"comparison\"].unique().tolist() if not deltas_df.empty else []\n", "x2 = np.arange(len(families))\n", "width2 = 0.8 / max(1, len(comp_types))\n", "for i, comp in enumerate(comp_types):\n", " sub = deltas_df[deltas_df[\"comparison\"] == comp]\n", " y = []\n", " for fam in families:\n", " row = sub[sub[\"model_family\"] == fam]\n", " y.append(float(row.iloc[0][\"delta_accuracy\"]) if len(row) else np.nan)\n", " plt.bar(x2 + i * width2 - (len(comp_types) - 1) * width2 / 2, y, width2, label=comp)\n", "\n", "plt.xticks(x2, families)\n", "plt.axhline(0, linestyle=\"--\")\n", "plt.ylabel(\"Δ Accuracy vs Native\")\n", "plt.title(\"Effect of feature set: Full name/Surname vs Native\")\n", "plt.legend()\n", "plt.tight_layout()\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }