{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "95dcf546dfd256ab",
   "metadata": {},
   "source": [
    "# Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80feb4d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import json\n",
    "import json\n",
    "from pathlib import Path\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from math import sqrt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-10-03T09:49:28.626603Z",
     "start_time": "2025-10-03T09:49:28.450965Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sys.path.append(os.path.abspath(\"..\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "136812ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "p = Path(\"../../data/outputs/experiments/experiments.json\")\n",
    "with p.open(\"r\", encoding=\"utf-8\") as f:\n",
    "    experiments = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "601c8bc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "for exp_id, exp in experiments.items():\n",
    "    cfg = exp.get(\"config\", {})\n",
    "    name = cfg.get(\"name\")\n",
    "    model_type = cfg.get(\"model_type\")\n",
    "    features = \",\".join(cfg.get(\"features\", []))\n",
    "    # metrics\n",
    "    tr = exp.get(\"train_metrics\", {}) or {}\n",
    "    te = exp.get(\"test_metrics\", {}) or {}\n",
    "    cv = exp.get(\"cv_metrics\", {}) or {}\n",
    "\n",
    "    cm = exp.get(\"confusion_matrix\")\n",
    "    tn = fp = fn = tp = np.nan\n",
    "    if (\n",
    "        isinstance(cm, list)\n",
    "        and len(cm) == 2\n",
    "        and all(isinstance(r, list) and len(r) == 2 for r in cm)\n",
    "    ):\n",
    "        # By inspection of the provided metrics, mapping is:\n",
    "        # rows = true [f, m]; cols = pred [f, m]\n",
    "        tn, fp = (\n",
    "            cm[0][0],\n",
    "            cm[0][1],\n",
    "        )  # true negatives and false positives for positive class 'm'\n",
    "        fn, tp = cm[1][0], cm[1][1]\n",
    "\n",
    "    # Derived metrics from confusion matrix (where present)\n",
    "    def safe_div(a, b):\n",
    "        return (\n",
    "            float(a) / float(b) if (b not in (0, None) and not pd.isna(b)) else np.nan\n",
    "        )\n",
    "\n",
    "    sensitivity = safe_div(tp, tp + fn)  # TPR for 'm'\n",
    "    specificity = safe_div(tn, tn + fp)  # TNR for 'm'\n",
    "    balanced_acc = np.nanmean([sensitivity, specificity])\n",
    "    mcc_num = tp * tn - fp * fn\n",
    "    mcc_den = (\n",
    "        sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))\n",
    "        if all(x == x for x in [tp + fp, tp + fn, tn + fp, tn + fn])\n",
    "        else np.nan\n",
    "    )\n",
    "    mcc = safe_div(mcc_num, mcc_den)\n",
    "\n",
    "    n_test = exp.get(\"test_size\") or np.nansum([tn, fp, fn, tp])\n",
    "    test_acc = te.get(\"accuracy\", np.nan)\n",
    "    # 95% CI for accuracy via normal approximation (ok for n=2000)\n",
    "    if pd.notna(test_acc) and pd.notna(n_test) and n_test > 0:\n",
    "        se = np.sqrt(test_acc * (1 - test_acc) / n_test)\n",
    "        acc_ci_lo = test_acc - 1.96 * se\n",
    "        acc_ci_hi = test_acc + 1.96 * se\n",
    "    else:\n",
    "        acc_ci_lo = acc_ci_hi = np.nan\n",
    "\n",
    "    rows.append(\n",
    "        {\n",
    "            \"experiment_id\": exp_id,\n",
    "            \"model\": name or model_type,\n",
    "            \"model_family\": (model_type or \"\").upper(),\n",
    "            \"feature_set\": features,\n",
    "            \"train_accuracy\": tr.get(\"accuracy\", np.nan),\n",
    "            \"test_accuracy\": test_acc,\n",
    "            \"cv_accuracy_mean\": cv.get(\"accuracy\", np.nan),\n",
    "            \"cv_accuracy_std\": cv.get(\"accuracy_std\", np.nan),\n",
    "            \"train_f1\": tr.get(\"f1\", np.nan),\n",
    "            \"test_f1\": te.get(\"f1\", np.nan),\n",
    "            \"cv_f1_mean\": cv.get(\"f1\", np.nan),\n",
    "            \"cv_f1_std\": cv.get(\"f1_std\", np.nan),\n",
    "            \"TP\": tp,\n",
    "            \"FP\": fp,\n",
    "            \"TN\": tn,\n",
    "            \"FN\": fn,\n",
    "            \"sensitivity_TPR_m\": sensitivity,\n",
    "            \"specificity_TNR_m\": specificity,\n",
    "            \"balanced_accuracy\": balanced_acc,\n",
    "            \"MCC\": mcc,\n",
    "            \"n_test\": n_test,\n",
    "            \"acc_95ci_lo\": acc_ci_lo,\n",
    "            \"acc_95ci_hi\": acc_ci_hi,\n",
    "            \"train_minus_test_gap\": (tr.get(\"accuracy\", np.nan) - test_acc)\n",
    "            if pd.notna(tr.get(\"accuracy\", np.nan)) and pd.notna(test_acc)\n",
    "            else np.nan,\n",
    "            \"test_minus_cv_gap\": (test_acc - cv.get(\"accuracy\", np.nan))\n",
    "            if pd.notna(test_acc) and pd.notna(cv.get(\"accuracy\", np.nan))\n",
    "            else np.nan,\n",
    "            \"start_time\": exp.get(\"start_time\"),\n",
    "            \"end_time\": exp.get(\"end_time\"),\n",
    "        }\n",
    "    )\n",
    "\n",
    "df = pd.DataFrame(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9859c4d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clean and order categorical fields\n",
    "df[\"feature_set\"] = df[\"feature_set\"].replace(\n",
    "    {\"full_name\": \"Full name\", \"native_name\": \"Native\", \"surname\": \"Surname\"}\n",
    ")\n",
    "order_features = [\"Full name\", \"Surname\", \"Native\"]\n",
    "df[\"feature_set\"] = pd.Categorical(\n",
    "    df[\"feature_set\"], categories=order_features, ordered=True\n",
    ")\n",
    "\n",
    "order_family = [\n",
    "    \"LOGISTIC_REGRESSION\",\n",
    "    \"LIGHTGBM\",\n",
    "    \"LSTM\",\n",
    "    \"CNN\",\n",
    "    \"BIGRU\",\n",
    "    \"RANDOM_FOREST\",\n",
    "    \"TRANSFORMER\",\n",
    "    \"NAIVE_BAYES\",\n",
    "    \"XGBOOST\",\n",
    "]\n",
    "df[\"model_family\"] = pd.Categorical(\n",
    "    df[\"model_family\"], categories=order_family, ordered=True\n",
    ")\n",
    "\n",
    "# Summary table (subset of most relevant columns)\n",
    "summary_cols = [\n",
    "    \"experiment_id\",\n",
    "    \"model_family\",\n",
    "    \"feature_set\",\n",
    "    \"train_accuracy\",\n",
    "    \"test_accuracy\",\n",
    "    \"cv_accuracy_mean\",\n",
    "    \"cv_accuracy_std\",\n",
    "    \"acc_95ci_lo\",\n",
    "    \"acc_95ci_hi\",\n",
    "    \"balanced_accuracy\",\n",
    "    \"MCC\",\n",
    "    \"train_minus_test_gap\",\n",
    "    \"test_minus_cv_gap\",\n",
    "    \"n_test\",\n",
    "]\n",
    "summary = (\n",
    "    df[summary_cols]\n",
    "    .sort_values(\n",
    "        [\"model_family\", \"feature_set\", \"test_accuracy\"], ascending=[True, True, False]\n",
    "    )\n",
    "    .reset_index(drop=True)\n",
    ")\n",
    "\n",
    "# Display the master summary table\n",
    "display(summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8189c6e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build a pivot for plotting\n",
    "plot_df = df.dropna(subset=[\"test_accuracy\"]).copy()\n",
    "# Prepare positions\n",
    "families = [\n",
    "    f for f in order_family if f in plot_df[\"model_family\"].astype(str).unique()\n",
    "]\n",
    "features = [\n",
    "    f for f in order_features if f in plot_df[\"feature_set\"].astype(str).unique()\n",
    "]\n",
    "\n",
    "# Bar positions\n",
    "x = np.arange(len(families))\n",
    "width = 0.8 / max(1, len(features))  # total width split by features\n",
    "\n",
    "fig1 = plt.figure(figsize=(10, 6))\n",
    "for i, feat in enumerate(features):\n",
    "    sub = plot_df[plot_df[\"feature_set\"].astype(str) == feat]\n",
    "    # Align to families\n",
    "    y = []\n",
    "    yerr = [[], []]  # lower and upper errors for asymmetric CI\n",
    "    for fam in families:\n",
    "        row = sub[sub[\"model_family\"].astype(str) == fam]\n",
    "        if len(row):\n",
    "            val = float(row.iloc[0][\"test_accuracy\"])\n",
    "            lo = (\n",
    "                float(row.iloc[0][\"acc_95ci_lo\"])\n",
    "                if pd.notna(row.iloc[0][\"acc_95ci_lo\"])\n",
    "                else np.nan\n",
    "            )\n",
    "            hi = (\n",
    "                float(row.iloc[0][\"acc_95ci_hi\"])\n",
    "                if pd.notna(row.iloc[0][\"acc_95ci_hi\"])\n",
    "                else np.nan\n",
    "            )\n",
    "        else:\n",
    "            val, lo, hi = np.nan, np.nan, np.nan\n",
    "        y.append(val)\n",
    "        # symmetric error bars about the mean\n",
    "        if not np.isnan(lo) and not np.isnan(hi) and not np.isnan(val):\n",
    "            yerr[0].append(val - lo)\n",
    "            yerr[1].append(hi - val)\n",
    "        else:\n",
    "            yerr[0].append(np.nan)\n",
    "            yerr[1].append(np.nan)\n",
    "\n",
    "    plt.bar(\n",
    "        x + i * width - (len(features) - 1) * width / 2,\n",
    "        y,\n",
    "        width,\n",
    "        label=feat,\n",
    "        yerr=yerr,\n",
    "        capsize=4,\n",
    "    )\n",
    "\n",
    "plt.xticks(x, families, rotation=0)\n",
    "plt.ylabel(\"Test accuracy\")\n",
    "plt.title(\"Test accuracy by model family and feature set (95% CI)\")\n",
    "plt.ylim(0.45, 1.0)\n",
    "plt.legend(title=\"Feature set\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5200154",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig2 = plt.figure(figsize=(10, 6))\n",
    "for i, feat in enumerate(features):\n",
    "    sub = plot_df[plot_df[\"feature_set\"].astype(str) == feat]\n",
    "    y = []\n",
    "    for fam in families:\n",
    "        row = sub[sub[\"model_family\"].astype(str) == fam]\n",
    "        val = float(row.iloc[0][\"test_f1\"]) if len(row) else np.nan\n",
    "        y.append(val)\n",
    "    plt.bar(x + i * width - (len(features) - 1) * width / 2, y, width, label=feat)\n",
    "\n",
    "plt.xticks(x, families, rotation=0)\n",
    "plt.ylabel(\"Test F1\")\n",
    "plt.title(\"Test F1 by model family and feature set\")\n",
    "plt.ylim(0.45, 1.0)\n",
    "plt.legend(title=\"Feature set\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee2b10c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig3 = plt.figure(figsize=(7, 7))\n",
    "for feat in features:\n",
    "    sub = df[df[\"feature_set\"].astype(str) == feat]\n",
    "    plt.scatter(sub[\"train_accuracy\"], sub[\"test_accuracy\"], label=feat)\n",
    "# y=x reference\n",
    "lims = [\n",
    "    min(df[\"train_accuracy\"].min(), df[\"test_accuracy\"].min()) - 0.02,\n",
    "    max(df[\"train_accuracy\"].max(), df[\"test_accuracy\"].max()) + 0.02,\n",
    "]\n",
    "plt.plot(lims, lims, linestyle=\"--\")\n",
    "plt.xlim(lims)\n",
    "plt.ylim(lims)\n",
    "plt.xlabel(\"Train accuracy\")\n",
    "plt.ylabel(\"Test accuracy\")\n",
    "plt.title(\"Overfitting analysis: Train vs Test accuracy\")\n",
    "plt.legend(title=\"Feature set\", loc=\"lower right\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e879dbe3",
   "metadata": {},
   "outputs": [],
   "source": [
    "best_rows = (\n",
    "    df.sort_values(\"test_accuracy\", ascending=False).groupby(\"feature_set\").head(1)\n",
    ")\n",
    "for _, row in best_rows.iterrows():\n",
    "    cm = np.array([[row[\"TN\"], row[\"FP\"]], [row[\"FN\"], row[\"TP\"]]], dtype=float)\n",
    "    if np.isnan(cm).any():\n",
    "        continue\n",
    "    fig = plt.figure(figsize=(5, 5))\n",
    "    im = plt.imshow(cm, interpolation=\"nearest\")\n",
    "    plt.title(f\"Confusion Matrix — {row['model_family']} ({row['feature_set']})\")\n",
    "    plt.xticks([0, 1], [\"Pred: f\", \"Pred: m\"])\n",
    "    plt.yticks([0, 1], [\"True: f\", \"True: m\"])\n",
    "    # Annotate counts and rates\n",
    "    total = cm.sum()\n",
    "    for i in range(2):\n",
    "        for j in range(2):\n",
    "            val = cm[i, j]\n",
    "            plt.text(j, i, f\"{int(val)}\\n({val / total:.2%})\", ha=\"center\", va=\"center\")\n",
    "    plt.colorbar(im, fraction=0.046, pad=0.04)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "588d43f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "deltas = []\n",
    "for fam in families:\n",
    "    fam_rows = df[df[\"model_family\"].astype(str) == fam]\n",
    "    base = fam_rows[fam_rows[\"feature_set\"] == \"Native\"]\n",
    "    if len(base):\n",
    "        base_acc = float(base.iloc[0][\"test_accuracy\"])\n",
    "        for feat in [\"Full name\", \"Surname\"]:\n",
    "            tgt = fam_rows[fam_rows[\"feature_set\"] == feat]\n",
    "            if len(tgt):\n",
    "                deltas.append(\n",
    "                    {\n",
    "                        \"model_family\": fam,\n",
    "                        \"comparison\": f\"{feat} minus Native\",\n",
    "                        \"delta_accuracy\": float(tgt.iloc[0][\"test_accuracy\"])\n",
    "                        - base_acc,\n",
    "                    }\n",
    "                )\n",
    "\n",
    "deltas_df = pd.DataFrame(deltas)\n",
    "display(deltas_df)\n",
    "\n",
    "fig5 = plt.figure(figsize=(10, 6))\n",
    "# Make bars grouped by model_family\n",
    "comp_types = deltas_df[\"comparison\"].unique().tolist() if not deltas_df.empty else []\n",
    "x2 = np.arange(len(families))\n",
    "width2 = 0.8 / max(1, len(comp_types))\n",
    "for i, comp in enumerate(comp_types):\n",
    "    sub = deltas_df[deltas_df[\"comparison\"] == comp]\n",
    "    y = []\n",
    "    for fam in families:\n",
    "        row = sub[sub[\"model_family\"] == fam]\n",
    "        y.append(float(row.iloc[0][\"delta_accuracy\"]) if len(row) else np.nan)\n",
    "    plt.bar(x2 + i * width2 - (len(comp_types) - 1) * width2 / 2, y, width2, label=comp)\n",
    "\n",
    "plt.xticks(x2, families)\n",
    "plt.axhline(0, linestyle=\"--\")\n",
    "plt.ylabel(\"Δ Accuracy vs Native\")\n",
    "plt.title(\"Effect of feature set: Full name/Surname vs Native\")\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}