From 14fc302b280e70782ed1b8c76c91c38066fe17da Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Thu, 24 Jul 2025 19:32:44 +0200 Subject: [PATCH] fix: eda with latest dataset --- notebooks/analysis.ipynb | 367 ++++++++++++++++++++------------------- notebooks/eda.ipynb | 124 +++++++------ 2 files changed, 250 insertions(+), 241 deletions(-) diff --git a/notebooks/analysis.ipynb b/notebooks/analysis.ipynb index 3d7f3db..dfe37ad 100644 --- a/notebooks/analysis.ipynb +++ b/notebooks/analysis.ipynb @@ -2,91 +2,85 @@ "cells": [ { "cell_type": "code", + "execution_count": 2, "id": "initial_id", "metadata": { - "collapsed": true, "ExecuteTime": { "end_time": "2025-07-08T15:58:03.741302Z", "start_time": "2025-07-08T15:58:03.737218Z" - } + }, + "collapsed": true }, + "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", + "import sys\n", + "import os\n", + "import importlib\n", "\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "import misc\n", + "importlib.reload(misc)\n", "from misc import load_csv_dataset" - ], - "outputs": [], - "execution_count": 1 + ] }, { + "cell_type": "code", + "execution_count": 3, + "id": "74a8a262bc55d976", "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:58:27.047640Z", "start_time": "2025-07-08T15:58:06.842257Z" } }, - "cell_type": "code", - "source": [ - "data = load_csv_dataset(\"names_featured.csv\")\n", - "df = pd.DataFrame(data)" - ], - "id": "74a8a262bc55d976", "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ ">> Loading CSV dataset from names_featured.csv\n", - ">> Detected fieldnames: ['name', 'region', 'year', 'words', 'length', 'probable_native', 'probable_surname', 'sex']\n", + ">> Detected fieldnames: ['name', 'sex', 'region', 'year', 'words', 'length', 'probable_native', 'probable_surname', 'identified_category', 'identified_name', 'identified_surname', 'annotated', 'province']\n", ">> Successfully loaded with UTF-8 encoding\n" ] } ], - "execution_count": 2 + "source": [ + "df = pd.DataFrame(load_csv_dataset(\"names_featured.csv\"))" + ] }, { + "cell_type": "code", + "execution_count": 4, + "id": "ebb18cabd40b011f", "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:58:50.742471Z", "start_time": "2025-07-08T15:58:45.080832Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "df['year'] = pd.to_numeric(df['year'], errors='coerce')\n", "df['words'] = pd.to_numeric(df['words'], errors='coerce')\n", "df['length'] = pd.to_numeric(df['length'], errors='coerce')" - ], - "id": "ebb18cabd40b011f", - "outputs": [], - "execution_count": 3 + ] }, { + "cell_type": "code", + "execution_count": 5, + "id": "295eda175081129c", "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:59:02.783458Z", "start_time": "2025-07-08T15:59:02.297971Z" } }, - "cell_type": "code", - "source": "df.describe().T", - "id": "295eda175081129c", "outputs": [ { "data": { - "text/plain": [ - " count mean std min 25% 50% 75% \\\n", - "year 4418337.0 2017.038495 4.656899 2008.0 2013.0 2018.0 2021.0 \n", - "words 4418337.0 2.862385 0.473601 1.0 3.0 3.0 3.0 \n", - "length 4418337.0 18.216613 3.498679 1.0 16.0 18.0 21.0 \n", - "\n", - " max \n", - "year 2023.0 \n", - "words 10.0 \n", - "length 49.0 " - ], "text/html": [ "
\n", "