{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:58:03.741302Z", "start_time": "2025-07-08T15:58:03.737218Z" }, "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import sys\n", "import os\n", "import importlib\n", "\n", "sys.path.append(os.path.abspath(\"..\"))\n", "import misc\n", "importlib.reload(misc)\n", "from misc import load_csv_dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "74a8a262bc55d976", "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:58:27.047640Z", "start_time": "2025-07-08T15:58:06.842257Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ">> Loading CSV dataset from names_featured.csv\n", ">> Detected fieldnames: ['name', 'sex', 'region', 'year', 'words', 'length', 'probable_native', 'probable_surname', 'identified_category', 'identified_name', 'identified_surname', 'annotated', 'province']\n", ">> Successfully loaded with UTF-8 encoding\n" ] } ], "source": [ "df = pd.DataFrame(load_csv_dataset(\"names_featured.csv\"))" ] }, { "cell_type": "code", "execution_count": 4, "id": "ebb18cabd40b011f", "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:58:50.742471Z", "start_time": "2025-07-08T15:58:45.080832Z" } }, "outputs": [], "source": [ "df['year'] = pd.to_numeric(df['year'], errors='coerce')\n", "df['words'] = pd.to_numeric(df['words'], errors='coerce')\n", "df['length'] = pd.to_numeric(df['length'], errors='coerce')" ] }, { "cell_type": "code", "execution_count": 5, "id": "295eda175081129c", "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:59:02.783458Z", "start_time": "2025-07-08T15:59:02.297971Z" } }, "outputs": [ { "data": { "text/html": [ "
| \n", " | count | \n", "mean | \n", "std | \n", "min | \n", "25% | \n", "50% | \n", "75% | \n", "max | \n", "
|---|---|---|---|---|---|---|---|---|
| year | \n", "5522918.0 | \n", "2017.037288 | \n", "4.657671 | \n", "2008.0 | \n", "2013.0 | \n", "2018.0 | \n", "2021.0 | \n", "2023.0 | \n", "
| words | \n", "5522918.0 | \n", "2.862474 | \n", "0.473505 | \n", "1.0 | \n", "3.0 | \n", "3.0 | \n", "3.0 | \n", "10.0 | \n", "
| length | \n", "5522918.0 | \n", "18.217497 | \n", "3.498697 | \n", "1.0 | \n", "16.0 | \n", "18.0 | \n", "21.0 | \n", "49.0 | \n", "