{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-07-08T15:58:03.741302Z", "start_time": "2025-07-08T15:58:03.737218Z" } }, "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from misc import load_csv_dataset" ], "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:58:27.047640Z", "start_time": "2025-07-08T15:58:06.842257Z" } }, "cell_type": "code", "source": [ "data = load_csv_dataset(\"names_featured.csv\")\n", "df = pd.DataFrame(data)" ], "id": "74a8a262bc55d976", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">> Loading CSV dataset from names_featured.csv\n", ">> Detected fieldnames: ['name', 'region', 'year', 'words', 'length', 'probable_native', 'probable_surname', 'sex']\n", ">> Successfully loaded with UTF-8 encoding\n" ] } ], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:58:50.742471Z", "start_time": "2025-07-08T15:58:45.080832Z" } }, "cell_type": "code", "source": [ "df['year'] = pd.to_numeric(df['year'], errors='coerce')\n", "df['words'] = pd.to_numeric(df['words'], errors='coerce')\n", "df['length'] = pd.to_numeric(df['length'], errors='coerce')" ], "id": "ebb18cabd40b011f", "outputs": [], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2025-07-08T15:59:02.783458Z", "start_time": "2025-07-08T15:59:02.297971Z" } }, "cell_type": "code", "source": "df.describe().T", "id": "295eda175081129c", "outputs": [ { "data": { "text/plain": [ " count mean std min 25% 50% 75% \\\n", "year 4418337.0 2017.038495 4.656899 2008.0 2013.0 2018.0 2021.0 \n", "words 4418337.0 2.862385 0.473601 1.0 3.0 3.0 3.0 \n", "length 4418337.0 18.216613 3.498679 1.0 16.0 18.0 21.0 \n", "\n", " max \n", "year 2023.0 \n", "words 10.0 \n", "length 49.0 " ], "text/html": [ "
| \n", " | count | \n", "mean | \n", "std | \n", "min | \n", "25% | \n", "50% | \n", "75% | \n", "max | \n", "
|---|---|---|---|---|---|---|---|---|
| year | \n", "4418337.0 | \n", "2017.038495 | \n", "4.656899 | \n", "2008.0 | \n", "2013.0 | \n", "2018.0 | \n", "2021.0 | \n", "2023.0 | \n", "
| words | \n", "4418337.0 | \n", "2.862385 | \n", "0.473601 | \n", "1.0 | \n", "3.0 | \n", "3.0 | \n", "3.0 | \n", "10.0 | \n", "
| length | \n", "4418337.0 | \n", "18.216613 | \n", "3.498679 | \n", "1.0 | \n", "16.0 | \n", "18.0 | \n", "21.0 | \n", "49.0 | \n", "