From 773ebf32c61f2ff0302cffc7a8814d62e6c110fe Mon Sep 17 00:00:00 2001 From: amaury Date: Fri, 26 Sep 2025 13:20:37 +0200 Subject: [PATCH] =?UTF-8?q?Adding=20surname=20transition=20analysis=20with?= =?UTF-8?q?=20Markov=20models,=20frequency=20studies,=20and=20visualizatio?= =?UTF-8?q?ns,=20including=20cleaned=20surname=20preprocessing,=20province?= =?UTF-8?q?=20sampling,=20bigram/trigram=20stats,=20and=20male=E2=80=93fem?= =?UTF-8?q?ale=20transition=20comparisons?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- notebooks/names.ipynb | 1095 ++++++++++++++++++++++++++--------------- 1 file changed, 694 insertions(+), 401 deletions(-) diff --git a/notebooks/names.ipynb b/notebooks/names.ipynb index a338089..ac8a20c 100644 --- a/notebooks/names.ipynb +++ b/notebooks/names.ipynb @@ -4,10 +4,13 @@ "cell_type": "markdown", "id": "b6cc3c6b-85c7-4a04-9aef-8ffc055aa93c", "metadata": {}, - "source": "# Names Analysis & Modeling" + "source": [ + "# Names Analysis & Modeling" + ] }, { "cell_type": "code", + "execution_count": 1, "id": "initial_id", "metadata": { "ExecuteTime": { @@ -15,6 +18,7 @@ "start_time": "2025-09-25T19:54:31.689969Z" } }, + "outputs": [], "source": [ "import pandas as pd \n", "import unicodedata \n", @@ -33,25 +37,26 @@ "from core.utils.data_loader import DataLoader\n", "from core.utils.region_mapper import RegionMapper\n", "from core.config.pipeline_config import PipelineConfig" - ], - "outputs": [], - "execution_count": 116 + ] }, { + "cell_type": "code", + "execution_count": 2, + "id": "16647cc71aea7594", "metadata": { "ExecuteTime": { "end_time": "2025-09-25T18:26:50.162866Z", "start_time": "2025-09-25T18:26:50.159601Z" } }, - "cell_type": "code", - "source": "LETTERS = 'abcdefghijklmnopqrstuvwxyz'", - "id": "16647cc71aea7594", "outputs": [], - "execution_count": 47 + "source": [ + "LETTERS = 'abcdefghijklmnopqrstuvwxyz'" + ] }, { "cell_type": "code", + "execution_count": 3, "id": "f1a69290-a9c0-40d0-9fe8-a06d8a466671", "metadata": { "ExecuteTime": { @@ -59,6 +64,7 @@ "start_time": "2025-09-25T17:39:49.304876Z" } }, + "outputs": [], "source": [ "config = PipelineConfig(\n", " paths={\n", @@ -82,36 +88,34 @@ " s = s.lower()\n", " s = re.sub(r\"[^a-z]\", \"\", s)\n", " return s" - ], - "outputs": [], - "execution_count": 3 + ] }, { + "cell_type": "code", + "execution_count": 4, + "id": "e48c6fd9a213bcd2", "metadata": { "ExecuteTime": { "end_time": "2025-09-25T17:59:34.598256Z", "start_time": "2025-09-25T17:58:54.210735Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")\n", "df['province'] = RegionMapper.clean_province(df['province'])" - ], - "id": "e48c6fd9a213bcd2", - "outputs": [], - "execution_count": 23 + ] }, { + "cell_type": "code", + "execution_count": 5, + "id": "2715f291947f5158", "metadata": { "ExecuteTime": { "end_time": "2025-09-25T17:59:38.255948Z", "start_time": "2025-09-25T17:59:38.249016Z" } }, - "cell_type": "code", - "source": "df.columns", - "id": "2715f291947f5158", "outputs": [ { "data": { @@ -123,33 +127,28 @@ " dtype='object')" ] }, - "execution_count": 24, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 24 + "source": [ + "df.columns" + ] }, { + "cell_type": "code", + "execution_count": 6, + "id": "93f8859e3e9c4350", "metadata": { "ExecuteTime": { "end_time": "2025-09-25T18:01:22.242283Z", "start_time": "2025-09-25T18:01:21.580597Z" } }, - "cell_type": "code", - "source": "df.describe().T", - "id": "93f8859e3e9c4350", "outputs": [ { "data": { - "text/plain": [ - " count mean std min 25% 50% 75% max\n", - "words 6467942.0 2.869578 0.46841 1.0 3.0 3.0 3.0 11.0\n", - "length 6467942.0 20.141236 3.796574 0.0 18.0 21.0 23.0 60.0\n", - "ner_tagged 5018124.0 0.997939 0.045348 0.0 1.0 1.0 1.0 1.0\n", - "annotated 5018124.0 0.997939 0.045348 0.0 1.0 1.0 1.0 1.0" - ], "text/html": [ "
\n", "