{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-09-21T12:06:07.309139Z", "start_time": "2025-09-21T12:06:07.086638Z" } }, "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import sys\n", "import os\n", "\n", "sys.path.append(os.path.abspath(\"..\"))\n", "from core.utils.data_loader import DataLoader\n", "from core.config.pipeline_config import PipelineConfig" ], "outputs": [], "execution_count": 1 }, { "cell_type": "code", "id": "74a8a262bc55d976", "metadata": { "ExecuteTime": { "end_time": "2025-09-21T12:06:14.601682Z", "start_time": "2025-09-21T12:06:14.598078Z" } }, "source": [ "config = PipelineConfig(\n", " paths={\n", " \"root_dir\": \"../data\",\n", " \"data_dir\": \"../data/dataset\",\n", " \"models_dir\": \"../models\",\n", " \"outputs_dir\": \"../data/processed\",\n", " \"logs_dir\": \"../logs\",\n", " \"configs_dir\": \"../configs\",\n", " \"checkpoints_dir\": \"../checkpoints\"\n", " }\n", ")\n", "\n", "loader = DataLoader(config)" ], "outputs": [], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2025-09-21T12:07:02.954766Z", "start_time": "2025-09-21T12:06:29.300866Z" } }, "cell_type": "code", "source": "df = loader.load_csv_complete(config.paths.data_dir / \"names_featured.csv\")", "id": "171054a786856d23", "outputs": [], "execution_count": 3 }, { "cell_type": "code", "id": "295eda175081129c", "metadata": { "ExecuteTime": { "end_time": "2025-09-21T12:08:17.955798Z", "start_time": "2025-09-21T12:08:17.165693Z" } }, "source": [ "df.describe().T" ], "outputs": [ { "data": { "text/plain": [ " count mean std min 25% 50% 75% max\n", "words 7901815.0 2.872912 0.466004 1.0 3.0 3.0 3.0 11.0\n", "length 7901815.0 20.161653 3.796664 0.0 18.0 21.0 23.0 60.0\n", "ner_tagged 6150395.0 0.998319 0.04097 0.0 1.0 1.0 1.0 1.0\n", "annotated 6150395.0 0.998319 0.04097 0.0 1.0 1.0 1.0 1.0" ], "text/html": [ "
| \n", " | count | \n", "mean | \n", "std | \n", "min | \n", "25% | \n", "50% | \n", "75% | \n", "max | \n", "
|---|---|---|---|---|---|---|---|---|
| words | \n", "7901815.0 | \n", "2.872912 | \n", "0.466004 | \n", "1.0 | \n", "3.0 | \n", "3.0 | \n", "3.0 | \n", "11.0 | \n", "
| length | \n", "7901815.0 | \n", "20.161653 | \n", "3.796664 | \n", "0.0 | \n", "18.0 | \n", "21.0 | \n", "23.0 | \n", "60.0 | \n", "
| ner_tagged | \n", "6150395.0 | \n", "0.998319 | \n", "0.04097 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "
| annotated | \n", "6150395.0 | \n", "0.998319 | \n", "0.04097 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "