From 33c7aceb0c64dbb42c092a5004e664a280dc0e10 Mon Sep 17 00:00:00 2001
From: bernard-ng <ngandubernard@gmail.com>
Date: Sun, 17 Aug 2025 16:03:46 +0200
Subject: [PATCH] feat: remove data heavy viz

---
 web/interfaces/dashboard.py | 162 +-----------------------------------
 1 file changed, 2 insertions(+), 160 deletions(-)

diff --git a/web/interfaces/dashboard.py b/web/interfaces/dashboard.py
index e4af4da..804c66b 100644
--- a/web/interfaces/dashboard.py
+++ b/web/interfaces/dashboard.py
@@ -1,6 +1,4 @@
 import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
 import streamlit as st
 
 from core.utils.data_loader import OPTIMIZED_DTYPES
@@ -21,101 +19,9 @@ class Dashboard:
         self.experiment_tracker = experiment_tracker
         self.experiment_runner = experiment_runner
 
-    def _create_gender_distribution_chart(self, df: pd.DataFrame):
-        """Create gender distribution pie chart"""
-        if "sex" in df.columns:
-            gender_counts = df["sex"].value_counts()
-            fig = px.pie(
-                values=gender_counts.values,
-                names=gender_counts.index,
-                title="Gender Distribution",
-                color_discrete_map={"m": "#3498db", "f": "#e74c3c"},
-            )
-            fig.update_traces(textposition="inside", textinfo="percent+label")
-            return fig
-        return None
-
-    def _create_province_distribution_chart(self, df: pd.DataFrame):
-        """Create province distribution bar chart"""
-        if "province" in df.columns:
-            province_counts = df["province"].value_counts().head(15)  # Top 15 provinces
-            fig = px.bar(
-                x=province_counts.index,
-                y=province_counts.values,
-                title="Top 15 Provinces by Name Count",
-                labels={"x": "Province", "y": "Number of Names"},
-            )
-            fig.update_layout(xaxis_tickangle=-45)
-            return fig
-        return None
-
-    def _create_name_length_distribution(self, df: pd.DataFrame):
-        """Create name length distribution histogram"""
-        if "length" in df.columns:
-            fig = px.histogram(
-                df,
-                x="length",
-                title="Name Length Distribution",
-                labels={"length": "Name Length (characters)", "count": "Frequency"},
-                nbins=30,
-            )
-            fig.update_layout(bargap=0.1)
-            return fig
-        return None
-
-    def _create_annotation_progress_chart(self, df: pd.DataFrame):
-        """Create annotation progress chart"""
-        if "annotated" in df.columns and "ner_tagged" in df.columns:
-            annotation_data = {
-                "Not Annotated": (df["annotated"] == 0).sum(),
-                "Annotated": (df["annotated"] == 1).sum(),
-                "NER Tagged": (df["ner_tagged"] == 1).sum(),
-            }
-
-            fig = go.Figure(
-                data=[
-                    go.Bar(
-                        x=list(annotation_data.keys()),
-                        y=list(annotation_data.values()),
-                        marker_color=["#95a5a6", "#2ecc71", "#9b59b6"],
-                    )
-                ]
-            )
-            fig.update_layout(
-                title="Annotation Progress",
-                xaxis_title="Status",
-                yaxis_title="Number of Names",
-            )
-            return fig
-        return None
-
-    def _create_regional_analysis(self, df: pd.DataFrame):
-        """Create regional analysis chart"""
-        if "region" in df.columns and "sex" in df.columns:
-            regional_gender = pd.crosstab(df["region"], df["sex"])
-            fig = px.bar(
-                regional_gender,
-                title="Gender Distribution by Region",
-                labels={"value": "Count", "index": "Region"},
-            )
-            fig.update_layout(xaxis_tickangle=-45)
-            return fig
-        return None
-
-    def _create_words_distribution(self, df: pd.DataFrame):
-        """Create word count distribution"""
-        if "words" in df.columns:
-            fig = px.box(
-                df,
-                y="words",
-                title="Word Count Distribution in Names",
-                labels={"words": "Number of Words"},
-            )
-            return fig
-        return None
-
     def index(self):
         st.title("Dashboard")
+        col1, col2, col3, col4 = st.columns(4)
 
         # Load basic statistics
         try:
@@ -123,9 +29,6 @@ class Dashboard:
             if data_path.exists():
                 df = load_dataset(str(data_path))
 
-                # Metrics row
-                col1, col2, col3, col4 = st.columns(4)
-
                 with col1:
                     st.metric("Total Names", f"{len(df):,}")
 
@@ -142,67 +45,6 @@ class Dashboard:
                         gender_dist = df["sex"].value_counts()
                         ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
                         st.metric("F/M Ratio", f"{ratio:.2f}")
-
-                # First row of charts
-                col1, col2 = st.columns(2)
-
-                with col1:
-                    gender_chart = self._create_gender_distribution_chart(df)
-                    if gender_chart:
-                        st.plotly_chart(gender_chart, use_container_width=True)
-
-                with col2:
-                    annotation_chart = self._create_annotation_progress_chart(df)
-                    if annotation_chart:
-                        st.plotly_chart(annotation_chart, use_container_width=True)
-
-                # Second row of charts
-                col1, col2 = st.columns(2)
-
-                with col1:
-                    length_chart = self._create_name_length_distribution(df)
-                    if length_chart:
-                        st.plotly_chart(length_chart, use_container_width=True)
-
-                with col2:
-                    words_chart = self._create_words_distribution(df)
-                    if words_chart:
-                        st.plotly_chart(words_chart, use_container_width=True)
-
-                # Full-width charts
-                province_chart = self._create_province_distribution_chart(df)
-                if province_chart:
-                    st.plotly_chart(province_chart, use_container_width=True)
-
-                regional_chart = self._create_regional_analysis(df)
-                if regional_chart:
-                    st.plotly_chart(regional_chart, use_container_width=True)
-
-                # Data insights section
-                st.header("🔍 Key Insights")
-                insights_col1, insights_col2 = st.columns(2)
-
-                with insights_col1:
-                    st.subheader("Dataset Overview")
-                    total_names = len(df)
-                    unique_provinces = df["province"].nunique() if "province" in df.columns else 0
-                    avg_length = df["length"].mean() if "length" in df.columns else 0
-
-                    st.write(f"• **{total_names:,}** total names in the dataset")
-                    st.write(f"• **{unique_provinces}** provinces represented")
-                    if avg_length > 0:
-                        st.write(f"• Average name length: **{avg_length:.1f}** characters")
-
-                with insights_col2:
-                    st.subheader("Processing Status")
-                    if "annotated" in df.columns:
-                        annotated_pct = (df["annotated"] == 1).mean() * 100
-                        st.write(f"• **{annotated_pct:.1f}%** of names are annotated")
-
-                    if "ner_tagged" in df.columns:
-                        ner_pct = (df["ner_tagged"] == 1).mean() * 100
-                        st.write(f"• **{ner_pct:.1f}%** of names have NER tags")
-
             else:
                 st.warning("No processed data found. Please run data processing first.")
 
@@ -210,7 +52,7 @@ class Dashboard:
             st.error(f"Error loading dashboard data: {e}")
 
         # Recent experiments
-        st.header("Recent Experiments")
+        st.subheader("Recent Experiments")
         experiments = self.experiment_tracker.list_experiments()[:5]
 
         if experiments: