From 33c7aceb0c64dbb42c092a5004e664a280dc0e10 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Sun, 17 Aug 2025 16:03:46 +0200 Subject: [PATCH] feat: remove data heavy viz --- web/interfaces/dashboard.py | 162 +----------------------------------- 1 file changed, 2 insertions(+), 160 deletions(-) diff --git a/web/interfaces/dashboard.py b/web/interfaces/dashboard.py index e4af4da..804c66b 100644 --- a/web/interfaces/dashboard.py +++ b/web/interfaces/dashboard.py @@ -1,6 +1,4 @@ import pandas as pd -import plotly.express as px -import plotly.graph_objects as go import streamlit as st from core.utils.data_loader import OPTIMIZED_DTYPES @@ -21,101 +19,9 @@ class Dashboard: self.experiment_tracker = experiment_tracker self.experiment_runner = experiment_runner - def _create_gender_distribution_chart(self, df: pd.DataFrame): - """Create gender distribution pie chart""" - if "sex" in df.columns: - gender_counts = df["sex"].value_counts() - fig = px.pie( - values=gender_counts.values, - names=gender_counts.index, - title="Gender Distribution", - color_discrete_map={"m": "#3498db", "f": "#e74c3c"}, - ) - fig.update_traces(textposition="inside", textinfo="percent+label") - return fig - return None - - def _create_province_distribution_chart(self, df: pd.DataFrame): - """Create province distribution bar chart""" - if "province" in df.columns: - province_counts = df["province"].value_counts().head(15) # Top 15 provinces - fig = px.bar( - x=province_counts.index, - y=province_counts.values, - title="Top 15 Provinces by Name Count", - labels={"x": "Province", "y": "Number of Names"}, - ) - fig.update_layout(xaxis_tickangle=-45) - return fig - return None - - def _create_name_length_distribution(self, df: pd.DataFrame): - """Create name length distribution histogram""" - if "length" in df.columns: - fig = px.histogram( - df, - x="length", - title="Name Length Distribution", - labels={"length": "Name Length (characters)", "count": "Frequency"}, - nbins=30, - ) - fig.update_layout(bargap=0.1) - return fig - return None - - def _create_annotation_progress_chart(self, df: pd.DataFrame): - """Create annotation progress chart""" - if "annotated" in df.columns and "ner_tagged" in df.columns: - annotation_data = { - "Not Annotated": (df["annotated"] == 0).sum(), - "Annotated": (df["annotated"] == 1).sum(), - "NER Tagged": (df["ner_tagged"] == 1).sum(), - } - - fig = go.Figure( - data=[ - go.Bar( - x=list(annotation_data.keys()), - y=list(annotation_data.values()), - marker_color=["#95a5a6", "#2ecc71", "#9b59b6"], - ) - ] - ) - fig.update_layout( - title="Annotation Progress", - xaxis_title="Status", - yaxis_title="Number of Names", - ) - return fig - return None - - def _create_regional_analysis(self, df: pd.DataFrame): - """Create regional analysis chart""" - if "region" in df.columns and "sex" in df.columns: - regional_gender = pd.crosstab(df["region"], df["sex"]) - fig = px.bar( - regional_gender, - title="Gender Distribution by Region", - labels={"value": "Count", "index": "Region"}, - ) - fig.update_layout(xaxis_tickangle=-45) - return fig - return None - - def _create_words_distribution(self, df: pd.DataFrame): - """Create word count distribution""" - if "words" in df.columns: - fig = px.box( - df, - y="words", - title="Word Count Distribution in Names", - labels={"words": "Number of Words"}, - ) - return fig - return None - def index(self): st.title("Dashboard") + col1, col2, col3, col4 = st.columns(4) # Load basic statistics try: @@ -123,9 +29,6 @@ class Dashboard: if data_path.exists(): df = load_dataset(str(data_path)) - # Metrics row - col1, col2, col3, col4 = st.columns(4) - with col1: st.metric("Total Names", f"{len(df):,}") @@ -142,67 +45,6 @@ class Dashboard: gender_dist = df["sex"].value_counts() ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1) st.metric("F/M Ratio", f"{ratio:.2f}") - - # First row of charts - col1, col2 = st.columns(2) - - with col1: - gender_chart = self._create_gender_distribution_chart(df) - if gender_chart: - st.plotly_chart(gender_chart, use_container_width=True) - - with col2: - annotation_chart = self._create_annotation_progress_chart(df) - if annotation_chart: - st.plotly_chart(annotation_chart, use_container_width=True) - - # Second row of charts - col1, col2 = st.columns(2) - - with col1: - length_chart = self._create_name_length_distribution(df) - if length_chart: - st.plotly_chart(length_chart, use_container_width=True) - - with col2: - words_chart = self._create_words_distribution(df) - if words_chart: - st.plotly_chart(words_chart, use_container_width=True) - - # Full-width charts - province_chart = self._create_province_distribution_chart(df) - if province_chart: - st.plotly_chart(province_chart, use_container_width=True) - - regional_chart = self._create_regional_analysis(df) - if regional_chart: - st.plotly_chart(regional_chart, use_container_width=True) - - # Data insights section - st.header("🔍 Key Insights") - insights_col1, insights_col2 = st.columns(2) - - with insights_col1: - st.subheader("Dataset Overview") - total_names = len(df) - unique_provinces = df["province"].nunique() if "province" in df.columns else 0 - avg_length = df["length"].mean() if "length" in df.columns else 0 - - st.write(f"• **{total_names:,}** total names in the dataset") - st.write(f"• **{unique_provinces}** provinces represented") - if avg_length > 0: - st.write(f"• Average name length: **{avg_length:.1f}** characters") - - with insights_col2: - st.subheader("Processing Status") - if "annotated" in df.columns: - annotated_pct = (df["annotated"] == 1).mean() * 100 - st.write(f"• **{annotated_pct:.1f}%** of names are annotated") - - if "ner_tagged" in df.columns: - ner_pct = (df["ner_tagged"] == 1).mean() * 100 - st.write(f"• **{ner_pct:.1f}%** of names have NER tags") - else: st.warning("No processed data found. Please run data processing first.") @@ -210,7 +52,7 @@ class Dashboard: st.error(f"Error loading dashboard data: {e}") # Recent experiments - st.header("Recent Experiments") + st.subheader("Recent Experiments") experiments = self.experiment_tracker.list_experiments()[:5] if experiments: