From b65aad6ac69f4bf9ddb021126fe5c688ca867c0c Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Sun, 17 Aug 2025 15:52:15 +0200 Subject: [PATCH] feat: add visualizations for gender, province, and name length distributions in dashboard --- web/interfaces/dashboard.py | 162 +++++++++++++++++++++++++++++++++++- 1 file changed, 160 insertions(+), 2 deletions(-) diff --git a/web/interfaces/dashboard.py b/web/interfaces/dashboard.py index 804c66b..e4af4da 100644 --- a/web/interfaces/dashboard.py +++ b/web/interfaces/dashboard.py @@ -1,4 +1,6 @@ import pandas as pd +import plotly.express as px +import plotly.graph_objects as go import streamlit as st from core.utils.data_loader import OPTIMIZED_DTYPES @@ -19,9 +21,101 @@ class Dashboard: self.experiment_tracker = experiment_tracker self.experiment_runner = experiment_runner + def _create_gender_distribution_chart(self, df: pd.DataFrame): + """Create gender distribution pie chart""" + if "sex" in df.columns: + gender_counts = df["sex"].value_counts() + fig = px.pie( + values=gender_counts.values, + names=gender_counts.index, + title="Gender Distribution", + color_discrete_map={"m": "#3498db", "f": "#e74c3c"}, + ) + fig.update_traces(textposition="inside", textinfo="percent+label") + return fig + return None + + def _create_province_distribution_chart(self, df: pd.DataFrame): + """Create province distribution bar chart""" + if "province" in df.columns: + province_counts = df["province"].value_counts().head(15) # Top 15 provinces + fig = px.bar( + x=province_counts.index, + y=province_counts.values, + title="Top 15 Provinces by Name Count", + labels={"x": "Province", "y": "Number of Names"}, + ) + fig.update_layout(xaxis_tickangle=-45) + return fig + return None + + def _create_name_length_distribution(self, df: pd.DataFrame): + """Create name length distribution histogram""" + if "length" in df.columns: + fig = px.histogram( + df, + x="length", + title="Name Length Distribution", + labels={"length": "Name Length (characters)", "count": "Frequency"}, + nbins=30, + ) + fig.update_layout(bargap=0.1) + return fig + return None + + def _create_annotation_progress_chart(self, df: pd.DataFrame): + """Create annotation progress chart""" + if "annotated" in df.columns and "ner_tagged" in df.columns: + annotation_data = { + "Not Annotated": (df["annotated"] == 0).sum(), + "Annotated": (df["annotated"] == 1).sum(), + "NER Tagged": (df["ner_tagged"] == 1).sum(), + } + + fig = go.Figure( + data=[ + go.Bar( + x=list(annotation_data.keys()), + y=list(annotation_data.values()), + marker_color=["#95a5a6", "#2ecc71", "#9b59b6"], + ) + ] + ) + fig.update_layout( + title="Annotation Progress", + xaxis_title="Status", + yaxis_title="Number of Names", + ) + return fig + return None + + def _create_regional_analysis(self, df: pd.DataFrame): + """Create regional analysis chart""" + if "region" in df.columns and "sex" in df.columns: + regional_gender = pd.crosstab(df["region"], df["sex"]) + fig = px.bar( + regional_gender, + title="Gender Distribution by Region", + labels={"value": "Count", "index": "Region"}, + ) + fig.update_layout(xaxis_tickangle=-45) + return fig + return None + + def _create_words_distribution(self, df: pd.DataFrame): + """Create word count distribution""" + if "words" in df.columns: + fig = px.box( + df, + y="words", + title="Word Count Distribution in Names", + labels={"words": "Number of Words"}, + ) + return fig + return None + def index(self): st.title("Dashboard") - col1, col2, col3, col4 = st.columns(4) # Load basic statistics try: @@ -29,6 +123,9 @@ class Dashboard: if data_path.exists(): df = load_dataset(str(data_path)) + # Metrics row + col1, col2, col3, col4 = st.columns(4) + with col1: st.metric("Total Names", f"{len(df):,}") @@ -45,6 +142,67 @@ class Dashboard: gender_dist = df["sex"].value_counts() ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1) st.metric("F/M Ratio", f"{ratio:.2f}") + + # First row of charts + col1, col2 = st.columns(2) + + with col1: + gender_chart = self._create_gender_distribution_chart(df) + if gender_chart: + st.plotly_chart(gender_chart, use_container_width=True) + + with col2: + annotation_chart = self._create_annotation_progress_chart(df) + if annotation_chart: + st.plotly_chart(annotation_chart, use_container_width=True) + + # Second row of charts + col1, col2 = st.columns(2) + + with col1: + length_chart = self._create_name_length_distribution(df) + if length_chart: + st.plotly_chart(length_chart, use_container_width=True) + + with col2: + words_chart = self._create_words_distribution(df) + if words_chart: + st.plotly_chart(words_chart, use_container_width=True) + + # Full-width charts + province_chart = self._create_province_distribution_chart(df) + if province_chart: + st.plotly_chart(province_chart, use_container_width=True) + + regional_chart = self._create_regional_analysis(df) + if regional_chart: + st.plotly_chart(regional_chart, use_container_width=True) + + # Data insights section + st.header("🔍 Key Insights") + insights_col1, insights_col2 = st.columns(2) + + with insights_col1: + st.subheader("Dataset Overview") + total_names = len(df) + unique_provinces = df["province"].nunique() if "province" in df.columns else 0 + avg_length = df["length"].mean() if "length" in df.columns else 0 + + st.write(f"• **{total_names:,}** total names in the dataset") + st.write(f"• **{unique_provinces}** provinces represented") + if avg_length > 0: + st.write(f"• Average name length: **{avg_length:.1f}** characters") + + with insights_col2: + st.subheader("Processing Status") + if "annotated" in df.columns: + annotated_pct = (df["annotated"] == 1).mean() * 100 + st.write(f"• **{annotated_pct:.1f}%** of names are annotated") + + if "ner_tagged" in df.columns: + ner_pct = (df["ner_tagged"] == 1).mean() * 100 + st.write(f"• **{ner_pct:.1f}%** of names have NER tags") + else: st.warning("No processed data found. Please run data processing first.") @@ -52,7 +210,7 @@ class Dashboard: st.error(f"Error loading dashboard data: {e}") # Recent experiments - st.subheader("Recent Experiments") + st.header("Recent Experiments") experiments = self.experiment_tracker.list_experiments()[:5] if experiments: