feat: add visualizations for gender, province, and name length distributions in dashboard
This commit is contained in:
+160
-2
@@ -1,4 +1,6 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import plotly.express as px
|
||||||
|
import plotly.graph_objects as go
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
@@ -19,9 +21,101 @@ class Dashboard:
|
|||||||
self.experiment_tracker = experiment_tracker
|
self.experiment_tracker = experiment_tracker
|
||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
|
|
||||||
|
def _create_gender_distribution_chart(self, df: pd.DataFrame):
|
||||||
|
"""Create gender distribution pie chart"""
|
||||||
|
if "sex" in df.columns:
|
||||||
|
gender_counts = df["sex"].value_counts()
|
||||||
|
fig = px.pie(
|
||||||
|
values=gender_counts.values,
|
||||||
|
names=gender_counts.index,
|
||||||
|
title="Gender Distribution",
|
||||||
|
color_discrete_map={"m": "#3498db", "f": "#e74c3c"},
|
||||||
|
)
|
||||||
|
fig.update_traces(textposition="inside", textinfo="percent+label")
|
||||||
|
return fig
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _create_province_distribution_chart(self, df: pd.DataFrame):
|
||||||
|
"""Create province distribution bar chart"""
|
||||||
|
if "province" in df.columns:
|
||||||
|
province_counts = df["province"].value_counts().head(15) # Top 15 provinces
|
||||||
|
fig = px.bar(
|
||||||
|
x=province_counts.index,
|
||||||
|
y=province_counts.values,
|
||||||
|
title="Top 15 Provinces by Name Count",
|
||||||
|
labels={"x": "Province", "y": "Number of Names"},
|
||||||
|
)
|
||||||
|
fig.update_layout(xaxis_tickangle=-45)
|
||||||
|
return fig
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _create_name_length_distribution(self, df: pd.DataFrame):
|
||||||
|
"""Create name length distribution histogram"""
|
||||||
|
if "length" in df.columns:
|
||||||
|
fig = px.histogram(
|
||||||
|
df,
|
||||||
|
x="length",
|
||||||
|
title="Name Length Distribution",
|
||||||
|
labels={"length": "Name Length (characters)", "count": "Frequency"},
|
||||||
|
nbins=30,
|
||||||
|
)
|
||||||
|
fig.update_layout(bargap=0.1)
|
||||||
|
return fig
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _create_annotation_progress_chart(self, df: pd.DataFrame):
|
||||||
|
"""Create annotation progress chart"""
|
||||||
|
if "annotated" in df.columns and "ner_tagged" in df.columns:
|
||||||
|
annotation_data = {
|
||||||
|
"Not Annotated": (df["annotated"] == 0).sum(),
|
||||||
|
"Annotated": (df["annotated"] == 1).sum(),
|
||||||
|
"NER Tagged": (df["ner_tagged"] == 1).sum(),
|
||||||
|
}
|
||||||
|
|
||||||
|
fig = go.Figure(
|
||||||
|
data=[
|
||||||
|
go.Bar(
|
||||||
|
x=list(annotation_data.keys()),
|
||||||
|
y=list(annotation_data.values()),
|
||||||
|
marker_color=["#95a5a6", "#2ecc71", "#9b59b6"],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
fig.update_layout(
|
||||||
|
title="Annotation Progress",
|
||||||
|
xaxis_title="Status",
|
||||||
|
yaxis_title="Number of Names",
|
||||||
|
)
|
||||||
|
return fig
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _create_regional_analysis(self, df: pd.DataFrame):
|
||||||
|
"""Create regional analysis chart"""
|
||||||
|
if "region" in df.columns and "sex" in df.columns:
|
||||||
|
regional_gender = pd.crosstab(df["region"], df["sex"])
|
||||||
|
fig = px.bar(
|
||||||
|
regional_gender,
|
||||||
|
title="Gender Distribution by Region",
|
||||||
|
labels={"value": "Count", "index": "Region"},
|
||||||
|
)
|
||||||
|
fig.update_layout(xaxis_tickangle=-45)
|
||||||
|
return fig
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _create_words_distribution(self, df: pd.DataFrame):
|
||||||
|
"""Create word count distribution"""
|
||||||
|
if "words" in df.columns:
|
||||||
|
fig = px.box(
|
||||||
|
df,
|
||||||
|
y="words",
|
||||||
|
title="Word Count Distribution in Names",
|
||||||
|
labels={"words": "Number of Words"},
|
||||||
|
)
|
||||||
|
return fig
|
||||||
|
return None
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
st.title("Dashboard")
|
st.title("Dashboard")
|
||||||
col1, col2, col3, col4 = st.columns(4)
|
|
||||||
|
|
||||||
# Load basic statistics
|
# Load basic statistics
|
||||||
try:
|
try:
|
||||||
@@ -29,6 +123,9 @@ class Dashboard:
|
|||||||
if data_path.exists():
|
if data_path.exists():
|
||||||
df = load_dataset(str(data_path))
|
df = load_dataset(str(data_path))
|
||||||
|
|
||||||
|
# Metrics row
|
||||||
|
col1, col2, col3, col4 = st.columns(4)
|
||||||
|
|
||||||
with col1:
|
with col1:
|
||||||
st.metric("Total Names", f"{len(df):,}")
|
st.metric("Total Names", f"{len(df):,}")
|
||||||
|
|
||||||
@@ -45,6 +142,67 @@ class Dashboard:
|
|||||||
gender_dist = df["sex"].value_counts()
|
gender_dist = df["sex"].value_counts()
|
||||||
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
|
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
|
||||||
st.metric("F/M Ratio", f"{ratio:.2f}")
|
st.metric("F/M Ratio", f"{ratio:.2f}")
|
||||||
|
|
||||||
|
# First row of charts
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
gender_chart = self._create_gender_distribution_chart(df)
|
||||||
|
if gender_chart:
|
||||||
|
st.plotly_chart(gender_chart, use_container_width=True)
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
annotation_chart = self._create_annotation_progress_chart(df)
|
||||||
|
if annotation_chart:
|
||||||
|
st.plotly_chart(annotation_chart, use_container_width=True)
|
||||||
|
|
||||||
|
# Second row of charts
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
length_chart = self._create_name_length_distribution(df)
|
||||||
|
if length_chart:
|
||||||
|
st.plotly_chart(length_chart, use_container_width=True)
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
words_chart = self._create_words_distribution(df)
|
||||||
|
if words_chart:
|
||||||
|
st.plotly_chart(words_chart, use_container_width=True)
|
||||||
|
|
||||||
|
# Full-width charts
|
||||||
|
province_chart = self._create_province_distribution_chart(df)
|
||||||
|
if province_chart:
|
||||||
|
st.plotly_chart(province_chart, use_container_width=True)
|
||||||
|
|
||||||
|
regional_chart = self._create_regional_analysis(df)
|
||||||
|
if regional_chart:
|
||||||
|
st.plotly_chart(regional_chart, use_container_width=True)
|
||||||
|
|
||||||
|
# Data insights section
|
||||||
|
st.header("🔍 Key Insights")
|
||||||
|
insights_col1, insights_col2 = st.columns(2)
|
||||||
|
|
||||||
|
with insights_col1:
|
||||||
|
st.subheader("Dataset Overview")
|
||||||
|
total_names = len(df)
|
||||||
|
unique_provinces = df["province"].nunique() if "province" in df.columns else 0
|
||||||
|
avg_length = df["length"].mean() if "length" in df.columns else 0
|
||||||
|
|
||||||
|
st.write(f"• **{total_names:,}** total names in the dataset")
|
||||||
|
st.write(f"• **{unique_provinces}** provinces represented")
|
||||||
|
if avg_length > 0:
|
||||||
|
st.write(f"• Average name length: **{avg_length:.1f}** characters")
|
||||||
|
|
||||||
|
with insights_col2:
|
||||||
|
st.subheader("Processing Status")
|
||||||
|
if "annotated" in df.columns:
|
||||||
|
annotated_pct = (df["annotated"] == 1).mean() * 100
|
||||||
|
st.write(f"• **{annotated_pct:.1f}%** of names are annotated")
|
||||||
|
|
||||||
|
if "ner_tagged" in df.columns:
|
||||||
|
ner_pct = (df["ner_tagged"] == 1).mean() * 100
|
||||||
|
st.write(f"• **{ner_pct:.1f}%** of names have NER tags")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
st.warning("No processed data found. Please run data processing first.")
|
st.warning("No processed data found. Please run data processing first.")
|
||||||
|
|
||||||
@@ -52,7 +210,7 @@ class Dashboard:
|
|||||||
st.error(f"Error loading dashboard data: {e}")
|
st.error(f"Error loading dashboard data: {e}")
|
||||||
|
|
||||||
# Recent experiments
|
# Recent experiments
|
||||||
st.subheader("Recent Experiments")
|
st.header("Recent Experiments")
|
||||||
experiments = self.experiment_tracker.list_experiments()[:5]
|
experiments = self.experiment_tracker.list_experiments()[:5]
|
||||||
|
|
||||||
if experiments:
|
if experiments:
|
||||||
|
|||||||
Reference in New Issue
Block a user