import pandas as pd import plotly.express as px import plotly.graph_objects as go import streamlit as st from core.utils.data_loader import OPTIMIZED_DTYPES @st.cache_data def load_dataset(file_path: str) -> pd.DataFrame: try: return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES) except Exception as e: st.error(f"Error loading dataset: {e}") return pd.DataFrame() class Dashboard: def __init__(self, config, experiment_tracker, experiment_runner): self.config = config self.experiment_tracker = experiment_tracker self.experiment_runner = experiment_runner def _create_gender_distribution_chart(self, df: pd.DataFrame): """Create gender distribution pie chart""" if "sex" in df.columns: gender_counts = df["sex"].value_counts() fig = px.pie( values=gender_counts.values, names=gender_counts.index, title="Gender Distribution", color_discrete_map={"m": "#3498db", "f": "#e74c3c"}, ) fig.update_traces(textposition="inside", textinfo="percent+label") return fig return None def _create_province_distribution_chart(self, df: pd.DataFrame): """Create province distribution bar chart""" if "province" in df.columns: province_counts = df["province"].value_counts().head(15) # Top 15 provinces fig = px.bar( x=province_counts.index, y=province_counts.values, title="Top 15 Provinces by Name Count", labels={"x": "Province", "y": "Number of Names"}, ) fig.update_layout(xaxis_tickangle=-45) return fig return None def _create_name_length_distribution(self, df: pd.DataFrame): """Create name length distribution histogram""" if "length" in df.columns: fig = px.histogram( df, x="length", title="Name Length Distribution", labels={"length": "Name Length (characters)", "count": "Frequency"}, nbins=30, ) fig.update_layout(bargap=0.1) return fig return None def _create_annotation_progress_chart(self, df: pd.DataFrame): """Create annotation progress chart""" if "annotated" in df.columns and "ner_tagged" in df.columns: annotation_data = { "Not Annotated": (df["annotated"] == 0).sum(), "Annotated": (df["annotated"] == 1).sum(), "NER Tagged": (df["ner_tagged"] == 1).sum(), } fig = go.Figure( data=[ go.Bar( x=list(annotation_data.keys()), y=list(annotation_data.values()), marker_color=["#95a5a6", "#2ecc71", "#9b59b6"], ) ] ) fig.update_layout( title="Annotation Progress", xaxis_title="Status", yaxis_title="Number of Names", ) return fig return None def _create_regional_analysis(self, df: pd.DataFrame): """Create regional analysis chart""" if "region" in df.columns and "sex" in df.columns: regional_gender = pd.crosstab(df["region"], df["sex"]) fig = px.bar( regional_gender, title="Gender Distribution by Region", labels={"value": "Count", "index": "Region"}, ) fig.update_layout(xaxis_tickangle=-45) return fig return None def _create_words_distribution(self, df: pd.DataFrame): """Create word count distribution""" if "words" in df.columns: fig = px.box( df, y="words", title="Word Count Distribution in Names", labels={"words": "Number of Words"}, ) return fig return None def index(self): st.title("Dashboard") # Load basic statistics try: data_path = self.config.paths.get_data_path(self.config.data.output_files["featured"]) if data_path.exists(): df = load_dataset(str(data_path)) # Metrics row col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Total Names", f"{len(df):,}") with col2: annotated = (df.get("annotated", 0) == 1).sum() st.metric("Annotated Names", f"{annotated:,}") with col3: provinces = df["province"].nunique() if "province" in df.columns else 0 st.metric("Provinces", provinces) with col4: if "sex" in df.columns: gender_dist = df["sex"].value_counts() ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1) st.metric("F/M Ratio", f"{ratio:.2f}") # First row of charts col1, col2 = st.columns(2) with col1: gender_chart = self._create_gender_distribution_chart(df) if gender_chart: st.plotly_chart(gender_chart, use_container_width=True) with col2: annotation_chart = self._create_annotation_progress_chart(df) if annotation_chart: st.plotly_chart(annotation_chart, use_container_width=True) # Second row of charts col1, col2 = st.columns(2) with col1: length_chart = self._create_name_length_distribution(df) if length_chart: st.plotly_chart(length_chart, use_container_width=True) with col2: words_chart = self._create_words_distribution(df) if words_chart: st.plotly_chart(words_chart, use_container_width=True) # Full-width charts province_chart = self._create_province_distribution_chart(df) if province_chart: st.plotly_chart(province_chart, use_container_width=True) regional_chart = self._create_regional_analysis(df) if regional_chart: st.plotly_chart(regional_chart, use_container_width=True) # Data insights section st.header("🔍 Key Insights") insights_col1, insights_col2 = st.columns(2) with insights_col1: st.subheader("Dataset Overview") total_names = len(df) unique_provinces = df["province"].nunique() if "province" in df.columns else 0 avg_length = df["length"].mean() if "length" in df.columns else 0 st.write(f"• **{total_names:,}** total names in the dataset") st.write(f"• **{unique_provinces}** provinces represented") if avg_length > 0: st.write(f"• Average name length: **{avg_length:.1f}** characters") with insights_col2: st.subheader("Processing Status") if "annotated" in df.columns: annotated_pct = (df["annotated"] == 1).mean() * 100 st.write(f"• **{annotated_pct:.1f}%** of names are annotated") if "ner_tagged" in df.columns: ner_pct = (df["ner_tagged"] == 1).mean() * 100 st.write(f"• **{ner_pct:.1f}%** of names have NER tags") else: st.warning("No processed data found. Please run data processing first.") except Exception as e: st.error(f"Error loading dashboard data: {e}") # Recent experiments st.header("Recent Experiments") experiments = self.experiment_tracker.list_experiments()[:5] if experiments: exp_data = [] for exp in experiments: exp_data.append( { "Name": exp.config.name, "Model": exp.config.model_type, "Status": exp.status.value, "Accuracy": ( f"{exp.test_metrics.get('accuracy', 0):.3f}" if exp.test_metrics else "N/A" ), "Date": exp.start_time.strftime("%Y-%m-%d %H:%M"), } ) st.dataframe(pd.DataFrame(exp_data), use_container_width=True) else: st.info("No experiments found. Create your first experiment in the Experiments tab!")