from datetime import datetime import pandas as pd import plotly.express as px import streamlit as st from core.utils import get_data_file_path def load_dataset(file_path: str) -> pd.DataFrame: try: return pd.read_csv(file_path) except Exception as e: st.error(f"Error loading dataset: {e}") return pd.DataFrame() class DataOverview: def __init__(self, config): self.config = config def index(self): st.header("Data Overview") data_files = { "Names": self.config.data.input_file, "Featured Dataset": self.config.data.output_files["featured"], "Evaluation Dataset": self.config.data.output_files["evaluation"], "Male Names": self.config.data.output_files["males"], "Female Names": self.config.data.output_files["females"], } selected_file = st.selectbox("Select Dataset", list(data_files.keys())) file_path = get_data_file_path(data_files[selected_file], self.config) if not file_path.exists(): st.warning(f"Dataset not found: {file_path}") st.warning("Please run data processing first to generate datasets.") return # Load and display data df = load_dataset(str(file_path)) if df.empty: st.error("Failed to load dataset") return # Basic statistics col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Total Records", f"{len(df):,}") with col2: if "annotated" in df.columns: annotated_pct = (df["annotated"] == 1).mean() * 100 st.metric("Annotated", f"{annotated_pct:.1f}%") with col3: if "words" in df.columns: avg_words = df["words"].mean() st.metric("Avg Words", f"{avg_words:.1f}") with col4: if "length" in df.columns: avg_length = df["length"].mean() st.metric("Avg Length", f"{avg_length:.0f}") # Data quality analysis st.subheader("Data Quality Analysis") col1, col2 = st.columns(2) with col1: # Missing values missing_data = df.isnull().sum() if missing_data.sum() > 0: fig = px.bar( x=missing_data.index, y=missing_data.values, title="Missing Values by Column" ) fig.update_layout(height=400) st.plotly_chart(fig, use_container_width=True) else: st.success("No missing values found") with col2: # Gender distribution if "sex" in df.columns: gender_counts = df["sex"].value_counts() fig = px.pie( values=gender_counts.values, names=gender_counts.index, title="Gender Distribution", ) fig.update_layout(height=400) st.plotly_chart(fig, use_container_width=True) # Word count distribution if "words" in df.columns: st.subheader("Name Structure Analysis") col1, col2 = st.columns(2) with col1: word_dist = df["words"].value_counts().sort_index() fig = px.bar( x=word_dist.index, y=word_dist.values, title="Distribution of Word Count in Names", ) fig.update_layout(height=400) st.plotly_chart(fig, use_container_width=True) with col2: # Province distribution if "province" in df.columns: province_counts = df["province"].value_counts().head(10) fig = px.bar( x=province_counts.values, y=province_counts.index, orientation="h", title="Top 10 Provinces by Name Count", ) fig.update_layout(height=400) st.plotly_chart(fig, use_container_width=True) # Sample data st.subheader("Sample Data") # Display columns selector if not df.empty: columns_to_show = st.multiselect( "Select columns to display", df.columns.tolist(), default=( ["name", "sex", "province", "words"] if all(col in df.columns for col in ["name", "sex", "province", "words"]) else df.columns[:5].tolist() ), ) if columns_to_show: sample_size = st.slider("Number of rows to display", 10, min(1000, len(df)), 50) st.dataframe(df[columns_to_show].head(sample_size), use_container_width=True) # Data export st.subheader("Export Data") if st.button("Download as CSV"): csv = df.to_csv(index=False) st.download_button( label="Download CSV", data=csv, file_name=f"{selected_file.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.csv", mime="text/csv", )