Files
drc-ners-nlp/interface/data_overview.py
T

155 lines
5.2 KiB
Python

from datetime import datetime
import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataOverview:
def __init__(self, config):
self.config = config
def index(self):
st.header("Data Overview")
data_files = {
"Names": self.config.data.input_file,
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
"Male Names": self.config.data.output_files["males"],
"Female Names": self.config.data.output_files["females"],
}
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
file_path = get_data_file_path(data_files[selected_file], self.config)
if not file_path.exists():
st.warning(f"Dataset not found: {file_path}")
st.warning("Please run data processing first to generate datasets.")
return
# Load and display data
df = load_dataset(str(file_path))
if df.empty:
st.error("Failed to load dataset")
return
# Basic statistics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Records", f"{len(df):,}")
with col2:
if "annotated" in df.columns:
annotated_pct = (df["annotated"] == 1).mean() * 100
st.metric("Annotated", f"{annotated_pct:.1f}%")
with col3:
if "words" in df.columns:
avg_words = df["words"].mean()
st.metric("Avg Words", f"{avg_words:.1f}")
with col4:
if "length" in df.columns:
avg_length = df["length"].mean()
st.metric("Avg Length", f"{avg_length:.0f}")
# Data quality analysis
st.subheader("Data Quality Analysis")
col1, col2 = st.columns(2)
with col1:
# Missing values
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
fig = px.bar(
x=missing_data.index, y=missing_data.values, title="Missing Values by Column"
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
else:
st.success("No missing values found")
with col2:
# Gender distribution
if "sex" in df.columns:
gender_counts = df["sex"].value_counts()
fig = px.pie(
values=gender_counts.values,
names=gender_counts.index,
title="Gender Distribution",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
# Word count distribution
if "words" in df.columns:
st.subheader("Name Structure Analysis")
col1, col2 = st.columns(2)
with col1:
word_dist = df["words"].value_counts().sort_index()
fig = px.bar(
x=word_dist.index,
y=word_dist.values,
title="Distribution of Word Count in Names",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
with col2:
# Province distribution
if "province" in df.columns:
province_counts = df["province"].value_counts().head(10)
fig = px.bar(
x=province_counts.values,
y=province_counts.index,
orientation="h",
title="Top 10 Provinces by Name Count",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
# Sample data
st.subheader("Sample Data")
# Display columns selector
if not df.empty:
columns_to_show = st.multiselect(
"Select columns to display",
df.columns.tolist(),
default=(
["name", "sex", "province", "words"]
if all(col in df.columns for col in ["name", "sex", "province", "words"])
else df.columns[:5].tolist()
),
)
if columns_to_show:
sample_size = st.slider("Number of rows to display", 10, min(1000, len(df)), 50)
st.dataframe(df[columns_to_show].head(sample_size), use_container_width=True)
# Data export
st.subheader("Export Data")
if st.button("Download as CSV"):
csv = df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv,
file_name=f"{selected_file.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.csv",
mime="text/csv",
)