feat: web application multipage support

This commit is contained in:
2025-08-16 19:05:24 +02:00
parent 7b652d6999
commit 84f7d41a84
38 changed files with 765 additions and 507 deletions
+374
View File
@@ -0,0 +1,374 @@
from datetime import datetime
from typing import Optional
import numpy as np
import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils.data_loader import OPTIMIZED_DTYPES
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
class Predictions:
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
st.title("Predictions")
# Load available models
experiments = self.experiment_tracker.list_experiments()
completed_experiments = [
e for e in experiments if e.status.value == "completed" and e.model_path
]
if not completed_experiments:
st.warning("No trained models available. Please run some experiments first.")
return
# Model selection
model_options = {
f"{exp.config.name} (Acc: {exp.test_metrics.get('accuracy', 0):.3f})": exp
for exp in completed_experiments
if exp.test_metrics
}
selected_model_name = st.selectbox("Select Model", list(model_options.keys()))
if not selected_model_name:
return
selected_experiment = model_options[selected_model_name]
# Prediction modes
prediction_mode = st.radio(
"Prediction Mode", ["Single Name", "Batch Upload", "Dataset Prediction"]
)
if prediction_mode == "Single Name":
self.show_single_prediction(selected_experiment)
elif prediction_mode == "Batch Upload":
self.show_batch_prediction(selected_experiment)
elif prediction_mode == "Dataset Prediction":
self.show_dataset_prediction(selected_experiment)
def show_single_prediction(self, experiment):
"""Show single name prediction interface"""
st.subheader("Single Name Prediction")
name_input = st.text_input("Enter a name:", placeholder="e.g., Jean Baptiste Mukendi")
if name_input and st.button("Predict Gender"):
try:
# Load the model
model = self.experiment_runner.load_experiment_model(experiment.experiment_id)
if model is None:
st.error("Failed to load model")
return
# Create a DataFrame with the input
input_df = self._prepare_single_input(name_input)
# Make prediction
prediction = model.predict(input_df)[0]
# Get prediction probability if available
confidence = self._get_prediction_confidence(model, input_df)
# Display results
self._display_single_prediction_results(
prediction, confidence, experiment, name_input
)
except Exception as e:
st.error(f"Error making prediction: {e}")
def _prepare_single_input(self, name_input: str) -> pd.DataFrame:
"""Prepare single name input for prediction"""
return pd.DataFrame(
{
"name": [name_input],
"words": [len(name_input.split())],
"length": [len(name_input.replace(" ", ""))],
"province": ["unknown"], # Default values
"identified_name": [None],
"identified_surname": [None],
"probable_native": [None],
"probable_surname": [None],
}
)
def _get_prediction_confidence(self, model, input_df: pd.DataFrame) -> Optional[float]:
"""Get prediction confidence if available"""
try:
probabilities = model.predict_proba(input_df)[0]
return max(probabilities)
except:
return None
def _display_single_prediction_results(
self, prediction: str, confidence: Optional[float], experiment, name_input: str
):
"""Display single prediction results"""
col1, col2 = st.columns(2)
with col1:
gender_label = "Female" if prediction == "f" else "Male"
st.success(f"**Predicted Gender:** {gender_label}")
with col2:
if confidence:
st.metric("Confidence", f"{confidence:.2%}")
# Additional info
st.info(f"Model used: {experiment.config.name}")
st.info(f"Features used: {', '.join([f.value for f in experiment.config.features])}")
def show_batch_prediction(self, experiment):
"""Show batch prediction interface"""
st.subheader("Batch Prediction")
uploaded_file = st.file_uploader("Upload CSV file with names", type="csv")
if uploaded_file is not None:
try:
df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)
st.write("**Uploaded Data Preview:**")
st.dataframe(df.head(), use_container_width=True)
# Column selection
df = self._prepare_batch_data(df)
if st.button("Run Batch Prediction"):
self._run_batch_prediction(df, experiment)
except Exception as e:
st.error(f"Error processing file: {e}")
def _prepare_batch_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Prepare batch data for prediction"""
# Column selection
if "name" not in df.columns:
name_column = st.selectbox("Select the name column:", df.columns)
df = df.rename(columns={name_column: "name"})
# Add missing columns with defaults
required_columns = [
"words",
"length",
"province",
"identified_name",
"identified_surname",
"probable_native",
"probable_surname",
]
for col in required_columns:
if col not in df.columns:
if col == "words":
df[col] = df["name"].str.split().str.len()
elif col == "length":
df[col] = df["name"].str.replace(" ", "").str.len()
else:
df[col] = None
return df
def _run_batch_prediction(self, df: pd.DataFrame, experiment):
"""Run batch prediction and display results"""
with st.spinner("Making predictions..."):
# Load model
model = self.experiment_runner.load_experiment_model(experiment.experiment_id)
if model is None:
st.error("Failed to load model")
return
# Make predictions
predictions = model.predict(df)
df["predicted_gender"] = predictions
df["gender_label"] = df["predicted_gender"].map({"f": "Female", "m": "Male"})
# Try to get probabilities
try:
probabilities = model.predict_proba(df)
df["confidence"] = np.max(probabilities, axis=1)
except:
df["confidence"] = None
st.success("Predictions completed!")
# Show results
self._display_batch_results(df)
def _display_batch_results(self, df: pd.DataFrame):
"""Display batch prediction results"""
result_columns = ["name", "gender_label", "predicted_gender"]
if "confidence" in df.columns:
result_columns.append("confidence")
st.dataframe(df[result_columns], use_container_width=True)
# Download results
csv = df.to_csv(index=False)
st.download_button(
label="Download Predictions",
data=csv,
file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv",
)
# Summary statistics
self._display_batch_summary(df)
def _display_batch_summary(self, df: pd.DataFrame):
"""Display batch prediction summary"""
st.subheader("Prediction Summary")
gender_counts = df["gender_label"].value_counts()
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Predictions", len(df))
with col2:
st.metric("Female", gender_counts.get("Female", 0))
with col3:
st.metric("Male", gender_counts.get("Male", 0))
# Gender distribution chart
fig = px.pie(
values=gender_counts.values,
names=gender_counts.index,
title="Predicted Gender Distribution",
)
st.plotly_chart(fig, use_container_width=True)
def show_dataset_prediction(self, experiment):
"""Show dataset prediction interface"""
st.subheader("Dataset Prediction")
st.write("Apply the model to existing datasets")
# Dataset selection
dataset_options = {
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
}
selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys()))
file_path = self.config.paths.get_data_path(dataset_options[selected_dataset])
if not file_path.exists():
st.warning(f"Dataset not found: {file_path}")
return
# Load and show dataset info
df = self._load_dataset(str(file_path))
if df.empty:
return
st.write(f"Dataset contains {len(df):,} records")
# Prediction options
col1, col2 = st.columns(2)
with col1:
sample_size = st.number_input(
"Sample size (0 = all data)", 0, len(df), min(1000, len(df))
)
with col2:
compare_with_actual = False
if "sex" in df.columns:
compare_with_actual = st.checkbox("Compare with actual labels", value=True)
if st.button("Run Dataset Prediction"):
self._run_dataset_prediction(df, experiment, sample_size, compare_with_actual)
def _load_dataset(self, file_path: str) -> pd.DataFrame:
"""Load dataset with error handling"""
try:
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
def _run_dataset_prediction(
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
):
"""Run dataset prediction and display results"""
with st.spinner("Running predictions..."):
# Sample data if requested
if sample_size > 0:
df_sample = df.sample(n=sample_size, random_state=42)
else:
df_sample = df
# Load model and make predictions
model = self.experiment_runner.load_experiment_model(experiment.experiment_id)
if model is None:
st.error("Failed to load model")
return
predictions = model.predict(df_sample)
df_sample["predicted_gender"] = predictions
# Show results
if compare_with_actual and "sex" in df_sample.columns:
self._display_dataset_comparison(df_sample)
else:
self._display_dataset_predictions(df_sample)
def _display_dataset_comparison(self, df_sample: pd.DataFrame):
"""Display dataset predictions with actual comparison"""
# Calculate accuracy
accuracy = (df_sample["sex"] == df_sample["predicted_gender"]).mean()
st.metric("Accuracy on Selected Data", f"{accuracy:.4f}")
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(df_sample["sex"], df_sample["predicted_gender"])
fig = px.imshow(cm, text_auto=True, aspect="auto", title="Confusion Matrix")
st.plotly_chart(fig, use_container_width=True)
# Sample of correct and incorrect predictions
correct_mask = df_sample["sex"] == df_sample["predicted_gender"]
col1, col2 = st.columns(2)
with col1:
st.write("**Sample Correct Predictions**")
correct_sample = df_sample[correct_mask][["name", "sex", "predicted_gender"]].head(10)
st.dataframe(correct_sample, use_container_width=True)
with col2:
st.write("**Sample Incorrect Predictions**")
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(
10
)
st.dataframe(incorrect_sample, use_container_width=True)
def _display_dataset_predictions(self, df_sample: pd.DataFrame):
"""Display dataset predictions without comparison"""
# Just show predictions
st.write("**Sample Predictions**")
sample_results = df_sample[["name", "predicted_gender"]].head(20)
st.dataframe(sample_results, use_container_width=True)
# Gender distribution
gender_counts = df_sample["predicted_gender"].value_counts()
fig = px.pie(
values=gender_counts.values,
names=gender_counts.index,
title="Predicted Gender Distribution",
)
st.plotly_chart(fig, use_container_width=True)