diff --git a/app.py b/app.py index bc462f4..0112522 100644 --- a/app.py +++ b/app.py @@ -1,26 +1,18 @@ #!.venv/bin/python3 -from datetime import datetime - -import numpy as np -import pandas as pd -import plotly.express as px -import plotly.graph_objects as go import streamlit as st from core.config import get_config -from core.utils import get_data_file_path from core.utils.data_loader import DataLoader -from core.utils.region_mapper import RegionMapper +from interface.configuration import Configuration +from interface.dashboard import Dashboard +from interface.data_overview import DataOverview +from interface.data_processing import DataProcessing +from interface.experiments import Experiments +from interface.predictions import Predictions +from interface.results_analysis import ResultsAnalysis from processing.monitoring.pipeline_monitor import PipelineMonitor -from research.experiment import ExperimentConfig -from research.experiment.experiment_builder import ExperimentBuilder from research.experiment.experiment_runner import ExperimentRunner from research.experiment.experiment_tracker import ExperimentTracker -from research.experiment.feature_extractor import FeatureType -from research.model_registry import list_available_models -from web.dashboard import Dashboard -from web.data_overview import DataOverview -from web.data_processing import DataProcessing # Page configuration st.set_page_config( @@ -37,16 +29,6 @@ def load_config(): return get_config() -@st.cache_data -def load_dataset(file_path: str) -> pd.DataFrame: - """Load dataset with caching""" - try: - return pd.read_csv(file_path) - except Exception as e: - st.error(f"Error loading dataset: {e}") - return pd.DataFrame() - - class StreamlitApp: """Main Streamlit application class""" @@ -57,10 +39,14 @@ class StreamlitApp: self.experiment_runner = ExperimentRunner(self.config) self.pipeline_monitor = PipelineMonitor() - # Initialize web components + # Initialize interface components self.dashboard = Dashboard(self.config, self.experiment_tracker, self.experiment_runner) self.data_overview = DataOverview(self.config) self.data_processing = DataProcessing(self.config, self.pipeline_monitor) + self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner) + self.results_analysis = ResultsAnalysis(self.config, self.experiment_tracker, self.experiment_runner) + self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner) + self.configuration = Configuration(self.config) # Initialize session state if "current_experiment" not in st.session_state: @@ -87,980 +73,16 @@ class StreamlitApp: ) # Route to appropriate page - if page == "Dashboard": - self.dashboard.index() - elif page == "Dataset Overview": - self.data_overview.index() - elif page == "Data Processing": - self.data_processing.index() - elif page == "Experiments": - self.show_experiments() - elif page == "Results & Analysis": - self.show_results_analysis() - elif page == "Predictions": - self.show_predictions() - elif page == "Configuration": - self.show_configuration() - - def show_experiments(self): - """Show experiment management interface""" - st.header("Experiment Management") - tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"]) - - with tab1: - self.show_experiment_creation() - - with tab2: - self.show_experiment_list() - - with tab3: - self.show_batch_experiments() - - def show_experiment_creation(self): - """Show interface for creating new experiments""" - st.subheader("Create New Experiment") - - with st.form("new_experiment"): - col1, col2 = st.columns(2) - - with col1: - exp_name = st.text_input("Experiment Name", placeholder="e.g., native_name_gender_prediction") - description = st.text_area("Description", placeholder="Brief description of the experiment") - model_type = st.selectbox("Model Type", list_available_models()) - - # Feature selection - feature_options = [f.value for f in FeatureType] - selected_features = st.multiselect("Features to Use", feature_options, default=["full_name"]) - - with col2: - # Model parameters - st.write("**Model Parameters**") - if model_type == "logistic_regression": - ngram_min = st.number_input("N-gram Min", 1, 5, 2) - ngram_max = st.number_input("N-gram Max", 2, 8, 5) - max_features = st.number_input("Max Features", 1000, 50000, 10000) - elif model_type == "random_forest": - n_estimators = st.number_input("Number of Trees", 10, 500, 100) - max_depth = st.number_input("Max Depth", 1, 20, 10) - - # Training parameters - st.write("**Training Parameters**") - test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2) - cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5) - - tags = st.text_input("Tags (comma-separated)", placeholder="e.g., baseline, feature_study") - - # Advanced options - with st.expander("Advanced Options"): - # Data filters - st.write("**Data Filters**") - filter_province = st.selectbox( - "Filter by Province (optional)", - ["None"] + RegionMapper().get_provinces(), - ) - - min_words = st.number_input("Minimum Word Count", 0, 10, 0) - max_words = st.number_input("Maximum Word Count (0 = no limit)", 0, 20, 0) - - submitted = st.form_submit_button("Create and Run Experiment", type="primary") - - if submitted: - if not exp_name: - st.error("Please provide an experiment name") - return - - if not selected_features: - st.error("Please select at least one feature") - return - - # Build experiment configuration - try: - # Prepare model parameters - model_params = {} - if model_type == "logistic_regression": - model_params = { - "ngram_range": [ngram_min, ngram_max], - "max_features": max_features, - } - elif model_type == "random_forest": - model_params = { - "n_estimators": n_estimators, - "max_depth": max_depth if max_depth > 0 else None, - } - - # Prepare data filters - train_filter = {} - if filter_province != "None": - train_filter["province"] = filter_province - if min_words > 0: - train_filter["words"] = {"min": min_words} - if max_words > 0: - if "words" in train_filter: - train_filter["words"]["max"] = max_words - else: - train_filter["words"] = {"max": max_words} - - # Create experiment config - features = [FeatureType(f) for f in selected_features] - tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()] - - config = ExperimentConfig( - name=exp_name, - description=description, - tags=tag_list, - model_type=model_type, - model_params=model_params, - features=features, - train_data_filter=train_filter if train_filter else None, - test_size=test_size, - cross_validation_folds=cv_folds, - ) - - # Run experiment - with st.spinner("Running experiment..."): - experiment_id = self.experiment_runner.run_experiment(config) - - st.success(f"Experiment completed successfully!") - st.info(f"Experiment ID: `{experiment_id}`") - - # Show results - experiment = self.experiment_tracker.get_experiment(experiment_id) - if experiment and experiment.test_metrics: - st.write("**Results:**") - for metric, value in experiment.test_metrics.items(): - st.metric(metric.title(), f"{value:.4f}") - - except Exception as e: - st.error(f"Error running experiment: {e}") - - def show_experiment_list(self): - """Show list of all experiments with filtering""" - st.subheader("All Experiments") - - # Filters - col1, col2, col3 = st.columns(3) - - with col1: - status_filter = st.selectbox( - "Filter by Status", ["All", "completed", "running", "failed", "pending"] - ) - - with col2: - model_filter = st.selectbox("Filter by Model", ["All"] + list_available_models()) - - with col3: - tag_filter = st.text_input("Filter by Tags (comma-separated)") - - # Get experiments - experiments = self.experiment_tracker.list_experiments() - - # Apply filters - if status_filter != "All": - from research.experiment import ExperimentStatus - - experiments = [e for e in experiments if e.status == ExperimentStatus(status_filter)] - - if model_filter != "All": - experiments = [e for e in experiments if e.config.model_type == model_filter] - - if tag_filter: - tags = [tag.strip() for tag in tag_filter.split(",")] - experiments = [e for e in experiments if any(tag in e.config.tags for tag in tags)] - - if not experiments: - st.info("No experiments found matching the filters.") - return - - # Display experiments - for i, exp in enumerate(experiments): - with st.expander( - f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}" - ): - col1, col2, col3 = st.columns(3) - - with col1: - st.write(f"**Model:** {exp.config.model_type}") - st.write(f"**Features:** {', '.join([f.value for f in exp.config.features])}") - st.write(f"**Tags:** {', '.join(exp.config.tags)}") - - with col2: - if exp.test_metrics: - for metric, value in exp.test_metrics.items(): - st.metric(metric.title(), f"{value:.4f}") - - with col3: - st.write(f"**Train Size:** {exp.train_size:,}") - st.write(f"**Test Size:** {exp.test_size:,}") - - if st.button(f"View Details", key=f"details_{i}"): - st.session_state.selected_experiment = exp.experiment_id - st.rerun() - - if exp.config.description: - st.write(f"**Description:** {exp.config.description}") - - def show_batch_experiments(self): - """Show interface for running batch experiments""" - st.subheader("Batch Experiments") - st.write("Run multiple experiments with different parameter combinations.") - - # Parameter sweep configuration - with st.form("batch_experiments"): - st.write("**Parameter Sweep Configuration**") - - col1, col2 = st.columns(2) - - with col1: - base_name = st.text_input("Base Experiment Name", "parameter_sweep") - model_types = st.multiselect( - "Model Types", list_available_models(), default=["logistic_regression"] - ) - - # N-gram ranges for logistic regression - st.write("**Logistic Regression Parameters**") - ngram_ranges = st.text_area( - "N-gram Ranges (one per line, format: min,max)", "2,4\n2,5\n3,6" - ) - - with col2: - feature_combinations = st.multiselect( - "Feature Combinations", - [f.value for f in FeatureType], - default=["full_name", "native_name", "surname"], - ) - - test_sizes = st.text_input("Test Sizes (comma-separated)", "0.15,0.2,0.25") - - tags = st.text_input("Common Tags", "parameter_sweep,batch") - - if st.form_submit_button("🚀 Run Batch Experiments"): - self.run_batch_experiments( - base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags - ) - - def show_results_analysis(self): - """Show experiment results and analysis""" - st.header("Results & Analysis") - tab1, tab2, tab3 = st.tabs(["Experiment Comparison", "Performance Analysis", "Model Analysis"]) - - with tab1: - self.show_experiment_comparison() - - with tab2: - self.show_performance_analysis() - - with tab3: - self.show_model_analysis() - - def show_experiment_comparison(self): - """Show experiment comparison interface""" - st.subheader("Compare Experiments") - - experiments = self.experiment_tracker.list_experiments() - completed_experiments = [e for e in experiments if e.status.value == "completed"] - - if not completed_experiments: - st.warning("No completed experiments found.") - return - - # Experiment selection - exp_options = { - f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id - for exp in completed_experiments + page_map = { + "Dashboard": self.dashboard.index, + "Dataset Overview": self.data_overview.index, + "Data Processing": self.data_processing.index, + "Experiments": self.experiments.index, + "Results & Analysis": self.results_analysis.index, + "Predictions": self.predictions.index, + "Configuration": self.configuration.index, } - - selected_exp_names = st.multiselect( - "Select Experiments to Compare", - list(exp_options.keys()), - default=list(exp_options.keys())[: min(5, len(exp_options))], - ) - - if not selected_exp_names: - st.info("Please select experiments to compare.") - return - - selected_exp_ids = [exp_options[name] for name in selected_exp_names] - - # Generate comparison - comparison_df = self.experiment_runner.compare_experiments(selected_exp_ids) - - if comparison_df.empty: - st.error("No data available for comparison.") - return - - # Display comparison table - st.write("**Experiment Comparison Table**") - - # Select columns to display - metric_columns = [ - col for col in comparison_df.columns if col.startswith("test_") or col.startswith("cv_") - ] - display_columns = ["name", "model_type", "features"] + metric_columns - available_columns = [col for col in display_columns if col in comparison_df.columns] - - st.dataframe(comparison_df[available_columns], use_container_width=True) - - # Visualization - st.write("**Performance Comparison**") - - if "test_accuracy" in comparison_df.columns: - fig = px.bar( - comparison_df, - x="name", - y="test_accuracy", - color="model_type", - title="Test Accuracy Comparison", - ) - fig.update_layout(xaxis_tickangle=-45) - st.plotly_chart(fig, use_container_width=True) - - # Metric comparison across multiple metrics - if len(metric_columns) > 1: - metric_to_plot = st.selectbox("Select Metric for Detailed Comparison", metric_columns) - - if metric_to_plot in comparison_df.columns: - fig = px.bar( - comparison_df, - x="name", - y=metric_to_plot, - color="model_type", - title=f"{metric_to_plot.replace('_', ' ').title()} Comparison", - ) - fig.update_layout(xaxis_tickangle=-45) - st.plotly_chart(fig, use_container_width=True) - - def show_performance_analysis(self): - """Show performance analysis across experiments""" - st.subheader("Performance Analysis") - - experiments = self.experiment_tracker.list_experiments() - completed_experiments = [ - e for e in experiments if e.status.value == "completed" and e.test_metrics - ] - - if not completed_experiments: - st.warning("No completed experiments with metrics found.") - return - - # Prepare data for analysis - analysis_data = [] - for exp in completed_experiments: - row = { - "experiment_id": exp.experiment_id, - "name": exp.config.name, - "model_type": exp.config.model_type, - "feature_count": len(exp.config.features), - "features": ", ".join([f.value for f in exp.config.features]), - "train_size": exp.train_size, - "test_size": exp.test_size, - **exp.test_metrics, - } - analysis_data.append(row) - - analysis_df = pd.DataFrame(analysis_data) - - # Performance trends - col1, col2 = st.columns(2) - - with col1: - # Accuracy vs Training Size - if "accuracy" in analysis_df.columns and "train_size" in analysis_df.columns: - fig = px.scatter( - analysis_df, - x="train_size", - y="accuracy", - color="model_type", - hover_data=["name"], - title="Accuracy vs Training Size", - ) - st.plotly_chart(fig, use_container_width=True) - - with col2: - # Feature Count vs Performance - if "accuracy" in analysis_df.columns and "feature_count" in analysis_df.columns: - fig = px.scatter( - analysis_df, - x="feature_count", - y="accuracy", - color="model_type", - hover_data=["name"], - title="Accuracy vs Number of Features", - ) - st.plotly_chart(fig, use_container_width=True) - - # Model type comparison - if "accuracy" in analysis_df.columns: - model_performance = ( - analysis_df.groupby("model_type")["accuracy"] - .agg(["mean", "std", "count"]) - .reset_index() - ) - - fig = go.Figure() - fig.add_trace( - go.Bar( - x=model_performance["model_type"], - y=model_performance["mean"], - error_y=dict(type="data", array=model_performance["std"]), - name="Average Accuracy", - ) - ) - fig.update_layout(title="Average Accuracy by Model Type", yaxis_title="Accuracy") - st.plotly_chart(fig, use_container_width=True) - - # Best experiments summary - st.subheader("Top Performing Experiments") - - if "accuracy" in analysis_df.columns: - top_experiments = analysis_df.nlargest(5, "accuracy")[ - ["name", "model_type", "features", "accuracy", "precision", "recall", "f1"] - ] - st.dataframe(top_experiments, use_container_width=True) - - def show_model_analysis(self): - """Show detailed model analysis""" - st.subheader("Model Analysis") - - experiments = self.experiment_tracker.list_experiments() - completed_experiments = [e for e in experiments if e.status.value == "completed"] - - if not completed_experiments: - st.warning("No completed experiments found.") - return - - # Select experiment for detailed analysis - exp_options = { - f"{exp.config.name} ({exp.experiment_id[:8]})": exp for exp in completed_experiments - } - - selected_exp_name = st.selectbox( - "Select Experiment for Detailed Analysis", list(exp_options.keys()) - ) - - if not selected_exp_name: - return - - selected_exp = exp_options[selected_exp_name] - - # Experiment details - col1, col2 = st.columns(2) - - with col1: - st.write("**Experiment Configuration**") - st.json( - { - "name": selected_exp.config.name, - "model_type": selected_exp.config.model_type, - "features": [f.value for f in selected_exp.config.features], - "model_params": selected_exp.config.model_params, - } - ) - - with col2: - st.write("**Performance Metrics**") - if selected_exp.test_metrics: - for metric, value in selected_exp.test_metrics.items(): - st.metric(metric.title(), f"{value:.4f}") - - # Confusion matrix - if selected_exp.confusion_matrix: - st.write("**Confusion Matrix**") - cm = np.array(selected_exp.confusion_matrix) - - fig = px.imshow(cm, text_auto=True, aspect="auto", title="Confusion Matrix") - st.plotly_chart(fig, use_container_width=True) - - # Feature importance - if selected_exp.feature_importance: - st.write("**Feature Importance**") - - importance_data = sorted( - selected_exp.feature_importance.items(), key=lambda x: x[1], reverse=True - )[:20] - - features, importances = zip(*importance_data) - - fig = px.bar( - x=list(importances), - y=list(features), - orientation="h", - title="Top 20 Feature Importances", - ) - fig.update_layout(height=600) - st.plotly_chart(fig, use_container_width=True) - - # Prediction examples - if selected_exp.prediction_examples: - st.write("**Prediction Examples**") - - examples_df = pd.DataFrame(selected_exp.prediction_examples) - - # Separate correct and incorrect predictions - correct_examples = examples_df[examples_df["correct"] == True] - incorrect_examples = examples_df[examples_df["correct"] == False] - - col1, col2 = st.columns(2) - - with col1: - st.write("**Correct Predictions**") - if not correct_examples.empty: - st.dataframe( - correct_examples[["name", "true_label", "predicted_label"]], - use_container_width=True, - ) - - with col2: - st.write("**Incorrect Predictions**") - if not incorrect_examples.empty: - st.dataframe( - incorrect_examples[["name", "true_label", "predicted_label"]], - use_container_width=True, - ) - - def show_predictions(self): - """Show prediction interface""" - st.header("Make Predictions") - - # Load available models - experiments = self.experiment_tracker.list_experiments() - completed_experiments = [ - e for e in experiments if e.status.value == "completed" and e.model_path - ] - - if not completed_experiments: - st.warning("No trained models available. Please run some experiments first.") - return - - # Model selection - model_options = { - f"{exp.config.name} (Acc: {exp.test_metrics.get('accuracy', 0):.3f})": exp - for exp in completed_experiments - if exp.test_metrics - } - - selected_model_name = st.selectbox("Select Model", list(model_options.keys())) - - if not selected_model_name: - return - - selected_experiment = model_options[selected_model_name] - - # Prediction modes - prediction_mode = st.radio( - "Prediction Mode", ["Single Name", "Batch Upload", "Dataset Prediction"] - ) - - if prediction_mode == "Single Name": - self.show_single_prediction(selected_experiment) - elif prediction_mode == "Batch Upload": - self.show_batch_prediction(selected_experiment) - elif prediction_mode == "Dataset Prediction": - self.show_dataset_prediction(selected_experiment) - - def show_single_prediction(self, experiment): - """Show single name prediction interface""" - st.subheader("Single Name Prediction") - - name_input = st.text_input("Enter a name:", placeholder="e.g., Jean Baptiste Mukendi") - - if name_input and st.button("Predict Gender"): - try: - # Load the model - model = self.experiment_runner.load_experiment_model(experiment.experiment_id) - - if model is None: - st.error("Failed to load model") - return - - # Create a DataFrame with the input - input_df = pd.DataFrame( - { - "name": [name_input], - "words": [len(name_input.split())], - "length": [len(name_input.replace(" ", ""))], - "province": ["unknown"], # Default values - "identified_name": [None], - "identified_surname": [None], - "probable_native": [None], - "probable_surname": [None], - } - ) - - # Make prediction - prediction = model.predict(input_df)[0] - - # Get prediction probability if available - try: - probabilities = model.predict_proba(input_df)[0] - confidence = max(probabilities) - except: - confidence = None - - # Display results - col1, col2 = st.columns(2) - - with col1: - gender_label = "Female" if prediction == "f" else "Male" - st.success(f"**Predicted Gender:** {gender_label}") - - with col2: - if confidence: - st.metric("Confidence", f"{confidence:.2%}") - - # Additional info - st.info(f"Model used: {experiment.batch_config.name}") - st.info( - f"Features used: {', '.join([f.value for f in experiment.batch_config.features])}" - ) - - except Exception as e: - st.error(f"Error making prediction: {e}") - - def show_batch_prediction(self, experiment): - """Show batch prediction interface""" - st.subheader("Batch Prediction") - - uploaded_file = st.file_uploader("Upload CSV file with names", type="csv") - - if uploaded_file is not None: - try: - df = pd.read_csv(uploaded_file) - - st.write("**Uploaded Data Preview:**") - st.dataframe(df.head(), use_container_width=True) - - # Column selection - if "name" not in df.columns: - name_column = st.selectbox("Select the name column:", df.columns) - df = df.rename(columns={name_column: "name"}) - - if st.button("Run Batch Prediction"): - with st.spinner("Making predictions..."): - # Load model - model = self.experiment_runner.load_experiment_model( - experiment.experiment_id - ) - - if model is None: - st.error("Failed to load model") - return - - # Prepare data (add missing columns with defaults) - required_columns = [ - "words", - "length", - "province", - "identified_name", - "identified_surname", - "probable_native", - "probable_surname", - ] - - for col in required_columns: - if col not in df.columns: - if col == "words": - df[col] = df["name"].str.split().str.len() - elif col == "length": - df[col] = df["name"].str.replace(" ", "").str.len() - else: - df[col] = None - - # Make predictions - predictions = model.predict(df) - df["predicted_gender"] = predictions - df["gender_label"] = df["predicted_gender"].map( - {"f": "Female", "m": "Male"} - ) - - # Try to get probabilities - try: - probabilities = model.predict_proba(df) - df["confidence"] = np.max(probabilities, axis=1) - except: - df["confidence"] = None - - st.success("Predictions completed!") - - # Show results - result_columns = ["name", "gender_label", "predicted_gender"] - if "confidence" in df.columns: - result_columns.append("confidence") - - st.dataframe(df[result_columns], use_container_width=True) - - # Download results - csv = df.to_csv(index=False) - st.download_button( - label="Download Predictions", - data=csv, - file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", - mime="text/csv", - ) - - # Summary statistics - st.subheader("Prediction Summary") - gender_counts = df["gender_label"].value_counts() - - col1, col2, col3 = st.columns(3) - with col1: - st.metric("Total Predictions", len(df)) - with col2: - st.metric("Female", gender_counts.get("Female", 0)) - with col3: - st.metric("Male", gender_counts.get("Male", 0)) - - # Gender distribution chart - fig = px.pie( - values=gender_counts.values, - names=gender_counts.index, - title="Predicted Gender Distribution", - ) - st.plotly_chart(fig, use_container_width=True) - - except Exception as e: - st.error(f"Error processing file: {e}") - - def show_dataset_prediction(self, experiment): - """Show dataset prediction interface""" - st.subheader("Dataset Prediction") - st.write("Apply the model to existing datasets") - - # Dataset selection - dataset_options = { - "Featured Dataset": self.config.data.output_files["featured"], - "Evaluation Dataset": self.config.data.output_files["evaluation"], - } - - selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys())) - file_path = get_data_file_path(dataset_options[selected_dataset], self.config) - - if not file_path.exists(): - st.warning(f"Dataset not found: {file_path}") - return - - # Load and show dataset info - df = load_dataset(str(file_path)) - st.write(f"Dataset contains {len(df):,} records") - - # Prediction options - col1, col2 = st.columns(2) - - with col1: - sample_size = st.number_input( - "Sample size (0 = all data)", 0, len(df), min(1000, len(df)) - ) - - with col2: - if "sex" in df.columns: - compare_with_actual = st.checkbox("Compare with actual labels", value=True) - else: - compare_with_actual = False - - if st.button("Run Dataset Prediction"): - with st.spinner("Running predictions..."): - # Sample data if requested - if sample_size > 0: - df_sample = df.sample(n=sample_size, random_state=42) - else: - df_sample = df - - # Load model and make predictions - model = self.experiment_runner.load_experiment_model(experiment.experiment_id) - - if model is None: - st.error("Failed to load model") - return - - predictions = model.predict(df_sample) - df_sample["predicted_gender"] = predictions - - # Show results - if compare_with_actual and "sex" in df_sample.columns: - # Calculate accuracy - accuracy = (df_sample["sex"] == df_sample["predicted_gender"]).mean() - st.metric("Accuracy on Selected Data", f"{accuracy:.4f}") - - # Confusion matrix - from sklearn.metrics import confusion_matrix - - cm = confusion_matrix(df_sample["sex"], df_sample["predicted_gender"]) - - fig = px.imshow(cm, text_auto=True, aspect="auto", title="Confusion Matrix") - st.plotly_chart(fig, use_container_width=True) - - # Sample of correct and incorrect predictions - correct_mask = df_sample["sex"] == df_sample["predicted_gender"] - - col1, col2 = st.columns(2) - - with col1: - st.write("**Sample Correct Predictions**") - correct_sample = df_sample[correct_mask][ - ["name", "sex", "predicted_gender"] - ].head(10) - st.dataframe(correct_sample, use_container_width=True) - - with col2: - st.write("**Sample Incorrect Predictions**") - incorrect_sample = df_sample[~correct_mask][ - ["name", "sex", "predicted_gender"] - ].head(10) - st.dataframe(incorrect_sample, use_container_width=True) - - else: - # Just show predictions - st.write("**Sample Predictions**") - sample_results = df_sample[["name", "predicted_gender"]].head(20) - st.dataframe(sample_results, use_container_width=True) - - # Gender distribution - gender_counts = df_sample["predicted_gender"].value_counts() - fig = px.pie( - values=gender_counts.values, - names=gender_counts.index, - title="Predicted Gender Distribution", - ) - st.plotly_chart(fig, use_container_width=True) - - def show_configuration(self): - st.header("Current Configuration") - st.json(self.config.model_dump()) - - def run_baseline_experiments(self): - """Run baseline experiments""" - with st.spinner("Running baseline experiments..."): - try: - builder = ExperimentBuilder() - experiments = builder.create_baseline_experiments() - experiment_ids = self.experiment_runner.run_experiment_batch(experiments) - - st.success(f"Completed {len(experiment_ids)} baseline experiments") - - # Show quick comparison - if experiment_ids: - comparison = self.experiment_runner.compare_experiments(experiment_ids) - st.write("**Results Summary:**") - st.dataframe( - comparison[["name", "model_type", "test_accuracy"]], - use_container_width=True, - ) - - except Exception as e: - st.error(f"Error running baseline experiments: {e}") - - def run_ablation_study(self): - """Run feature ablation study""" - with st.spinner("Running ablation study..."): - try: - builder = ExperimentBuilder() - experiments = builder.create_feature_ablation_study() - experiment_ids = self.experiment_runner.run_experiment_batch(experiments) - - st.success(f"Completed {len(experiment_ids)} ablation experiments") - - except Exception as e: - st.error(f"Error running ablation study: {e}") - - def run_component_study(self): - """Run name component study""" - with st.spinner("Running component study..."): - try: - builder = ExperimentBuilder() - experiments = builder.create_name_component_study() - experiment_ids = self.experiment_runner.run_experiment_batch(experiments) - - st.success(f"Completed {len(experiment_ids)} component experiments") - - except Exception as e: - st.error(f"Error running component study: {e}") - - def run_province_study(self): - """Run province-specific study""" - with st.spinner("Running province study..."): - try: - builder = ExperimentBuilder() - experiments = builder.create_province_specific_study() - experiment_ids = self.experiment_runner.run_experiment_batch(experiments) - - st.success(f"Completed {len(experiment_ids)} province experiments") - - except Exception as e: - st.error(f"Error running province study: {e}") - - def clean_checkpoints(self): - """Clean pipeline checkpoints""" - for step in ["data_cleaning", "feature_extraction", "llm_annotation", "data_splitting"]: - self.pipeline_monitor.clean_step_checkpoints(step, keep_last=1) - st.success("Checkpoints cleaned!") - - def run_batch_experiments( - self, base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags - ): - """Run batch experiments with parameter combinations""" - with st.spinner("Running batch experiments..."): - try: - experiments = [] - - # Parse parameters - ngram_list = [] - for line in ngram_ranges.strip().split("\n"): - if "," in line: - min_val, max_val = map(int, line.split(",")) - ngram_list.append([min_val, max_val]) - - test_size_list = [float(x.strip()) for x in test_sizes.split(",")] - tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()] - - # Generate experiment combinations - exp_count = 0 - for model_type in model_types: - for feature_combo in feature_combinations: - for test_size in test_size_list: - if model_type == "logistic_regression": - for ngram_range in ngram_list: - exp_name = f"{base_name}_{model_type}_{feature_combo}_{ngram_range[0]}_{ngram_range[1]}_{test_size}" - - config = ExperimentConfig( - name=exp_name, - description=f"Batch experiment: {model_type} with {feature_combo}", - model_type=model_type, - features=[FeatureType(feature_combo)], - model_params={"ngram_range": ngram_range}, - test_size=test_size, - tags=tag_list, - ) - experiments.append(config) - exp_count += 1 - else: - exp_name = f"{base_name}_{model_type}_{feature_combo}_{test_size}" - - config = ExperimentConfig( - name=exp_name, - description=f"Batch experiment: {model_type} with {feature_combo}", - model_type=model_type, - features=[FeatureType(feature_combo)], - test_size=test_size, - tags=tag_list, - ) - experiments.append(config) - exp_count += 1 - - # Run experiments - experiment_ids = self.experiment_runner.run_experiment_batch(experiments) - - st.success(f"Completed {len(experiment_ids)} batch experiments") - - # Show summary - if experiment_ids: - comparison = self.experiment_runner.compare_experiments(experiment_ids) - st.write("**Batch Results Summary:**") - st.dataframe( - comparison[["name", "model_type", "test_accuracy"]], - use_container_width=True, - ) - - except Exception as e: - st.error(f"Error running batch experiments: {e}") + page_map.get(page, lambda: None)() def main(): diff --git a/cli.py b/cli.py index 54b99b2..3363c9f 100755 --- a/cli.py +++ b/cli.py @@ -1,159 +1,14 @@ #!.venv/bin/python3 import argparse +import logging import sys from pathlib import Path -import json + import pandas as pd -import logging from core.config import get_config, setup_logging -from research.experiment import ExperimentConfig -from research.experiment.experiment_tracker import ExperimentTracker -from research.experiment.feature_extractor import FeatureType -from research.experiment.experiment_builder import ExperimentBuilder from research.experiment.experiment_runner import ExperimentRunner -from research.model_registry import list_available_models - - -def create_experiment_from_args(args) -> ExperimentConfig: - """Create experiment configuration from command line arguments""" - - features = [] - if args.features: - for feature_name in args.features: - try: - features.append(FeatureType(feature_name)) - except ValueError: - logging.warning(f"Unknown feature type '{feature_name}', skipping") - - if not features: - features = [FeatureType.FULL_NAME] # Default - - # Parse model parameters - model_params = {} - if args.model_params: - try: - model_params = json.loads(args.model_params) - except json.JSONDecodeError: - logging.warning("Invalid JSON for model parameters, using defaults") - - # Parse feature parameters - feature_params = {} - if args.feature_params: - try: - feature_params = json.loads(args.feature_params) - except json.JSONDecodeError: - logging.warning("Invalid JSON for feature parameters, using defaults") - - # Parse data filters - train_filter = None - if args.train_filter: - try: - train_filter = json.loads(args.train_filter) - except json.JSONDecodeError: - logging.warning("Invalid JSON for train filter, ignoring") - - return ExperimentConfig( - name=args.name, - description=args.description or "", - tags=args.tags or [], - model_type=args.model_type, - model_params=model_params, - features=features, - feature_params=feature_params, - train_data_filter=train_filter, - target_column=args.target, - test_size=args.test_size, - random_seed=args.seed, - cross_validation_folds=args.cv_folds, - metrics=args.metrics or ["accuracy", "precision", "recall", "f1"], - ) - - -def run_single_experiment(args): - """Run a single experiment""" - - config = create_experiment_from_args(args) - runner = ExperimentRunner() - experiment_id = runner.run_experiment(config) - - logging.info(f"Experiment completed: {experiment_id}") - - # Show results - experiment = runner.tracker.get_experiment(experiment_id) - if experiment: - logging.info("Results:") - for metric, value in experiment.test_metrics.items(): - logging.info(f" Test {metric}: {value:.4f}") - - if experiment.cv_metrics: - logging.info("Cross-validation:") - for metric, value in experiment.cv_metrics.items(): - if not metric.endswith("_std"): - std_key = f"{metric}_std" - std_val = experiment.cv_metrics.get(std_key, 0) - logging.info(f" CV {metric}: {value:.4f} ± {std_val:.4f}") - - -def run_baseline_experiments(args): - """Run baseline experiments""" - logger = logging.getLogger(__name__) - - builder = ExperimentBuilder() - experiments = builder.create_baseline_experiments() - - runner = ExperimentRunner() - experiment_ids = runner.run_experiment_batch(experiments) - - logging.info(f"Completed {len(experiment_ids)} baseline experiments") - - # Show comparison - if experiment_ids: - comparison = runner.compare_experiments(experiment_ids) - logging.info("Baseline Results Comparison:") - logging.info( - comparison[["name", "model_type", "features", "test_accuracy"]].to_string(index=False) - ) - - -def run_ablation_study(args): - """Run feature ablation study""" - - builder = ExperimentBuilder() - experiments = builder.create_feature_ablation_study() - - runner = ExperimentRunner() - experiment_ids = runner.run_experiment_batch(experiments) - - logging.info(f"Completed {len(experiment_ids)} ablation experiments") - - # Show results - if experiment_ids: - comparison = runner.compare_experiments(experiment_ids) - logging.info("Ablation Study Results:") - logging.info(comparison[["name", "test_accuracy", "test_f1"]].to_string(index=False)) - - -def run_component_study(args): - """Run name component study""" - - builder = ExperimentBuilder() - experiments = builder.create_name_component_study() - - runner = ExperimentRunner() - experiment_ids = runner.run_experiment_batch(experiments) - - logging.info(f"Completed {len(experiment_ids)} component study experiments") - - # Show results - if experiment_ids: - comparison = runner.compare_experiments(experiment_ids) - logging.info("Name Component Study Results:") - logging.info( - comparison[["name", "test_accuracy", "test_precision", "test_recall"]].to_string( - index=False - ) - ) +from research.experiment.experiment_tracker import ExperimentTracker def list_experiments(args): @@ -249,7 +104,7 @@ def show_experiment_details(args): def compare_experiments_cmd(args): """Compare multiple experiments""" - runner = ExperimentRunner() + runner = ExperimentRunner(get_config()) comparison = runner.compare_experiments(args.experiment_ids) if comparison.empty: @@ -285,43 +140,9 @@ def main(): parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging") subparsers = parser.add_subparsers(dest="command", help="Available commands") - # Single experiment command - exp_parser = subparsers.add_parser("run", help="Run a single experiment") - exp_parser.add_argument("--name", required=True, help="Experiment name") - exp_parser.add_argument("--description", help="Experiment description") - exp_parser.add_argument( - "--model-type", - default="logistic_regression", - choices=list_available_models(), - help="Model type", - ) - exp_parser.add_argument( - "--features", nargs="+", choices=[f.value for f in FeatureType], help="Features to use" - ) - exp_parser.add_argument("--model-params", help="Model parameters as JSON") - exp_parser.add_argument("--feature-params", help="Feature parameters as JSON") - exp_parser.add_argument("--train-filter", help="Training data filter as JSON") - exp_parser.add_argument("--target", default="sex", help="Target column") - exp_parser.add_argument("--test-size", type=float, default=0.2, help="Test set size") - exp_parser.add_argument("--seed", type=int, default=42, help="Random seed") - exp_parser.add_argument("--cv-folds", type=int, default=5, help="CV folds") - exp_parser.add_argument( - "--metrics", - nargs="+", - choices=["accuracy", "precision", "recall", "f1"], - help="Metrics to calculate", - ) - exp_parser.add_argument("--tags", nargs="+", help="Experiment tags") - - # Batch experiment commands - subparsers.add_parser("baseline", help="Run baseline experiments") - subparsers.add_parser("ablation", help="Run feature ablation study") - subparsers.add_parser("components", help="Run name component study") - # List experiments list_parser = subparsers.add_parser("list", help="List experiments") list_parser.add_argument("--status", choices=["pending", "running", "completed", "failed"]) - list_parser.add_argument("--model-type", choices=list_available_models()) list_parser.add_argument("--tags", nargs="+", help="Filter by tags") # Show experiment details @@ -350,22 +171,15 @@ def main(): # Execute command try: - if args.command == "run": - run_single_experiment(args) - elif args.command == "baseline": - run_baseline_experiments(args) - elif args.command == "ablation": - run_ablation_study(args) - elif args.command == "components": - run_component_study(args) - elif args.command == "list": - list_experiments(args) - elif args.command == "show": - show_experiment_details(args) - elif args.command == "compare": - compare_experiments_cmd(args) - elif args.command == "export": - export_results(args) + command_map = { + "list": list_experiments, + "show": show_experiment_details, + "compare": compare_experiments_cmd, + "export": export_results, + } + handler = command_map.get(args.command) + if handler: + handler(args) return 0 diff --git a/web/__init__.py b/interface/__init__.py similarity index 100% rename from web/__init__.py rename to interface/__init__.py diff --git a/interface/configuration.py b/interface/configuration.py new file mode 100644 index 0000000..bc843e9 --- /dev/null +++ b/interface/configuration.py @@ -0,0 +1,12 @@ +import streamlit as st + + +class Configuration: + """Handles configuration display and management""" + + def __init__(self, config): + self.config = config + + def index(self): + st.header("Current Configuration") + st.json(self.config.model_dump()) diff --git a/web/dashboard.py b/interface/dashboard.py similarity index 100% rename from web/dashboard.py rename to interface/dashboard.py diff --git a/web/data_overview.py b/interface/data_overview.py similarity index 100% rename from web/data_overview.py rename to interface/data_overview.py diff --git a/web/data_processing.py b/interface/data_processing.py similarity index 99% rename from web/data_processing.py rename to interface/data_processing.py index 9831666..8b586da 100644 --- a/web/data_processing.py +++ b/interface/data_processing.py @@ -2,7 +2,7 @@ import pandas as pd import plotly.express as px import streamlit as st -from web.log_reader import LogReader +from interface.log_reader import LogReader def load_dataset(file_path: str) -> pd.DataFrame: diff --git a/interface/experiments.py b/interface/experiments.py new file mode 100644 index 0000000..25fbd96 --- /dev/null +++ b/interface/experiments.py @@ -0,0 +1,398 @@ +from typing import List, Dict, Any + +import streamlit as st + +from core.utils.region_mapper import RegionMapper +from research.experiment import ExperimentConfig, ExperimentStatus +from research.experiment.experiment_builder import ExperimentBuilder +from research.experiment.experiment_runner import ExperimentRunner +from research.experiment.experiment_tracker import ExperimentTracker +from research.experiment.feature_extractor import FeatureType +from research.model_registry import list_available_models + + +class Experiments: + """Handles experiment management interface""" + + def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner): + self.config = config + self.experiment_tracker = experiment_tracker + self.experiment_runner = experiment_runner + + def index(self): + """Main experiments page""" + st.header("Experiment Management") + tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"]) + + with tab1: + self.show_experiment_creation() + + with tab2: + self.show_experiment_list() + + with tab3: + self.show_batch_experiments() + + def show_experiment_creation(self): + """Show interface for creating new experiments""" + st.subheader("Create New Experiment") + + with st.form("new_experiment"): + col1, col2 = st.columns(2) + + with col1: + exp_name = st.text_input("Experiment Name", placeholder="e.g., native_name_gender_prediction") + description = st.text_area("Description", placeholder="Brief description of the experiment") + model_type = st.selectbox("Model Type", list_available_models()) + + # Feature selection + feature_options = [f.value for f in FeatureType] + selected_features = st.multiselect("Features to Use", feature_options, default=["full_name"]) + + with col2: + # Model parameters + st.write("**Model Parameters**") + model_params = {} + if model_type == "logistic_regression": + ngram_min = st.number_input("N-gram Min", 1, 5, 2) + ngram_max = st.number_input("N-gram Max", 2, 8, 5) + max_features = st.number_input("Max Features", 1000, 50000, 10000) + model_params = { + "ngram_range": [ngram_min, ngram_max], + "max_features": max_features, + } + elif model_type == "random_forest": + n_estimators = st.number_input("Number of Trees", 10, 500, 100) + max_depth = st.number_input("Max Depth", 1, 20, 10) + model_params = { + "n_estimators": n_estimators, + "max_depth": max_depth if max_depth > 0 else None, + } + + # Training parameters + st.write("**Training Parameters**") + test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2) + cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5) + + tags = st.text_input("Tags (comma-separated)", placeholder="e.g., baseline, feature_study") + + # Advanced options + with st.expander("Advanced Options"): + # Data filters + st.write("**Data Filters**") + filter_province = st.selectbox( + "Filter by Province (optional)", + ["None"] + RegionMapper().get_provinces(), + ) + + min_words = st.number_input("Minimum Word Count", 0, 10, 0) + max_words = st.number_input("Maximum Word Count (0 = no limit)", 0, 20, 0) + + submitted = st.form_submit_button("Create and Run Experiment", type="primary") + + if submitted: + self._handle_experiment_submission( + exp_name, description, model_type, selected_features, model_params, + test_size, cv_folds, tags, filter_province, min_words, max_words + ) + + def _handle_experiment_submission(self, exp_name: str, description: str, model_type: str, + selected_features: List[str], model_params: Dict[str, Any], + test_size: float, cv_folds: int, tags: str, + filter_province: str, min_words: int, max_words: int): + """Handle experiment form submission""" + if not exp_name: + st.error("Please provide an experiment name") + return + + if not selected_features: + st.error("Please select at least one feature") + return + + try: + # Prepare data filters + train_filter = {} + if filter_province != "None": + train_filter["province"] = filter_province + if min_words > 0: + train_filter["words"] = {"min": min_words} + if max_words > 0: + if "words" in train_filter: + train_filter["words"]["max"] = max_words + else: + train_filter["words"] = {"max": max_words} + + # Create experiment config + features = [FeatureType(f) for f in selected_features] + tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()] + + config = ExperimentConfig( + name=exp_name, + description=description, + tags=tag_list, + model_type=model_type, + model_params=model_params, + features=features, + train_data_filter=train_filter if train_filter else None, + test_size=test_size, + cross_validation_folds=cv_folds, + ) + + # Run experiment + with st.spinner("Running experiment..."): + experiment_id = self.experiment_runner.run_experiment(config) + + st.success(f"Experiment completed successfully!") + st.info(f"Experiment ID: `{experiment_id}`") + + # Show results + experiment = self.experiment_tracker.get_experiment(experiment_id) + if experiment and experiment.test_metrics: + st.write("**Results:**") + for metric, value in experiment.test_metrics.items(): + st.metric(metric.title(), f"{value:.4f}") + + except Exception as e: + st.error(f"Error running experiment: {e}") + + def show_experiment_list(self): + """Show list of all experiments with filtering""" + st.subheader("All Experiments") + + # Filters + col1, col2, col3 = st.columns(3) + + with col1: + status_filter = st.selectbox( + "Filter by Status", ["All", "completed", "running", "failed", "pending"] + ) + + with col2: + model_filter = st.selectbox("Filter by Model", ["All"] + list_available_models()) + + with col3: + tag_filter = st.text_input("Filter by Tags (comma-separated)") + + # Get and filter experiments + experiments = self._get_filtered_experiments(status_filter, model_filter, tag_filter) + + if not experiments: + st.info("No experiments found matching the filters.") + return + + # Display experiments + for i, exp in enumerate(experiments): + with st.expander( + f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}" + ): + self._display_experiment_details(exp, i) + + def _get_filtered_experiments(self, status_filter: str, model_filter: str, tag_filter: str): + """Get experiments with applied filters""" + experiments = self.experiment_tracker.list_experiments() + + # Apply filters + if status_filter != "All": + experiments = [e for e in experiments if e.status == ExperimentStatus(status_filter)] + + if model_filter != "All": + experiments = [e for e in experiments if e.config.model_type == model_filter] + + if tag_filter: + tags = [tag.strip() for tag in tag_filter.split(",")] + experiments = [e for e in experiments if any(tag in e.config.tags for tag in tags)] + + return experiments + + def _display_experiment_details(self, exp, index: int): + """Display details for a single experiment""" + col1, col2, col3 = st.columns(3) + + with col1: + st.write(f"**Model:** {exp.config.model_type}") + st.write(f"**Features:** {', '.join([f.value for f in exp.config.features])}") + st.write(f"**Tags:** {', '.join(exp.config.tags)}") + + with col2: + if exp.test_metrics: + for metric, value in exp.test_metrics.items(): + st.metric(metric.title(), f"{value:.4f}") + + with col3: + st.write(f"**Train Size:** {exp.train_size:,}") + st.write(f"**Test Size:** {exp.test_size:,}") + + if st.button(f"View Details", key=f"details_{index}"): + st.session_state.selected_experiment = exp.experiment_id + st.rerun() + + if exp.config.description: + st.write(f"**Description:** {exp.config.description}") + + def show_batch_experiments(self): + """Show interface for running batch experiments""" + st.subheader("Batch Experiments") + st.write("Run multiple experiments with different parameter combinations.") + + # Parameter sweep configuration + with st.form("batch_experiments"): + st.write("**Parameter Sweep Configuration**") + + col1, col2 = st.columns(2) + + with col1: + base_name = st.text_input("Base Experiment Name", "parameter_sweep") + model_types = st.multiselect( + "Model Types", list_available_models(), default=["logistic_regression"] + ) + + # N-gram ranges for logistic regression + st.write("**Logistic Regression Parameters**") + ngram_ranges = st.text_area( + "N-gram Ranges (one per line, format: min,max)", "2,4\n2,5\n3,6" + ) + + with col2: + feature_combinations = st.multiselect( + "Feature Combinations", + [f.value for f in FeatureType], + default=["full_name", "native_name", "surname"], + ) + + test_sizes = st.text_input("Test Sizes (comma-separated)", "0.15,0.2,0.25") + + tags = st.text_input("Common Tags", "parameter_sweep,batch") + + if st.form_submit_button("🚀 Run Batch Experiments"): + self.run_batch_experiments( + base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags + ) + + def run_batch_experiments(self, base_name: str, model_types: List[str], ngram_ranges: str, + feature_combinations: List[str], test_sizes: str, tags: str): + """Run batch experiments with parameter combinations""" + with st.spinner("Running batch experiments..."): + try: + experiments = [] + + # Parse parameters + ngram_list = [] + for line in ngram_ranges.strip().split("\n"): + if "," in line: + min_val, max_val = map(int, line.split(",")) + ngram_list.append([min_val, max_val]) + + test_size_list = [float(x.strip()) for x in test_sizes.split(",")] + tag_list = [tag.strip() for tag in tags.split(",") if tag.strip()] + + # Generate experiment combinations + exp_count = 0 + for model_type in model_types: + for feature_combo in feature_combinations: + for test_size in test_size_list: + if model_type == "logistic_regression": + for ngram_range in ngram_list: + exp_name = f"{base_name}_{model_type}_{feature_combo}_{ngram_range[0]}_{ngram_range[1]}_{test_size}" + + config = ExperimentConfig( + name=exp_name, + description=f"Batch experiment: {model_type} with {feature_combo}", + model_type=model_type, + features=[FeatureType(feature_combo)], + model_params={"ngram_range": ngram_range}, + test_size=test_size, + tags=tag_list, + ) + experiments.append(config) + exp_count += 1 + else: + exp_name = f"{base_name}_{model_type}_{feature_combo}_{test_size}" + + config = ExperimentConfig( + name=exp_name, + description=f"Batch experiment: {model_type} with {feature_combo}", + model_type=model_type, + features=[FeatureType(feature_combo)], + test_size=test_size, + tags=tag_list, + ) + experiments.append(config) + exp_count += 1 + + # Run experiments + experiment_ids = self.experiment_runner.run_experiment_batch(experiments) + + st.success(f"Completed {len(experiment_ids)} batch experiments") + + # Show summary + if experiment_ids: + comparison = self.experiment_runner.compare_experiments(experiment_ids) + st.write("**Batch Results Summary:**") + st.dataframe( + comparison[["name", "model_type", "test_accuracy"]], + use_container_width=True, + ) + + except Exception as e: + st.error(f"Error running batch experiments: {e}") + + def run_baseline_experiments(self): + """Run baseline experiments""" + with st.spinner("Running baseline experiments..."): + try: + builder = ExperimentBuilder() + experiments = builder.create_baseline_experiments() + experiment_ids = self.experiment_runner.run_experiment_batch(experiments) + + st.success(f"Completed {len(experiment_ids)} baseline experiments") + + # Show quick comparison + if experiment_ids: + comparison = self.experiment_runner.compare_experiments(experiment_ids) + st.write("**Results Summary:**") + st.dataframe( + comparison[["name", "model_type", "test_accuracy"]], + use_container_width=True, + ) + + except Exception as e: + st.error(f"Error running baseline experiments: {e}") + + def run_ablation_study(self): + """Run feature ablation study""" + with st.spinner("Running ablation study..."): + try: + builder = ExperimentBuilder() + experiments = builder.create_feature_ablation_study() + experiment_ids = self.experiment_runner.run_experiment_batch(experiments) + + st.success(f"Completed {len(experiment_ids)} ablation experiments") + + except Exception as e: + st.error(f"Error running ablation study: {e}") + + def run_component_study(self): + """Run name component study""" + with st.spinner("Running component study..."): + try: + builder = ExperimentBuilder() + experiments = builder.create_name_component_study() + experiment_ids = self.experiment_runner.run_experiment_batch(experiments) + + st.success(f"Completed {len(experiment_ids)} component experiments") + + except Exception as e: + st.error(f"Error running component study: {e}") + + def run_province_study(self): + """Run province-specific study""" + with st.spinner("Running province study..."): + try: + builder = ExperimentBuilder() + experiments = builder.create_province_specific_study() + experiment_ids = self.experiment_runner.run_experiment_batch(experiments) + + st.success(f"Completed {len(experiment_ids)} province experiments") + + except Exception as e: + st.error(f"Error running province study: {e}") diff --git a/web/log_reader.py b/interface/log_reader.py similarity index 98% rename from web/log_reader.py rename to interface/log_reader.py index b1427f6..b11bbb2 100644 --- a/web/log_reader.py +++ b/interface/log_reader.py @@ -37,7 +37,7 @@ class LogReader: # Parse log entries from the end entries = [] - for line in reversed(lines[-count*2:]): # Read more lines in case some don't match + for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match entry = self._parse_log_line(line.strip()) if entry: entries.append(entry) diff --git a/interface/predictions.py b/interface/predictions.py new file mode 100644 index 0000000..76b8707 --- /dev/null +++ b/interface/predictions.py @@ -0,0 +1,373 @@ +"""Predictions interface for the Streamlit app""" + +from datetime import datetime +from typing import Optional + +import numpy as np +import pandas as pd +import plotly.express as px +import streamlit as st + +from core.utils import get_data_file_path +from research.experiment.experiment_runner import ExperimentRunner +from research.experiment.experiment_tracker import ExperimentTracker + + +class Predictions: + """Handles prediction interface""" + + def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner): + self.config = config + self.experiment_tracker = experiment_tracker + self.experiment_runner = experiment_runner + + def index(self): + """Main predictions page""" + st.header("Make Predictions") + + # Load available models + experiments = self.experiment_tracker.list_experiments() + completed_experiments = [ + e for e in experiments if e.status.value == "completed" and e.model_path + ] + + if not completed_experiments: + st.warning("No trained models available. Please run some experiments first.") + return + + # Model selection + model_options = { + f"{exp.config.name} (Acc: {exp.test_metrics.get('accuracy', 0):.3f})": exp + for exp in completed_experiments + if exp.test_metrics + } + + selected_model_name = st.selectbox("Select Model", list(model_options.keys())) + + if not selected_model_name: + return + + selected_experiment = model_options[selected_model_name] + + # Prediction modes + prediction_mode = st.radio( + "Prediction Mode", ["Single Name", "Batch Upload", "Dataset Prediction"] + ) + + if prediction_mode == "Single Name": + self.show_single_prediction(selected_experiment) + elif prediction_mode == "Batch Upload": + self.show_batch_prediction(selected_experiment) + elif prediction_mode == "Dataset Prediction": + self.show_dataset_prediction(selected_experiment) + + def show_single_prediction(self, experiment): + """Show single name prediction interface""" + st.subheader("Single Name Prediction") + + name_input = st.text_input("Enter a name:", placeholder="e.g., Jean Baptiste Mukendi") + + if name_input and st.button("Predict Gender"): + try: + # Load the model + model = self.experiment_runner.load_experiment_model(experiment.experiment_id) + + if model is None: + st.error("Failed to load model") + return + + # Create a DataFrame with the input + input_df = self._prepare_single_input(name_input) + + # Make prediction + prediction = model.predict(input_df)[0] + + # Get prediction probability if available + confidence = self._get_prediction_confidence(model, input_df) + + # Display results + self._display_single_prediction_results(prediction, confidence, experiment, name_input) + + except Exception as e: + st.error(f"Error making prediction: {e}") + + def _prepare_single_input(self, name_input: str) -> pd.DataFrame: + """Prepare single name input for prediction""" + return pd.DataFrame( + { + "name": [name_input], + "words": [len(name_input.split())], + "length": [len(name_input.replace(" ", ""))], + "province": ["unknown"], # Default values + "identified_name": [None], + "identified_surname": [None], + "probable_native": [None], + "probable_surname": [None], + } + ) + + def _get_prediction_confidence(self, model, input_df: pd.DataFrame) -> Optional[float]: + """Get prediction confidence if available""" + try: + probabilities = model.predict_proba(input_df)[0] + return max(probabilities) + except: + return None + + def _display_single_prediction_results(self, prediction: str, confidence: Optional[float], + experiment, name_input: str): + """Display single prediction results""" + col1, col2 = st.columns(2) + + with col1: + gender_label = "Female" if prediction == "f" else "Male" + st.success(f"**Predicted Gender:** {gender_label}") + + with col2: + if confidence: + st.metric("Confidence", f"{confidence:.2%}") + + # Additional info + st.info(f"Model used: {experiment.config.name}") + st.info( + f"Features used: {', '.join([f.value for f in experiment.config.features])}" + ) + + def show_batch_prediction(self, experiment): + """Show batch prediction interface""" + st.subheader("Batch Prediction") + + uploaded_file = st.file_uploader("Upload CSV file with names", type="csv") + + if uploaded_file is not None: + try: + df = pd.read_csv(uploaded_file) + + st.write("**Uploaded Data Preview:**") + st.dataframe(df.head(), use_container_width=True) + + # Column selection + df = self._prepare_batch_data(df) + + if st.button("Run Batch Prediction"): + self._run_batch_prediction(df, experiment) + + except Exception as e: + st.error(f"Error processing file: {e}") + + def _prepare_batch_data(self, df: pd.DataFrame) -> pd.DataFrame: + """Prepare batch data for prediction""" + # Column selection + if "name" not in df.columns: + name_column = st.selectbox("Select the name column:", df.columns) + df = df.rename(columns={name_column: "name"}) + + # Add missing columns with defaults + required_columns = [ + "words", + "length", + "province", + "identified_name", + "identified_surname", + "probable_native", + "probable_surname", + ] + + for col in required_columns: + if col not in df.columns: + if col == "words": + df[col] = df["name"].str.split().str.len() + elif col == "length": + df[col] = df["name"].str.replace(" ", "").str.len() + else: + df[col] = None + + return df + + def _run_batch_prediction(self, df: pd.DataFrame, experiment): + """Run batch prediction and display results""" + with st.spinner("Making predictions..."): + # Load model + model = self.experiment_runner.load_experiment_model(experiment.experiment_id) + + if model is None: + st.error("Failed to load model") + return + + # Make predictions + predictions = model.predict(df) + df["predicted_gender"] = predictions + df["gender_label"] = df["predicted_gender"].map({"f": "Female", "m": "Male"}) + + # Try to get probabilities + try: + probabilities = model.predict_proba(df) + df["confidence"] = np.max(probabilities, axis=1) + except: + df["confidence"] = None + + st.success("Predictions completed!") + + # Show results + self._display_batch_results(df) + + def _display_batch_results(self, df: pd.DataFrame): + """Display batch prediction results""" + result_columns = ["name", "gender_label", "predicted_gender"] + if "confidence" in df.columns: + result_columns.append("confidence") + + st.dataframe(df[result_columns], use_container_width=True) + + # Download results + csv = df.to_csv(index=False) + st.download_button( + label="Download Predictions", + data=csv, + file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", + mime="text/csv", + ) + + # Summary statistics + self._display_batch_summary(df) + + def _display_batch_summary(self, df: pd.DataFrame): + """Display batch prediction summary""" + st.subheader("Prediction Summary") + gender_counts = df["gender_label"].value_counts() + + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Total Predictions", len(df)) + with col2: + st.metric("Female", gender_counts.get("Female", 0)) + with col3: + st.metric("Male", gender_counts.get("Male", 0)) + + # Gender distribution chart + fig = px.pie( + values=gender_counts.values, + names=gender_counts.index, + title="Predicted Gender Distribution", + ) + st.plotly_chart(fig, use_container_width=True) + + def show_dataset_prediction(self, experiment): + """Show dataset prediction interface""" + st.subheader("Dataset Prediction") + st.write("Apply the model to existing datasets") + + # Dataset selection + dataset_options = { + "Featured Dataset": self.config.data.output_files["featured"], + "Evaluation Dataset": self.config.data.output_files["evaluation"], + } + + selected_dataset = st.selectbox("Select Dataset", list(dataset_options.keys())) + file_path = get_data_file_path(dataset_options[selected_dataset], self.config) + + if not file_path.exists(): + st.warning(f"Dataset not found: {file_path}") + return + + # Load and show dataset info + df = self._load_dataset(str(file_path)) + if df.empty: + return + + st.write(f"Dataset contains {len(df):,} records") + + # Prediction options + col1, col2 = st.columns(2) + + with col1: + sample_size = st.number_input( + "Sample size (0 = all data)", 0, len(df), min(1000, len(df)) + ) + + with col2: + compare_with_actual = False + if "sex" in df.columns: + compare_with_actual = st.checkbox("Compare with actual labels", value=True) + + if st.button("Run Dataset Prediction"): + self._run_dataset_prediction(df, experiment, sample_size, compare_with_actual) + + def _load_dataset(self, file_path: str) -> pd.DataFrame: + """Load dataset with error handling""" + try: + return pd.read_csv(file_path) + except Exception as e: + st.error(f"Error loading dataset: {e}") + return pd.DataFrame() + + def _run_dataset_prediction(self, df: pd.DataFrame, experiment, sample_size: int, + compare_with_actual: bool): + """Run dataset prediction and display results""" + with st.spinner("Running predictions..."): + # Sample data if requested + if sample_size > 0: + df_sample = df.sample(n=sample_size, random_state=42) + else: + df_sample = df + + # Load model and make predictions + model = self.experiment_runner.load_experiment_model(experiment.experiment_id) + + if model is None: + st.error("Failed to load model") + return + + predictions = model.predict(df_sample) + df_sample["predicted_gender"] = predictions + + # Show results + if compare_with_actual and "sex" in df_sample.columns: + self._display_dataset_comparison(df_sample) + else: + self._display_dataset_predictions(df_sample) + + def _display_dataset_comparison(self, df_sample: pd.DataFrame): + """Display dataset predictions with actual comparison""" + # Calculate accuracy + accuracy = (df_sample["sex"] == df_sample["predicted_gender"]).mean() + st.metric("Accuracy on Selected Data", f"{accuracy:.4f}") + + # Confusion matrix + from sklearn.metrics import confusion_matrix + + cm = confusion_matrix(df_sample["sex"], df_sample["predicted_gender"]) + + fig = px.imshow(cm, text_auto=True, aspect="auto", title="Confusion Matrix") + st.plotly_chart(fig, use_container_width=True) + + # Sample of correct and incorrect predictions + correct_mask = df_sample["sex"] == df_sample["predicted_gender"] + + col1, col2 = st.columns(2) + + with col1: + st.write("**Sample Correct Predictions**") + correct_sample = df_sample[correct_mask][["name", "sex", "predicted_gender"]].head(10) + st.dataframe(correct_sample, use_container_width=True) + + with col2: + st.write("**Sample Incorrect Predictions**") + incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(10) + st.dataframe(incorrect_sample, use_container_width=True) + + def _display_dataset_predictions(self, df_sample: pd.DataFrame): + """Display dataset predictions without comparison""" + # Just show predictions + st.write("**Sample Predictions**") + sample_results = df_sample[["name", "predicted_gender"]].head(20) + st.dataframe(sample_results, use_container_width=True) + + # Gender distribution + gender_counts = df_sample["predicted_gender"].value_counts() + fig = px.pie( + values=gender_counts.values, + names=gender_counts.index, + title="Predicted Gender Distribution", + ) + st.plotly_chart(fig, use_container_width=True) diff --git a/interface/results_analysis.py b/interface/results_analysis.py new file mode 100644 index 0000000..4f39823 --- /dev/null +++ b/interface/results_analysis.py @@ -0,0 +1,332 @@ +from typing import List + +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import streamlit as st + +from research.experiment.experiment_runner import ExperimentRunner +from research.experiment.experiment_tracker import ExperimentTracker + + +class ResultsAnalysis: + """Handles experiment results and analysis interface""" + + def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner): + self.config = config + self.experiment_tracker = experiment_tracker + self.experiment_runner = experiment_runner + + def index(self): + """Main results analysis page""" + st.header("Results & Analysis") + tab1, tab2, tab3 = st.tabs(["Experiment Comparison", "Performance Analysis", "Model Analysis"]) + + with tab1: + self.show_experiment_comparison() + + with tab2: + self.show_performance_analysis() + + with tab3: + self.show_model_analysis() + + def show_experiment_comparison(self): + """Show experiment comparison interface""" + st.subheader("Compare Experiments") + + experiments = self.experiment_tracker.list_experiments() + completed_experiments = [e for e in experiments if e.status.value == "completed"] + + if not completed_experiments: + st.warning("No completed experiments found.") + return + + # Experiment selection + exp_options = { + f"{exp.config.name} ({exp.experiment_id[:8]})": exp.experiment_id + for exp in completed_experiments + } + + selected_exp_names = st.multiselect( + "Select Experiments to Compare", + list(exp_options.keys()), + default=list(exp_options.keys())[: min(5, len(exp_options))], + ) + + if not selected_exp_names: + st.info("Please select experiments to compare.") + return + + selected_exp_ids = [exp_options[name] for name in selected_exp_names] + + # Generate comparison + comparison_df = self.experiment_runner.compare_experiments(selected_exp_ids) + + if comparison_df.empty: + st.error("No data available for comparison.") + return + + self._display_comparison_table(comparison_df) + self._display_comparison_charts(comparison_df) + + def _display_comparison_table(self, comparison_df: pd.DataFrame): + """Display comparison table""" + st.write("**Experiment Comparison Table**") + + # Select columns to display + metric_columns = [ + col for col in comparison_df.columns if col.startswith("test_") or col.startswith("cv_") + ] + display_columns = ["name", "model_type", "features"] + metric_columns + available_columns = [col for col in display_columns if col in comparison_df.columns] + + st.dataframe(comparison_df[available_columns], use_container_width=True) + + def _display_comparison_charts(self, comparison_df: pd.DataFrame): + """Display comparison charts""" + st.write("**Performance Comparison**") + + if "test_accuracy" in comparison_df.columns: + fig = px.bar( + comparison_df, + x="name", + y="test_accuracy", + color="model_type", + title="Test Accuracy Comparison", + ) + fig.update_layout(xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + # Metric comparison across multiple metrics + metric_columns = [ + col for col in comparison_df.columns if col.startswith("test_") or col.startswith("cv_") + ] + + if len(metric_columns) > 1: + metric_to_plot = st.selectbox("Select Metric for Detailed Comparison", metric_columns) + + if metric_to_plot in comparison_df.columns: + fig = px.bar( + comparison_df, + x="name", + y=metric_to_plot, + color="model_type", + title=f"{metric_to_plot.replace('_', ' ').title()} Comparison", + ) + fig.update_layout(xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + def show_performance_analysis(self): + """Show performance analysis across experiments""" + st.subheader("Performance Analysis") + + experiments = self.experiment_tracker.list_experiments() + completed_experiments = [ + e for e in experiments if e.status.value == "completed" and e.test_metrics + ] + + if not completed_experiments: + st.warning("No completed experiments with metrics found.") + return + + # Prepare data for analysis + analysis_data = self._prepare_analysis_data(completed_experiments) + analysis_df = pd.DataFrame(analysis_data) + + self._display_performance_trends(analysis_df) + self._display_model_comparison(analysis_df) + self._display_top_experiments(analysis_df) + + def _prepare_analysis_data(self, completed_experiments: List) -> List[dict]: + """Prepare data for performance analysis""" + analysis_data = [] + for exp in completed_experiments: + row = { + "experiment_id": exp.experiment_id, + "name": exp.config.name, + "model_type": exp.config.model_type, + "feature_count": len(exp.config.features), + "features": ", ".join([f.value for f in exp.config.features]), + "train_size": exp.train_size, + "test_size": exp.test_size, + **exp.test_metrics, + } + analysis_data.append(row) + return analysis_data + + def _display_performance_trends(self, analysis_df: pd.DataFrame): + """Display performance trend charts""" + col1, col2 = st.columns(2) + + with col1: + # Accuracy vs Training Size + if "accuracy" in analysis_df.columns and "train_size" in analysis_df.columns: + fig = px.scatter( + analysis_df, + x="train_size", + y="accuracy", + color="model_type", + hover_data=["name"], + title="Accuracy vs Training Size", + ) + st.plotly_chart(fig, use_container_width=True) + + with col2: + # Feature Count vs Performance + if "accuracy" in analysis_df.columns and "feature_count" in analysis_df.columns: + fig = px.scatter( + analysis_df, + x="feature_count", + y="accuracy", + color="model_type", + hover_data=["name"], + title="Accuracy vs Number of Features", + ) + st.plotly_chart(fig, use_container_width=True) + + def _display_model_comparison(self, analysis_df: pd.DataFrame): + """Display model type comparison""" + if "accuracy" in analysis_df.columns: + model_performance = ( + analysis_df.groupby("model_type")["accuracy"] + .agg(["mean", "std", "count"]) + .reset_index() + ) + + fig = go.Figure() + fig.add_trace( + go.Bar( + x=model_performance["model_type"], + y=model_performance["mean"], + error_y=dict(type="data", array=model_performance["std"]), + name="Average Accuracy", + ) + ) + fig.update_layout(title="Average Accuracy by Model Type", yaxis_title="Accuracy") + st.plotly_chart(fig, use_container_width=True) + + def _display_top_experiments(self, analysis_df: pd.DataFrame): + """Display top performing experiments""" + st.subheader("Top Performing Experiments") + + if "accuracy" in analysis_df.columns: + display_columns = ["name", "model_type", "features", "accuracy"] + + # Add other metrics if available + for metric in ["precision", "recall", "f1"]: + if metric in analysis_df.columns: + display_columns.append(metric) + + top_experiments = analysis_df.nlargest(5, "accuracy")[display_columns] + st.dataframe(top_experiments, use_container_width=True) + + def show_model_analysis(self): + """Show detailed model analysis""" + st.subheader("Model Analysis") + + experiments = self.experiment_tracker.list_experiments() + completed_experiments = [e for e in experiments if e.status.value == "completed"] + + if not completed_experiments: + st.warning("No completed experiments found.") + return + + # Select experiment for detailed analysis + exp_options = { + f"{exp.config.name} ({exp.experiment_id[:8]})": exp for exp in completed_experiments + } + + selected_exp_name = st.selectbox( + "Select Experiment for Detailed Analysis", list(exp_options.keys()) + ) + + if not selected_exp_name: + return + + selected_exp = exp_options[selected_exp_name] + + self._display_experiment_details(selected_exp) + self._display_confusion_matrix(selected_exp) + self._display_feature_importance(selected_exp) + self._display_prediction_examples(selected_exp) + + def _display_experiment_details(self, experiment): + """Display experiment configuration and metrics""" + col1, col2 = st.columns(2) + + with col1: + st.write("**Experiment Configuration**") + st.json( + { + "name": experiment.config.name, + "model_type": experiment.config.model_type, + "features": [f.value for f in experiment.config.features], + "model_params": experiment.config.model_params, + } + ) + + with col2: + st.write("**Performance Metrics**") + if experiment.test_metrics: + for metric, value in experiment.test_metrics.items(): + st.metric(metric.title(), f"{value:.4f}") + + def _display_confusion_matrix(self, experiment): + """Display confusion matrix if available""" + if experiment.confusion_matrix: + st.write("**Confusion Matrix**") + cm = np.array(experiment.confusion_matrix) + + fig = px.imshow(cm, text_auto=True, aspect="auto", title="Confusion Matrix") + st.plotly_chart(fig, use_container_width=True) + + def _display_feature_importance(self, experiment): + """Display feature importance if available""" + if experiment.feature_importance: + st.write("**Feature Importance**") + + importance_data = sorted( + experiment.feature_importance.items(), key=lambda x: x[1], reverse=True + )[:20] + + features, importances = zip(*importance_data) + + fig = px.bar( + x=list(importances), + y=list(features), + orientation="h", + title="Top 20 Feature Importances", + ) + fig.update_layout(height=600) + st.plotly_chart(fig, use_container_width=True) + + def _display_prediction_examples(self, experiment): + """Display prediction examples if available""" + if experiment.prediction_examples: + st.write("**Prediction Examples**") + + examples_df = pd.DataFrame(experiment.prediction_examples) + + # Separate correct and incorrect predictions + correct_examples = examples_df[examples_df["correct"] == True] + incorrect_examples = examples_df[examples_df["correct"] == False] + + col1, col2 = st.columns(2) + + with col1: + st.write("**Correct Predictions**") + if not correct_examples.empty: + st.dataframe( + correct_examples[["name", "true_label", "predicted_label"]], + use_container_width=True, + ) + + with col2: + st.write("**Incorrect Predictions**") + if not incorrect_examples.empty: + st.dataframe( + incorrect_examples[["name", "true_label", "predicted_label"]], + use_container_width=True, + ) diff --git a/monitor.py b/monitor.py index fc0e158..fe29e83 100755 --- a/monitor.py +++ b/monitor.py @@ -3,8 +3,8 @@ import argparse import sys from core.config.config_manager import ConfigManager -from processing.monitoring.pipeline_monitor import PipelineMonitor from processing.monitoring.data_analyzer import DatasetAnalyzer +from processing.monitoring.pipeline_monitor import PipelineMonitor def main(): @@ -112,29 +112,15 @@ def main(): return 1 completion_stats = analyzer.analyze_completion() - quality_stats = analyzer.analyze_quality() print(f"\n=== Dataset Analysis: {args.file} ===") print(f"Total rows: {completion_stats['total_rows']:,}") - print( - f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)" - ) + print(f"Annotated: {completion_stats['annotated_rows']:,} ({completion_stats['annotation_percentage']:.1f}%)") print(f"Unannotated: {completion_stats['unannotated_rows']:,}") print( f"Complete names: {completion_stats['complete_names']:,} ({completion_stats['completeness_percentage']:.1f}%)" ) - if "name_length" in quality_stats: - length_stats = quality_stats["name_length"] - print(f"\nName length statistics:") - print(f" Average: {length_stats['mean']:.1f} characters") - print(f" Range: {length_stats['min']}-{length_stats['max']} characters") - - if "word_distribution" in quality_stats: - print(f"\nWord count distribution:") - for words, count in quality_stats["word_distribution"].items(): - print(f" {words} words: {count:,} names") - elif args.command == "info": checkpoint_info = monitor.count_checkpoint_files() diff --git a/processing/monitoring/data_analyzer.py b/processing/monitoring/data_analyzer.py index d4c7487..f3c22aa 100644 --- a/processing/monitoring/data_analyzer.py +++ b/processing/monitoring/data_analyzer.py @@ -50,31 +50,3 @@ class DatasetAnalyzer: "complete_names": complete_names, "completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0, } - - def analyze_quality(self) -> Dict: - """Analyze data quality metrics""" - if self.df is None: - return {} - - quality_metrics = {} - - # Missing values - missing_data = self.df.isnull().sum() - quality_metrics["missing_values"] = missing_data.to_dict() - - # Name length distribution - if "name" in self.df.columns: - name_lengths = self.df["name"].str.len() - quality_metrics["name_length"] = { - "mean": name_lengths.mean(), - "median": name_lengths.median(), - "min": name_lengths.min(), - "max": name_lengths.max(), - } - - # Word count distribution - if "words" in self.df.columns: - word_counts = self.df["words"].value_counts().sort_index() - quality_metrics["word_distribution"] = word_counts.to_dict() - - return quality_metrics diff --git a/processing/steps/feature_extraction_step.py b/processing/steps/feature_extraction_step.py index 7634d64..3a4d520 100644 --- a/processing/steps/feature_extraction_step.py +++ b/processing/steps/feature_extraction_step.py @@ -39,7 +39,7 @@ class FeatureExtractionStep(PipelineStep): @classmethod def get_name_category(cls, word_count: int) -> NameCategory: """Determine name category based on word count""" - if word_count <= 3: + if word_count == 3: return NameCategory.SIMPLE else: return NameCategory.COMPOSE diff --git a/research/model_trainer.py b/research/model_trainer.py index 41ab268..cf50e22 100644 --- a/research/model_trainer.py +++ b/research/model_trainer.py @@ -11,6 +11,7 @@ from core.utils.data_loader import DataLoader from research.experiment import FeatureType, ExperimentConfig from research.experiment.experiment_runner import ExperimentRunner from research.experiment.experiment_tracker import ExperimentTracker +from research.model_registry import MODEL_REGISTRY class ModelTrainer: @@ -21,25 +22,24 @@ class ModelTrainer: self.data_loader = DataLoader(self.config) self.experiment_runner = ExperimentRunner(self.config) self.experiment_tracker = ExperimentTracker(self.config) - self.logger = logging.getLogger(__name__) # Setup model artifacts directory self.models_dir = self.config.paths.models_dir self.models_dir.mkdir(parents=True, exist_ok=True) def train_single_model( - self, - model_name: str, - model_type: str = "logistic_regression", - features: List[str] = None, - model_params: Dict[str, Any] = None, - save_artifacts: bool = True, + self, + model_name: str, + model_type: str = "logistic_regression", + features: List[str] = None, + model_params: Dict[str, Any] = None, + save_artifacts: bool = True, ) -> str: """ Train a single model and save its artifacts. Returns the experiment ID. """ - self.logger.info(f"Training {model_type} model: {model_name}") + logging.info(f"Training {model_type} model: {model_name}") if features is None: features = ["full_name"] @@ -60,10 +60,10 @@ class ModelTrainer: experiment = self.experiment_tracker.get_experiment(experiment_id) if experiment and experiment.test_metrics: - self.logger.info("Training completed successfully!") - self.logger.info(f" Experiment ID: {experiment_id}") - self.logger.info(f" Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}") - self.logger.info(f" Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}") + logging.info("Training completed successfully!") + logging.info(f"Experiment ID: {experiment_id}") + logging.info(f"Test Accuracy: {experiment.test_metrics.get('accuracy', 0):.4f}") + logging.info(f"Test F1-Score: {experiment.test_metrics.get('f1', 0):.4f}") if save_artifacts: self.save_model_artifacts(experiment_id) @@ -71,12 +71,15 @@ class ModelTrainer: return experiment_id def train_multiple_models( - self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True + self, + base_name: str, + model_configs: List[Dict[str, Any]], + save_all: bool = True ) -> List[str]: """ Train multiple models with different configurations. """ - self.logger.info(f"Training {len(model_configs)} models...") + logging.info(f"Training {len(model_configs)} models...") experiment_ids = [] @@ -94,10 +97,10 @@ class ModelTrainer: experiment_ids.append(exp_id) except Exception as e: - self.logger.error(f"Failed to train {model_name}: {e}") + logging.error(f"Failed to train {model_name}: {e}") continue - self.logger.info(f"Completed training {len(experiment_ids)} models successfully") + logging.info(f"Completed training {len(experiment_ids)} models successfully") return experiment_ids def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]: @@ -145,7 +148,7 @@ class ModelTrainer: df = self.data_loader.load_csv_complete(data_path) # Generate learning curve - self.logger.info("Generating learning curve...") + logging.info("Generating learning curve...") trained_model.generate_learning_curve(df, df[experiment.config.target_column]) # Plot and save learning curve @@ -169,7 +172,7 @@ class ModelTrainer: json.dump(trained_model.training_history, f, indent=2) except Exception as e: - self.logger.warning(f"Could not generate learning curves: {e}") + logging.warning(f"Could not generate learning curves: {e}") # Save artifacts metadata metadata = { @@ -193,17 +196,17 @@ class ModelTrainer: with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) - self.logger.info(f"Model artifacts saved to: {model_dir}") - self.logger.info(f" - Complete model: {model_path.name}") - self.logger.info(f" - Configuration: {config_path.name}") - self.logger.info(f" - Results: {results_path.name}") - self.logger.info(f" - Metadata: {metadata_path.name}") + logging.info(f"Model artifacts saved to: {model_dir}") + logging.info(f" - Complete model: {model_path.name}") + logging.info(f" - Configuration: {config_path.name}") + logging.info(f" - Results: {results_path.name}") + logging.info(f" - Metadata: {metadata_path.name}") if learning_curve_path and learning_curve_path.exists(): - self.logger.info(f" - Learning curve: {learning_curve_path.name}") + logging.info(f" - Learning curve: {learning_curve_path.name}") if training_history_path and training_history_path.exists(): - self.logger.info(f" - Training history: {training_history_path.name}") + logging.info(f" - Training history: {training_history_path.name}") return { "model_dir": str(model_dir), @@ -231,16 +234,14 @@ class ModelTrainer: metadata = json.load(f) model_type = metadata["model_type"] - from research.model_registry import MODEL_REGISTRY - model_class = MODEL_REGISTRY[model_type] # Load the complete model loaded_model = model_class.load(str(model_path)) - self.logger.info(f"Loaded model: {metadata['model_name']}") - self.logger.info(f" Type: {model_type}") - self.logger.info(f" Accuracy: {metadata['test_accuracy']:.4f}") + logging.info(f"Loaded model: {metadata['model_name']}") + logging.info(f" Type: {model_type}") + logging.info(f" Accuracy: {metadata['test_accuracy']:.4f}") return loaded_model @@ -259,10 +260,10 @@ class ModelTrainer: metadata = json.load(f) models_data.append(metadata) except Exception as e: - self.logger.warning(f"Could not read metadata for {model_dir.name}: {e}") + logging.warning(f"Could not read metadata for {model_dir.name}: {e}") if not models_data: - self.logger.info("No saved models found.") + logging.info("No saved models found.") return pd.DataFrame() df = pd.DataFrame(models_data) diff --git a/research/models/lightgbm_model.py b/research/models/lightgbm_model.py index 07d3881..6efc4c6 100644 --- a/research/models/lightgbm_model.py +++ b/research/models/lightgbm_model.py @@ -22,7 +22,7 @@ class LightGBMModel(TraditionalModel): subsample=params.get("subsample", 0.8), colsample_bytree=params.get("colsample_bytree", 0.8), random_state=self.config.random_seed, - verbose=-1, + verbose=2, ) def prepare_features(self, X: pd.DataFrame) -> np.ndarray: diff --git a/research/models/logistic_regression_model.py b/research/models/logistic_regression_model.py index 5d53859..4bc146c 100644 --- a/research/models/logistic_regression_model.py +++ b/research/models/logistic_regression_model.py @@ -20,7 +20,9 @@ class LogisticRegressionModel(TraditionalModel): ) classifier = LogisticRegression( - max_iter=params.get("max_iter", 1000), random_state=self.config.random_seed + max_iter=params.get("max_iter", 1000), + random_state=self.config.random_seed, + verbose=2 ) return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)]) diff --git a/research/models/random_forest_model.py b/research/models/random_forest_model.py index 12abcbb..a2d9aac 100644 --- a/research/models/random_forest_model.py +++ b/research/models/random_forest_model.py @@ -18,6 +18,7 @@ class RandomForestModel(TraditionalModel): n_estimators=params.get("n_estimators", 100), max_depth=params.get("max_depth", None), random_state=self.config.random_seed, + verbose=2 ) def prepare_features(self, X: pd.DataFrame) -> np.ndarray: diff --git a/research/models/svm_model.py b/research/models/svm_model.py index 762e0c6..03ef3cc 100644 --- a/research/models/svm_model.py +++ b/research/models/svm_model.py @@ -25,6 +25,7 @@ class SVMModel(TraditionalModel): gamma=params.get("gamma", "scale"), probability=True, # Enable probability prediction random_state=self.config.random_seed, + verbose=2 ) return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)]) diff --git a/research/models/xgboost_model.py b/research/models/xgboost_model.py index 3f3fe70..cefc703 100644 --- a/research/models/xgboost_model.py +++ b/research/models/xgboost_model.py @@ -22,6 +22,7 @@ class XGBoostModel(TraditionalModel): colsample_bytree=params.get("colsample_bytree", 0.8), random_state=self.config.random_seed, eval_metric="logloss", + verbosity=2 ) def prepare_features(self, X: pd.DataFrame) -> np.ndarray: diff --git a/research/neural_network_model.py b/research/neural_network_model.py index b454453..624648d 100644 --- a/research/neural_network_model.py +++ b/research/neural_network_model.py @@ -49,6 +49,7 @@ class NeuralNetworkModel(BaseModel): # Now we can build the model with known vocab size vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000 + logging.info(f"Vocabulary size: {vocab_size}") # Get additional model parameters max_len = self.config.model_params.get("max_len", 6) @@ -58,16 +59,18 @@ class NeuralNetworkModel(BaseModel): ) # Train the neural network + logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features") history = self.model.fit( X_prepared, y_encoded, epochs=self.config.model_params.get("epochs", 10), batch_size=self.config.model_params.get("batch_size", 64), validation_split=0.1, - verbose=1, + verbose=2, ) # Store training history + self.training_history = { "accuracy": history.history["accuracy"], "loss": history.history["loss"], diff --git a/research/traditional_model.py b/research/traditional_model.py index 6e415d0..89dc56d 100644 --- a/research/traditional_model.py +++ b/research/traditional_model.py @@ -50,7 +50,8 @@ class TraditionalModel(BaseModel): y_encoded = self.label_encoder.transform(y) # Train model - self.model.fit(X_prepared, y_encoded) + logging.info(f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features") + self.model.fit(X_prepared, y_encoded, verbose=2) self.is_fitted = True return self diff --git a/train.py b/train.py old mode 100644 new mode 100755 index 19dbf24..cb4407e --- a/train.py +++ b/train.py @@ -1,151 +1,25 @@ #!.venv/bin/python3 -import logging import argparse +from core.config import setup_logging, get_config from research.model_trainer import ModelTrainer -def train_baseline_models(): - """ - Quick function to train all baseline models and save artifacts. - """ - logger = logging.getLogger(__name__) - logger.info("Training Baseline Models with Artifact Saving") - - trainer = ModelTrainer() - - # Define baseline model configurations - baseline_configs = [ - { - "model_type": "logistic_regression", - "features": ["full_name"], - "model_params": {"ngram_range": [2, 5], "max_features": 10000}, - }, - { - "model_type": "logistic_regression", - "features": ["native_name"], - "model_params": {"ngram_range": [2, 4], "max_features": 5000}, - }, - { - "model_type": "logistic_regression", - "features": ["surname"], - "model_params": {"ngram_range": [2, 4], "max_features": 5000}, - }, - { - "model_type": "random_forest", - "features": ["name_length", "word_count", "province"], - "model_params": {"n_estimators": 100, "max_depth": 10}, - }, - { - "model_type": "svm", - "features": ["full_name"], - "model_params": {"kernel": "rbf", "C": 1.0}, - }, - {"model_type": "naive_bayes", "features": ["full_name"], "model_params": {"alpha": 1.0}}, - ] - - # Train all baseline models - experiment_ids = trainer.train_multiple_models("baseline", baseline_configs) - - # Show summary - logger.info(f"\n Training Summary:") - for exp_id in experiment_ids: - experiment = trainer.experiment_tracker.get_experiment(exp_id) - if experiment: - acc = experiment.test_metrics.get("accuracy", 0) - logger.info(f" {experiment.config.name}: {acc:.4f} accuracy") - - return experiment_ids - - -def train_neural_networks(): - """ - Train neural network models with proper parameters. - """ - - logging.info("Training Neural Network Models") - - trainer = ModelTrainer() - - neural_configs = [ - { - "model_type": "lstm", - "features": ["full_name"], - "model_params": { - "embedding_dim": 64, - "lstm_units": 32, - "epochs": 10, - "batch_size": 64, - "max_len": 6, - }, - }, - { - "model_type": "cnn", - "features": ["full_name"], - "model_params": { - "embedding_dim": 64, - "filters": 64, - "kernel_size": 3, - "epochs": 10, - "batch_size": 64, - "max_len": 20, # Character level - }, - }, - { - "model_type": "transformer", - "features": ["full_name"], - "model_params": { - "embedding_dim": 64, - "transformer_num_heads": 2, - "epochs": 10, - "batch_size": 64, - "max_len": 6, - }, - }, - ] - - experiment_ids = trainer.train_multiple_models("neural_networks", neural_configs) - return experiment_ids - - def main(): - """ - Main training script with different options. - """ - + setup_logging(get_config()) parser = argparse.ArgumentParser(description="Train DRC Names Models") - parser.add_argument( - "--mode", - choices=["baseline", "neural", "list"], - default="list", - help="Training mode", - ) - parser.add_argument("--model-type", type=str, help="Specific model type to train") + parser.add_argument("--type", type=str, help="Specific model type to train") parser.add_argument("--name", type=str, help="Model name") args = parser.parse_args() - trainer = ModelTrainer() - if args.mode == "baseline": - train_baseline_models() - - elif args.mode == "neural": - train_neural_networks() - - elif args.mode == "list": - logging.info("📋 Saved Models:") - saved_models = trainer.list_saved_models() - if not saved_models.empty: - logging.info(saved_models.to_string(index=False)) - else: - logging.info("No saved models found.") - - elif args.model_type and args.name: - # Train specific model - trainer.train_single_model( - model_name=args.name, model_type=args.model_type, features=["full_name"] - ) + # Train specific model + trainer.train_single_model( + model_name=args.name, + model_type=args.type, + features=["full_name"] + ) if __name__ == "__main__":