refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,76 @@
+import pandas as pd
+import streamlit as st
+
+from core.utils import get_data_file_path
+
+
+def load_dataset(file_path: str) -> pd.DataFrame:
+    try:
+        return pd.read_csv(file_path)
+    except Exception as e:
+        st.error(f"Error loading dataset: {e}")
+        return pd.DataFrame()
+
+
+class Dashboard:
+    def __init__(self, config, experiment_tracker, experiment_runner):
+        self.config = config
+        self.experiment_tracker = experiment_tracker
+        self.experiment_runner = experiment_runner
+
+    def index(self):
+        st.header("Dashboard")
+        col1, col2, col3, col4 = st.columns(4)
+
+        # Load basic statistics
+        try:
+            data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
+            if data_path.exists():
+                df = load_dataset(str(data_path))
+
+                with col1:
+                    st.metric("Total Names", f"{len(df):,}")
+
+                with col2:
+                    annotated = (df.get("annotated", 0) == 1).sum()
+                    st.metric("Annotated Names", f"{annotated:,}")
+
+                with col3:
+                    provinces = df["province"].nunique() if "province" in df.columns else 0
+                    st.metric("Provinces", provinces)
+
+                with col4:
+                    if "sex" in df.columns:
+                        gender_dist = df["sex"].value_counts()
+                        ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
+                        st.metric("F/M Ratio", f"{ratio:.2f}")
+            else:
+                st.warning("No processed data found. Please run data processing first.")
+
+        except Exception as e:
+            st.error(f"Error loading dashboard data: {e}")
+
+        # Recent experiments
+        st.subheader("Recent Experiments")
+        experiments = self.experiment_tracker.list_experiments()[:5]
+
+        if experiments:
+            exp_data = []
+            for exp in experiments:
+                exp_data.append(
+                    {
+                        "Name": exp.config.name,
+                        "Model": exp.config.model_type,
+                        "Status": exp.status.value,
+                        "Accuracy": (
+                            f"{exp.test_metrics.get('accuracy', 0):.3f}"
+                            if exp.test_metrics
+                            else "N/A"
+                        ),
+                        "Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
+                    }
+                )
+
+            st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
+        else:
+            st.info("No experiments found. Create your first experiment in the Experiments tab!")
@@ -0,0 +1,154 @@
+from datetime import datetime
+
+import pandas as pd
+import plotly.express as px
+import streamlit as st
+
+from core.utils import get_data_file_path
+
+
+def load_dataset(file_path: str) -> pd.DataFrame:
+    try:
+        return pd.read_csv(file_path)
+    except Exception as e:
+        st.error(f"Error loading dataset: {e}")
+        return pd.DataFrame()
+
+
+class DataOverview:
+    def __init__(self, config):
+        self.config = config
+
+    def index(self):
+        st.header("Data Overview")
+        data_files = {
+            "Names": self.config.data.input_file,
+            "Featured Dataset": self.config.data.output_files["featured"],
+            "Evaluation Dataset": self.config.data.output_files["evaluation"],
+            "Male Names": self.config.data.output_files["males"],
+            "Female Names": self.config.data.output_files["females"],
+        }
+
+        selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
+        file_path = get_data_file_path(data_files[selected_file], self.config)
+
+        if not file_path.exists():
+            st.warning(f"Dataset not found: {file_path}")
+            st.warning("Please run data processing first to generate datasets.")
+            return
+
+        # Load and display data
+        df = load_dataset(str(file_path))
+
+        if df.empty:
+            st.error("Failed to load dataset")
+            return
+
+        # Basic statistics
+        col1, col2, col3, col4 = st.columns(4)
+
+        with col1:
+            st.metric("Total Records", f"{len(df):,}")
+
+        with col2:
+            if "annotated" in df.columns:
+                annotated_pct = (df["annotated"] == 1).mean() * 100
+                st.metric("Annotated", f"{annotated_pct:.1f}%")
+
+        with col3:
+            if "words" in df.columns:
+                avg_words = df["words"].mean()
+                st.metric("Avg Words", f"{avg_words:.1f}")
+
+        with col4:
+            if "length" in df.columns:
+                avg_length = df["length"].mean()
+                st.metric("Avg Length", f"{avg_length:.0f}")
+
+        # Data quality analysis
+        st.subheader("Data Quality Analysis")
+
+        col1, col2 = st.columns(2)
+
+        with col1:
+            # Missing values
+            missing_data = df.isnull().sum()
+            if missing_data.sum() > 0:
+                fig = px.bar(
+                    x=missing_data.index, y=missing_data.values, title="Missing Values by Column"
+                )
+                fig.update_layout(height=400)
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.success("No missing values found")
+
+        with col2:
+            # Gender distribution
+            if "sex" in df.columns:
+                gender_counts = df["sex"].value_counts()
+                fig = px.pie(
+                    values=gender_counts.values,
+                    names=gender_counts.index,
+                    title="Gender Distribution",
+                )
+                fig.update_layout(height=400)
+                st.plotly_chart(fig, use_container_width=True)
+
+        # Word count distribution
+        if "words" in df.columns:
+            st.subheader("Name Structure Analysis")
+
+            col1, col2 = st.columns(2)
+
+            with col1:
+                word_dist = df["words"].value_counts().sort_index()
+                fig = px.bar(
+                    x=word_dist.index,
+                    y=word_dist.values,
+                    title="Distribution of Word Count in Names",
+                )
+                fig.update_layout(height=400)
+                st.plotly_chart(fig, use_container_width=True)
+
+            with col2:
+                # Province distribution
+                if "province" in df.columns:
+                    province_counts = df["province"].value_counts().head(10)
+                    fig = px.bar(
+                        x=province_counts.values,
+                        y=province_counts.index,
+                        orientation="h",
+                        title="Top 10 Provinces by Name Count",
+                    )
+                    fig.update_layout(height=400)
+                    st.plotly_chart(fig, use_container_width=True)
+
+        # Sample data
+        st.subheader("Sample Data")
+
+        # Display columns selector
+        if not df.empty:
+            columns_to_show = st.multiselect(
+                "Select columns to display",
+                df.columns.tolist(),
+                default=(
+                    ["name", "sex", "province", "words"]
+                    if all(col in df.columns for col in ["name", "sex", "province", "words"])
+                    else df.columns[:5].tolist()
+                ),
+            )
+
+            if columns_to_show:
+                sample_size = st.slider("Number of rows to display", 10, min(1000, len(df)), 50)
+                st.dataframe(df[columns_to_show].head(sample_size), use_container_width=True)
+
+        # Data export
+        st.subheader("Export Data")
+        if st.button("Download as CSV"):
+            csv = df.to_csv(index=False)
+            st.download_button(
+                label="Download CSV",
+                data=csv,
+                file_name=f"{selected_file.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.csv",
+                mime="text/csv",
+            )
@@ -0,0 +1,127 @@
+import pandas as pd
+import plotly.express as px
+import streamlit as st
+
+from web.log_reader import LogReader
+
+
+def load_dataset(file_path: str) -> pd.DataFrame:
+    try:
+        return pd.read_csv(file_path)
+    except Exception as e:
+        st.error(f"Error loading dataset: {e}")
+        return pd.DataFrame()
+
+
+class DataProcessing:
+    def __init__(self, config, pipeline_monitor):
+        self.config = config
+        self.pipeline_monitor = pipeline_monitor
+
+    def index(self):
+        st.header("Data Processing Pipeline")
+        status = self.pipeline_monitor.get_pipeline_status()
+
+        # Overall progress
+        overall_progress = status["overall_completion"] / 100
+        st.progress(overall_progress)
+        st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
+
+        # Step details
+        for step_name, step_status in status["steps"].items():
+            with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
+                col1, col2, col3 = st.columns(3)
+
+                with col1:
+                    st.metric("Processed Batches", step_status["processed_batches"])
+
+                with col2:
+                    st.metric("Total Batches", step_status["total_batches"])
+
+                with col3:
+                    st.metric("Failed Batches", step_status["failed_batches"])
+
+                if step_status["completion_percentage"] > 0:
+                    st.progress(step_status["completion_percentage"] / 100)
+
+        # Read actual log entries from the log file
+        st.subheader("Recent Processing Logs")
+        try:
+            log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
+            log_reader = LogReader(log_file_path)
+
+            # Options for filtering logs
+            col1, col2 = st.columns(2)
+            with col1:
+                log_level_filter = st.selectbox(
+                    "Filter by Level",
+                    ["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
+                    key="log_level_filter"
+                )
+
+            with col2:
+                num_entries = st.number_input(
+                    "Number of entries",
+                    min_value=5,
+                    max_value=50,
+                    value=10,
+                    key="num_log_entries"
+                )
+
+            # Get log entries based on filter
+            if log_level_filter == "All":
+                log_entries = log_reader.read_last_entries(num_entries)
+            else:
+                log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
+
+            if log_entries:
+                for entry in log_entries:
+                    if entry.level == "ERROR":
+                        st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+                    elif entry.level == "WARNING":
+                        st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+                    elif entry.level == "INFO":
+                        st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+                    else:
+                        st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+
+                # Show log statistics
+                st.subheader("Log Statistics")
+                log_stats = log_reader.get_log_stats()
+
+                if log_stats:
+                    col1, col2, col3, col4 = st.columns(4)
+
+                    with col1:
+                        st.metric("Total Lines", log_stats.get('total_lines', 0))
+                    with col2:
+                        st.metric("INFO", log_stats.get('INFO', 0))
+                    with col3:
+                        st.metric("WARNING", log_stats.get('WARNING', 0))
+                    with col4:
+                        st.metric("ERROR", log_stats.get('ERROR', 0))
+
+                    # Log level distribution chart
+                    levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
+                    counts = [log_stats.get(level, 0) for level in levels]
+
+                    if sum(counts) > 0:
+                        fig = px.bar(
+                            x=levels,
+                            y=counts,
+                            title="Log Entries by Level",
+                            color=levels,
+                            color_discrete_map={
+                                'INFO': 'blue',
+                                'WARNING': 'orange',
+                                'ERROR': 'red',
+                                'DEBUG': 'gray',
+                                'CRITICAL': 'darkred'
+                            }
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info("No log entries found or log file is empty.")
+
+        except Exception as e:
+            st.error(f"Error reading log file: {e}")
@@ -0,0 +1,185 @@
+import re
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Optional
+
+
+@dataclass
+class LogEntry:
+    """Represents a single log entry."""
+    timestamp: datetime
+    logger: str
+    level: str
+    message: str
+    raw_line: str
+
+
+class LogReader:
+    """Utility class for reading and parsing log files."""
+
+    def __init__(self, log_file_path: Path):
+        """Initialize the log reader with a log file path."""
+        self.log_file_path = Path(log_file_path)
+        # Pattern to match Python logging format: timestamp - logger - level - message
+        self.log_pattern = re.compile(
+            r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
+        )
+
+    def read_last_entries(self, count: int = 10) -> List[LogEntry]:
+        """Read the last N entries from the log file."""
+        if not self.log_file_path.exists():
+            return []
+
+        try:
+            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+                lines = file.readlines()
+
+            # Parse log entries from the end
+            entries = []
+            for line in reversed(lines[-count*2:]):  # Read more lines in case some don't match
+                entry = self._parse_log_line(line.strip())
+                if entry:
+                    entries.append(entry)
+                if len(entries) >= count:
+                    break
+
+            # Return entries in chronological order (oldest first of the last N)
+            return list(reversed(entries))
+
+        except Exception as e:
+            print(f"Error reading log file: {e}")
+            return []
+
+    def read_entries_by_level(self, level: str, count: int = 50) -> List[LogEntry]:
+        """Read entries filtered by log level."""
+        if not self.log_file_path.exists():
+            return []
+
+        try:
+            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+                lines = file.readlines()
+
+            entries = []
+            for line in reversed(lines):
+                entry = self._parse_log_line(line.strip())
+                if entry and entry.level.upper() == level.upper():
+                    entries.append(entry)
+                if len(entries) >= count:
+                    break
+
+            return list(reversed(entries))
+
+        except Exception as e:
+            print(f"Error reading log file: {e}")
+            return []
+
+    def read_entries_since(self, since: datetime, count: int = 100) -> List[LogEntry]:
+        """Read entries since a specific datetime."""
+        if not self.log_file_path.exists():
+            return []
+
+        try:
+            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+                lines = file.readlines()
+
+            entries = []
+            for line in reversed(lines):
+                entry = self._parse_log_line(line.strip())
+                if entry:
+                    if entry.timestamp >= since:
+                        entries.append(entry)
+                    else:
+                        # Stop reading if we've gone past the since time
+                        break
+                if len(entries) >= count:
+                    break
+
+            return list(reversed(entries))
+
+        except Exception as e:
+            print(f"Error reading log file: {e}")
+            return []
+
+    def get_log_stats(self) -> Dict[str, int]:
+        """Get statistics about the log file."""
+        if not self.log_file_path.exists():
+            return {}
+
+        try:
+            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+                lines = file.readlines()
+
+            stats = {
+                'total_lines': len(lines),
+                'INFO': 0,
+                'WARNING': 0,
+                'ERROR': 0,
+                'DEBUG': 0,
+                'CRITICAL': 0
+            }
+
+            for line in lines:
+                entry = self._parse_log_line(line.strip())
+                if entry:
+                    level = entry.level.upper()
+                    if level in stats:
+                        stats[level] += 1
+
+            return stats
+
+        except Exception as e:
+            print(f"Error reading log file: {e}")
+            return {}
+
+    def _parse_log_line(self, line: str) -> Optional[LogEntry]:
+        """Parse a single log line into a LogEntry object."""
+        if not line:
+            return None
+
+        match = self.log_pattern.match(line)
+        if not match:
+            return None
+
+        try:
+            timestamp_str, logger, level, message = match.groups()
+            timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
+
+            return LogEntry(
+                timestamp=timestamp,
+                logger=logger,
+                level=level,
+                message=message,
+                raw_line=line
+            )
+        except ValueError:
+            return None
+
+
+class MultiLogReader:
+    """Reader for multiple log files."""
+
+    def __init__(self, log_directory: Path):
+        """Initialize with a directory containing log files."""
+        self.log_directory = Path(log_directory)
+
+    def get_available_log_files(self) -> List[Path]:
+        """Get list of available log files."""
+        if not self.log_directory.exists():
+            return []
+
+        return list(self.log_directory.glob('*.log'))
+
+    def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
+        """Read entries from all log files and merge them chronologically."""
+        all_entries = []
+
+        for log_file in self.get_available_log_files():
+            reader = LogReader(log_file)
+            entries = reader.read_last_entries(count)
+            all_entries.extend(entries)
+
+        # Sort by timestamp
+        all_entries.sort(key=lambda x: x.timestamp, reverse=True)
+
+        return all_entries[:count]