feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -2,11 +2,12 @@ import pandas as pd
 import streamlit as st

 from core.utils import get_data_file_path
+from core.utils.data_loader import OPTIMIZED_DTYPES


 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
-        return pd.read_csv(file_path)
+        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame()
@@ -5,11 +5,12 @@ import plotly.express as px
 import streamlit as st

 from core.utils import get_data_file_path
+from core.utils.data_loader import OPTIMIZED_DTYPES


 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
-        return pd.read_csv(file_path)
+        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame()
@@ -2,12 +2,13 @@ import pandas as pd
 import plotly.express as px
 import streamlit as st

+from core.utils.data_loader import OPTIMIZED_DTYPES
 from interface.log_reader import LogReader


 def load_dataset(file_path: str) -> pd.DataFrame:
    try:
-        return pd.read_csv(file_path)
+        return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame()
@@ -56,16 +57,12 @@ class DataProcessing:
                log_level_filter = st.selectbox(
                    "Filter by Level",
                    ["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
-                    key="log_level_filter"
+                    key="log_level_filter",
                )

            with col2:
                num_entries = st.number_input(
-                    "Number of entries",
-                    min_value=5,
-                    max_value=50,
-                    value=10,
-                    key="num_log_entries"
+                    "Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries"
                )

            # Get log entries based on filter
@@ -77,13 +74,21 @@ class DataProcessing:
            if log_entries:
                for entry in log_entries:
                    if entry.level == "ERROR":
-                        st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+                        st.error(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )
                    elif entry.level == "WARNING":
-                        st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+                        st.warning(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )
                    elif entry.level == "INFO":
-                        st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+                        st.info(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )
                    else:
-                        st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
+                        st.text(
+                            f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
+                        )

                # Show log statistics
                st.subheader("Log Statistics")
@@ -93,16 +98,16 @@ class DataProcessing:
                    col1, col2, col3, col4 = st.columns(4)

                    with col1:
-                        st.metric("Total Lines", log_stats.get('total_lines', 0))
+                        st.metric("Total Lines", log_stats.get("total_lines", 0))
                    with col2:
-                        st.metric("INFO", log_stats.get('INFO', 0))
+                        st.metric("INFO", log_stats.get("INFO", 0))
                    with col3:
-                        st.metric("WARNING", log_stats.get('WARNING', 0))
+                        st.metric("WARNING", log_stats.get("WARNING", 0))
                    with col4:
-                        st.metric("ERROR", log_stats.get('ERROR', 0))
+                        st.metric("ERROR", log_stats.get("ERROR", 0))

                    # Log level distribution chart
-                    levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
+                    levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
                    counts = [log_stats.get(level, 0) for level in levels]

                    if sum(counts) > 0:
@@ -112,12 +117,12 @@ class DataProcessing:
                            title="Log Entries by Level",
                            color=levels,
                            color_discrete_map={
-                                'INFO': 'blue',
-                                'WARNING': 'orange',
-                                'ERROR': 'red',
-                                'DEBUG': 'gray',
-                                'CRITICAL': 'darkred'
-                            }
+                                "INFO": "blue",
+                                "WARNING": "orange",
+                                "ERROR": "red",
+                                "DEBUG": "gray",
+                                "CRITICAL": "darkred",
+                            },
                        )
                        st.plotly_chart(fig, use_container_width=True)
            else:
@@ -14,7 +14,9 @@ from research.model_registry import list_available_models
 class Experiments:
    """Handles experiment management interface"""

-    def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
+    def __init__(
+        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
        self.experiment_runner = experiment_runner
@@ -41,13 +43,19 @@ class Experiments:
            col1, col2 = st.columns(2)

            with col1:
-                exp_name = st.text_input("Experiment Name", placeholder="e.g., native_name_gender_prediction")
-                description = st.text_area("Description", placeholder="Brief description of the experiment")
+                exp_name = st.text_input(
+                    "Experiment Name", placeholder="e.g., native_name_gender_prediction"
+                )
+                description = st.text_area(
+                    "Description", placeholder="Brief description of the experiment"
+                )
                model_type = st.selectbox("Model Type", list_available_models())

                # Feature selection
                feature_options = [f.value for f in FeatureType]
-                selected_features = st.multiselect("Features to Use", feature_options, default=["full_name"])
+                selected_features = st.multiselect(
+                    "Features to Use", feature_options, default=["full_name"]
+                )

            with col2:
                # Model parameters
@@ -74,7 +82,9 @@ class Experiments:
                test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2)
                cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5)

-                tags = st.text_input("Tags (comma-separated)", placeholder="e.g., baseline, feature_study")
+                tags = st.text_input(
+                    "Tags (comma-separated)", placeholder="e.g., baseline, feature_study"
+                )

            # Advanced options
            with st.expander("Advanced Options"):
@@ -92,14 +102,33 @@ class Experiments:

            if submitted:
                self._handle_experiment_submission(
-                    exp_name, description, model_type, selected_features, model_params,
-                    test_size, cv_folds, tags, filter_province, min_words, max_words
+                    exp_name,
+                    description,
+                    model_type,
+                    selected_features,
+                    model_params,
+                    test_size,
+                    cv_folds,
+                    tags,
+                    filter_province,
+                    min_words,
+                    max_words,
                )

-    def _handle_experiment_submission(self, exp_name: str, description: str, model_type: str,
-                                      selected_features: List[str], model_params: Dict[str, Any],
-                                      test_size: float, cv_folds: int, tags: str,
-                                      filter_province: str, min_words: int, max_words: int):
+    def _handle_experiment_submission(
+        self,
+        exp_name: str,
+        description: str,
+        model_type: str,
+        selected_features: List[str],
+        model_params: Dict[str, Any],
+        test_size: float,
+        cv_folds: int,
+        tags: str,
+        filter_province: str,
+        min_words: int,
+        max_words: int,
+    ):
        """Handle experiment form submission"""
        if not exp_name:
            st.error("Please provide an experiment name")
@@ -183,7 +212,7 @@ class Experiments:
        # Display experiments
        for i, exp in enumerate(experiments):
            with st.expander(
-                    f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
+                f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
            ):
                self._display_experiment_details(exp, i)

@@ -268,8 +297,15 @@ class Experiments:
                    base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
                )

-    def run_batch_experiments(self, base_name: str, model_types: List[str], ngram_ranges: str,
-                              feature_combinations: List[str], test_sizes: str, tags: str):
+    def run_batch_experiments(
+        self,
+        base_name: str,
+        model_types: List[str],
+        ngram_ranges: str,
+        feature_combinations: List[str],
+        test_sizes: str,
+        tags: str,
+    ):
        """Run batch experiments with parameter combinations"""
        with st.spinner("Running batch experiments..."):
            try:
@@ -8,6 +8,7 @@ from typing import List, Dict, Optional
@dataclass
 class LogEntry:
    """Represents a single log entry."""
+
    timestamp: datetime
    logger: str
    level: str
@@ -23,7 +24,7 @@ class LogReader:
        self.log_file_path = Path(log_file_path)
        # Pattern to match Python logging format: timestamp - logger - level - message
        self.log_pattern = re.compile(
-            r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
+            r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)"
        )

    def read_last_entries(self, count: int = 10) -> List[LogEntry]:
@@ -32,12 +33,12 @@ class LogReader:
            return []

        try:
-            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+            with open(self.log_file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()

            # Parse log entries from the end
            entries = []
-            for line in reversed(lines[-count * 2:]):  # Read more lines in case some don't match
+            for line in reversed(lines[-count * 2 :]):  # Read more lines in case some don't match
                entry = self._parse_log_line(line.strip())
                if entry:
                    entries.append(entry)
@@ -57,7 +58,7 @@ class LogReader:
            return []

        try:
-            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+            with open(self.log_file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()

            entries = []
@@ -80,7 +81,7 @@ class LogReader:
            return []

        try:
-            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+            with open(self.log_file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()

            entries = []
@@ -107,16 +108,16 @@ class LogReader:
            return {}

        try:
-            with open(self.log_file_path, 'r', encoding='utf-8') as file:
+            with open(self.log_file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()

            stats = {
-                'total_lines': len(lines),
-                'INFO': 0,
-                'WARNING': 0,
-                'ERROR': 0,
-                'DEBUG': 0,
-                'CRITICAL': 0
+                "total_lines": len(lines),
+                "INFO": 0,
+                "WARNING": 0,
+                "ERROR": 0,
+                "DEBUG": 0,
+                "CRITICAL": 0,
            }

            for line in lines:
@@ -143,14 +144,10 @@ class LogReader:

        try:
            timestamp_str, logger, level, message = match.groups()
-            timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
+            timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S,%f")

            return LogEntry(
-                timestamp=timestamp,
-                logger=logger,
-                level=level,
-                message=message,
-                raw_line=line
+                timestamp=timestamp, logger=logger, level=level, message=message, raw_line=line
            )
        except ValueError:
            return None
@@ -168,7 +165,7 @@ class MultiLogReader:
        if not self.log_directory.exists():
            return []

-        return list(self.log_directory.glob('*.log'))
+        return list(self.log_directory.glob("*.log"))

    def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
        """Read entries from all log files and merge them chronologically."""
@@ -9,6 +9,7 @@ import plotly.express as px
 import streamlit as st

 from core.utils import get_data_file_path
+from core.utils.data_loader import OPTIMIZED_DTYPES
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker

@@ -16,7 +17,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
 class Predictions:
    """Handles prediction interface"""

-    def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
+    def __init__(
+        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
        self.experiment_runner = experiment_runner
@@ -86,7 +89,9 @@ class Predictions:
                confidence = self._get_prediction_confidence(model, input_df)

                # Display results
-                self._display_single_prediction_results(prediction, confidence, experiment, name_input)
+                self._display_single_prediction_results(
+                    prediction, confidence, experiment, name_input
+                )

            except Exception as e:
                st.error(f"Error making prediction: {e}")
@@ -114,8 +119,9 @@ class Predictions:
        except:
            return None

-    def _display_single_prediction_results(self, prediction: str, confidence: Optional[float],
-                                           experiment, name_input: str):
+    def _display_single_prediction_results(
+        self, prediction: str, confidence: Optional[float], experiment, name_input: str
+    ):
        """Display single prediction results"""
        col1, col2 = st.columns(2)

@@ -129,9 +135,7 @@ class Predictions:

        # Additional info
        st.info(f"Model used: {experiment.config.name}")
-        st.info(
-            f"Features used: {', '.join([f.value for f in experiment.config.features])}"
-        )
+        st.info(f"Features used: {', '.join([f.value for f in experiment.config.features])}")

    def show_batch_prediction(self, experiment):
        """Show batch prediction interface"""
@@ -141,7 +145,7 @@ class Predictions:

        if uploaded_file is not None:
            try:
-                df = pd.read_csv(uploaded_file)
+                df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)

                st.write("**Uploaded Data Preview:**")
                st.dataframe(df.head(), use_container_width=True)
@@ -296,13 +300,14 @@ class Predictions:
    def _load_dataset(self, file_path: str) -> pd.DataFrame:
        """Load dataset with error handling"""
        try:
-            return pd.read_csv(file_path)
+            return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
        except Exception as e:
            st.error(f"Error loading dataset: {e}")
            return pd.DataFrame()

-    def _run_dataset_prediction(self, df: pd.DataFrame, experiment, sample_size: int,
-                                compare_with_actual: bool):
+    def _run_dataset_prediction(
+        self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
+    ):
        """Run dataset prediction and display results"""
        with st.spinner("Running predictions..."):
            # Sample data if requested
@@ -353,7 +358,9 @@ class Predictions:

        with col2:
            st.write("**Sample Incorrect Predictions**")
-            incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(10)
+            incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(
+                10
+            )
            st.dataframe(incorrect_sample, use_container_width=True)

    def _display_dataset_predictions(self, df_sample: pd.DataFrame):
@@ -13,7 +13,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
 class ResultsAnalysis:
    """Handles experiment results and analysis interface"""

-    def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
+    def __init__(
+        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
        self.experiment_runner = experiment_runner
@@ -21,7 +23,9 @@ class ResultsAnalysis:
    def index(self):
        """Main results analysis page"""
        st.header("Results & Analysis")
-        tab1, tab2, tab3 = st.tabs(["Experiment Comparison", "Performance Analysis", "Model Analysis"])
+        tab1, tab2, tab3 = st.tabs(
+            ["Experiment Comparison", "Performance Analysis", "Model Analysis"]
+        )

        with tab1:
            self.show_experiment_comparison()