feat: enhance logging and memory management across modules

This commit is contained in:
2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
+2 -1
View File
@@ -2,11 +2,12 @@ import pandas as pd
import streamlit as st
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
+2 -1
View File
@@ -5,11 +5,12 @@ import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
+27 -22
View File
@@ -2,12 +2,13 @@ import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils.data_loader import OPTIMIZED_DTYPES
from interface.log_reader import LogReader
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
@@ -56,16 +57,12 @@ class DataProcessing:
log_level_filter = st.selectbox(
"Filter by Level",
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
key="log_level_filter"
key="log_level_filter",
)
with col2:
num_entries = st.number_input(
"Number of entries",
min_value=5,
max_value=50,
value=10,
key="num_log_entries"
"Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries"
)
# Get log entries based on filter
@@ -77,13 +74,21 @@ class DataProcessing:
if log_entries:
for entry in log_entries:
if entry.level == "ERROR":
st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
st.error(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
elif entry.level == "WARNING":
st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
st.warning(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
elif entry.level == "INFO":
st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
st.info(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
else:
st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
st.text(
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
)
# Show log statistics
st.subheader("Log Statistics")
@@ -93,16 +98,16 @@ class DataProcessing:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Lines", log_stats.get('total_lines', 0))
st.metric("Total Lines", log_stats.get("total_lines", 0))
with col2:
st.metric("INFO", log_stats.get('INFO', 0))
st.metric("INFO", log_stats.get("INFO", 0))
with col3:
st.metric("WARNING", log_stats.get('WARNING', 0))
st.metric("WARNING", log_stats.get("WARNING", 0))
with col4:
st.metric("ERROR", log_stats.get('ERROR', 0))
st.metric("ERROR", log_stats.get("ERROR", 0))
# Log level distribution chart
levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
counts = [log_stats.get(level, 0) for level in levels]
if sum(counts) > 0:
@@ -112,12 +117,12 @@ class DataProcessing:
title="Log Entries by Level",
color=levels,
color_discrete_map={
'INFO': 'blue',
'WARNING': 'orange',
'ERROR': 'red',
'DEBUG': 'gray',
'CRITICAL': 'darkred'
}
"INFO": "blue",
"WARNING": "orange",
"ERROR": "red",
"DEBUG": "gray",
"CRITICAL": "darkred",
},
)
st.plotly_chart(fig, use_container_width=True)
else:
+50 -14
View File
@@ -14,7 +14,9 @@ from research.model_registry import list_available_models
class Experiments:
"""Handles experiment management interface"""
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
@@ -41,13 +43,19 @@ class Experiments:
col1, col2 = st.columns(2)
with col1:
exp_name = st.text_input("Experiment Name", placeholder="e.g., native_name_gender_prediction")
description = st.text_area("Description", placeholder="Brief description of the experiment")
exp_name = st.text_input(
"Experiment Name", placeholder="e.g., native_name_gender_prediction"
)
description = st.text_area(
"Description", placeholder="Brief description of the experiment"
)
model_type = st.selectbox("Model Type", list_available_models())
# Feature selection
feature_options = [f.value for f in FeatureType]
selected_features = st.multiselect("Features to Use", feature_options, default=["full_name"])
selected_features = st.multiselect(
"Features to Use", feature_options, default=["full_name"]
)
with col2:
# Model parameters
@@ -74,7 +82,9 @@ class Experiments:
test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2)
cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5)
tags = st.text_input("Tags (comma-separated)", placeholder="e.g., baseline, feature_study")
tags = st.text_input(
"Tags (comma-separated)", placeholder="e.g., baseline, feature_study"
)
# Advanced options
with st.expander("Advanced Options"):
@@ -92,14 +102,33 @@ class Experiments:
if submitted:
self._handle_experiment_submission(
exp_name, description, model_type, selected_features, model_params,
test_size, cv_folds, tags, filter_province, min_words, max_words
exp_name,
description,
model_type,
selected_features,
model_params,
test_size,
cv_folds,
tags,
filter_province,
min_words,
max_words,
)
def _handle_experiment_submission(self, exp_name: str, description: str, model_type: str,
selected_features: List[str], model_params: Dict[str, Any],
test_size: float, cv_folds: int, tags: str,
filter_province: str, min_words: int, max_words: int):
def _handle_experiment_submission(
self,
exp_name: str,
description: str,
model_type: str,
selected_features: List[str],
model_params: Dict[str, Any],
test_size: float,
cv_folds: int,
tags: str,
filter_province: str,
min_words: int,
max_words: int,
):
"""Handle experiment form submission"""
if not exp_name:
st.error("Please provide an experiment name")
@@ -183,7 +212,7 @@ class Experiments:
# Display experiments
for i, exp in enumerate(experiments):
with st.expander(
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
):
self._display_experiment_details(exp, i)
@@ -268,8 +297,15 @@ class Experiments:
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
)
def run_batch_experiments(self, base_name: str, model_types: List[str], ngram_ranges: str,
feature_combinations: List[str], test_sizes: str, tags: str):
def run_batch_experiments(
self,
base_name: str,
model_types: List[str],
ngram_ranges: str,
feature_combinations: List[str],
test_sizes: str,
tags: str,
):
"""Run batch experiments with parameter combinations"""
with st.spinner("Running batch experiments..."):
try:
+16 -19
View File
@@ -8,6 +8,7 @@ from typing import List, Dict, Optional
@dataclass
class LogEntry:
"""Represents a single log entry."""
timestamp: datetime
logger: str
level: str
@@ -23,7 +24,7 @@ class LogReader:
self.log_file_path = Path(log_file_path)
# Pattern to match Python logging format: timestamp - logger - level - message
self.log_pattern = re.compile(
r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)"
)
def read_last_entries(self, count: int = 10) -> List[LogEntry]:
@@ -32,12 +33,12 @@ class LogReader:
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
with open(self.log_file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
# Parse log entries from the end
entries = []
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip())
if entry:
entries.append(entry)
@@ -57,7 +58,7 @@ class LogReader:
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
with open(self.log_file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
entries = []
@@ -80,7 +81,7 @@ class LogReader:
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
with open(self.log_file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
entries = []
@@ -107,16 +108,16 @@ class LogReader:
return {}
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
with open(self.log_file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
stats = {
'total_lines': len(lines),
'INFO': 0,
'WARNING': 0,
'ERROR': 0,
'DEBUG': 0,
'CRITICAL': 0
"total_lines": len(lines),
"INFO": 0,
"WARNING": 0,
"ERROR": 0,
"DEBUG": 0,
"CRITICAL": 0,
}
for line in lines:
@@ -143,14 +144,10 @@ class LogReader:
try:
timestamp_str, logger, level, message = match.groups()
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S,%f")
return LogEntry(
timestamp=timestamp,
logger=logger,
level=level,
message=message,
raw_line=line
timestamp=timestamp, logger=logger, level=level, message=message, raw_line=line
)
except ValueError:
return None
@@ -168,7 +165,7 @@ class MultiLogReader:
if not self.log_directory.exists():
return []
return list(self.log_directory.glob('*.log'))
return list(self.log_directory.glob("*.log"))
def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
"""Read entries from all log files and merge them chronologically."""
+19 -12
View File
@@ -9,6 +9,7 @@ import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
@@ -16,7 +17,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
class Predictions:
"""Handles prediction interface"""
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
@@ -86,7 +89,9 @@ class Predictions:
confidence = self._get_prediction_confidence(model, input_df)
# Display results
self._display_single_prediction_results(prediction, confidence, experiment, name_input)
self._display_single_prediction_results(
prediction, confidence, experiment, name_input
)
except Exception as e:
st.error(f"Error making prediction: {e}")
@@ -114,8 +119,9 @@ class Predictions:
except:
return None
def _display_single_prediction_results(self, prediction: str, confidence: Optional[float],
experiment, name_input: str):
def _display_single_prediction_results(
self, prediction: str, confidence: Optional[float], experiment, name_input: str
):
"""Display single prediction results"""
col1, col2 = st.columns(2)
@@ -129,9 +135,7 @@ class Predictions:
# Additional info
st.info(f"Model used: {experiment.config.name}")
st.info(
f"Features used: {', '.join([f.value for f in experiment.config.features])}"
)
st.info(f"Features used: {', '.join([f.value for f in experiment.config.features])}")
def show_batch_prediction(self, experiment):
"""Show batch prediction interface"""
@@ -141,7 +145,7 @@ class Predictions:
if uploaded_file is not None:
try:
df = pd.read_csv(uploaded_file)
df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)
st.write("**Uploaded Data Preview:**")
st.dataframe(df.head(), use_container_width=True)
@@ -296,13 +300,14 @@ class Predictions:
def _load_dataset(self, file_path: str) -> pd.DataFrame:
"""Load dataset with error handling"""
try:
return pd.read_csv(file_path)
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
def _run_dataset_prediction(self, df: pd.DataFrame, experiment, sample_size: int,
compare_with_actual: bool):
def _run_dataset_prediction(
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
):
"""Run dataset prediction and display results"""
with st.spinner("Running predictions..."):
# Sample data if requested
@@ -353,7 +358,9 @@ class Predictions:
with col2:
st.write("**Sample Incorrect Predictions**")
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(10)
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(
10
)
st.dataframe(incorrect_sample, use_container_width=True)
def _display_dataset_predictions(self, df_sample: pd.DataFrame):
+6 -2
View File
@@ -13,7 +13,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
class ResultsAnalysis:
"""Handles experiment results and analysis interface"""
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
@@ -21,7 +23,9 @@ class ResultsAnalysis:
def index(self):
"""Main results analysis page"""
st.header("Results & Analysis")
tab1, tab2, tab3 = st.tabs(["Experiment Comparison", "Performance Analysis", "Model Analysis"])
tab1, tab2, tab3 = st.tabs(
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
)
with tab1:
self.show_experiment_comparison()