feat: enhance logging and memory management across modules
This commit is contained in:
@@ -2,11 +2,12 @@ import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path)
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
@@ -5,11 +5,12 @@ import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
|
||||
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path)
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
@@ -2,12 +2,13 @@ import pandas as pd
|
||||
import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from interface.log_reader import LogReader
|
||||
|
||||
|
||||
def load_dataset(file_path: str) -> pd.DataFrame:
|
||||
try:
|
||||
return pd.read_csv(file_path)
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
@@ -56,16 +57,12 @@ class DataProcessing:
|
||||
log_level_filter = st.selectbox(
|
||||
"Filter by Level",
|
||||
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
|
||||
key="log_level_filter"
|
||||
key="log_level_filter",
|
||||
)
|
||||
|
||||
with col2:
|
||||
num_entries = st.number_input(
|
||||
"Number of entries",
|
||||
min_value=5,
|
||||
max_value=50,
|
||||
value=10,
|
||||
key="num_log_entries"
|
||||
"Number of entries", min_value=5, max_value=50, value=10, key="num_log_entries"
|
||||
)
|
||||
|
||||
# Get log entries based on filter
|
||||
@@ -77,13 +74,21 @@ class DataProcessing:
|
||||
if log_entries:
|
||||
for entry in log_entries:
|
||||
if entry.level == "ERROR":
|
||||
st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||
st.error(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
elif entry.level == "WARNING":
|
||||
st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||
st.warning(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
elif entry.level == "INFO":
|
||||
st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||
st.info(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
else:
|
||||
st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
|
||||
st.text(
|
||||
f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}"
|
||||
)
|
||||
|
||||
# Show log statistics
|
||||
st.subheader("Log Statistics")
|
||||
@@ -93,16 +98,16 @@ class DataProcessing:
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.metric("Total Lines", log_stats.get('total_lines', 0))
|
||||
st.metric("Total Lines", log_stats.get("total_lines", 0))
|
||||
with col2:
|
||||
st.metric("INFO", log_stats.get('INFO', 0))
|
||||
st.metric("INFO", log_stats.get("INFO", 0))
|
||||
with col3:
|
||||
st.metric("WARNING", log_stats.get('WARNING', 0))
|
||||
st.metric("WARNING", log_stats.get("WARNING", 0))
|
||||
with col4:
|
||||
st.metric("ERROR", log_stats.get('ERROR', 0))
|
||||
st.metric("ERROR", log_stats.get("ERROR", 0))
|
||||
|
||||
# Log level distribution chart
|
||||
levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
|
||||
levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
|
||||
counts = [log_stats.get(level, 0) for level in levels]
|
||||
|
||||
if sum(counts) > 0:
|
||||
@@ -112,12 +117,12 @@ class DataProcessing:
|
||||
title="Log Entries by Level",
|
||||
color=levels,
|
||||
color_discrete_map={
|
||||
'INFO': 'blue',
|
||||
'WARNING': 'orange',
|
||||
'ERROR': 'red',
|
||||
'DEBUG': 'gray',
|
||||
'CRITICAL': 'darkred'
|
||||
}
|
||||
"INFO": "blue",
|
||||
"WARNING": "orange",
|
||||
"ERROR": "red",
|
||||
"DEBUG": "gray",
|
||||
"CRITICAL": "darkred",
|
||||
},
|
||||
)
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
else:
|
||||
|
||||
+50
-14
@@ -14,7 +14,9 @@ from research.model_registry import list_available_models
|
||||
class Experiments:
|
||||
"""Handles experiment management interface"""
|
||||
|
||||
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
self.experiment_runner = experiment_runner
|
||||
@@ -41,13 +43,19 @@ class Experiments:
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
exp_name = st.text_input("Experiment Name", placeholder="e.g., native_name_gender_prediction")
|
||||
description = st.text_area("Description", placeholder="Brief description of the experiment")
|
||||
exp_name = st.text_input(
|
||||
"Experiment Name", placeholder="e.g., native_name_gender_prediction"
|
||||
)
|
||||
description = st.text_area(
|
||||
"Description", placeholder="Brief description of the experiment"
|
||||
)
|
||||
model_type = st.selectbox("Model Type", list_available_models())
|
||||
|
||||
# Feature selection
|
||||
feature_options = [f.value for f in FeatureType]
|
||||
selected_features = st.multiselect("Features to Use", feature_options, default=["full_name"])
|
||||
selected_features = st.multiselect(
|
||||
"Features to Use", feature_options, default=["full_name"]
|
||||
)
|
||||
|
||||
with col2:
|
||||
# Model parameters
|
||||
@@ -74,7 +82,9 @@ class Experiments:
|
||||
test_size = st.slider("Test Set Size", 0.1, 0.5, 0.2)
|
||||
cv_folds = st.number_input("Cross-Validation Folds", 3, 10, 5)
|
||||
|
||||
tags = st.text_input("Tags (comma-separated)", placeholder="e.g., baseline, feature_study")
|
||||
tags = st.text_input(
|
||||
"Tags (comma-separated)", placeholder="e.g., baseline, feature_study"
|
||||
)
|
||||
|
||||
# Advanced options
|
||||
with st.expander("Advanced Options"):
|
||||
@@ -92,14 +102,33 @@ class Experiments:
|
||||
|
||||
if submitted:
|
||||
self._handle_experiment_submission(
|
||||
exp_name, description, model_type, selected_features, model_params,
|
||||
test_size, cv_folds, tags, filter_province, min_words, max_words
|
||||
exp_name,
|
||||
description,
|
||||
model_type,
|
||||
selected_features,
|
||||
model_params,
|
||||
test_size,
|
||||
cv_folds,
|
||||
tags,
|
||||
filter_province,
|
||||
min_words,
|
||||
max_words,
|
||||
)
|
||||
|
||||
def _handle_experiment_submission(self, exp_name: str, description: str, model_type: str,
|
||||
selected_features: List[str], model_params: Dict[str, Any],
|
||||
test_size: float, cv_folds: int, tags: str,
|
||||
filter_province: str, min_words: int, max_words: int):
|
||||
def _handle_experiment_submission(
|
||||
self,
|
||||
exp_name: str,
|
||||
description: str,
|
||||
model_type: str,
|
||||
selected_features: List[str],
|
||||
model_params: Dict[str, Any],
|
||||
test_size: float,
|
||||
cv_folds: int,
|
||||
tags: str,
|
||||
filter_province: str,
|
||||
min_words: int,
|
||||
max_words: int,
|
||||
):
|
||||
"""Handle experiment form submission"""
|
||||
if not exp_name:
|
||||
st.error("Please provide an experiment name")
|
||||
@@ -183,7 +212,7 @@ class Experiments:
|
||||
# Display experiments
|
||||
for i, exp in enumerate(experiments):
|
||||
with st.expander(
|
||||
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
|
||||
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
|
||||
):
|
||||
self._display_experiment_details(exp, i)
|
||||
|
||||
@@ -268,8 +297,15 @@ class Experiments:
|
||||
base_name, model_types, ngram_ranges, feature_combinations, test_sizes, tags
|
||||
)
|
||||
|
||||
def run_batch_experiments(self, base_name: str, model_types: List[str], ngram_ranges: str,
|
||||
feature_combinations: List[str], test_sizes: str, tags: str):
|
||||
def run_batch_experiments(
|
||||
self,
|
||||
base_name: str,
|
||||
model_types: List[str],
|
||||
ngram_ranges: str,
|
||||
feature_combinations: List[str],
|
||||
test_sizes: str,
|
||||
tags: str,
|
||||
):
|
||||
"""Run batch experiments with parameter combinations"""
|
||||
with st.spinner("Running batch experiments..."):
|
||||
try:
|
||||
|
||||
+16
-19
@@ -8,6 +8,7 @@ from typing import List, Dict, Optional
|
||||
@dataclass
|
||||
class LogEntry:
|
||||
"""Represents a single log entry."""
|
||||
|
||||
timestamp: datetime
|
||||
logger: str
|
||||
level: str
|
||||
@@ -23,7 +24,7 @@ class LogReader:
|
||||
self.log_file_path = Path(log_file_path)
|
||||
# Pattern to match Python logging format: timestamp - logger - level - message
|
||||
self.log_pattern = re.compile(
|
||||
r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
|
||||
r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)"
|
||||
)
|
||||
|
||||
def read_last_entries(self, count: int = 10) -> List[LogEntry]:
|
||||
@@ -32,12 +33,12 @@ class LogReader:
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# Parse log entries from the end
|
||||
entries = []
|
||||
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
|
||||
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
|
||||
entry = self._parse_log_line(line.strip())
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
@@ -57,7 +58,7 @@ class LogReader:
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||
lines = file.readlines()
|
||||
|
||||
entries = []
|
||||
@@ -80,7 +81,7 @@ class LogReader:
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||
lines = file.readlines()
|
||||
|
||||
entries = []
|
||||
@@ -107,16 +108,16 @@ class LogReader:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(self.log_file_path, 'r', encoding='utf-8') as file:
|
||||
with open(self.log_file_path, "r", encoding="utf-8") as file:
|
||||
lines = file.readlines()
|
||||
|
||||
stats = {
|
||||
'total_lines': len(lines),
|
||||
'INFO': 0,
|
||||
'WARNING': 0,
|
||||
'ERROR': 0,
|
||||
'DEBUG': 0,
|
||||
'CRITICAL': 0
|
||||
"total_lines": len(lines),
|
||||
"INFO": 0,
|
||||
"WARNING": 0,
|
||||
"ERROR": 0,
|
||||
"DEBUG": 0,
|
||||
"CRITICAL": 0,
|
||||
}
|
||||
|
||||
for line in lines:
|
||||
@@ -143,14 +144,10 @@ class LogReader:
|
||||
|
||||
try:
|
||||
timestamp_str, logger, level, message = match.groups()
|
||||
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
|
||||
timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S,%f")
|
||||
|
||||
return LogEntry(
|
||||
timestamp=timestamp,
|
||||
logger=logger,
|
||||
level=level,
|
||||
message=message,
|
||||
raw_line=line
|
||||
timestamp=timestamp, logger=logger, level=level, message=message, raw_line=line
|
||||
)
|
||||
except ValueError:
|
||||
return None
|
||||
@@ -168,7 +165,7 @@ class MultiLogReader:
|
||||
if not self.log_directory.exists():
|
||||
return []
|
||||
|
||||
return list(self.log_directory.glob('*.log'))
|
||||
return list(self.log_directory.glob("*.log"))
|
||||
|
||||
def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
|
||||
"""Read entries from all log files and merge them chronologically."""
|
||||
|
||||
+19
-12
@@ -9,6 +9,7 @@ import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
@@ -16,7 +17,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
||||
class Predictions:
|
||||
"""Handles prediction interface"""
|
||||
|
||||
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
self.experiment_runner = experiment_runner
|
||||
@@ -86,7 +89,9 @@ class Predictions:
|
||||
confidence = self._get_prediction_confidence(model, input_df)
|
||||
|
||||
# Display results
|
||||
self._display_single_prediction_results(prediction, confidence, experiment, name_input)
|
||||
self._display_single_prediction_results(
|
||||
prediction, confidence, experiment, name_input
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error making prediction: {e}")
|
||||
@@ -114,8 +119,9 @@ class Predictions:
|
||||
except:
|
||||
return None
|
||||
|
||||
def _display_single_prediction_results(self, prediction: str, confidence: Optional[float],
|
||||
experiment, name_input: str):
|
||||
def _display_single_prediction_results(
|
||||
self, prediction: str, confidence: Optional[float], experiment, name_input: str
|
||||
):
|
||||
"""Display single prediction results"""
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
@@ -129,9 +135,7 @@ class Predictions:
|
||||
|
||||
# Additional info
|
||||
st.info(f"Model used: {experiment.config.name}")
|
||||
st.info(
|
||||
f"Features used: {', '.join([f.value for f in experiment.config.features])}"
|
||||
)
|
||||
st.info(f"Features used: {', '.join([f.value for f in experiment.config.features])}")
|
||||
|
||||
def show_batch_prediction(self, experiment):
|
||||
"""Show batch prediction interface"""
|
||||
@@ -141,7 +145,7 @@ class Predictions:
|
||||
|
||||
if uploaded_file is not None:
|
||||
try:
|
||||
df = pd.read_csv(uploaded_file)
|
||||
df = pd.read_csv(uploaded_file, dtype=OPTIMIZED_DTYPES)
|
||||
|
||||
st.write("**Uploaded Data Preview:**")
|
||||
st.dataframe(df.head(), use_container_width=True)
|
||||
@@ -296,13 +300,14 @@ class Predictions:
|
||||
def _load_dataset(self, file_path: str) -> pd.DataFrame:
|
||||
"""Load dataset with error handling"""
|
||||
try:
|
||||
return pd.read_csv(file_path)
|
||||
return pd.read_csv(file_path, dtype=OPTIMIZED_DTYPES)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading dataset: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
def _run_dataset_prediction(self, df: pd.DataFrame, experiment, sample_size: int,
|
||||
compare_with_actual: bool):
|
||||
def _run_dataset_prediction(
|
||||
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
|
||||
):
|
||||
"""Run dataset prediction and display results"""
|
||||
with st.spinner("Running predictions..."):
|
||||
# Sample data if requested
|
||||
@@ -353,7 +358,9 @@ class Predictions:
|
||||
|
||||
with col2:
|
||||
st.write("**Sample Incorrect Predictions**")
|
||||
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(10)
|
||||
incorrect_sample = df_sample[~correct_mask][["name", "sex", "predicted_gender"]].head(
|
||||
10
|
||||
)
|
||||
st.dataframe(incorrect_sample, use_container_width=True)
|
||||
|
||||
def _display_dataset_predictions(self, df_sample: pd.DataFrame):
|
||||
|
||||
@@ -13,7 +13,9 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
||||
class ResultsAnalysis:
|
||||
"""Handles experiment results and analysis interface"""
|
||||
|
||||
def __init__(self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner):
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
self.config = config
|
||||
self.experiment_tracker = experiment_tracker
|
||||
self.experiment_runner = experiment_runner
|
||||
@@ -21,7 +23,9 @@ class ResultsAnalysis:
|
||||
def index(self):
|
||||
"""Main results analysis page"""
|
||||
st.header("Results & Analysis")
|
||||
tab1, tab2, tab3 = st.tabs(["Experiment Comparison", "Performance Analysis", "Model Analysis"])
|
||||
tab1, tab2, tab3 = st.tabs(
|
||||
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
|
||||
)
|
||||
|
||||
with tab1:
|
||||
self.show_experiment_comparison()
|
||||
|
||||
Reference in New Issue
Block a user