refactoring: add initial pipeline configuration and model classes

This commit is contained in:
2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
View File
+76
View File
@@ -0,0 +1,76 @@
import pandas as pd
import streamlit as st
from core.utils import get_data_file_path
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class Dashboard:
def __init__(self, config, experiment_tracker, experiment_runner):
self.config = config
self.experiment_tracker = experiment_tracker
self.experiment_runner = experiment_runner
def index(self):
st.header("Dashboard")
col1, col2, col3, col4 = st.columns(4)
# Load basic statistics
try:
data_path = get_data_file_path(self.config.data.output_files["featured"], self.config)
if data_path.exists():
df = load_dataset(str(data_path))
with col1:
st.metric("Total Names", f"{len(df):,}")
with col2:
annotated = (df.get("annotated", 0) == 1).sum()
st.metric("Annotated Names", f"{annotated:,}")
with col3:
provinces = df["province"].nunique() if "province" in df.columns else 0
st.metric("Provinces", provinces)
with col4:
if "sex" in df.columns:
gender_dist = df["sex"].value_counts()
ratio = gender_dist.get("f", 0) / max(gender_dist.get("m", 1), 1)
st.metric("F/M Ratio", f"{ratio:.2f}")
else:
st.warning("No processed data found. Please run data processing first.")
except Exception as e:
st.error(f"Error loading dashboard data: {e}")
# Recent experiments
st.subheader("Recent Experiments")
experiments = self.experiment_tracker.list_experiments()[:5]
if experiments:
exp_data = []
for exp in experiments:
exp_data.append(
{
"Name": exp.config.name,
"Model": exp.config.model_type,
"Status": exp.status.value,
"Accuracy": (
f"{exp.test_metrics.get('accuracy', 0):.3f}"
if exp.test_metrics
else "N/A"
),
"Date": exp.start_time.strftime("%Y-%m-%d %H:%M"),
}
)
st.dataframe(pd.DataFrame(exp_data), use_container_width=True)
else:
st.info("No experiments found. Create your first experiment in the Experiments tab!")
+154
View File
@@ -0,0 +1,154 @@
from datetime import datetime
import pandas as pd
import plotly.express as px
import streamlit as st
from core.utils import get_data_file_path
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataOverview:
def __init__(self, config):
self.config = config
def index(self):
st.header("Data Overview")
data_files = {
"Names": self.config.data.input_file,
"Featured Dataset": self.config.data.output_files["featured"],
"Evaluation Dataset": self.config.data.output_files["evaluation"],
"Male Names": self.config.data.output_files["males"],
"Female Names": self.config.data.output_files["females"],
}
selected_file = st.selectbox("Select Dataset", list(data_files.keys()))
file_path = get_data_file_path(data_files[selected_file], self.config)
if not file_path.exists():
st.warning(f"Dataset not found: {file_path}")
st.warning("Please run data processing first to generate datasets.")
return
# Load and display data
df = load_dataset(str(file_path))
if df.empty:
st.error("Failed to load dataset")
return
# Basic statistics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Records", f"{len(df):,}")
with col2:
if "annotated" in df.columns:
annotated_pct = (df["annotated"] == 1).mean() * 100
st.metric("Annotated", f"{annotated_pct:.1f}%")
with col3:
if "words" in df.columns:
avg_words = df["words"].mean()
st.metric("Avg Words", f"{avg_words:.1f}")
with col4:
if "length" in df.columns:
avg_length = df["length"].mean()
st.metric("Avg Length", f"{avg_length:.0f}")
# Data quality analysis
st.subheader("Data Quality Analysis")
col1, col2 = st.columns(2)
with col1:
# Missing values
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
fig = px.bar(
x=missing_data.index, y=missing_data.values, title="Missing Values by Column"
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
else:
st.success("No missing values found")
with col2:
# Gender distribution
if "sex" in df.columns:
gender_counts = df["sex"].value_counts()
fig = px.pie(
values=gender_counts.values,
names=gender_counts.index,
title="Gender Distribution",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
# Word count distribution
if "words" in df.columns:
st.subheader("Name Structure Analysis")
col1, col2 = st.columns(2)
with col1:
word_dist = df["words"].value_counts().sort_index()
fig = px.bar(
x=word_dist.index,
y=word_dist.values,
title="Distribution of Word Count in Names",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
with col2:
# Province distribution
if "province" in df.columns:
province_counts = df["province"].value_counts().head(10)
fig = px.bar(
x=province_counts.values,
y=province_counts.index,
orientation="h",
title="Top 10 Provinces by Name Count",
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
# Sample data
st.subheader("Sample Data")
# Display columns selector
if not df.empty:
columns_to_show = st.multiselect(
"Select columns to display",
df.columns.tolist(),
default=(
["name", "sex", "province", "words"]
if all(col in df.columns for col in ["name", "sex", "province", "words"])
else df.columns[:5].tolist()
),
)
if columns_to_show:
sample_size = st.slider("Number of rows to display", 10, min(1000, len(df)), 50)
st.dataframe(df[columns_to_show].head(sample_size), use_container_width=True)
# Data export
st.subheader("Export Data")
if st.button("Download as CSV"):
csv = df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv,
file_name=f"{selected_file.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.csv",
mime="text/csv",
)
+127
View File
@@ -0,0 +1,127 @@
import pandas as pd
import plotly.express as px
import streamlit as st
from web.log_reader import LogReader
def load_dataset(file_path: str) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return pd.DataFrame()
class DataProcessing:
def __init__(self, config, pipeline_monitor):
self.config = config
self.pipeline_monitor = pipeline_monitor
def index(self):
st.header("Data Processing Pipeline")
status = self.pipeline_monitor.get_pipeline_status()
# Overall progress
overall_progress = status["overall_completion"] / 100
st.progress(overall_progress)
st.write(f"Overall Progress: {status['overall_completion']:.1f}%")
# Step details
for step_name, step_status in status["steps"].items():
with st.expander(f"{step_name.replace('_', ' ').title()} - {step_status['status']}"):
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Processed Batches", step_status["processed_batches"])
with col2:
st.metric("Total Batches", step_status["total_batches"])
with col3:
st.metric("Failed Batches", step_status["failed_batches"])
if step_status["completion_percentage"] > 0:
st.progress(step_status["completion_percentage"] / 100)
# Read actual log entries from the log file
st.subheader("Recent Processing Logs")
try:
log_file_path = self.config.paths.logs_dir / "pipeline.development.log"
log_reader = LogReader(log_file_path)
# Options for filtering logs
col1, col2 = st.columns(2)
with col1:
log_level_filter = st.selectbox(
"Filter by Level",
["All", "INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"],
key="log_level_filter"
)
with col2:
num_entries = st.number_input(
"Number of entries",
min_value=5,
max_value=50,
value=10,
key="num_log_entries"
)
# Get log entries based on filter
if log_level_filter == "All":
log_entries = log_reader.read_last_entries(num_entries)
else:
log_entries = log_reader.read_entries_by_level(log_level_filter, num_entries)
if log_entries:
for entry in log_entries:
if entry.level == "ERROR":
st.error(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
elif entry.level == "WARNING":
st.warning(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
elif entry.level == "INFO":
st.info(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
else:
st.text(f"[{entry.timestamp.strftime('%Y-%m-%d %H:%M:%S')}] {entry.level}: {entry.message}")
# Show log statistics
st.subheader("Log Statistics")
log_stats = log_reader.get_log_stats()
if log_stats:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Lines", log_stats.get('total_lines', 0))
with col2:
st.metric("INFO", log_stats.get('INFO', 0))
with col3:
st.metric("WARNING", log_stats.get('WARNING', 0))
with col4:
st.metric("ERROR", log_stats.get('ERROR', 0))
# Log level distribution chart
levels = ['INFO', 'WARNING', 'ERROR', 'DEBUG', 'CRITICAL']
counts = [log_stats.get(level, 0) for level in levels]
if sum(counts) > 0:
fig = px.bar(
x=levels,
y=counts,
title="Log Entries by Level",
color=levels,
color_discrete_map={
'INFO': 'blue',
'WARNING': 'orange',
'ERROR': 'red',
'DEBUG': 'gray',
'CRITICAL': 'darkred'
}
)
st.plotly_chart(fig, use_container_width=True)
else:
st.info("No log entries found or log file is empty.")
except Exception as e:
st.error(f"Error reading log file: {e}")
+185
View File
@@ -0,0 +1,185 @@
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
@dataclass
class LogEntry:
"""Represents a single log entry."""
timestamp: datetime
logger: str
level: str
message: str
raw_line: str
class LogReader:
"""Utility class for reading and parsing log files."""
def __init__(self, log_file_path: Path):
"""Initialize the log reader with a log file path."""
self.log_file_path = Path(log_file_path)
# Pattern to match Python logging format: timestamp - logger - level - message
self.log_pattern = re.compile(
r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (.+?) - (\w+) - (.+)'
)
def read_last_entries(self, count: int = 10) -> List[LogEntry]:
"""Read the last N entries from the log file."""
if not self.log_file_path.exists():
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Parse log entries from the end
entries = []
for line in reversed(lines[-count*2:]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip())
if entry:
entries.append(entry)
if len(entries) >= count:
break
# Return entries in chronological order (oldest first of the last N)
return list(reversed(entries))
except Exception as e:
print(f"Error reading log file: {e}")
return []
def read_entries_by_level(self, level: str, count: int = 50) -> List[LogEntry]:
"""Read entries filtered by log level."""
if not self.log_file_path.exists():
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
entries = []
for line in reversed(lines):
entry = self._parse_log_line(line.strip())
if entry and entry.level.upper() == level.upper():
entries.append(entry)
if len(entries) >= count:
break
return list(reversed(entries))
except Exception as e:
print(f"Error reading log file: {e}")
return []
def read_entries_since(self, since: datetime, count: int = 100) -> List[LogEntry]:
"""Read entries since a specific datetime."""
if not self.log_file_path.exists():
return []
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
entries = []
for line in reversed(lines):
entry = self._parse_log_line(line.strip())
if entry:
if entry.timestamp >= since:
entries.append(entry)
else:
# Stop reading if we've gone past the since time
break
if len(entries) >= count:
break
return list(reversed(entries))
except Exception as e:
print(f"Error reading log file: {e}")
return []
def get_log_stats(self) -> Dict[str, int]:
"""Get statistics about the log file."""
if not self.log_file_path.exists():
return {}
try:
with open(self.log_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
stats = {
'total_lines': len(lines),
'INFO': 0,
'WARNING': 0,
'ERROR': 0,
'DEBUG': 0,
'CRITICAL': 0
}
for line in lines:
entry = self._parse_log_line(line.strip())
if entry:
level = entry.level.upper()
if level in stats:
stats[level] += 1
return stats
except Exception as e:
print(f"Error reading log file: {e}")
return {}
def _parse_log_line(self, line: str) -> Optional[LogEntry]:
"""Parse a single log line into a LogEntry object."""
if not line:
return None
match = self.log_pattern.match(line)
if not match:
return None
try:
timestamp_str, logger, level, message = match.groups()
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
return LogEntry(
timestamp=timestamp,
logger=logger,
level=level,
message=message,
raw_line=line
)
except ValueError:
return None
class MultiLogReader:
"""Reader for multiple log files."""
def __init__(self, log_directory: Path):
"""Initialize with a directory containing log files."""
self.log_directory = Path(log_directory)
def get_available_log_files(self) -> List[Path]:
"""Get list of available log files."""
if not self.log_directory.exists():
return []
return list(self.log_directory.glob('*.log'))
def read_from_all_files(self, count: int = 10) -> List[LogEntry]:
"""Read entries from all log files and merge them chronologically."""
all_entries = []
for log_file in self.get_available_log_files():
reader = LogReader(log_file)
entries = reader.read_last_entries(count)
all_entries.extend(entries)
# Sort by timestamp
all_entries.sort(key=lambda x: x.timestamp, reverse=True)
return all_entries[:count]