feat: enhance logging and memory management across modules
This commit is contained in:
@@ -1,52 +0,0 @@
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class DatasetAnalyzer:
|
||||
"""Analyze dataset statistics and quality"""
|
||||
|
||||
def __init__(self, filepath: str):
|
||||
self.filepath = filepath
|
||||
self.df = None
|
||||
|
||||
def load_data(self) -> bool:
|
||||
"""Load dataset for analysis"""
|
||||
try:
|
||||
self.df = pd.read_csv(self.filepath)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load {self.filepath}: {e}")
|
||||
return False
|
||||
|
||||
def analyze_completion(self) -> Dict:
|
||||
"""Analyze annotation completion status"""
|
||||
if self.df is None:
|
||||
return {}
|
||||
|
||||
total_rows = len(self.df)
|
||||
|
||||
# Check annotation status
|
||||
if "annotated" in self.df.columns:
|
||||
annotated_count = (self.df["annotated"] == 1).sum()
|
||||
unannotated_count = (self.df["annotated"] == 0).sum()
|
||||
else:
|
||||
annotated_count = 0
|
||||
unannotated_count = total_rows
|
||||
|
||||
# Analyze name completeness
|
||||
complete_names = 0
|
||||
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
|
||||
complete_names = (
|
||||
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
|
||||
).sum()
|
||||
|
||||
return {
|
||||
"total_rows": total_rows,
|
||||
"annotated_rows": annotated_count,
|
||||
"unannotated_rows": unannotated_count,
|
||||
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
|
||||
"complete_names": complete_names,
|
||||
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
|
||||
}
|
||||
@@ -19,7 +19,13 @@ class PipelineMonitor:
|
||||
|
||||
self.paths = paths
|
||||
self.checkpoint_dir = paths.checkpoints_dir
|
||||
self.steps = ["data_cleaning", "feature_extraction", "ner_annotation", "llm_annotation", "data_splitting"]
|
||||
self.steps = [
|
||||
"data_cleaning",
|
||||
"feature_extraction",
|
||||
"ner_annotation",
|
||||
"llm_annotation",
|
||||
"data_splitting",
|
||||
]
|
||||
|
||||
def get_step_status(self, step_name: str) -> Dict:
|
||||
"""Get status of a specific pipeline step"""
|
||||
|
||||
Reference in New Issue
Block a user