feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -1,52 +0,0 @@
-import logging
-from typing import Dict
-
-import pandas as pd
-
-
-class DatasetAnalyzer:
-    """Analyze dataset statistics and quality"""
-
-    def __init__(self, filepath: str):
-        self.filepath = filepath
-        self.df = None
-
-    def load_data(self) -> bool:
-        """Load dataset for analysis"""
-        try:
-            self.df = pd.read_csv(self.filepath)
-            return True
-        except Exception as e:
-            logging.error(f"Failed to load {self.filepath}: {e}")
-            return False
-
-    def analyze_completion(self) -> Dict:
-        """Analyze annotation completion status"""
-        if self.df is None:
-            return {}
-
-        total_rows = len(self.df)
-
-        # Check annotation status
-        if "annotated" in self.df.columns:
-            annotated_count = (self.df["annotated"] == 1).sum()
-            unannotated_count = (self.df["annotated"] == 0).sum()
-        else:
-            annotated_count = 0
-            unannotated_count = total_rows
-
-        # Analyze name completeness
-        complete_names = 0
-        if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
-            complete_names = (
-                (self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
-            ).sum()
-
-        return {
-            "total_rows": total_rows,
-            "annotated_rows": annotated_count,
-            "unannotated_rows": unannotated_count,
-            "annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
-            "complete_names": complete_names,
-            "completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
-        }
@@ -19,7 +19,13 @@ class PipelineMonitor:

        self.paths = paths
        self.checkpoint_dir = paths.checkpoints_dir
-        self.steps = ["data_cleaning", "feature_extraction", "ner_annotation", "llm_annotation", "data_splitting"]
+        self.steps = [
+            "data_cleaning",
+            "feature_extraction",
+            "ner_annotation",
+            "llm_annotation",
+            "data_splitting",
+        ]

    def get_step_status(self, step_name: str) -> Dict:
        """Get status of a specific pipeline step"""