refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,80 @@
+import logging
+from typing import Dict
+
+import pandas as pd
+
+
+class DatasetAnalyzer:
+    """Analyze dataset statistics and quality"""
+
+    def __init__(self, filepath: str):
+        self.filepath = filepath
+        self.df = None
+
+    def load_data(self) -> bool:
+        """Load dataset for analysis"""
+        try:
+            self.df = pd.read_csv(self.filepath)
+            return True
+        except Exception as e:
+            logging.error(f"Failed to load {self.filepath}: {e}")
+            return False
+
+    def analyze_completion(self) -> Dict:
+        """Analyze annotation completion status"""
+        if self.df is None:
+            return {}
+
+        total_rows = len(self.df)
+
+        # Check annotation status
+        if "annotated" in self.df.columns:
+            annotated_count = (self.df["annotated"] == 1).sum()
+            unannotated_count = (self.df["annotated"] == 0).sum()
+        else:
+            annotated_count = 0
+            unannotated_count = total_rows
+
+        # Analyze name completeness
+        complete_names = 0
+        if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
+            complete_names = (
+                (self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
+            ).sum()
+
+        return {
+            "total_rows": total_rows,
+            "annotated_rows": annotated_count,
+            "unannotated_rows": unannotated_count,
+            "annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
+            "complete_names": complete_names,
+            "completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
+        }
+
+    def analyze_quality(self) -> Dict:
+        """Analyze data quality metrics"""
+        if self.df is None:
+            return {}
+
+        quality_metrics = {}
+
+        # Missing values
+        missing_data = self.df.isnull().sum()
+        quality_metrics["missing_values"] = missing_data.to_dict()
+
+        # Name length distribution
+        if "name" in self.df.columns:
+            name_lengths = self.df["name"].str.len()
+            quality_metrics["name_length"] = {
+                "mean": name_lengths.mean(),
+                "median": name_lengths.median(),
+                "min": name_lengths.min(),
+                "max": name_lengths.max(),
+            }
+
+        # Word count distribution
+        if "words" in self.df.columns:
+            word_counts = self.df["words"].value_counts().sort_index()
+            quality_metrics["word_distribution"] = word_counts.to_dict()
+
+        return quality_metrics