import logging from typing import Dict import pandas as pd class DatasetAnalyzer: """Analyze dataset statistics and quality""" def __init__(self, filepath: str): self.filepath = filepath self.df = None def load_data(self) -> bool: """Load dataset for analysis""" try: self.df = pd.read_csv(self.filepath) return True except Exception as e: logging.error(f"Failed to load {self.filepath}: {e}") return False def analyze_completion(self) -> Dict: """Analyze annotation completion status""" if self.df is None: return {} total_rows = len(self.df) # Check annotation status if "annotated" in self.df.columns: annotated_count = (self.df["annotated"] == 1).sum() unannotated_count = (self.df["annotated"] == 0).sum() else: annotated_count = 0 unannotated_count = total_rows # Analyze name completeness complete_names = 0 if "identified_name" in self.df.columns and "identified_surname" in self.df.columns: complete_names = ( (self.df["identified_name"].notna()) & (self.df["identified_surname"].notna()) ).sum() return { "total_rows": total_rows, "annotated_rows": annotated_count, "unannotated_rows": unannotated_count, "annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0, "complete_names": complete_names, "completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0, } def analyze_quality(self) -> Dict: """Analyze data quality metrics""" if self.df is None: return {} quality_metrics = {} # Missing values missing_data = self.df.isnull().sum() quality_metrics["missing_values"] = missing_data.to_dict() # Name length distribution if "name" in self.df.columns: name_lengths = self.df["name"].str.len() quality_metrics["name_length"] = { "mean": name_lengths.mean(), "median": name_lengths.median(), "min": name_lengths.min(), "max": name_lengths.max(), } # Word count distribution if "words" in self.df.columns: word_counts = self.df["words"].value_counts().sort_index() quality_metrics["word_distribution"] = word_counts.to_dict() return quality_metrics