drc-ners-nlp/processing/monitoring/data_analyzer.py

import logging
from typing import Dict

import pandas as pd


class DatasetAnalyzer:
    """Analyze dataset statistics and quality"""

    def __init__(self, filepath: str):
        self.filepath = filepath
        self.df = None

    def load_data(self) -> bool:
        """Load dataset for analysis"""
        try:
            self.df = pd.read_csv(self.filepath)
            return True
        except Exception as e:
            logging.error(f"Failed to load {self.filepath}: {e}")
            return False

    def analyze_completion(self) -> Dict:
        """Analyze annotation completion status"""
        if self.df is None:
            return {}

        total_rows = len(self.df)

        # Check annotation status
        if "annotated" in self.df.columns:
            annotated_count = (self.df["annotated"] == 1).sum()
            unannotated_count = (self.df["annotated"] == 0).sum()
        else:
            annotated_count = 0
            unannotated_count = total_rows

        # Analyze name completeness
        complete_names = 0
        if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
            complete_names = (
                (self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
            ).sum()

        return {
            "total_rows": total_rows,
            "annotated_rows": annotated_count,
            "unannotated_rows": unannotated_count,
            "annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
            "complete_names": complete_names,
            "completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
        }

    def analyze_quality(self) -> Dict:
        """Analyze data quality metrics"""
        if self.df is None:
            return {}

        quality_metrics = {}

        # Missing values
        missing_data = self.df.isnull().sum()
        quality_metrics["missing_values"] = missing_data.to_dict()

        # Name length distribution
        if "name" in self.df.columns:
            name_lengths = self.df["name"].str.len()
            quality_metrics["name_length"] = {
                "mean": name_lengths.mean(),
                "median": name_lengths.median(),
                "min": name_lengths.min(),
                "max": name_lengths.max(),
            }

        # Word count distribution
        if "words" in self.df.columns:
            word_counts = self.df["words"].value_counts().sort_index()
            quality_metrics["word_distribution"] = word_counts.to_dict()

        return quality_metrics