Files
drc-ners-nlp/processing/monitoring/data_analyzer.py
T

81 lines
2.6 KiB
Python

import logging
from typing import Dict
import pandas as pd
class DatasetAnalyzer:
"""Analyze dataset statistics and quality"""
def __init__(self, filepath: str):
self.filepath = filepath
self.df = None
def load_data(self) -> bool:
"""Load dataset for analysis"""
try:
self.df = pd.read_csv(self.filepath)
return True
except Exception as e:
logging.error(f"Failed to load {self.filepath}: {e}")
return False
def analyze_completion(self) -> Dict:
"""Analyze annotation completion status"""
if self.df is None:
return {}
total_rows = len(self.df)
# Check annotation status
if "annotated" in self.df.columns:
annotated_count = (self.df["annotated"] == 1).sum()
unannotated_count = (self.df["annotated"] == 0).sum()
else:
annotated_count = 0
unannotated_count = total_rows
# Analyze name completeness
complete_names = 0
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
complete_names = (
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
).sum()
return {
"total_rows": total_rows,
"annotated_rows": annotated_count,
"unannotated_rows": unannotated_count,
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
"complete_names": complete_names,
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
}
def analyze_quality(self) -> Dict:
"""Analyze data quality metrics"""
if self.df is None:
return {}
quality_metrics = {}
# Missing values
missing_data = self.df.isnull().sum()
quality_metrics["missing_values"] = missing_data.to_dict()
# Name length distribution
if "name" in self.df.columns:
name_lengths = self.df["name"].str.len()
quality_metrics["name_length"] = {
"mean": name_lengths.mean(),
"median": name_lengths.median(),
"min": name_lengths.min(),
"max": name_lengths.max(),
}
# Word count distribution
if "words" in self.df.columns:
word_counts = self.df["words"].value_counts().sort_index()
quality_metrics["word_distribution"] = word_counts.to_dict()
return quality_metrics