81 lines
2.6 KiB
Python
81 lines
2.6 KiB
Python
import logging
|
|
from typing import Dict
|
|
|
|
import pandas as pd
|
|
|
|
|
|
class DatasetAnalyzer:
|
|
"""Analyze dataset statistics and quality"""
|
|
|
|
def __init__(self, filepath: str):
|
|
self.filepath = filepath
|
|
self.df = None
|
|
|
|
def load_data(self) -> bool:
|
|
"""Load dataset for analysis"""
|
|
try:
|
|
self.df = pd.read_csv(self.filepath)
|
|
return True
|
|
except Exception as e:
|
|
logging.error(f"Failed to load {self.filepath}: {e}")
|
|
return False
|
|
|
|
def analyze_completion(self) -> Dict:
|
|
"""Analyze annotation completion status"""
|
|
if self.df is None:
|
|
return {}
|
|
|
|
total_rows = len(self.df)
|
|
|
|
# Check annotation status
|
|
if "annotated" in self.df.columns:
|
|
annotated_count = (self.df["annotated"] == 1).sum()
|
|
unannotated_count = (self.df["annotated"] == 0).sum()
|
|
else:
|
|
annotated_count = 0
|
|
unannotated_count = total_rows
|
|
|
|
# Analyze name completeness
|
|
complete_names = 0
|
|
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
|
|
complete_names = (
|
|
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
|
|
).sum()
|
|
|
|
return {
|
|
"total_rows": total_rows,
|
|
"annotated_rows": annotated_count,
|
|
"unannotated_rows": unannotated_count,
|
|
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
|
|
"complete_names": complete_names,
|
|
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
|
|
}
|
|
|
|
def analyze_quality(self) -> Dict:
|
|
"""Analyze data quality metrics"""
|
|
if self.df is None:
|
|
return {}
|
|
|
|
quality_metrics = {}
|
|
|
|
# Missing values
|
|
missing_data = self.df.isnull().sum()
|
|
quality_metrics["missing_values"] = missing_data.to_dict()
|
|
|
|
# Name length distribution
|
|
if "name" in self.df.columns:
|
|
name_lengths = self.df["name"].str.len()
|
|
quality_metrics["name_length"] = {
|
|
"mean": name_lengths.mean(),
|
|
"median": name_lengths.median(),
|
|
"min": name_lengths.min(),
|
|
"max": name_lengths.max(),
|
|
}
|
|
|
|
# Word count distribution
|
|
if "words" in self.df.columns:
|
|
word_counts = self.df["words"].value_counts().sort_index()
|
|
quality_metrics["word_distribution"] = word_counts.to_dict()
|
|
|
|
return quality_metrics
|