feat: statistics tests

This commit is contained in:
2025-09-28 23:50:40 +02:00
parent 9039e9a4cf
commit 9e35f95107
18 changed files with 7645 additions and 4872 deletions
+66 -7
View File
@@ -23,7 +23,7 @@ def normalize_letters(s):
return s
def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
return (
df.groupby("province")["identified_category"]
.value_counts(normalize=True) # get proportions
@@ -31,7 +31,7 @@ def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
)
def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
# Normalize + split once (vectorized)
s = df[source].fillna('').astype(str)
s = (
@@ -55,12 +55,26 @@ def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFr
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
s = series.astype(str).str.lower().str.replace(r'[^a-z]', '', regex=True).str.cat(sep='')
out = (
s.value_counts(normalize=False)
.reindex(list(LETTERS), fill_value=0)
.rename_axis("letter").reset_index(name="count")
# Normalize: lowercase, remove non-letters, concatenate all into one string
s = (
series.astype(str)
.str.lower()
.str.replace(r'[^a-z]', '', regex=True)
.str.cat(sep='')
)
# Convert string into Series of characters
chars = pd.Series(list(s))
# Count letters and ensure all letters are present
out = (
chars.value_counts(normalize=False)
.reindex(list(LETTERS), fill_value=0)
.rename_axis("letter")
.reset_index(name="count")
)
# Relative frequency
total = out["count"].sum()
out["freq"] = out["count"] / (total if total > 0 else 1)
return out
@@ -209,3 +223,48 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
}, index=["names", "surnames"])
return out
import pandas as pd
from collections import Counter
from typing import Literal
def build_ngrams_count(
df: pd.DataFrame,
n: int,
where: Literal["any", "prefix", "suffix"] = "any",
) -> pd.DataFrame:
# Normalize and clean to az
names = (
df["name"].astype(str)
.str.lower()
.str.replace(r"[^a-z]", "", regex=True)
)
ngrams = []
if where == "any":
for s in names:
L = len(s)
if L >= n:
ngrams.extend(s[i:i+n] for i in range(L - n + 1))
elif where == "prefix":
for s in names:
if len(s) >= n:
ngrams.append(s[:n])
elif where == "suffix":
for s in names:
if len(s) >= n:
ngrams.append(s[-n:])
else:
raise ValueError("where must be one of: 'any', 'prefix', 'suffix'")
counter = Counter(ngrams)
out = (
pd.DataFrame(counter.items(), columns=[f"{n}-gram", "count"])
.sort_values("count", ascending=False, kind="mergesort")
.reset_index(drop=True)
)
total = out["count"].sum()
out["freq"] = out["count"] / (total if total > 0 else 1)
return out