feat: statistics tests

This commit is contained in:
2025-09-28 23:50:40 +02:00
parent 9039e9a4cf
commit 9e35f95107
18 changed files with 7645 additions and 4872 deletions
+37 -2
View File
@@ -1,6 +1,9 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from research.statistics.utils import LETTERS
from research.statistics.utils import LETTERS, build_letter_frequencies
def plot_transition_matrix(ax, df_probs, title=""):
@@ -12,4 +15,36 @@ def plot_transition_matrix(ax, df_probs, title=""):
ax=ax
)
ax.set_title(title, fontsize=12)
return hm
return hm
def plot_letter_frequencies(males, females, sort_values=False, title=None):
# Compute frequencies
L_m = build_letter_frequencies(males['name']).set_index("letter")["freq"]
L_f = build_letter_frequencies(females['name']).set_index("letter")["freq"]
# Combine into one DataFrame
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
df_plot.to_csv(f"../assets/{title}_letter_frequencies.csv", index=False)
# Optional sorting
if sort_values:
df_plot = df_plot.sort_values("Male", ascending=False)
# Plot side-by-side bars
x = np.arange(len(df_plot))
w = 0.4
fig, ax = plt.subplots(figsize=(16, 6))
ax.bar(x - w/2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
ax.bar(x + w/2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels(df_plot["letter"])
ax.set_ylabel("Frequency")
ax.set_xlabel("Letter")
ax.set_title(f"{title} - Letter Frequencies")
ax.legend()
ax.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
+66 -7
View File
@@ -23,7 +23,7 @@ def normalize_letters(s):
return s
def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
return (
df.groupby("province")["identified_category"]
.value_counts(normalize=True) # get proportions
@@ -31,7 +31,7 @@ def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
)
def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
# Normalize + split once (vectorized)
s = df[source].fillna('').astype(str)
s = (
@@ -55,12 +55,26 @@ def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFr
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
s = series.astype(str).str.lower().str.replace(r'[^a-z]', '', regex=True).str.cat(sep='')
out = (
s.value_counts(normalize=False)
.reindex(list(LETTERS), fill_value=0)
.rename_axis("letter").reset_index(name="count")
# Normalize: lowercase, remove non-letters, concatenate all into one string
s = (
series.astype(str)
.str.lower()
.str.replace(r'[^a-z]', '', regex=True)
.str.cat(sep='')
)
# Convert string into Series of characters
chars = pd.Series(list(s))
# Count letters and ensure all letters are present
out = (
chars.value_counts(normalize=False)
.reindex(list(LETTERS), fill_value=0)
.rename_axis("letter")
.reset_index(name="count")
)
# Relative frequency
total = out["count"].sum()
out["freq"] = out["count"] / (total if total > 0 else 1)
return out
@@ -209,3 +223,48 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
}, index=["names", "surnames"])
return out
import pandas as pd
from collections import Counter
from typing import Literal
def build_ngrams_count(
df: pd.DataFrame,
n: int,
where: Literal["any", "prefix", "suffix"] = "any",
) -> pd.DataFrame:
# Normalize and clean to az
names = (
df["name"].astype(str)
.str.lower()
.str.replace(r"[^a-z]", "", regex=True)
)
ngrams = []
if where == "any":
for s in names:
L = len(s)
if L >= n:
ngrams.extend(s[i:i+n] for i in range(L - n + 1))
elif where == "prefix":
for s in names:
if len(s) >= n:
ngrams.append(s[:n])
elif where == "suffix":
for s in names:
if len(s) >= n:
ngrams.append(s[-n:])
else:
raise ValueError("where must be one of: 'any', 'prefix', 'suffix'")
counter = Counter(ngrams)
out = (
pd.DataFrame(counter.items(), columns=[f"{n}-gram", "count"])
.sort_values("count", ascending=False, kind="mergesort")
.reset_index(drop=True)
)
total = out["count"].sum()
out["freq"] = out["count"] / (total if total > 0 else 1)
return out