feat: statistics tests

2025-09-28 23:50:40 +02:00
parent 9039e9a4cf
commit 9e35f95107
18 changed files with 7645 additions and 4872 deletions
@@ -1,6 +1,9 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
 import seaborn as sns

-from research.statistics.utils import LETTERS
+from research.statistics.utils import LETTERS, build_letter_frequencies


 def plot_transition_matrix(ax, df_probs, title=""):
@@ -12,4 +15,36 @@ def plot_transition_matrix(ax, df_probs, title=""):
        ax=ax
    )
    ax.set_title(title, fontsize=12)
-    return hm
+    return hm
+
+
+def plot_letter_frequencies(males, females, sort_values=False, title=None):
+    # Compute frequencies
+    L_m = build_letter_frequencies(males['name']).set_index("letter")["freq"]
+    L_f = build_letter_frequencies(females['name']).set_index("letter")["freq"]
+
+    # Combine into one DataFrame
+    df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
+    df_plot.to_csv(f"../assets/{title}_letter_frequencies.csv", index=False)
+
+    # Optional sorting
+    if sort_values:
+        df_plot = df_plot.sort_values("Male", ascending=False)
+
+    # Plot side-by-side bars
+    x = np.arange(len(df_plot))
+    w = 0.4
+    fig, ax = plt.subplots(figsize=(16, 6))
+    ax.bar(x - w/2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
+    ax.bar(x + w/2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(df_plot["letter"])
+    ax.set_ylabel("Frequency")
+    ax.set_xlabel("Letter")
+    ax.set_title(f"{title} - Letter Frequencies")
+    ax.legend()
+    ax.grid(axis="y", alpha=0.3)
+
+    plt.tight_layout()
+    plt.show()
@@ -23,7 +23,7 @@ def normalize_letters(s):
    return s


-def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
+def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
    return (
        df.groupby("province")["identified_category"]
        .value_counts(normalize=True)  # get proportions
@@ -31,7 +31,7 @@ def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
    )


-def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
+def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
    # Normalize + split once (vectorized)
    s = df[source].fillna('').astype(str)
    s = (
@@ -55,12 +55,26 @@ def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFr


 def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
-    s = series.astype(str).str.lower().str.replace(r'[^a-z]', '', regex=True).str.cat(sep='')
-    out = (
-        s.value_counts(normalize=False)
-        .reindex(list(LETTERS), fill_value=0)
-        .rename_axis("letter").reset_index(name="count")
+    # Normalize: lowercase, remove non-letters, concatenate all into one string
+    s = (
+        series.astype(str)
+        .str.lower()
+        .str.replace(r'[^a-z]', '', regex=True)
+        .str.cat(sep='')
    )
+
+    # Convert string into Series of characters
+    chars = pd.Series(list(s))
+
+    # Count letters and ensure all letters are present
+    out = (
+        chars.value_counts(normalize=False)
+        .reindex(list(LETTERS), fill_value=0)
+        .rename_axis("letter")
+        .reset_index(name="count")
+    )
+
+    # Relative frequency
    total = out["count"].sum()
    out["freq"] = out["count"] / (total if total > 0 else 1)
    return out
@@ -209,3 +223,48 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
    }, index=["names", "surnames"])

    return out
+
+import pandas as pd
+from collections import Counter
+from typing import Literal
+
+
+def build_ngrams_count(
+        df: pd.DataFrame,
+        n: int,
+        where: Literal["any", "prefix", "suffix"] = "any",
+) -> pd.DataFrame:
+    # Normalize and clean to a–z
+    names = (
+        df["name"].astype(str)
+        .str.lower()
+        .str.replace(r"[^a-z]", "", regex=True)
+    )
+
+    ngrams = []
+    if where == "any":
+        for s in names:
+            L = len(s)
+            if L >= n:
+                ngrams.extend(s[i:i+n] for i in range(L - n + 1))
+    elif where == "prefix":
+        for s in names:
+            if len(s) >= n:
+                ngrams.append(s[:n])
+    elif where == "suffix":
+        for s in names:
+            if len(s) >= n:
+                ngrams.append(s[-n:])
+    else:
+        raise ValueError("where must be one of: 'any', 'prefix', 'suffix'")
+
+    counter = Counter(ngrams)
+
+    out = (
+        pd.DataFrame(counter.items(), columns=[f"{n}-gram", "count"])
+        .sort_values("count", ascending=False, kind="mergesort")
+        .reset_index(drop=True)
+    )
+    total = out["count"].sum()
+    out["freq"] = out["count"] / (total if total > 0 else 1)
+    return out