feat: statistics tests
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
from research.statistics.utils import LETTERS
|
||||
from research.statistics.utils import LETTERS, build_letter_frequencies
|
||||
|
||||
|
||||
def plot_transition_matrix(ax, df_probs, title=""):
|
||||
@@ -12,4 +15,36 @@ def plot_transition_matrix(ax, df_probs, title=""):
|
||||
ax=ax
|
||||
)
|
||||
ax.set_title(title, fontsize=12)
|
||||
return hm
|
||||
return hm
|
||||
|
||||
|
||||
def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
||||
# Compute frequencies
|
||||
L_m = build_letter_frequencies(males['name']).set_index("letter")["freq"]
|
||||
L_f = build_letter_frequencies(females['name']).set_index("letter")["freq"]
|
||||
|
||||
# Combine into one DataFrame
|
||||
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
|
||||
df_plot.to_csv(f"../assets/{title}_letter_frequencies.csv", index=False)
|
||||
|
||||
# Optional sorting
|
||||
if sort_values:
|
||||
df_plot = df_plot.sort_values("Male", ascending=False)
|
||||
|
||||
# Plot side-by-side bars
|
||||
x = np.arange(len(df_plot))
|
||||
w = 0.4
|
||||
fig, ax = plt.subplots(figsize=(16, 6))
|
||||
ax.bar(x - w/2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
|
||||
ax.bar(x + w/2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
|
||||
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(df_plot["letter"])
|
||||
ax.set_ylabel("Frequency")
|
||||
ax.set_xlabel("Letter")
|
||||
ax.set_title(f"{title} - Letter Frequencies")
|
||||
ax.legend()
|
||||
ax.grid(axis="y", alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
@@ -23,7 +23,7 @@ def normalize_letters(s):
|
||||
return s
|
||||
|
||||
|
||||
def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
|
||||
def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
|
||||
return (
|
||||
df.groupby("province")["identified_category"]
|
||||
.value_counts(normalize=True) # get proportions
|
||||
@@ -31,7 +31,7 @@ def identified_category_dist(df: pd.DataFrame) -> pd.DataFrame:
|
||||
)
|
||||
|
||||
|
||||
def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
|
||||
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
|
||||
# Normalize + split once (vectorized)
|
||||
s = df[source].fillna('').astype(str)
|
||||
s = (
|
||||
@@ -55,12 +55,26 @@ def explode_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFr
|
||||
|
||||
|
||||
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
||||
s = series.astype(str).str.lower().str.replace(r'[^a-z]', '', regex=True).str.cat(sep='')
|
||||
out = (
|
||||
s.value_counts(normalize=False)
|
||||
.reindex(list(LETTERS), fill_value=0)
|
||||
.rename_axis("letter").reset_index(name="count")
|
||||
# Normalize: lowercase, remove non-letters, concatenate all into one string
|
||||
s = (
|
||||
series.astype(str)
|
||||
.str.lower()
|
||||
.str.replace(r'[^a-z]', '', regex=True)
|
||||
.str.cat(sep='')
|
||||
)
|
||||
|
||||
# Convert string into Series of characters
|
||||
chars = pd.Series(list(s))
|
||||
|
||||
# Count letters and ensure all letters are present
|
||||
out = (
|
||||
chars.value_counts(normalize=False)
|
||||
.reindex(list(LETTERS), fill_value=0)
|
||||
.rename_axis("letter")
|
||||
.reset_index(name="count")
|
||||
)
|
||||
|
||||
# Relative frequency
|
||||
total = out["count"].sum()
|
||||
out["freq"] = out["count"] / (total if total > 0 else 1)
|
||||
return out
|
||||
@@ -209,3 +223,48 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
|
||||
}, index=["names", "surnames"])
|
||||
|
||||
return out
|
||||
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from typing import Literal
|
||||
|
||||
|
||||
def build_ngrams_count(
|
||||
df: pd.DataFrame,
|
||||
n: int,
|
||||
where: Literal["any", "prefix", "suffix"] = "any",
|
||||
) -> pd.DataFrame:
|
||||
# Normalize and clean to a–z
|
||||
names = (
|
||||
df["name"].astype(str)
|
||||
.str.lower()
|
||||
.str.replace(r"[^a-z]", "", regex=True)
|
||||
)
|
||||
|
||||
ngrams = []
|
||||
if where == "any":
|
||||
for s in names:
|
||||
L = len(s)
|
||||
if L >= n:
|
||||
ngrams.extend(s[i:i+n] for i in range(L - n + 1))
|
||||
elif where == "prefix":
|
||||
for s in names:
|
||||
if len(s) >= n:
|
||||
ngrams.append(s[:n])
|
||||
elif where == "suffix":
|
||||
for s in names:
|
||||
if len(s) >= n:
|
||||
ngrams.append(s[-n:])
|
||||
else:
|
||||
raise ValueError("where must be one of: 'any', 'prefix', 'suffix'")
|
||||
|
||||
counter = Counter(ngrams)
|
||||
|
||||
out = (
|
||||
pd.DataFrame(counter.items(), columns=[f"{n}-gram", "count"])
|
||||
.sort_values("count", ascending=False, kind="mergesort")
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
total = out["count"].sum()
|
||||
out["freq"] = out["count"] / (total if total > 0 else 1)
|
||||
return out
|
||||
|
||||
Reference in New Issue
Block a user