feat: support gpu

This commit is contained in:
2025-09-29 21:07:23 +02:00
parent 9e35f95107
commit a1d500830b
15 changed files with 661 additions and 85 deletions
+1 -1
View File
@@ -48,7 +48,7 @@ class BiGRUModel(NeuralNetworkModel):
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary gender classification.
Dense(2, activation="softmax"),
Dense(2, activation="softmax", dtype="float32"),
]
)
+1 -1
View File
@@ -54,7 +54,7 @@ class CNNModel(NeuralNetworkModel):
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary classification.
Dense(2, activation="softmax"),
Dense(2, activation="softmax", dtype="float32"),
]
)
+9
View File
@@ -20,6 +20,12 @@ class LightGBMModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Optional GPU acceleration
use_gpu = bool(params.get("use_gpu", False))
device = params.get("device", "gpu" if use_gpu else "cpu")
gpu_platform_id = params.get("gpu_platform_id", None)
gpu_device_id = params.get("gpu_device_id", None)
# Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
# and parallelism improve training speed for this task.
return lgb.LGBMClassifier(
@@ -33,6 +39,9 @@ class LightGBMModel(TraditionalModel):
objective=params.get("objective", "binary"),
n_jobs=params.get("n_jobs", -1),
verbose=2,
device=device,
gpu_platform_id=gpu_platform_id,
gpu_device_id=gpu_device_id,
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+1 -1
View File
@@ -45,7 +45,7 @@ class LSTMModel(NeuralNetworkModel):
Dense(64, activation="relu"),
Dropout(params.get("dropout", 0.5)),
# Two-way softmax for binary classification.
Dense(2, activation="softmax"),
Dense(2, activation="softmax", dtype="float32"),
]
)
+1 -1
View File
@@ -45,7 +45,7 @@ class TransformerModel(NeuralNetworkModel):
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation="relu")(x)
x = Dropout(params.get("dropout", 0.1))(x)
outputs = Dense(2, activation="softmax")(x)
outputs = Dense(2, activation="softmax", dtype="float32")(x)
model = Model(inputs, outputs)
model.compile(
+10 -1
View File
@@ -20,6 +20,14 @@ class XGBoostModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Optional GPU acceleration
use_gpu = bool(params.get("use_gpu", False))
default_tree_method = "gpu_hist" if use_gpu else "hist"
tree_method = params.get("tree_method", default_tree_method)
predictor = params.get(
"predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto"
)
# Histogram-based trees and parallelism provide fast training; default
# logloss metric suits binary classification of gender.
return xgb.XGBClassifier(
@@ -31,7 +39,8 @@ class XGBoostModel(TraditionalModel):
random_state=self.config.random_seed,
eval_metric="logloss",
n_jobs=params.get("n_jobs", -1),
tree_method=params.get("tree_method", "hist"),
tree_method=tree_method,
predictor=predictor,
verbosity=2,
)
+85
View File
@@ -30,6 +30,38 @@ class NeuralNetworkModel(BaseModel):
"""Fit the neural network model with deferred building"""
logging.info(f"Training {self.__class__.__name__}")
# Best-effort GPU configuration for TensorFlow when available
# - Enables memory growth to avoid pre-allocating all VRAM
# - Optionally enables mixed precision if requested via model params
try:
import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
logging.info("Enabled TensorFlow mixed precision (float16)")
except Exception as e:
logging.warning(f"Could not enable mixed precision: {e}")
else:
if requested_gpu:
logging.warning("Requested GPU but no TensorFlow GPU device is available.")
except Exception as e:
# Keep silent in non-TF environments / non-NN workflows
logging.debug(f"TensorFlow GPU setup skipped: {e}")
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
@@ -105,6 +137,32 @@ class NeuralNetworkModel(BaseModel):
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
# Ensure TF GPU/mixed-precision config also applies to CV runs
try:
import tensorflow as tf
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
if requested_gpu:
logging.warning("Requested GPU for CV but none is available.")
except Exception:
pass
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
y_encoded = self.label_encoder.transform(y)
@@ -165,6 +223,33 @@ class NeuralNetworkModel(BaseModel):
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
# Ensure TF GPU/mixed-precision config also applies here
try:
import tensorflow as tf
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
if requested_gpu:
logging.warning("Requested GPU for learning curve but none is available.")
except Exception:
pass
if train_sizes is None:
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
+1 -1
View File
@@ -1 +1 @@
LETTERS = 'abcdefghijklmnopqrstuvwxyz'
LETTERS = "abcdefghijklmnopqrstuvwxyz"
+5 -9
View File
@@ -8,11 +8,7 @@ from research.statistics.utils import LETTERS, build_letter_frequencies
def plot_transition_matrix(ax, df_probs, title=""):
hm = sns.heatmap(
df_probs.loc[list(LETTERS), list(LETTERS)],
cmap="Reds",
annot=False,
cbar=False,
ax=ax
df_probs.loc[list(LETTERS), list(LETTERS)], cmap="Reds", annot=False, cbar=False, ax=ax
)
ax.set_title(title, fontsize=12)
return hm
@@ -20,8 +16,8 @@ def plot_transition_matrix(ax, df_probs, title=""):
def plot_letter_frequencies(males, females, sort_values=False, title=None):
# Compute frequencies
L_m = build_letter_frequencies(males['name']).set_index("letter")["freq"]
L_f = build_letter_frequencies(females['name']).set_index("letter")["freq"]
L_m = build_letter_frequencies(males["name"]).set_index("letter")["freq"]
L_f = build_letter_frequencies(females["name"]).set_index("letter")["freq"]
# Combine into one DataFrame
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
@@ -35,8 +31,8 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None):
x = np.arange(len(df_plot))
w = 0.4
fig, ax = plt.subplots(figsize=(16, 6))
ax.bar(x - w/2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
ax.bar(x + w/2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
ax.bar(x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
ax.bar(x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels(df_plot["letter"])
+60 -65
View File
@@ -9,9 +9,10 @@ from scipy.spatial.distance import euclidean
from scipy.stats import entropy
from typing import Dict, Any
LETTERS = 'abcdefghijklmnopqrstuvwxyz'
START_TOKEN = '^'
END_TOKEN = '$'
LETTERS = "abcdefghijklmnopqrstuvwxyz"
START_TOKEN = "^"
END_TOKEN = "$"
def normalize_letters(s):
"""Normalize accents -> ascii, lowercase, keep only a-z."""
@@ -27,41 +28,28 @@ def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
return (
df.groupby("province")["identified_category"]
.value_counts(normalize=True) # get proportions
.unstack(fill_value=0) # reshape into columns per word count
.unstack(fill_value=0) # reshape into columns per word count
)
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
# Normalize + split once (vectorized)
s = df[source].fillna('').astype(str)
s = (
s.str.lower()
.str.replace(r"[^\w'\-]+", " ", regex=True)
.str.strip()
.str.split()
)
s = df[source].fillna("").astype(str)
s = s.str.lower().str.replace(r"[^\w'\-]+", " ", regex=True).str.strip().str.split()
# Explode the token list into rows under `target`
out = (
df.assign(**{target: s})
.explode(target, ignore_index=True)
)
out = df.assign(**{target: s}).explode(target, ignore_index=True)
# Drop NA/empty tokens and strip whitespace
out[target] = out[target].astype(str).str.strip()
out = out[out[target].ne('')].dropna(subset=[target]).reset_index(drop=True)
out = out[out[target].ne("")].dropna(subset=[target]).reset_index(drop=True)
return out
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
# Normalize: lowercase, remove non-letters, concatenate all into one string
s = (
series.astype(str)
.str.lower()
.str.replace(r'[^a-z]', '', regex=True)
.str.cat(sep='')
)
s = series.astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True).str.cat(sep="")
# Convert string into Series of characters
chars = pd.Series(list(s))
@@ -82,11 +70,7 @@ def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict:
# 1) Normalize
names = (
names.astype(str)
.str.lower()
.str.replace(fr"[^{LETTERS}]", "", regex=True)
)
names = names.astype(str).str.lower().str.replace(rf"[^{LETTERS}]", "", regex=True)
names = names[names.str.len() > 0]
# 2) Prepare sequences
@@ -130,7 +114,7 @@ def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict
# 11) DataFrames
df_counts = pd.DataFrame(counts, index=tokens, columns=tokens)
df_probs = pd.DataFrame(probs, index=tokens, columns=tokens)
df_probs = pd.DataFrame(probs, index=tokens, columns=tokens)
return {
"tokens": tokens,
@@ -142,7 +126,11 @@ def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict
}
def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_transitions: Dict[str, Any], n_permutations: int = 1000) -> pd.DataFrame:
def build_transition_comparisons(
names_transitions: Dict[str, Any],
surnames_transitions: Dict[str, Any],
n_permutations: int = 1000,
) -> pd.DataFrame:
"""
Compares letter transition probability matrices for names and surnames using
various distance metrics and a permutation test for statistical significance.
@@ -150,23 +138,20 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
# Helper function to flatten and smooth matrices
def prepare_data(data):
return {
'm': data['m']['probs'].flatten(),
'f': data['f']['probs'].flatten()
}
return {"m": data["m"]["probs"].flatten(), "f": data["f"]["probs"].flatten()}
prepared_names = prepare_data(names_transitions)
prepared_surnames = prepare_data(surnames_transitions)
# Distance Metrics
names_l2 = euclidean(prepared_names['m'], prepared_names['f'])
surnames_l2 = euclidean(prepared_surnames['m'], prepared_surnames['f'])
names_l2 = euclidean(prepared_names["m"], prepared_names["f"])
surnames_l2 = euclidean(prepared_surnames["m"], prepared_surnames["f"])
kl_names_mf = entropy(prepared_names['m'] + 1e-12, prepared_names['f'] + 1e-12)
kl_names_fm = entropy(prepared_names['f'] + 1e-12, prepared_names['m'] + 1e-12)
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
kl_surnames_mf = entropy(prepared_surnames['m'] + 1e-12, prepared_surnames['f'] + 1e-12)
kl_surnames_fm = entropy(prepared_surnames['f'] + 1e-12, prepared_surnames['m'] + 1e-12)
kl_surnames_mf = entropy(prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12)
kl_surnames_fm = entropy(prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12)
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
@@ -174,15 +159,15 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
# Permutation Test
def run_permutation_test(transitions):
# Flattened probabilities for male and female
P_m = transitions['m']['probs'].flatten()
P_f = transitions['f']['probs'].flatten()
P_m = transitions["m"]["probs"].flatten()
P_f = transitions["f"]["probs"].flatten()
# Calculate the observed JSD (our test statistic)
observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12))
# Concatenate male and female counts
counts_m = transitions['m']['counts']
counts_f = transitions['f']['counts']
counts_m = transitions["m"]["counts"]
counts_f = transitions["f"]["counts"]
all_counts = np.concatenate((counts_m, counts_f), axis=1)
total_counts = counts_m.shape[1] + counts_f.shape[1]
@@ -194,17 +179,27 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
# Note: This is a simplified approach, assuming counts are
# structured per name. A more robust implementation would
# shuffle the actual names themselves.
permuted_counts_m = all_counts[:, shuffled_indices[:counts_m.shape[1]]]
permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1]:]]
permuted_counts_m = all_counts[:, shuffled_indices[: counts_m.shape[1]]]
permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1] :]]
# Re-calculate probabilities and JSD for the permuted groups
# Add a small epsilon to the denominator to prevent division by zero
epsilon = 1e-12
permuted_probs_m = permuted_counts_m / (permuted_counts_m.sum(axis=0, keepdims=True) + epsilon)
permuted_probs_f = permuted_counts_f / (permuted_counts_f.sum(axis=0, keepdims=True) + epsilon)
permuted_probs_m = permuted_counts_m / (
permuted_counts_m.sum(axis=0, keepdims=True) + epsilon
)
permuted_probs_f = permuted_counts_f / (
permuted_counts_f.sum(axis=0, keepdims=True) + epsilon
)
permuted_jsd = 0.5 * (entropy(permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12) +
entropy(permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12))
permuted_jsd = 0.5 * (
entropy(
permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12
)
+ entropy(
permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12
)
)
permuted_jsds.append(permuted_jsd)
# Calculate the p-value
@@ -214,39 +209,39 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
names_p_value = run_permutation_test(names_transitions)
surnames_p_value = run_permutation_test(surnames_transitions)
out = pd.DataFrame({
"l2": [names_l2, surnames_l2],
"kl_mf": [kl_names_mf, kl_surnames_mf],
"kl_fm": [kl_names_fm, kl_surnames_fm],
"jsd": [jsd_names, jsd_surnames],
"permutation_p_value": [names_p_value, surnames_p_value]
}, index=["names", "surnames"])
out = pd.DataFrame(
{
"l2": [names_l2, surnames_l2],
"kl_mf": [kl_names_mf, kl_surnames_mf],
"kl_fm": [kl_names_fm, kl_surnames_fm],
"jsd": [jsd_names, jsd_surnames],
"permutation_p_value": [names_p_value, surnames_p_value],
},
index=["names", "surnames"],
)
return out
import pandas as pd
from collections import Counter
from typing import Literal
def build_ngrams_count(
df: pd.DataFrame,
n: int,
where: Literal["any", "prefix", "suffix"] = "any",
df: pd.DataFrame,
n: int,
where: Literal["any", "prefix", "suffix"] = "any",
) -> pd.DataFrame:
# Normalize and clean to az
names = (
df["name"].astype(str)
.str.lower()
.str.replace(r"[^a-z]", "", regex=True)
)
names = df["name"].astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True)
ngrams = []
if where == "any":
for s in names:
L = len(s)
if L >= n:
ngrams.extend(s[i:i+n] for i in range(L - n + 1))
ngrams.extend(s[i : i + n] for i in range(L - n + 1))
elif where == "prefix":
for s in names:
if len(s) >= n: