feat: support gpu
This commit is contained in:
@@ -166,6 +166,57 @@ experiments and make predictions without needing to understand the underlying co
|
|||||||
streamlit run web/app.py
|
streamlit run web/app.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## GPU Acceleration
|
||||||
|
|
||||||
|
This project can leverage GPUs for faster training when supported libraries and hardware are available.
|
||||||
|
|
||||||
|
- TensorFlow/Keras models (BiGRU, LSTM, CNN, Transformer)
|
||||||
|
- Uses GPU automatically if a TensorFlow GPU build is installed.
|
||||||
|
- The code enables safe GPU memory growth by default; optionally enable mixed precision for additional speed:
|
||||||
|
- Add `mixed_precision: true` in the experiment `model_params` (e.g., in `config/research_templates.yaml`).
|
||||||
|
- The final layer outputs are set to float32 for numerical stability under mixed precision.
|
||||||
|
|
||||||
|
- spaCy NER
|
||||||
|
- Automatically prefers GPU if available; otherwise falls back to CPU.
|
||||||
|
- Ensure a compatible CUDA-enabled spaCy/thinc stack is installed to use GPU.
|
||||||
|
|
||||||
|
- XGBoost
|
||||||
|
- Enable GPU by adding to the experiment `model_params`:
|
||||||
|
- `use_gpu: true` (sets `tree_method: gpu_hist` and `predictor: gpu_predictor`).
|
||||||
|
|
||||||
|
- LightGBM
|
||||||
|
- Enable GPU by adding to the experiment `model_params`:
|
||||||
|
- `use_gpu: true` (sets `device: gpu`). Optional: `gpu_platform_id`, `gpu_device_id`.
|
||||||
|
|
||||||
|
Example template snippet (GPU on):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: "lstm_gpu"
|
||||||
|
description: "LSTM with GPU + mixed precision"
|
||||||
|
model_type: "lstm"
|
||||||
|
features: ["full_name"]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
lstm_units: 64
|
||||||
|
epochs: 5
|
||||||
|
batch_size: 128
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: ["gpu", "mixed_precision"]
|
||||||
|
|
||||||
|
- name: "xgboost_gpu"
|
||||||
|
description: "XGBoost with GPU"
|
||||||
|
model_type: "xgboost"
|
||||||
|
features: ["full_name"]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 200
|
||||||
|
use_gpu: true
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Install CUDA‑enabled binaries for TensorFlow/spaCy/LightGBM/XGBoost to actually use GPU.
|
||||||
|
- If GPU is requested but not available, training will proceed on CPU with a warning.
|
||||||
|
|
||||||
## Contributors
|
## Contributors
|
||||||
|
|
||||||
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
<a href="https://github.com/bernard-ng/drc-ners-nlp/graphs/contributors" title="show all contributors">
|
||||||
|
|||||||
@@ -0,0 +1,412 @@
|
|||||||
|
baseline_experiments:
|
||||||
|
# BiGRU Models (GPU-enabled)
|
||||||
|
- name: "bigru"
|
||||||
|
description: "Baseline BiGRU with full name features (GPU)"
|
||||||
|
model_type: "bigru"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 64
|
||||||
|
gru_units: 32
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 32
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "bigru", "gpu" ]
|
||||||
|
|
||||||
|
- name: "bigru_native"
|
||||||
|
description: "Baseline BiGRU with native name features (GPU)"
|
||||||
|
model_type: "bigru"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 64
|
||||||
|
gru_units: 32
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 32
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "bigru", "native", "gpu" ]
|
||||||
|
|
||||||
|
- name: "bigru_surname"
|
||||||
|
description: "Baseline BiGRU with surname features (GPU)"
|
||||||
|
model_type: "bigru"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 64
|
||||||
|
gru_units: 32
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 32
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "bigru", "surname", "gpu" ]
|
||||||
|
|
||||||
|
## CNN Models (GPU-enabled)
|
||||||
|
- name: "cnn"
|
||||||
|
description: "Baseline CNN with character patterns (GPU)"
|
||||||
|
model_type: "cnn"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 64
|
||||||
|
filters: 64
|
||||||
|
kernel_size: 3
|
||||||
|
dropout: 0.5
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 32
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "cnn", "gpu" ]
|
||||||
|
|
||||||
|
- name: "cnn_native"
|
||||||
|
description: "Baseline CNN with native name character patterns (GPU)"
|
||||||
|
model_type: "cnn"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 64
|
||||||
|
filters: 64
|
||||||
|
kernel_size: 3
|
||||||
|
dropout: 0.5
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 32
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "cnn", "native", "gpu" ]
|
||||||
|
|
||||||
|
- name: "cnn_surname"
|
||||||
|
description: "Baseline CNN with surname character patterns (GPU)"
|
||||||
|
model_type: "cnn"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 64
|
||||||
|
filters: 64
|
||||||
|
kernel_size: 3
|
||||||
|
dropout: 0.5
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 32
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "cnn", "surname", "gpu" ]
|
||||||
|
|
||||||
|
## Ensemble Models (CPU)
|
||||||
|
- name: "ensemble"
|
||||||
|
description: "Baseline Ensemble with multiple models"
|
||||||
|
model_type: "ensemble"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
||||||
|
voting: "soft"
|
||||||
|
cv_folds: 5
|
||||||
|
tags: [ "baseline", "ensemble" ]
|
||||||
|
|
||||||
|
- name: "ensemble_native"
|
||||||
|
description: "Baseline Ensemble with native name"
|
||||||
|
model_type: "ensemble"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
||||||
|
voting: "soft"
|
||||||
|
cv_folds: 5
|
||||||
|
tags: [ "baseline", "ensemble", "native" ]
|
||||||
|
|
||||||
|
- name: "ensemble_surname"
|
||||||
|
description: "Baseline Ensemble with surname"
|
||||||
|
model_type: "ensemble"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
base_models: [ "logistic_regression", "random_forest", "xgboost" ]
|
||||||
|
voting: "soft"
|
||||||
|
cv_folds: 5
|
||||||
|
tags: [ "baseline", "ensemble", "surname" ]
|
||||||
|
|
||||||
|
# LightGBM Models (GPU-enabled)
|
||||||
|
- name: "lightgbm"
|
||||||
|
description: "Baseline LightGBM with engineered features (GPU)"
|
||||||
|
model_type: "lightgbm"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: -1
|
||||||
|
learning_rate: 0.1
|
||||||
|
num_leaves: 31
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
use_gpu: true
|
||||||
|
tags: [ "baseline", "lightgbm", "gpu" ]
|
||||||
|
|
||||||
|
- name: "lightgbm_native"
|
||||||
|
description: "Baseline LightGBM with native name features (GPU)"
|
||||||
|
model_type: "lightgbm"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: -1
|
||||||
|
learning_rate: 0.1
|
||||||
|
num_leaves: 31
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
use_gpu: true
|
||||||
|
tags: [ "baseline", "lightgbm", "native", "gpu" ]
|
||||||
|
|
||||||
|
- name: "lightgbm_surname"
|
||||||
|
description: "Baseline LightGBM with surname features (GPU)"
|
||||||
|
model_type: "lightgbm"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: -1
|
||||||
|
learning_rate: 0.1
|
||||||
|
num_leaves: 31
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
use_gpu: true
|
||||||
|
tags: [ "baseline", "lightgbm", "surname", "gpu" ]
|
||||||
|
|
||||||
|
# Logistic Regression Models (CPU)
|
||||||
|
- name: "logistic_regression"
|
||||||
|
description: "Baseline logistic regression with full name"
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 10000
|
||||||
|
tags: [ "baseline", "logistic_regression", "fullname" ]
|
||||||
|
|
||||||
|
- name: "logistic_regression_native"
|
||||||
|
description: "Logistic regression with native name only"
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "logistic_regression", "native" ]
|
||||||
|
|
||||||
|
- name: "logistic_regression_surname"
|
||||||
|
description: "Logistic regression with surname name only"
|
||||||
|
model_type: "logistic_regression"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "logistic_regression", "surname" ]
|
||||||
|
|
||||||
|
# LSTM Models (GPU-enabled)
|
||||||
|
- name: "lstm"
|
||||||
|
description: "Baseline LSTM with full name features (GPU)"
|
||||||
|
model_type: "lstm"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
lstm_units: 64
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 64
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "lstm", "gpu" ]
|
||||||
|
|
||||||
|
- name: "lstm_native"
|
||||||
|
description: "Baseline LSTM with native name features (GPU)"
|
||||||
|
model_type: "lstm"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
lstm_units: 64
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 64
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "lstm", "native", "gpu" ]
|
||||||
|
|
||||||
|
- name: "lstm_surname"
|
||||||
|
description: "Baseline LSTM with surname features (GPU)"
|
||||||
|
model_type: "lstm"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
lstm_units: 64
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 64
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "lstm", "surname", "gpu" ]
|
||||||
|
|
||||||
|
# Naive Bayes Models (CPU)
|
||||||
|
- name: "naive_bayes"
|
||||||
|
description: "Baseline Naive Bayes with full name features"
|
||||||
|
model_type: "naive_bayes"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "naive_bayes" ]
|
||||||
|
|
||||||
|
- name: "naive_bayes_native"
|
||||||
|
description: "Baseline Naive Bayes with native name features"
|
||||||
|
model_type: "naive_bayes"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "naive_bayes", "native" ]
|
||||||
|
|
||||||
|
- name: "naive_bayes_surname"
|
||||||
|
description: "Baseline Naive Bayes with surname features"
|
||||||
|
model_type: "naive_bayes"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "naive_bayes", "surname" ]
|
||||||
|
|
||||||
|
# Random Forest Models (CPU)
|
||||||
|
- name: "random_forest"
|
||||||
|
description: "Baseline Random Forest with engineered features"
|
||||||
|
model_type: "random_forest"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 10
|
||||||
|
min_samples_split: 2
|
||||||
|
min_samples_leaf: 1
|
||||||
|
tags: [ "baseline", "random_forest", "engineered" ]
|
||||||
|
|
||||||
|
- name: "random_forest_native"
|
||||||
|
description: "Baseline Random Forest with native name engineered features"
|
||||||
|
model_type: "random_forest"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 10
|
||||||
|
min_samples_split: 2
|
||||||
|
min_samples_leaf: 1
|
||||||
|
tags: [ "baseline", "random_forest", "engineered", "native" ]
|
||||||
|
|
||||||
|
- name: "random_forest_surname"
|
||||||
|
description: "Baseline Random Forest with surname engineered features"
|
||||||
|
model_type: "random_forest"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 10
|
||||||
|
min_samples_split: 2
|
||||||
|
min_samples_leaf: 1
|
||||||
|
tags: [ "baseline", "random_forest", "engineered", "surname" ]
|
||||||
|
|
||||||
|
# SVM Models (CPU)
|
||||||
|
- name: "svm"
|
||||||
|
description: "Baseline SVM with full name features"
|
||||||
|
model_type: "svm"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
C: 1.0
|
||||||
|
kernel: "rbf"
|
||||||
|
ngram_range: [ 2, 4 ]
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "svm" ]
|
||||||
|
|
||||||
|
- name: "svm_native"
|
||||||
|
description: "Baseline SVM with native name features"
|
||||||
|
model_type: "svm"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
C: 1.0
|
||||||
|
kernel: "rbf"
|
||||||
|
ngram_range: [ 2, 4 ]
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "svm", "native" ]
|
||||||
|
|
||||||
|
- name: "svm_surname"
|
||||||
|
description: "Baseline SVM with surname features"
|
||||||
|
model_type: "svm"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
C: 1.0
|
||||||
|
kernel: "rbf"
|
||||||
|
ngram_range: [ 2, 4 ]
|
||||||
|
max_features: 5000
|
||||||
|
tags: [ "baseline", "svm", "surname" ]
|
||||||
|
|
||||||
|
# Transformer Models (GPU-enabled)
|
||||||
|
- name: "transformer"
|
||||||
|
description: "Baseline Transformer with attention mechanism (GPU)"
|
||||||
|
model_type: "transformer"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
num_heads: 4
|
||||||
|
num_layers: 2
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 64
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "transformer", "gpu" ]
|
||||||
|
|
||||||
|
- name: "transformer_native"
|
||||||
|
description: "Baseline Transformer with native name attention mechanism (GPU)"
|
||||||
|
model_type: "transformer"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
num_heads: 4
|
||||||
|
num_layers: 2
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 64
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "transformer", "native", "gpu" ]
|
||||||
|
|
||||||
|
- name: "transformer_surname"
|
||||||
|
description: "Baseline Transformer with surname attention mechanism (GPU)"
|
||||||
|
model_type: "transformer"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
embedding_dim: 128
|
||||||
|
num_heads: 4
|
||||||
|
num_layers: 2
|
||||||
|
epochs: 2
|
||||||
|
batch_size: 64
|
||||||
|
use_gpu: true
|
||||||
|
mixed_precision: true
|
||||||
|
tags: [ "baseline", "neural", "transformer", "surname", "gpu" ]
|
||||||
|
|
||||||
|
# XGBoost Models (GPU-enabled)
|
||||||
|
- name: "xgboost"
|
||||||
|
description: "Baseline XGBoost with engineered features (GPU)"
|
||||||
|
model_type: "xgboost"
|
||||||
|
features: [ "full_name" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 6
|
||||||
|
learning_rate: 0.1
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
use_gpu: true
|
||||||
|
tags: [ "baseline", "xgboost", "gpu" ]
|
||||||
|
|
||||||
|
- name: "xgboost_native"
|
||||||
|
description: "Baseline XGBoost with native name engineered features (GPU)"
|
||||||
|
model_type: "xgboost"
|
||||||
|
features: [ "native_name" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 6
|
||||||
|
learning_rate: 0.1
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
use_gpu: true
|
||||||
|
tags: [ "baseline", "xgboost", "native", "gpu" ]
|
||||||
|
|
||||||
|
- name: "xgboost_surname"
|
||||||
|
description: "Baseline XGBoost with surname engineered features (GPU)"
|
||||||
|
model_type: "xgboost"
|
||||||
|
features: [ "surname" ]
|
||||||
|
model_params:
|
||||||
|
n_estimators: 100
|
||||||
|
max_depth: 6
|
||||||
|
learning_rate: 0.1
|
||||||
|
subsample: 0.8
|
||||||
|
colsample_bytree: 0.8
|
||||||
|
use_gpu: true
|
||||||
|
tags: [ "baseline", "xgboost", "surname", "gpu" ]
|
||||||
|
|
||||||
|
|
||||||
|
# Advanced Experiments Configuration
|
||||||
|
advanced_experiments:
|
||||||
|
|
||||||
|
# Feature Study Configurations
|
||||||
|
feature_studies:
|
||||||
|
|
||||||
|
# Hyperparameter Tuning Configurations
|
||||||
|
hyperparameter_tuning:
|
||||||
|
|
||||||
@@ -19,9 +19,15 @@ class RegionMapper:
|
|||||||
return (
|
return (
|
||||||
series.str.upper()
|
series.str.upper()
|
||||||
.str.strip()
|
.str.strip()
|
||||||
.apply(lambda x: unicodedata.normalize("NFKD", x)
|
.apply(
|
||||||
.encode("ascii", errors="ignore")
|
lambda x: (
|
||||||
.decode("utf-8") if isinstance(x, str) else x)
|
unicodedata.normalize("NFKD", x)
|
||||||
|
.encode("ascii", errors="ignore")
|
||||||
|
.decode("utf-8")
|
||||||
|
if isinstance(x, str)
|
||||||
|
else x
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -29,6 +29,15 @@ class NameModel:
|
|||||||
"""Create a blank spaCy model with NER pipeline"""
|
"""Create a blank spaCy model with NER pipeline"""
|
||||||
logging.info(f"Creating blank {language} model for NER training")
|
logging.info(f"Creating blank {language} model for NER training")
|
||||||
|
|
||||||
|
# Prefer GPU for spaCy if available (falls back to CPU automatically)
|
||||||
|
try:
|
||||||
|
if spacy.prefer_gpu():
|
||||||
|
logging.info("spaCy GPU enabled (cupy) for NER training")
|
||||||
|
else:
|
||||||
|
logging.info("spaCy running on CPU")
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"spaCy GPU selection skipped: {e}")
|
||||||
|
|
||||||
# Create blank model - French tokenizer works well for DRC names
|
# Create blank model - French tokenizer works well for DRC names
|
||||||
self.nlp = spacy.blank(language)
|
self.nlp = spacy.blank(language)
|
||||||
|
|
||||||
|
|||||||
@@ -20,11 +20,15 @@ class DataSelectionStep(PipelineStep):
|
|||||||
# Remove rows where region == "global" only for specific years
|
# Remove rows where region == "global" only for specific years
|
||||||
if "region" in batch.columns and "year" in batch.columns:
|
if "region" in batch.columns and "year" in batch.columns:
|
||||||
target_years = {2015, 2021, 2022}
|
target_years = {2015, 2021, 2022}
|
||||||
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(target_years)
|
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
|
||||||
|
target_years
|
||||||
|
)
|
||||||
removed = int(mask_remove.sum())
|
removed = int(mask_remove.sum())
|
||||||
if removed:
|
if removed:
|
||||||
batch = batch[~mask_remove]
|
batch = batch[~mask_remove]
|
||||||
logging.info(f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}")
|
logging.info(
|
||||||
|
f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}"
|
||||||
|
)
|
||||||
|
|
||||||
# Check which columns exist in the batch
|
# Check which columns exist in the batch
|
||||||
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
available_columns = [col for col in self.selected_columns if col in batch.columns]
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class BiGRUModel(NeuralNetworkModel):
|
|||||||
Dense(64, activation="relu"),
|
Dense(64, activation="relu"),
|
||||||
Dropout(params.get("dropout", 0.5)),
|
Dropout(params.get("dropout", 0.5)),
|
||||||
# Two-way softmax for binary gender classification.
|
# Two-way softmax for binary gender classification.
|
||||||
Dense(2, activation="softmax"),
|
Dense(2, activation="softmax", dtype="float32"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class CNNModel(NeuralNetworkModel):
|
|||||||
Dense(64, activation="relu"),
|
Dense(64, activation="relu"),
|
||||||
Dropout(params.get("dropout", 0.5)),
|
Dropout(params.get("dropout", 0.5)),
|
||||||
# Two-way softmax for binary classification.
|
# Two-way softmax for binary classification.
|
||||||
Dense(2, activation="softmax"),
|
Dense(2, activation="softmax", dtype="float32"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,12 @@ class LightGBMModel(TraditionalModel):
|
|||||||
def build_model(self) -> BaseEstimator:
|
def build_model(self) -> BaseEstimator:
|
||||||
params = self.config.model_params
|
params = self.config.model_params
|
||||||
|
|
||||||
|
# Optional GPU acceleration
|
||||||
|
use_gpu = bool(params.get("use_gpu", False))
|
||||||
|
device = params.get("device", "gpu" if use_gpu else "cpu")
|
||||||
|
gpu_platform_id = params.get("gpu_platform_id", None)
|
||||||
|
gpu_device_id = params.get("gpu_device_id", None)
|
||||||
|
|
||||||
# Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
|
# Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective
|
||||||
# and parallelism improve training speed for this task.
|
# and parallelism improve training speed for this task.
|
||||||
return lgb.LGBMClassifier(
|
return lgb.LGBMClassifier(
|
||||||
@@ -33,6 +39,9 @@ class LightGBMModel(TraditionalModel):
|
|||||||
objective=params.get("objective", "binary"),
|
objective=params.get("objective", "binary"),
|
||||||
n_jobs=params.get("n_jobs", -1),
|
n_jobs=params.get("n_jobs", -1),
|
||||||
verbose=2,
|
verbose=2,
|
||||||
|
device=device,
|
||||||
|
gpu_platform_id=gpu_platform_id,
|
||||||
|
gpu_device_id=gpu_device_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ class LSTMModel(NeuralNetworkModel):
|
|||||||
Dense(64, activation="relu"),
|
Dense(64, activation="relu"),
|
||||||
Dropout(params.get("dropout", 0.5)),
|
Dropout(params.get("dropout", 0.5)),
|
||||||
# Two-way softmax for binary classification.
|
# Two-way softmax for binary classification.
|
||||||
Dense(2, activation="softmax"),
|
Dense(2, activation="softmax", dtype="float32"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ class TransformerModel(NeuralNetworkModel):
|
|||||||
x = GlobalAveragePooling1D()(x)
|
x = GlobalAveragePooling1D()(x)
|
||||||
x = Dense(32, activation="relu")(x)
|
x = Dense(32, activation="relu")(x)
|
||||||
x = Dropout(params.get("dropout", 0.1))(x)
|
x = Dropout(params.get("dropout", 0.1))(x)
|
||||||
outputs = Dense(2, activation="softmax")(x)
|
outputs = Dense(2, activation="softmax", dtype="float32")(x)
|
||||||
|
|
||||||
model = Model(inputs, outputs)
|
model = Model(inputs, outputs)
|
||||||
model.compile(
|
model.compile(
|
||||||
|
|||||||
@@ -20,6 +20,14 @@ class XGBoostModel(TraditionalModel):
|
|||||||
def build_model(self) -> BaseEstimator:
|
def build_model(self) -> BaseEstimator:
|
||||||
params = self.config.model_params
|
params = self.config.model_params
|
||||||
|
|
||||||
|
# Optional GPU acceleration
|
||||||
|
use_gpu = bool(params.get("use_gpu", False))
|
||||||
|
default_tree_method = "gpu_hist" if use_gpu else "hist"
|
||||||
|
tree_method = params.get("tree_method", default_tree_method)
|
||||||
|
predictor = params.get(
|
||||||
|
"predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto"
|
||||||
|
)
|
||||||
|
|
||||||
# Histogram-based trees and parallelism provide fast training; default
|
# Histogram-based trees and parallelism provide fast training; default
|
||||||
# logloss metric suits binary classification of gender.
|
# logloss metric suits binary classification of gender.
|
||||||
return xgb.XGBClassifier(
|
return xgb.XGBClassifier(
|
||||||
@@ -31,7 +39,8 @@ class XGBoostModel(TraditionalModel):
|
|||||||
random_state=self.config.random_seed,
|
random_state=self.config.random_seed,
|
||||||
eval_metric="logloss",
|
eval_metric="logloss",
|
||||||
n_jobs=params.get("n_jobs", -1),
|
n_jobs=params.get("n_jobs", -1),
|
||||||
tree_method=params.get("tree_method", "hist"),
|
tree_method=tree_method,
|
||||||
|
predictor=predictor,
|
||||||
verbosity=2,
|
verbosity=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,38 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
"""Fit the neural network model with deferred building"""
|
"""Fit the neural network model with deferred building"""
|
||||||
logging.info(f"Training {self.__class__.__name__}")
|
logging.info(f"Training {self.__class__.__name__}")
|
||||||
|
|
||||||
|
# Best-effort GPU configuration for TensorFlow when available
|
||||||
|
# - Enables memory growth to avoid pre-allocating all VRAM
|
||||||
|
# - Optionally enables mixed precision if requested via model params
|
||||||
|
try:
|
||||||
|
import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs
|
||||||
|
|
||||||
|
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||||
|
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||||
|
|
||||||
|
gpus = tf.config.list_physical_devices("GPU")
|
||||||
|
if gpus:
|
||||||
|
for gpu in gpus:
|
||||||
|
try:
|
||||||
|
tf.config.experimental.set_memory_growth(gpu, True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if enable_mixed:
|
||||||
|
try:
|
||||||
|
from tensorflow.keras import mixed_precision
|
||||||
|
|
||||||
|
mixed_precision.set_global_policy("mixed_float16")
|
||||||
|
logging.info("Enabled TensorFlow mixed precision (float16)")
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Could not enable mixed precision: {e}")
|
||||||
|
else:
|
||||||
|
if requested_gpu:
|
||||||
|
logging.warning("Requested GPU but no TensorFlow GPU device is available.")
|
||||||
|
except Exception as e:
|
||||||
|
# Keep silent in non-TF environments / non-NN workflows
|
||||||
|
logging.debug(f"TensorFlow GPU setup skipped: {e}")
|
||||||
|
|
||||||
# Setup feature extraction
|
# Setup feature extraction
|
||||||
if self.feature_extractor is None:
|
if self.feature_extractor is None:
|
||||||
self.feature_extractor = FeatureExtractor(
|
self.feature_extractor = FeatureExtractor(
|
||||||
@@ -105,6 +137,32 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
def cross_validate(
|
def cross_validate(
|
||||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||||
) -> dict[str, np.floating[Any]]:
|
) -> dict[str, np.floating[Any]]:
|
||||||
|
# Ensure TF GPU/mixed-precision config also applies to CV runs
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||||
|
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||||
|
|
||||||
|
gpus = tf.config.list_physical_devices("GPU")
|
||||||
|
if gpus:
|
||||||
|
for gpu in gpus:
|
||||||
|
try:
|
||||||
|
tf.config.experimental.set_memory_growth(gpu, True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if enable_mixed:
|
||||||
|
try:
|
||||||
|
from tensorflow.keras import mixed_precision
|
||||||
|
|
||||||
|
mixed_precision.set_global_policy("mixed_float16")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if requested_gpu:
|
||||||
|
logging.warning("Requested GPU for CV but none is available.")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
features_df = self.feature_extractor.extract_features(X)
|
features_df = self.feature_extractor.extract_features(X)
|
||||||
X_prepared = self.prepare_features(features_df)
|
X_prepared = self.prepare_features(features_df)
|
||||||
y_encoded = self.label_encoder.transform(y)
|
y_encoded = self.label_encoder.transform(y)
|
||||||
@@ -165,6 +223,33 @@ class NeuralNetworkModel(BaseModel):
|
|||||||
"""Generate learning curve data for the model"""
|
"""Generate learning curve data for the model"""
|
||||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||||
|
|
||||||
|
# Ensure TF GPU/mixed-precision config also applies here
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
|
||||||
|
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
|
||||||
|
|
||||||
|
gpus = tf.config.list_physical_devices("GPU")
|
||||||
|
if gpus:
|
||||||
|
for gpu in gpus:
|
||||||
|
try:
|
||||||
|
tf.config.experimental.set_memory_growth(gpu, True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if enable_mixed:
|
||||||
|
try:
|
||||||
|
from tensorflow.keras import mixed_precision
|
||||||
|
|
||||||
|
mixed_precision.set_global_policy("mixed_float16")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if requested_gpu:
|
||||||
|
logging.warning("Requested GPU for learning curve but none is available.")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
if train_sizes is None:
|
if train_sizes is None:
|
||||||
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
|
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
|
||||||
|
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
LETTERS = 'abcdefghijklmnopqrstuvwxyz'
|
LETTERS = "abcdefghijklmnopqrstuvwxyz"
|
||||||
|
|||||||
@@ -8,11 +8,7 @@ from research.statistics.utils import LETTERS, build_letter_frequencies
|
|||||||
|
|
||||||
def plot_transition_matrix(ax, df_probs, title=""):
|
def plot_transition_matrix(ax, df_probs, title=""):
|
||||||
hm = sns.heatmap(
|
hm = sns.heatmap(
|
||||||
df_probs.loc[list(LETTERS), list(LETTERS)],
|
df_probs.loc[list(LETTERS), list(LETTERS)], cmap="Reds", annot=False, cbar=False, ax=ax
|
||||||
cmap="Reds",
|
|
||||||
annot=False,
|
|
||||||
cbar=False,
|
|
||||||
ax=ax
|
|
||||||
)
|
)
|
||||||
ax.set_title(title, fontsize=12)
|
ax.set_title(title, fontsize=12)
|
||||||
return hm
|
return hm
|
||||||
@@ -20,8 +16,8 @@ def plot_transition_matrix(ax, df_probs, title=""):
|
|||||||
|
|
||||||
def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
||||||
# Compute frequencies
|
# Compute frequencies
|
||||||
L_m = build_letter_frequencies(males['name']).set_index("letter")["freq"]
|
L_m = build_letter_frequencies(males["name"]).set_index("letter")["freq"]
|
||||||
L_f = build_letter_frequencies(females['name']).set_index("letter")["freq"]
|
L_f = build_letter_frequencies(females["name"]).set_index("letter")["freq"]
|
||||||
|
|
||||||
# Combine into one DataFrame
|
# Combine into one DataFrame
|
||||||
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
|
df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index()
|
||||||
@@ -35,8 +31,8 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None):
|
|||||||
x = np.arange(len(df_plot))
|
x = np.arange(len(df_plot))
|
||||||
w = 0.4
|
w = 0.4
|
||||||
fig, ax = plt.subplots(figsize=(16, 6))
|
fig, ax = plt.subplots(figsize=(16, 6))
|
||||||
ax.bar(x - w/2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
|
ax.bar(x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8)
|
||||||
ax.bar(x + w/2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
|
ax.bar(x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8)
|
||||||
|
|
||||||
ax.set_xticks(x)
|
ax.set_xticks(x)
|
||||||
ax.set_xticklabels(df_plot["letter"])
|
ax.set_xticklabels(df_plot["letter"])
|
||||||
|
|||||||
@@ -9,9 +9,10 @@ from scipy.spatial.distance import euclidean
|
|||||||
from scipy.stats import entropy
|
from scipy.stats import entropy
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
LETTERS = 'abcdefghijklmnopqrstuvwxyz'
|
LETTERS = "abcdefghijklmnopqrstuvwxyz"
|
||||||
START_TOKEN = '^'
|
START_TOKEN = "^"
|
||||||
END_TOKEN = '$'
|
END_TOKEN = "$"
|
||||||
|
|
||||||
|
|
||||||
def normalize_letters(s):
|
def normalize_letters(s):
|
||||||
"""Normalize accents -> ascii, lowercase, keep only a-z."""
|
"""Normalize accents -> ascii, lowercase, keep only a-z."""
|
||||||
@@ -27,41 +28,28 @@ def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame:
|
|||||||
return (
|
return (
|
||||||
df.groupby("province")["identified_category"]
|
df.groupby("province")["identified_category"]
|
||||||
.value_counts(normalize=True) # get proportions
|
.value_counts(normalize=True) # get proportions
|
||||||
.unstack(fill_value=0) # reshape into columns per word count
|
.unstack(fill_value=0) # reshape into columns per word count
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
|
def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame:
|
||||||
# Normalize + split once (vectorized)
|
# Normalize + split once (vectorized)
|
||||||
s = df[source].fillna('').astype(str)
|
s = df[source].fillna("").astype(str)
|
||||||
s = (
|
s = s.str.lower().str.replace(r"[^\w'\-]+", " ", regex=True).str.strip().str.split()
|
||||||
s.str.lower()
|
|
||||||
.str.replace(r"[^\w'\-]+", " ", regex=True)
|
|
||||||
.str.strip()
|
|
||||||
.str.split()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Explode the token list into rows under `target`
|
# Explode the token list into rows under `target`
|
||||||
out = (
|
out = df.assign(**{target: s}).explode(target, ignore_index=True)
|
||||||
df.assign(**{target: s})
|
|
||||||
.explode(target, ignore_index=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Drop NA/empty tokens and strip whitespace
|
# Drop NA/empty tokens and strip whitespace
|
||||||
out[target] = out[target].astype(str).str.strip()
|
out[target] = out[target].astype(str).str.strip()
|
||||||
out = out[out[target].ne('')].dropna(subset=[target]).reset_index(drop=True)
|
out = out[out[target].ne("")].dropna(subset=[target]).reset_index(drop=True)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
||||||
# Normalize: lowercase, remove non-letters, concatenate all into one string
|
# Normalize: lowercase, remove non-letters, concatenate all into one string
|
||||||
s = (
|
s = series.astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True).str.cat(sep="")
|
||||||
series.astype(str)
|
|
||||||
.str.lower()
|
|
||||||
.str.replace(r'[^a-z]', '', regex=True)
|
|
||||||
.str.cat(sep='')
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert string into Series of characters
|
# Convert string into Series of characters
|
||||||
chars = pd.Series(list(s))
|
chars = pd.Series(list(s))
|
||||||
@@ -82,11 +70,7 @@ def build_letter_frequencies(series: pd.Series) -> pd.DataFrame:
|
|||||||
|
|
||||||
def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict:
|
def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict:
|
||||||
# 1) Normalize
|
# 1) Normalize
|
||||||
names = (
|
names = names.astype(str).str.lower().str.replace(rf"[^{LETTERS}]", "", regex=True)
|
||||||
names.astype(str)
|
|
||||||
.str.lower()
|
|
||||||
.str.replace(fr"[^{LETTERS}]", "", regex=True)
|
|
||||||
)
|
|
||||||
names = names[names.str.len() > 0]
|
names = names[names.str.len() > 0]
|
||||||
|
|
||||||
# 2) Prepare sequences
|
# 2) Prepare sequences
|
||||||
@@ -130,7 +114,7 @@ def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict
|
|||||||
|
|
||||||
# 11) DataFrames
|
# 11) DataFrames
|
||||||
df_counts = pd.DataFrame(counts, index=tokens, columns=tokens)
|
df_counts = pd.DataFrame(counts, index=tokens, columns=tokens)
|
||||||
df_probs = pd.DataFrame(probs, index=tokens, columns=tokens)
|
df_probs = pd.DataFrame(probs, index=tokens, columns=tokens)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"tokens": tokens,
|
"tokens": tokens,
|
||||||
@@ -142,7 +126,11 @@ def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_transitions: Dict[str, Any], n_permutations: int = 1000) -> pd.DataFrame:
|
def build_transition_comparisons(
|
||||||
|
names_transitions: Dict[str, Any],
|
||||||
|
surnames_transitions: Dict[str, Any],
|
||||||
|
n_permutations: int = 1000,
|
||||||
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Compares letter transition probability matrices for names and surnames using
|
Compares letter transition probability matrices for names and surnames using
|
||||||
various distance metrics and a permutation test for statistical significance.
|
various distance metrics and a permutation test for statistical significance.
|
||||||
@@ -150,23 +138,20 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
|
|||||||
|
|
||||||
# Helper function to flatten and smooth matrices
|
# Helper function to flatten and smooth matrices
|
||||||
def prepare_data(data):
|
def prepare_data(data):
|
||||||
return {
|
return {"m": data["m"]["probs"].flatten(), "f": data["f"]["probs"].flatten()}
|
||||||
'm': data['m']['probs'].flatten(),
|
|
||||||
'f': data['f']['probs'].flatten()
|
|
||||||
}
|
|
||||||
|
|
||||||
prepared_names = prepare_data(names_transitions)
|
prepared_names = prepare_data(names_transitions)
|
||||||
prepared_surnames = prepare_data(surnames_transitions)
|
prepared_surnames = prepare_data(surnames_transitions)
|
||||||
|
|
||||||
# Distance Metrics
|
# Distance Metrics
|
||||||
names_l2 = euclidean(prepared_names['m'], prepared_names['f'])
|
names_l2 = euclidean(prepared_names["m"], prepared_names["f"])
|
||||||
surnames_l2 = euclidean(prepared_surnames['m'], prepared_surnames['f'])
|
surnames_l2 = euclidean(prepared_surnames["m"], prepared_surnames["f"])
|
||||||
|
|
||||||
kl_names_mf = entropy(prepared_names['m'] + 1e-12, prepared_names['f'] + 1e-12)
|
kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12)
|
||||||
kl_names_fm = entropy(prepared_names['f'] + 1e-12, prepared_names['m'] + 1e-12)
|
kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12)
|
||||||
|
|
||||||
kl_surnames_mf = entropy(prepared_surnames['m'] + 1e-12, prepared_surnames['f'] + 1e-12)
|
kl_surnames_mf = entropy(prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12)
|
||||||
kl_surnames_fm = entropy(prepared_surnames['f'] + 1e-12, prepared_surnames['m'] + 1e-12)
|
kl_surnames_fm = entropy(prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12)
|
||||||
|
|
||||||
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
|
jsd_names = 0.5 * (kl_names_mf + kl_names_fm)
|
||||||
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
|
jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm)
|
||||||
@@ -174,15 +159,15 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
|
|||||||
# Permutation Test
|
# Permutation Test
|
||||||
def run_permutation_test(transitions):
|
def run_permutation_test(transitions):
|
||||||
# Flattened probabilities for male and female
|
# Flattened probabilities for male and female
|
||||||
P_m = transitions['m']['probs'].flatten()
|
P_m = transitions["m"]["probs"].flatten()
|
||||||
P_f = transitions['f']['probs'].flatten()
|
P_f = transitions["f"]["probs"].flatten()
|
||||||
|
|
||||||
# Calculate the observed JSD (our test statistic)
|
# Calculate the observed JSD (our test statistic)
|
||||||
observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12))
|
observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12))
|
||||||
|
|
||||||
# Concatenate male and female counts
|
# Concatenate male and female counts
|
||||||
counts_m = transitions['m']['counts']
|
counts_m = transitions["m"]["counts"]
|
||||||
counts_f = transitions['f']['counts']
|
counts_f = transitions["f"]["counts"]
|
||||||
all_counts = np.concatenate((counts_m, counts_f), axis=1)
|
all_counts = np.concatenate((counts_m, counts_f), axis=1)
|
||||||
total_counts = counts_m.shape[1] + counts_f.shape[1]
|
total_counts = counts_m.shape[1] + counts_f.shape[1]
|
||||||
|
|
||||||
@@ -194,17 +179,27 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
|
|||||||
# Note: This is a simplified approach, assuming counts are
|
# Note: This is a simplified approach, assuming counts are
|
||||||
# structured per name. A more robust implementation would
|
# structured per name. A more robust implementation would
|
||||||
# shuffle the actual names themselves.
|
# shuffle the actual names themselves.
|
||||||
permuted_counts_m = all_counts[:, shuffled_indices[:counts_m.shape[1]]]
|
permuted_counts_m = all_counts[:, shuffled_indices[: counts_m.shape[1]]]
|
||||||
permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1]:]]
|
permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1] :]]
|
||||||
|
|
||||||
# Re-calculate probabilities and JSD for the permuted groups
|
# Re-calculate probabilities and JSD for the permuted groups
|
||||||
# Add a small epsilon to the denominator to prevent division by zero
|
# Add a small epsilon to the denominator to prevent division by zero
|
||||||
epsilon = 1e-12
|
epsilon = 1e-12
|
||||||
permuted_probs_m = permuted_counts_m / (permuted_counts_m.sum(axis=0, keepdims=True) + epsilon)
|
permuted_probs_m = permuted_counts_m / (
|
||||||
permuted_probs_f = permuted_counts_f / (permuted_counts_f.sum(axis=0, keepdims=True) + epsilon)
|
permuted_counts_m.sum(axis=0, keepdims=True) + epsilon
|
||||||
|
)
|
||||||
|
permuted_probs_f = permuted_counts_f / (
|
||||||
|
permuted_counts_f.sum(axis=0, keepdims=True) + epsilon
|
||||||
|
)
|
||||||
|
|
||||||
permuted_jsd = 0.5 * (entropy(permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12) +
|
permuted_jsd = 0.5 * (
|
||||||
entropy(permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12))
|
entropy(
|
||||||
|
permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12
|
||||||
|
)
|
||||||
|
+ entropy(
|
||||||
|
permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12
|
||||||
|
)
|
||||||
|
)
|
||||||
permuted_jsds.append(permuted_jsd)
|
permuted_jsds.append(permuted_jsd)
|
||||||
|
|
||||||
# Calculate the p-value
|
# Calculate the p-value
|
||||||
@@ -214,39 +209,39 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra
|
|||||||
names_p_value = run_permutation_test(names_transitions)
|
names_p_value = run_permutation_test(names_transitions)
|
||||||
surnames_p_value = run_permutation_test(surnames_transitions)
|
surnames_p_value = run_permutation_test(surnames_transitions)
|
||||||
|
|
||||||
out = pd.DataFrame({
|
out = pd.DataFrame(
|
||||||
"l2": [names_l2, surnames_l2],
|
{
|
||||||
"kl_mf": [kl_names_mf, kl_surnames_mf],
|
"l2": [names_l2, surnames_l2],
|
||||||
"kl_fm": [kl_names_fm, kl_surnames_fm],
|
"kl_mf": [kl_names_mf, kl_surnames_mf],
|
||||||
"jsd": [jsd_names, jsd_surnames],
|
"kl_fm": [kl_names_fm, kl_surnames_fm],
|
||||||
"permutation_p_value": [names_p_value, surnames_p_value]
|
"jsd": [jsd_names, jsd_surnames],
|
||||||
}, index=["names", "surnames"])
|
"permutation_p_value": [names_p_value, surnames_p_value],
|
||||||
|
},
|
||||||
|
index=["names", "surnames"],
|
||||||
|
)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
def build_ngrams_count(
|
def build_ngrams_count(
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
n: int,
|
n: int,
|
||||||
where: Literal["any", "prefix", "suffix"] = "any",
|
where: Literal["any", "prefix", "suffix"] = "any",
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
# Normalize and clean to a–z
|
# Normalize and clean to a–z
|
||||||
names = (
|
names = df["name"].astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True)
|
||||||
df["name"].astype(str)
|
|
||||||
.str.lower()
|
|
||||||
.str.replace(r"[^a-z]", "", regex=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
ngrams = []
|
ngrams = []
|
||||||
if where == "any":
|
if where == "any":
|
||||||
for s in names:
|
for s in names:
|
||||||
L = len(s)
|
L = len(s)
|
||||||
if L >= n:
|
if L >= n:
|
||||||
ngrams.extend(s[i:i+n] for i in range(L - n + 1))
|
ngrams.extend(s[i : i + n] for i in range(L - n + 1))
|
||||||
elif where == "prefix":
|
elif where == "prefix":
|
||||||
for s in names:
|
for s in names:
|
||||||
if len(s) >= n:
|
if len(s) >= n:
|
||||||
|
|||||||
Reference in New Issue
Block a user