From a1d500830bf81cdfff47ccf097d2ecd0eeed09a9 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Mon, 29 Sep 2025 21:07:23 +0200 Subject: [PATCH] feat: support gpu --- README.md | 51 +++ config/gpu_research_templates.yaml | 412 ++++++++++++++++++++++++ core/utils/region_mapper.py | 12 +- processing/ner/name_model.py | 9 + processing/steps/data_selection_step.py | 8 +- research/models/bigru_model.py | 2 +- research/models/cnn_model.py | 2 +- research/models/lightgbm_model.py | 9 + research/models/lstm_model.py | 2 +- research/models/transformer_model.py | 2 +- research/models/xgboost_model.py | 11 +- research/neural_network_model.py | 85 +++++ research/statistics/__init__.py | 2 +- research/statistics/plots.py | 14 +- research/statistics/utils.py | 125 ++++--- 15 files changed, 661 insertions(+), 85 deletions(-) create mode 100644 config/gpu_research_templates.yaml diff --git a/README.md b/README.md index 7065314..dc429bb 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,57 @@ experiments and make predictions without needing to understand the underlying co streamlit run web/app.py ``` +## GPU Acceleration + +This project can leverage GPUs for faster training when supported libraries and hardware are available. + +- TensorFlow/Keras models (BiGRU, LSTM, CNN, Transformer) + - Uses GPU automatically if a TensorFlow GPU build is installed. + - The code enables safe GPU memory growth by default; optionally enable mixed precision for additional speed: + - Add `mixed_precision: true` in the experiment `model_params` (e.g., in `config/research_templates.yaml`). + - The final layer outputs are set to float32 for numerical stability under mixed precision. + +- spaCy NER + - Automatically prefers GPU if available; otherwise falls back to CPU. + - Ensure a compatible CUDA-enabled spaCy/thinc stack is installed to use GPU. + +- XGBoost + - Enable GPU by adding to the experiment `model_params`: + - `use_gpu: true` (sets `tree_method: gpu_hist` and `predictor: gpu_predictor`). + +- LightGBM + - Enable GPU by adding to the experiment `model_params`: + - `use_gpu: true` (sets `device: gpu`). Optional: `gpu_platform_id`, `gpu_device_id`. + +Example template snippet (GPU on): + +```yaml +- name: "lstm_gpu" + description: "LSTM with GPU + mixed precision" + model_type: "lstm" + features: ["full_name"] + model_params: + embedding_dim: 128 + lstm_units: 64 + epochs: 5 + batch_size: 128 + use_gpu: true + mixed_precision: true + tags: ["gpu", "mixed_precision"] + +- name: "xgboost_gpu" + description: "XGBoost with GPU" + model_type: "xgboost" + features: ["full_name"] + model_params: + n_estimators: 200 + use_gpu: true +``` + +Notes: +- Install CUDA‑enabled binaries for TensorFlow/spaCy/LightGBM/XGBoost to actually use GPU. +- If GPU is requested but not available, training will proceed on CPU with a warning. + ## Contributors diff --git a/config/gpu_research_templates.yaml b/config/gpu_research_templates.yaml new file mode 100644 index 0000000..c309415 --- /dev/null +++ b/config/gpu_research_templates.yaml @@ -0,0 +1,412 @@ +baseline_experiments: + # BiGRU Models (GPU-enabled) + - name: "bigru" + description: "Baseline BiGRU with full name features (GPU)" + model_type: "bigru" + features: [ "full_name" ] + model_params: + embedding_dim: 64 + gru_units: 32 + epochs: 2 + batch_size: 32 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "bigru", "gpu" ] + + - name: "bigru_native" + description: "Baseline BiGRU with native name features (GPU)" + model_type: "bigru" + features: [ "native_name" ] + model_params: + embedding_dim: 64 + gru_units: 32 + epochs: 2 + batch_size: 32 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "bigru", "native", "gpu" ] + + - name: "bigru_surname" + description: "Baseline BiGRU with surname features (GPU)" + model_type: "bigru" + features: [ "surname" ] + model_params: + embedding_dim: 64 + gru_units: 32 + epochs: 2 + batch_size: 32 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "bigru", "surname", "gpu" ] + + ## CNN Models (GPU-enabled) + - name: "cnn" + description: "Baseline CNN with character patterns (GPU)" + model_type: "cnn" + features: [ "full_name" ] + model_params: + embedding_dim: 64 + filters: 64 + kernel_size: 3 + dropout: 0.5 + epochs: 2 + batch_size: 32 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "cnn", "gpu" ] + + - name: "cnn_native" + description: "Baseline CNN with native name character patterns (GPU)" + model_type: "cnn" + features: [ "native_name" ] + model_params: + embedding_dim: 64 + filters: 64 + kernel_size: 3 + dropout: 0.5 + epochs: 2 + batch_size: 32 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "cnn", "native", "gpu" ] + + - name: "cnn_surname" + description: "Baseline CNN with surname character patterns (GPU)" + model_type: "cnn" + features: [ "surname" ] + model_params: + embedding_dim: 64 + filters: 64 + kernel_size: 3 + dropout: 0.5 + epochs: 2 + batch_size: 32 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "cnn", "surname", "gpu" ] + + ## Ensemble Models (CPU) + - name: "ensemble" + description: "Baseline Ensemble with multiple models" + model_type: "ensemble" + features: [ "full_name" ] + model_params: + base_models: [ "logistic_regression", "random_forest", "xgboost" ] + voting: "soft" + cv_folds: 5 + tags: [ "baseline", "ensemble" ] + + - name: "ensemble_native" + description: "Baseline Ensemble with native name" + model_type: "ensemble" + features: [ "native_name" ] + model_params: + base_models: [ "logistic_regression", "random_forest", "xgboost" ] + voting: "soft" + cv_folds: 5 + tags: [ "baseline", "ensemble", "native" ] + + - name: "ensemble_surname" + description: "Baseline Ensemble with surname" + model_type: "ensemble" + features: [ "surname" ] + model_params: + base_models: [ "logistic_regression", "random_forest", "xgboost" ] + voting: "soft" + cv_folds: 5 + tags: [ "baseline", "ensemble", "surname" ] + + # LightGBM Models (GPU-enabled) + - name: "lightgbm" + description: "Baseline LightGBM with engineered features (GPU)" + model_type: "lightgbm" + features: [ "full_name" ] + model_params: + n_estimators: 100 + max_depth: -1 + learning_rate: 0.1 + num_leaves: 31 + subsample: 0.8 + colsample_bytree: 0.8 + use_gpu: true + tags: [ "baseline", "lightgbm", "gpu" ] + + - name: "lightgbm_native" + description: "Baseline LightGBM with native name features (GPU)" + model_type: "lightgbm" + features: [ "native_name" ] + model_params: + n_estimators: 100 + max_depth: -1 + learning_rate: 0.1 + num_leaves: 31 + subsample: 0.8 + colsample_bytree: 0.8 + use_gpu: true + tags: [ "baseline", "lightgbm", "native", "gpu" ] + + - name: "lightgbm_surname" + description: "Baseline LightGBM with surname features (GPU)" + model_type: "lightgbm" + features: [ "surname" ] + model_params: + n_estimators: 100 + max_depth: -1 + learning_rate: 0.1 + num_leaves: 31 + subsample: 0.8 + colsample_bytree: 0.8 + use_gpu: true + tags: [ "baseline", "lightgbm", "surname", "gpu" ] + + # Logistic Regression Models (CPU) + - name: "logistic_regression" + description: "Baseline logistic regression with full name" + model_type: "logistic_regression" + features: [ "full_name" ] + model_params: + max_features: 10000 + tags: [ "baseline", "logistic_regression", "fullname" ] + + - name: "logistic_regression_native" + description: "Logistic regression with native name only" + model_type: "logistic_regression" + features: [ "native_name" ] + model_params: + max_features: 5000 + tags: [ "baseline", "logistic_regression", "native" ] + + - name: "logistic_regression_surname" + description: "Logistic regression with surname name only" + model_type: "logistic_regression" + features: [ "surname" ] + model_params: + max_features: 5000 + tags: [ "baseline", "logistic_regression", "surname" ] + + # LSTM Models (GPU-enabled) + - name: "lstm" + description: "Baseline LSTM with full name features (GPU)" + model_type: "lstm" + features: [ "full_name" ] + model_params: + embedding_dim: 128 + lstm_units: 64 + epochs: 2 + batch_size: 64 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "lstm", "gpu" ] + + - name: "lstm_native" + description: "Baseline LSTM with native name features (GPU)" + model_type: "lstm" + features: [ "native_name" ] + model_params: + embedding_dim: 128 + lstm_units: 64 + epochs: 2 + batch_size: 64 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "lstm", "native", "gpu" ] + + - name: "lstm_surname" + description: "Baseline LSTM with surname features (GPU)" + model_type: "lstm" + features: [ "surname" ] + model_params: + embedding_dim: 128 + lstm_units: 64 + epochs: 2 + batch_size: 64 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "lstm", "surname", "gpu" ] + + # Naive Bayes Models (CPU) + - name: "naive_bayes" + description: "Baseline Naive Bayes with full name features" + model_type: "naive_bayes" + features: [ "full_name" ] + model_params: + max_features: 5000 + tags: [ "baseline", "naive_bayes" ] + + - name: "naive_bayes_native" + description: "Baseline Naive Bayes with native name features" + model_type: "naive_bayes" + features: [ "native_name" ] + model_params: + max_features: 5000 + tags: [ "baseline", "naive_bayes", "native" ] + + - name: "naive_bayes_surname" + description: "Baseline Naive Bayes with surname features" + model_type: "naive_bayes" + features: [ "surname" ] + model_params: + max_features: 5000 + tags: [ "baseline", "naive_bayes", "surname" ] + + # Random Forest Models (CPU) + - name: "random_forest" + description: "Baseline Random Forest with engineered features" + model_type: "random_forest" + features: [ "full_name" ] + model_params: + n_estimators: 100 + max_depth: 10 + min_samples_split: 2 + min_samples_leaf: 1 + tags: [ "baseline", "random_forest", "engineered" ] + + - name: "random_forest_native" + description: "Baseline Random Forest with native name engineered features" + model_type: "random_forest" + features: [ "native_name" ] + model_params: + n_estimators: 100 + max_depth: 10 + min_samples_split: 2 + min_samples_leaf: 1 + tags: [ "baseline", "random_forest", "engineered", "native" ] + + - name: "random_forest_surname" + description: "Baseline Random Forest with surname engineered features" + model_type: "random_forest" + features: [ "surname" ] + model_params: + n_estimators: 100 + max_depth: 10 + min_samples_split: 2 + min_samples_leaf: 1 + tags: [ "baseline", "random_forest", "engineered", "surname" ] + + # SVM Models (CPU) + - name: "svm" + description: "Baseline SVM with full name features" + model_type: "svm" + features: [ "full_name" ] + model_params: + C: 1.0 + kernel: "rbf" + ngram_range: [ 2, 4 ] + max_features: 5000 + tags: [ "baseline", "svm" ] + + - name: "svm_native" + description: "Baseline SVM with native name features" + model_type: "svm" + features: [ "native_name" ] + model_params: + C: 1.0 + kernel: "rbf" + ngram_range: [ 2, 4 ] + max_features: 5000 + tags: [ "baseline", "svm", "native" ] + + - name: "svm_surname" + description: "Baseline SVM with surname features" + model_type: "svm" + features: [ "surname" ] + model_params: + C: 1.0 + kernel: "rbf" + ngram_range: [ 2, 4 ] + max_features: 5000 + tags: [ "baseline", "svm", "surname" ] + + # Transformer Models (GPU-enabled) + - name: "transformer" + description: "Baseline Transformer with attention mechanism (GPU)" + model_type: "transformer" + features: [ "full_name" ] + model_params: + embedding_dim: 128 + num_heads: 4 + num_layers: 2 + epochs: 2 + batch_size: 64 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "transformer", "gpu" ] + + - name: "transformer_native" + description: "Baseline Transformer with native name attention mechanism (GPU)" + model_type: "transformer" + features: [ "native_name" ] + model_params: + embedding_dim: 128 + num_heads: 4 + num_layers: 2 + epochs: 2 + batch_size: 64 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "transformer", "native", "gpu" ] + + - name: "transformer_surname" + description: "Baseline Transformer with surname attention mechanism (GPU)" + model_type: "transformer" + features: [ "surname" ] + model_params: + embedding_dim: 128 + num_heads: 4 + num_layers: 2 + epochs: 2 + batch_size: 64 + use_gpu: true + mixed_precision: true + tags: [ "baseline", "neural", "transformer", "surname", "gpu" ] + + # XGBoost Models (GPU-enabled) + - name: "xgboost" + description: "Baseline XGBoost with engineered features (GPU)" + model_type: "xgboost" + features: [ "full_name" ] + model_params: + n_estimators: 100 + max_depth: 6 + learning_rate: 0.1 + subsample: 0.8 + colsample_bytree: 0.8 + use_gpu: true + tags: [ "baseline", "xgboost", "gpu" ] + + - name: "xgboost_native" + description: "Baseline XGBoost with native name engineered features (GPU)" + model_type: "xgboost" + features: [ "native_name" ] + model_params: + n_estimators: 100 + max_depth: 6 + learning_rate: 0.1 + subsample: 0.8 + colsample_bytree: 0.8 + use_gpu: true + tags: [ "baseline", "xgboost", "native", "gpu" ] + + - name: "xgboost_surname" + description: "Baseline XGBoost with surname engineered features (GPU)" + model_type: "xgboost" + features: [ "surname" ] + model_params: + n_estimators: 100 + max_depth: 6 + learning_rate: 0.1 + subsample: 0.8 + colsample_bytree: 0.8 + use_gpu: true + tags: [ "baseline", "xgboost", "surname", "gpu" ] + + +# Advanced Experiments Configuration +advanced_experiments: + +# Feature Study Configurations +feature_studies: + +# Hyperparameter Tuning Configurations +hyperparameter_tuning: + diff --git a/core/utils/region_mapper.py b/core/utils/region_mapper.py index ec380da..6e54c6e 100644 --- a/core/utils/region_mapper.py +++ b/core/utils/region_mapper.py @@ -19,9 +19,15 @@ class RegionMapper: return ( series.str.upper() .str.strip() - .apply(lambda x: unicodedata.normalize("NFKD", x) - .encode("ascii", errors="ignore") - .decode("utf-8") if isinstance(x, str) else x) + .apply( + lambda x: ( + unicodedata.normalize("NFKD", x) + .encode("ascii", errors="ignore") + .decode("utf-8") + if isinstance(x, str) + else x + ) + ) ) @staticmethod diff --git a/processing/ner/name_model.py b/processing/ner/name_model.py index 782770c..bc4901e 100644 --- a/processing/ner/name_model.py +++ b/processing/ner/name_model.py @@ -29,6 +29,15 @@ class NameModel: """Create a blank spaCy model with NER pipeline""" logging.info(f"Creating blank {language} model for NER training") + # Prefer GPU for spaCy if available (falls back to CPU automatically) + try: + if spacy.prefer_gpu(): + logging.info("spaCy GPU enabled (cupy) for NER training") + else: + logging.info("spaCy running on CPU") + except Exception as e: + logging.debug(f"spaCy GPU selection skipped: {e}") + # Create blank model - French tokenizer works well for DRC names self.nlp = spacy.blank(language) diff --git a/processing/steps/data_selection_step.py b/processing/steps/data_selection_step.py index ab9409e..5b07104 100644 --- a/processing/steps/data_selection_step.py +++ b/processing/steps/data_selection_step.py @@ -20,11 +20,15 @@ class DataSelectionStep(PipelineStep): # Remove rows where region == "global" only for specific years if "region" in batch.columns and "year" in batch.columns: target_years = {2015, 2021, 2022} - mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(target_years) + mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin( + target_years + ) removed = int(mask_remove.sum()) if removed: batch = batch[~mask_remove] - logging.info(f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}") + logging.info( + f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}" + ) # Check which columns exist in the batch available_columns = [col for col in self.selected_columns if col in batch.columns] diff --git a/research/models/bigru_model.py b/research/models/bigru_model.py index 9954f1c..5c9c347 100644 --- a/research/models/bigru_model.py +++ b/research/models/bigru_model.py @@ -48,7 +48,7 @@ class BiGRUModel(NeuralNetworkModel): Dense(64, activation="relu"), Dropout(params.get("dropout", 0.5)), # Two-way softmax for binary gender classification. - Dense(2, activation="softmax"), + Dense(2, activation="softmax", dtype="float32"), ] ) diff --git a/research/models/cnn_model.py b/research/models/cnn_model.py index 8763d6f..d1b1257 100644 --- a/research/models/cnn_model.py +++ b/research/models/cnn_model.py @@ -54,7 +54,7 @@ class CNNModel(NeuralNetworkModel): Dense(64, activation="relu"), Dropout(params.get("dropout", 0.5)), # Two-way softmax for binary classification. - Dense(2, activation="softmax"), + Dense(2, activation="softmax", dtype="float32"), ] ) diff --git a/research/models/lightgbm_model.py b/research/models/lightgbm_model.py index 8b242ad..deb895f 100644 --- a/research/models/lightgbm_model.py +++ b/research/models/lightgbm_model.py @@ -20,6 +20,12 @@ class LightGBMModel(TraditionalModel): def build_model(self) -> BaseEstimator: params = self.config.model_params + # Optional GPU acceleration + use_gpu = bool(params.get("use_gpu", False)) + device = params.get("device", "gpu" if use_gpu else "cpu") + gpu_platform_id = params.get("gpu_platform_id", None) + gpu_device_id = params.get("gpu_device_id", None) + # Leaf-wise boosted trees excel on sparse/categorical mixes; binary objective # and parallelism improve training speed for this task. return lgb.LGBMClassifier( @@ -33,6 +39,9 @@ class LightGBMModel(TraditionalModel): objective=params.get("objective", "binary"), n_jobs=params.get("n_jobs", -1), verbose=2, + device=device, + gpu_platform_id=gpu_platform_id, + gpu_device_id=gpu_device_id, ) def prepare_features(self, X: pd.DataFrame) -> np.ndarray: diff --git a/research/models/lstm_model.py b/research/models/lstm_model.py index 7d2cb3c..1e65aa1 100644 --- a/research/models/lstm_model.py +++ b/research/models/lstm_model.py @@ -45,7 +45,7 @@ class LSTMModel(NeuralNetworkModel): Dense(64, activation="relu"), Dropout(params.get("dropout", 0.5)), # Two-way softmax for binary classification. - Dense(2, activation="softmax"), + Dense(2, activation="softmax", dtype="float32"), ] ) diff --git a/research/models/transformer_model.py b/research/models/transformer_model.py index d587d31..f5bdcbe 100644 --- a/research/models/transformer_model.py +++ b/research/models/transformer_model.py @@ -45,7 +45,7 @@ class TransformerModel(NeuralNetworkModel): x = GlobalAveragePooling1D()(x) x = Dense(32, activation="relu")(x) x = Dropout(params.get("dropout", 0.1))(x) - outputs = Dense(2, activation="softmax")(x) + outputs = Dense(2, activation="softmax", dtype="float32")(x) model = Model(inputs, outputs) model.compile( diff --git a/research/models/xgboost_model.py b/research/models/xgboost_model.py index 28093ee..c642203 100644 --- a/research/models/xgboost_model.py +++ b/research/models/xgboost_model.py @@ -20,6 +20,14 @@ class XGBoostModel(TraditionalModel): def build_model(self) -> BaseEstimator: params = self.config.model_params + # Optional GPU acceleration + use_gpu = bool(params.get("use_gpu", False)) + default_tree_method = "gpu_hist" if use_gpu else "hist" + tree_method = params.get("tree_method", default_tree_method) + predictor = params.get( + "predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto" + ) + # Histogram-based trees and parallelism provide fast training; default # logloss metric suits binary classification of gender. return xgb.XGBClassifier( @@ -31,7 +39,8 @@ class XGBoostModel(TraditionalModel): random_state=self.config.random_seed, eval_metric="logloss", n_jobs=params.get("n_jobs", -1), - tree_method=params.get("tree_method", "hist"), + tree_method=tree_method, + predictor=predictor, verbosity=2, ) diff --git a/research/neural_network_model.py b/research/neural_network_model.py index ac90d57..2f28afd 100644 --- a/research/neural_network_model.py +++ b/research/neural_network_model.py @@ -30,6 +30,38 @@ class NeuralNetworkModel(BaseModel): """Fit the neural network model with deferred building""" logging.info(f"Training {self.__class__.__name__}") + # Best-effort GPU configuration for TensorFlow when available + # - Enables memory growth to avoid pre-allocating all VRAM + # - Optionally enables mixed precision if requested via model params + try: + import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs + + requested_gpu = bool(self.config.model_params.get("use_gpu", False)) + enable_mixed = bool(self.config.model_params.get("mixed_precision", False)) + + gpus = tf.config.list_physical_devices("GPU") + if gpus: + for gpu in gpus: + try: + tf.config.experimental.set_memory_growth(gpu, True) + except Exception: + pass + + if enable_mixed: + try: + from tensorflow.keras import mixed_precision + + mixed_precision.set_global_policy("mixed_float16") + logging.info("Enabled TensorFlow mixed precision (float16)") + except Exception as e: + logging.warning(f"Could not enable mixed precision: {e}") + else: + if requested_gpu: + logging.warning("Requested GPU but no TensorFlow GPU device is available.") + except Exception as e: + # Keep silent in non-TF environments / non-NN workflows + logging.debug(f"TensorFlow GPU setup skipped: {e}") + # Setup feature extraction if self.feature_extractor is None: self.feature_extractor = FeatureExtractor( @@ -105,6 +137,32 @@ class NeuralNetworkModel(BaseModel): def cross_validate( self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5 ) -> dict[str, np.floating[Any]]: + # Ensure TF GPU/mixed-precision config also applies to CV runs + try: + import tensorflow as tf + + requested_gpu = bool(self.config.model_params.get("use_gpu", False)) + enable_mixed = bool(self.config.model_params.get("mixed_precision", False)) + + gpus = tf.config.list_physical_devices("GPU") + if gpus: + for gpu in gpus: + try: + tf.config.experimental.set_memory_growth(gpu, True) + except Exception: + pass + if enable_mixed: + try: + from tensorflow.keras import mixed_precision + + mixed_precision.set_global_policy("mixed_float16") + except Exception: + pass + else: + if requested_gpu: + logging.warning("Requested GPU for CV but none is available.") + except Exception: + pass features_df = self.feature_extractor.extract_features(X) X_prepared = self.prepare_features(features_df) y_encoded = self.label_encoder.transform(y) @@ -165,6 +223,33 @@ class NeuralNetworkModel(BaseModel): """Generate learning curve data for the model""" logging.info(f"Generating learning curve for {self.__class__.__name__}") + # Ensure TF GPU/mixed-precision config also applies here + try: + import tensorflow as tf + + requested_gpu = bool(self.config.model_params.get("use_gpu", False)) + enable_mixed = bool(self.config.model_params.get("mixed_precision", False)) + + gpus = tf.config.list_physical_devices("GPU") + if gpus: + for gpu in gpus: + try: + tf.config.experimental.set_memory_growth(gpu, True) + except Exception: + pass + if enable_mixed: + try: + from tensorflow.keras import mixed_precision + + mixed_precision.set_global_policy("mixed_float16") + except Exception: + pass + else: + if requested_gpu: + logging.warning("Requested GPU for learning curve but none is available.") + except Exception: + pass + if train_sizes is None: train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0] diff --git a/research/statistics/__init__.py b/research/statistics/__init__.py index b5a77d9..c24a7ba 100644 --- a/research/statistics/__init__.py +++ b/research/statistics/__init__.py @@ -1 +1 @@ -LETTERS = 'abcdefghijklmnopqrstuvwxyz' +LETTERS = "abcdefghijklmnopqrstuvwxyz" diff --git a/research/statistics/plots.py b/research/statistics/plots.py index ff35298..051feb1 100644 --- a/research/statistics/plots.py +++ b/research/statistics/plots.py @@ -8,11 +8,7 @@ from research.statistics.utils import LETTERS, build_letter_frequencies def plot_transition_matrix(ax, df_probs, title=""): hm = sns.heatmap( - df_probs.loc[list(LETTERS), list(LETTERS)], - cmap="Reds", - annot=False, - cbar=False, - ax=ax + df_probs.loc[list(LETTERS), list(LETTERS)], cmap="Reds", annot=False, cbar=False, ax=ax ) ax.set_title(title, fontsize=12) return hm @@ -20,8 +16,8 @@ def plot_transition_matrix(ax, df_probs, title=""): def plot_letter_frequencies(males, females, sort_values=False, title=None): # Compute frequencies - L_m = build_letter_frequencies(males['name']).set_index("letter")["freq"] - L_f = build_letter_frequencies(females['name']).set_index("letter")["freq"] + L_m = build_letter_frequencies(males["name"]).set_index("letter")["freq"] + L_f = build_letter_frequencies(females["name"]).set_index("letter")["freq"] # Combine into one DataFrame df_plot = pd.DataFrame({"Male": L_m, "Female": L_f}).fillna(0).reset_index() @@ -35,8 +31,8 @@ def plot_letter_frequencies(males, females, sort_values=False, title=None): x = np.arange(len(df_plot)) w = 0.4 fig, ax = plt.subplots(figsize=(16, 6)) - ax.bar(x - w/2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8) - ax.bar(x + w/2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8) + ax.bar(x - w / 2, df_plot["Male"], width=w, label="Male", color="steelblue", alpha=0.8) + ax.bar(x + w / 2, df_plot["Female"], width=w, label="Female", color="salmon", alpha=0.8) ax.set_xticks(x) ax.set_xticklabels(df_plot["letter"]) diff --git a/research/statistics/utils.py b/research/statistics/utils.py index 901fffd..25f2248 100644 --- a/research/statistics/utils.py +++ b/research/statistics/utils.py @@ -9,9 +9,10 @@ from scipy.spatial.distance import euclidean from scipy.stats import entropy from typing import Dict, Any -LETTERS = 'abcdefghijklmnopqrstuvwxyz' -START_TOKEN = '^' -END_TOKEN = '$' +LETTERS = "abcdefghijklmnopqrstuvwxyz" +START_TOKEN = "^" +END_TOKEN = "$" + def normalize_letters(s): """Normalize accents -> ascii, lowercase, keep only a-z.""" @@ -27,41 +28,28 @@ def build_category_distribution(df: pd.DataFrame) -> pd.DataFrame: return ( df.groupby("province")["identified_category"] .value_counts(normalize=True) # get proportions - .unstack(fill_value=0) # reshape into columns per word count + .unstack(fill_value=0) # reshape into columns per word count ) def build_words_token(df: pd.DataFrame, source: str, target: str) -> pd.DataFrame: # Normalize + split once (vectorized) - s = df[source].fillna('').astype(str) - s = ( - s.str.lower() - .str.replace(r"[^\w'\-]+", " ", regex=True) - .str.strip() - .str.split() - ) + s = df[source].fillna("").astype(str) + s = s.str.lower().str.replace(r"[^\w'\-]+", " ", regex=True).str.strip().str.split() # Explode the token list into rows under `target` - out = ( - df.assign(**{target: s}) - .explode(target, ignore_index=True) - ) + out = df.assign(**{target: s}).explode(target, ignore_index=True) # Drop NA/empty tokens and strip whitespace out[target] = out[target].astype(str).str.strip() - out = out[out[target].ne('')].dropna(subset=[target]).reset_index(drop=True) + out = out[out[target].ne("")].dropna(subset=[target]).reset_index(drop=True) return out def build_letter_frequencies(series: pd.Series) -> pd.DataFrame: # Normalize: lowercase, remove non-letters, concatenate all into one string - s = ( - series.astype(str) - .str.lower() - .str.replace(r'[^a-z]', '', regex=True) - .str.cat(sep='') - ) + s = series.astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True).str.cat(sep="") # Convert string into Series of characters chars = pd.Series(list(s)) @@ -82,11 +70,7 @@ def build_letter_frequencies(series: pd.Series) -> pd.DataFrame: def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict: # 1) Normalize - names = ( - names.astype(str) - .str.lower() - .str.replace(fr"[^{LETTERS}]", "", regex=True) - ) + names = names.astype(str).str.lower().str.replace(rf"[^{LETTERS}]", "", regex=True) names = names[names.str.len() > 0] # 2) Prepare sequences @@ -130,7 +114,7 @@ def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict # 11) DataFrames df_counts = pd.DataFrame(counts, index=tokens, columns=tokens) - df_probs = pd.DataFrame(probs, index=tokens, columns=tokens) + df_probs = pd.DataFrame(probs, index=tokens, columns=tokens) return { "tokens": tokens, @@ -142,7 +126,11 @@ def build_transition_probabilities(names: pd.Series, alpha: float = 0.0) -> dict } -def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_transitions: Dict[str, Any], n_permutations: int = 1000) -> pd.DataFrame: +def build_transition_comparisons( + names_transitions: Dict[str, Any], + surnames_transitions: Dict[str, Any], + n_permutations: int = 1000, +) -> pd.DataFrame: """ Compares letter transition probability matrices for names and surnames using various distance metrics and a permutation test for statistical significance. @@ -150,23 +138,20 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra # Helper function to flatten and smooth matrices def prepare_data(data): - return { - 'm': data['m']['probs'].flatten(), - 'f': data['f']['probs'].flatten() - } + return {"m": data["m"]["probs"].flatten(), "f": data["f"]["probs"].flatten()} prepared_names = prepare_data(names_transitions) prepared_surnames = prepare_data(surnames_transitions) # Distance Metrics - names_l2 = euclidean(prepared_names['m'], prepared_names['f']) - surnames_l2 = euclidean(prepared_surnames['m'], prepared_surnames['f']) + names_l2 = euclidean(prepared_names["m"], prepared_names["f"]) + surnames_l2 = euclidean(prepared_surnames["m"], prepared_surnames["f"]) - kl_names_mf = entropy(prepared_names['m'] + 1e-12, prepared_names['f'] + 1e-12) - kl_names_fm = entropy(prepared_names['f'] + 1e-12, prepared_names['m'] + 1e-12) + kl_names_mf = entropy(prepared_names["m"] + 1e-12, prepared_names["f"] + 1e-12) + kl_names_fm = entropy(prepared_names["f"] + 1e-12, prepared_names["m"] + 1e-12) - kl_surnames_mf = entropy(prepared_surnames['m'] + 1e-12, prepared_surnames['f'] + 1e-12) - kl_surnames_fm = entropy(prepared_surnames['f'] + 1e-12, prepared_surnames['m'] + 1e-12) + kl_surnames_mf = entropy(prepared_surnames["m"] + 1e-12, prepared_surnames["f"] + 1e-12) + kl_surnames_fm = entropy(prepared_surnames["f"] + 1e-12, prepared_surnames["m"] + 1e-12) jsd_names = 0.5 * (kl_names_mf + kl_names_fm) jsd_surnames = 0.5 * (kl_surnames_mf + kl_surnames_fm) @@ -174,15 +159,15 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra # Permutation Test def run_permutation_test(transitions): # Flattened probabilities for male and female - P_m = transitions['m']['probs'].flatten() - P_f = transitions['f']['probs'].flatten() + P_m = transitions["m"]["probs"].flatten() + P_f = transitions["f"]["probs"].flatten() # Calculate the observed JSD (our test statistic) observed_jsd = 0.5 * (entropy(P_m + 1e-12, P_f + 1e-12) + entropy(P_f + 1e-12, P_m + 1e-12)) # Concatenate male and female counts - counts_m = transitions['m']['counts'] - counts_f = transitions['f']['counts'] + counts_m = transitions["m"]["counts"] + counts_f = transitions["f"]["counts"] all_counts = np.concatenate((counts_m, counts_f), axis=1) total_counts = counts_m.shape[1] + counts_f.shape[1] @@ -194,17 +179,27 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra # Note: This is a simplified approach, assuming counts are # structured per name. A more robust implementation would # shuffle the actual names themselves. - permuted_counts_m = all_counts[:, shuffled_indices[:counts_m.shape[1]]] - permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1]:]] + permuted_counts_m = all_counts[:, shuffled_indices[: counts_m.shape[1]]] + permuted_counts_f = all_counts[:, shuffled_indices[counts_m.shape[1] :]] # Re-calculate probabilities and JSD for the permuted groups # Add a small epsilon to the denominator to prevent division by zero epsilon = 1e-12 - permuted_probs_m = permuted_counts_m / (permuted_counts_m.sum(axis=0, keepdims=True) + epsilon) - permuted_probs_f = permuted_counts_f / (permuted_counts_f.sum(axis=0, keepdims=True) + epsilon) + permuted_probs_m = permuted_counts_m / ( + permuted_counts_m.sum(axis=0, keepdims=True) + epsilon + ) + permuted_probs_f = permuted_counts_f / ( + permuted_counts_f.sum(axis=0, keepdims=True) + epsilon + ) - permuted_jsd = 0.5 * (entropy(permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12) + - entropy(permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12)) + permuted_jsd = 0.5 * ( + entropy( + permuted_probs_m.mean(axis=1) + 1e-12, permuted_probs_f.mean(axis=1) + 1e-12 + ) + + entropy( + permuted_probs_f.mean(axis=1) + 1e-12, permuted_probs_m.mean(axis=1) + 1e-12 + ) + ) permuted_jsds.append(permuted_jsd) # Calculate the p-value @@ -214,39 +209,39 @@ def build_transition_comparisons(names_transitions: Dict[str, Any], surnames_tra names_p_value = run_permutation_test(names_transitions) surnames_p_value = run_permutation_test(surnames_transitions) - out = pd.DataFrame({ - "l2": [names_l2, surnames_l2], - "kl_mf": [kl_names_mf, kl_surnames_mf], - "kl_fm": [kl_names_fm, kl_surnames_fm], - "jsd": [jsd_names, jsd_surnames], - "permutation_p_value": [names_p_value, surnames_p_value] - }, index=["names", "surnames"]) + out = pd.DataFrame( + { + "l2": [names_l2, surnames_l2], + "kl_mf": [kl_names_mf, kl_surnames_mf], + "kl_fm": [kl_names_fm, kl_surnames_fm], + "jsd": [jsd_names, jsd_surnames], + "permutation_p_value": [names_p_value, surnames_p_value], + }, + index=["names", "surnames"], + ) return out + import pandas as pd from collections import Counter from typing import Literal def build_ngrams_count( - df: pd.DataFrame, - n: int, - where: Literal["any", "prefix", "suffix"] = "any", + df: pd.DataFrame, + n: int, + where: Literal["any", "prefix", "suffix"] = "any", ) -> pd.DataFrame: # Normalize and clean to a–z - names = ( - df["name"].astype(str) - .str.lower() - .str.replace(r"[^a-z]", "", regex=True) - ) + names = df["name"].astype(str).str.lower().str.replace(r"[^a-z]", "", regex=True) ngrams = [] if where == "any": for s in names: L = len(s) if L >= n: - ngrams.extend(s[i:i+n] for i in range(L - n + 1)) + ngrams.extend(s[i : i + n] for i in range(L - n + 1)) elif where == "prefix": for s in names: if len(s) >= n: