Files
drc-ners-nlp/research/neural_network_model.py
T
2025-09-29 22:52:08 +02:00

372 lines
14 KiB
Python

import logging
from abc import abstractmethod
from typing import Any, Dict, List
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from research.base_model import BaseModel
from research.experiment.feature_extractor import FeatureExtractor
class NeuralNetworkModel(BaseModel):
"""Base class for neural network models (TensorFlow/Keras)"""
@property
def architecture(self) -> str:
return "neural_network"
@abstractmethod
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
"""Build neural network model with known vocabulary size"""
pass
def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaseModel":
"""Fit the neural network model with deferred building"""
logging.info(f"Training {self.__class__.__name__}")
# Best-effort GPU configuration for TensorFlow when available
# - Enables memory growth to avoid pre-allocating all VRAM
# - Optionally enables mixed precision if requested via model params
try:
import tensorflow as tf # Imported lazily to avoid dependency for non-NN runs
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
logging.info("Enabled TensorFlow mixed precision (float16)")
except Exception as e:
logging.warning(f"Could not enable mixed precision: {e}")
else:
if requested_gpu:
logging.warning("Requested GPU but no TensorFlow GPU device is available.")
except Exception as e:
# Keep silent in non-TF environments / non-NN workflows
logging.debug(f"TensorFlow GPU setup skipped: {e}")
# Setup feature extraction
if self.feature_extractor is None:
self.feature_extractor = FeatureExtractor(
self.config.features, self.config.feature_params
)
# Extract and prepare features (this will also initialize tokenizer)
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
# Sanitize any out-of-range indices to avoid embedding scatter errors
X_prepared = self._sanitize_sequences(X_prepared)
# Encode labels
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
y_encoded = self.label_encoder.fit_transform(y)
else:
y_encoded = self.label_encoder.transform(y)
# Now we can build the model with known vocab size
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
logging.info(f"Vocabulary size: {vocab_size}")
# Get additional model parameters
self.model = self.build_model_with_vocab(vocab_size=vocab_size, **self.config.model_params)
# Train the neural network
logging.info(
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
)
logging.info(X_prepared[0])
logging.info(f"Model parameters: {self.config.model_params}")
history = self.model.fit(
X_prepared,
y_encoded,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 64),
validation_split=self.config.model_params.get("validation_split", 0.1),
verbose=2,
)
# Store training history
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
"val_accuracy": history.history.get("val_accuracy", []),
"val_loss": history.history.get("val_loss", []),
}
self.is_fitted = True
return self
def _sanitize_sequences(self, sequences: np.ndarray) -> np.ndarray:
"""Clamp invalid token indices to OOV and ensure int32 dtype.
This prevents rare cases where malformed inputs or dtype issues introduce
large or negative indices which can trigger TensorScatterUpdate errors
during embedding updates on GPU.
"""
try:
if sequences is None:
return sequences
arr = np.asarray(sequences)
# Ensure integer dtype for embedding lookups
if not np.issubdtype(arr.dtype, np.integer):
arr = arr.astype(np.int64, copy=False)
if self.tokenizer is not None and hasattr(self.tokenizer, "word_index"):
# Use the actual max index present in the tokenizer mapping
if self.tokenizer.word_index:
max_idx = max(self.tokenizer.word_index.values())
else:
max_idx = 0
# OOV token index if available, else fall back to 1
oov_index = self.tokenizer.word_index.get(
getattr(self.tokenizer, "oov_token", "<OOV>"), 1
)
# Keep zeros (padding) untouched; clamp negatives and > max_idx to OOV
invalid_mask = (arr < 0) | (arr > max_idx)
# Avoid turning zeros into OOV
invalid_mask &= (arr != 0)
if invalid_mask.any():
arr[invalid_mask] = oov_index
# Use int32 for TF embedding ops compatibility
return arr.astype(np.int32, copy=False)
except Exception as e:
logging.debug(f"Sequence sanitization skipped due to: {e}")
return sequences
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
"""Combine configured textual features into one string per record."""
column_names = [
feature.value for feature in self.config.features if feature.value in X.columns
]
if not column_names:
raise ValueError("No configured text features found in the provided DataFrame.")
text_frame = X[column_names].fillna("").astype(str)
if len(column_names) == 1:
return text_frame.iloc[:, 0].tolist()
combined_rows = []
for row in text_frame.itertuples(index=False):
tokens = [value for value in row if value]
combined_rows.append(" ".join(tokens))
return combined_rows
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
# Ensure TF GPU/mixed-precision config also applies to CV runs
try:
import tensorflow as tf
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
if requested_gpu:
logging.warning("Requested GPU for CV but none is available.")
except Exception:
pass
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
X_prepared = self._sanitize_sequences(X_prepared)
y_encoded = self.label_encoder.transform(y)
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.config.random_seed)
accuracies = []
precisions = []
recalls = []
f1_scores = []
# Get vocabulary size and model parameters
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
# Create fresh model for each fold using build_model_with_vocab
fold_model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train on fold
if hasattr(fold_model, "fit"):
fold_model.fit(
X_prepared[train_idx],
y_encoded[train_idx],
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
verbose=0,
)
# Predict on validation
y_pred = fold_model.predict(X_prepared[val_idx])
if len(y_pred.shape) > 1:
y_pred = y_pred.argmax(axis=1)
# Calculate metrics
acc = accuracy_score(y_encoded[val_idx], y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
y_encoded[val_idx], y_pred, average="weighted"
)
accuracies.append(acc)
precisions.append(prec)
recalls.append(rec)
f1_scores.append(f1)
return {
"accuracy": np.mean(accuracies),
"precision": np.mean(precisions),
"recall": np.mean(recalls),
"f1": np.mean(f1_scores),
}
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
# Ensure TF GPU/mixed-precision config also applies here
try:
import tensorflow as tf
requested_gpu = bool(self.config.model_params.get("use_gpu", False))
enable_mixed = bool(self.config.model_params.get("mixed_precision", False))
gpus = tf.config.list_physical_devices("GPU")
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_memory_growth(gpu, True)
except Exception:
pass
if enable_mixed:
try:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
except Exception:
pass
else:
if requested_gpu:
logging.warning("Requested GPU for learning curve but none is available.")
except Exception:
pass
if train_sizes is None:
train_sizes = [0.1, 0.3, 0.5, 0.7, 1.0]
learning_curve_data = {
"train_sizes": [],
"train_scores": [],
"val_scores": [],
"train_scores_std": [],
"val_scores_std": [],
}
# Prepare features and get vocabulary size
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
X_prepared = self._sanitize_sequences(X_prepared)
y_encoded = self.label_encoder.transform(y)
vocab_size = len(self.tokenizer.word_index) + 1 if self.tokenizer else 1000
max_len = self.config.model_params.get("max_len", 6)
# Split data once for validation
X_train_full, X_val, y_train_full, y_val = train_test_split(
X_prepared,
y_encoded,
test_size=0.2,
random_state=self.config.random_seed,
stratify=y_encoded,
)
for size in train_sizes:
train_size = int(len(X_train_full) * size)
if train_size < 10: # Minimum training size
continue
# Sample training data
indices = np.random.choice(len(X_train_full), train_size, replace=False)
X_train_subset = X_train_full[indices]
y_train_subset = y_train_full[indices]
# Train multiple models for variance estimation
train_scores = []
val_scores = []
for seed in range(3): # 3 runs for variance
# Build fresh model using build_model_with_vocab
model = self.build_model_with_vocab(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params
)
# Train model
if hasattr(model, "fit"):
history = model.fit(
X_train_subset,
y_train_subset,
epochs=self.config.model_params.get("epochs", 10),
batch_size=self.config.model_params.get("batch_size", 32),
validation_data=(X_val, y_val),
verbose=0,
)
# Evaluate
train_pred = model.predict(X_train_subset)
val_pred = model.predict(X_val)
train_acc = accuracy_score(y_train_subset, train_pred.argmax(axis=1))
val_acc = accuracy_score(y_val, val_pred.argmax(axis=1))
train_scores.append(train_acc)
val_scores.append(val_acc)
learning_curve_data["train_sizes"].append(train_size)
learning_curve_data["train_scores"].append(np.mean(train_scores))
learning_curve_data["val_scores"].append(np.mean(val_scores))
learning_curve_data["train_scores_std"].append(np.std(train_scores))
learning_curve_data["val_scores_std"].append(np.std(val_scores))
self.learning_curve_data = learning_curve_data
return learning_curve_data