fix: add github workflow

This commit is contained in:
2025-10-07 23:21:35 +02:00
parent d3b3840278
commit f2ac0c9769
25 changed files with 307 additions and 89 deletions
+35
View File
@@ -0,0 +1,35 @@
name: audit
on:
push:
branches:
- main
pull_request:
jobs:
bandit:
name: bandit
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install uv
run: curl -LsSf https://astral.sh/uv/install.sh | sh
- name: Cache uv dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/uv
.venv
key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
restore-keys: |
${{ runner.os }}-uv-
- name: Sync dependencies (with dev tools)
run: uv sync --dev
- name: Run Bandit (security linter)
run: uv run bandit -r . -c pyproject.toml || true
+40
View File
@@ -0,0 +1,40 @@
name: quality
on:
push:
branches:
- main
pull_request:
jobs:
lint:
name: ruff and pyright
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install uv
run: curl -LsSf https://astral.sh/uv/install.sh | sh
- name: Cache uv dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/uv
.venv
key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
restore-keys: |
${{ runner.os }}-uv-
- name: Sync dependencies (with dev tools)
run: uv sync --dev
- name: Run Ruff (lint + format checks)
run: |
uv run ruff check .
uv run ruff format --check .
- name: Run Pyright (type checks)
run: uv run pyright
+16
View File
@@ -37,5 +37,21 @@ build-backend = "uv_build"
[dependency-groups] [dependency-groups]
dev = [ dev = [
"ipykernel>=6.30.1", "ipykernel>=6.30.1",
"pyright>=1.1.406",
"pytest>=8.4.2",
"ruff>=0.13.3", "ruff>=0.13.3",
] ]
[tool.pyright]
pythonVersion = "3.11"
typeCheckingMode = "basic"
reportMissingImports = "none"
reportMissingModuleSource = "none"
useLibraryCodeForTypes = true
include = ["src"]
[tool.ruff]
# Keep defaults and additionally ignore notebooks
extend-exclude = [
"**/*.ipynb",
]
+23 -4
View File
@@ -118,12 +118,31 @@ def research_train(
exp_cfg = exp_builder.find_template(tmpl, name, type) exp_cfg = exp_builder.find_template(tmpl, name, type)
trainer = ModelTrainer(cfg) trainer = ModelTrainer(cfg)
# Validate and coerce template fields to expected types for type safety
model_name = exp_cfg.get("name")
model_type = exp_cfg.get("model_type")
features = exp_cfg.get("features")
tags = exp_cfg.get("tags", [])
if not isinstance(model_name, str) or not isinstance(model_type, str):
raise typer.BadParameter(
"Template must include 'name' and 'model_type' as strings"
)
if features is None:
features = ["full_name"]
elif not isinstance(features, list):
raise typer.BadParameter("Template 'features' must be a list of strings")
if not isinstance(tags, list):
tags = []
trainer.train_single_model( trainer.train_single_model(
model_name=exp_cfg.get("name"), model_name=model_name,
model_type=exp_cfg.get("model_type"), model_type=model_type,
features=exp_cfg.get("features"), features=features,
model_params=exp_cfg.get("model_params", {}), model_params=exp_cfg.get("model_params", {}),
tags=exp_cfg.get("tags", []), tags=tags,
) )
+5 -3
View File
@@ -16,13 +16,13 @@ def get_config() -> PipelineConfig:
def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig: def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig:
"""Load configuration from specified path""" """Load configuration from specified path"""
if config_path: if config_path is not None:
return config_manager.load_config(Path(config_path)) return config_manager.load_config(config_path)
return config_manager.get_config() return config_manager.get_config()
def setup_config( def setup_config(
config_path: Optional[Path] = None, env: str = "development" config_path: Optional[Union[str, Path]] = None, env: str = "development"
) -> PipelineConfig: ) -> PipelineConfig:
""" """
Unified configuration loading and logging setup for all entrypoint scripts. Unified configuration loading and logging setup for all entrypoint scripts.
@@ -37,6 +37,8 @@ def setup_config(
# Determine config path # Determine config path
if config_path is None: if config_path is None:
config_path = Path("config") / f"pipeline.{env}.yaml" config_path = Path("config") / f"pipeline.{env}.yaml"
else:
config_path = Path(config_path)
# Load configuration # Load configuration
config = ConfigManager(config_path).load_config() config = ConfigManager(config_path).load_config()
+14 -8
View File
@@ -13,7 +13,9 @@ class ConfigManager:
"""Centralized configuration management""" """Centralized configuration management"""
def __init__(self, config_path: Optional[Union[str, Path]] = None): def __init__(self, config_path: Optional[Union[str, Path]] = None):
self.config_path = config_path or self._find_config_file() self.config_path: Path = (
Path(config_path) if config_path is not None else self._find_config_file()
)
self._config: Optional[PipelineConfig] = None self._config: Optional[PipelineConfig] = None
self._setup_default_paths() self._setup_default_paths()
@@ -47,10 +49,12 @@ class ConfigManager:
checkpoints_dir=root_dir / "data" / "checkpoints", checkpoints_dir=root_dir / "data" / "checkpoints",
) )
def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig: def load_config(
self, config_path: Optional[Union[str, Path]] = None
) -> PipelineConfig:
"""Load configuration from file""" """Load configuration from file"""
if config_path: if config_path is not None:
self.config_path = config_path self.config_path = Path(config_path)
if not self.config_path.exists(): if not self.config_path.exists():
logging.warning( logging.warning(
@@ -80,9 +84,11 @@ class ConfigManager:
"""Create default configuration""" """Create default configuration"""
return PipelineConfig(paths=self.default_paths) return PipelineConfig(paths=self.default_paths)
def save_config(self, config: PipelineConfig, path: Optional[Path] = None): def save_config(
self, config: PipelineConfig, path: Optional[Union[str, Path]] = None
):
"""Save configuration to file""" """Save configuration to file"""
save_path = path or self.config_path save_path = Path(path) if path is not None else self.config_path
save_path.parent.mkdir(parents=True, exist_ok=True) save_path.parent.mkdir(parents=True, exist_ok=True)
config_dict = config.model_dump() config_dict = config.model_dump()
@@ -142,8 +148,8 @@ class ConfigManager:
env_config = self.load_config(env_config_path) env_config = self.load_config(env_config_path)
# Merge configurations # Merge configurations
base_dict = base_config.dict() base_dict = base_config.model_dump()
env_dict = env_config.dict() env_dict = env_config.model_dump()
self._deep_update(base_dict, env_dict) self._deep_update(base_dict, env_dict)
return PipelineConfig(**base_dict) return PipelineConfig(**base_dict)
+2 -2
View File
@@ -260,9 +260,9 @@ class NameTagger:
# Remove overlaps # Remove overlaps
filtered, last_end = [], -1 filtered, last_end = [], -1
for s, e, l in valid: for s, e, label in valid:
if s >= last_end: if s >= last_end:
filtered.append((s, e, l)) filtered.append((s, e, label))
last_end = e last_end = e
return filtered return filtered
+1 -1
View File
@@ -19,7 +19,7 @@ class PipelineState:
processed_batches: int = 0 processed_batches: int = 0
total_batches: int = 0 total_batches: int = 0
failed_batches: List[int] = None failed_batches: Optional[List[int]] = None
last_checkpoint: Optional[str] = None last_checkpoint: Optional[str] = None
def __post_init__(self): def __post_init__(self):
@@ -21,7 +21,7 @@ class DataSelectionStep(PipelineStep):
if "region" in batch.columns and "year" in batch.columns: if "region" in batch.columns and "year" in batch.columns:
target_years = {2015, 2021, 2022} target_years = {2015, 2021, 2022}
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin( mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
target_years list(target_years)
) )
removed = int(mask_remove.sum()) removed = int(mask_remove.sum())
if removed: if removed:
@@ -29,8 +29,8 @@ class FeatureExtractionStep(PipelineStep):
self.region_mapper = RegionMapper() self.region_mapper = RegionMapper()
self.name_tagger = NameTagger() self.name_tagger = NameTagger()
@classmethod @property
def requires_batch_mutation(cls) -> bool: def requires_batch_mutation(self) -> bool:
"""This step creates new columns, so mutation is required""" """This step creates new columns, so mutation is required"""
return True return True
+44 -26
View File
@@ -1,6 +1,6 @@
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List, TYPE_CHECKING, Union
import joblib import joblib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@@ -9,19 +9,23 @@ import pandas as pd
from ners.research.experiment import ExperimentConfig from ners.research.experiment import ExperimentConfig
if TYPE_CHECKING:
from ners.research.experiment.feature_extractor import FeatureExtractor
from sklearn.preprocessing import LabelEncoder
class BaseModel(ABC): class BaseModel(ABC):
"""Abstract base class for all models""" """Abstract base class for all models"""
def __init__(self, config: ExperimentConfig): def __init__(self, config: ExperimentConfig):
self.config = config self.config = config
self.model = None self.model: Any | None = None
self.feature_extractor = None self.feature_extractor: "FeatureExtractor | None" = None
self.label_encoder = None self.label_encoder: "LabelEncoder | None" = None
self.tokenizer = None # For neural models self.tokenizer: Any | None = None # For neural models
self.is_fitted = False self.is_fitted: bool = False
self.training_history = {} # Store training history for learning curves self.training_history: Dict[str, Any] = {} # For learning curves
self.learning_curve_data = {} # Store learning curve experiment data self.learning_curve_data: Dict[str, Any] = {}
@property @property
@abstractmethod @abstractmethod
@@ -48,7 +52,7 @@ class BaseModel(ABC):
@abstractmethod @abstractmethod
def generate_learning_curve( def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = []
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Generate learning curve data for the model""" """Generate learning curve data for the model"""
pass pass
@@ -58,10 +62,17 @@ class BaseModel(ABC):
if not self.is_fitted: if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions") raise ValueError("Model must be fitted before making predictions")
if (
self.feature_extractor is None
or self.model is None
or self.label_encoder is None
):
raise ValueError("Model is not fully initialized for prediction")
features_df = self.feature_extractor.extract_features(X) features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df) X_prepared = self.prepare_features(features_df)
predictions = self.model.predict(X_prepared) predictions: Union[np.ndarray, Any] = self.model.predict(X_prepared)
# Handle different prediction formats # Handle different prediction formats
if hasattr(predictions, "shape") and len(predictions.shape) > 1: if hasattr(predictions, "shape") and len(predictions.shape) > 1:
@@ -75,6 +86,9 @@ class BaseModel(ABC):
if not self.is_fitted: if not self.is_fitted:
raise ValueError("Model must be fitted before making predictions") raise ValueError("Model must be fitted before making predictions")
if self.feature_extractor is None or self.model is None:
raise ValueError("Model is not fully initialized for prediction")
features_df = self.feature_extractor.extract_features(X) features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df) X_prepared = self.prepare_features(features_df)
@@ -83,7 +97,11 @@ class BaseModel(ABC):
elif hasattr(self.model, "predict"): elif hasattr(self.model, "predict"):
# For neural networks that return probabilities directly # For neural networks that return probabilities directly
probabilities = self.model.predict(X_prepared) probabilities = self.model.predict(X_prepared)
if len(probabilities.shape) == 2 and probabilities.shape[1] > 1: if (
hasattr(probabilities, "shape")
and len(probabilities.shape) == 2
and probabilities.shape[1] > 1
):
return probabilities return probabilities
raise NotImplementedError("Model does not support probability predictions") raise NotImplementedError("Model does not support probability predictions")
@@ -91,30 +109,29 @@ class BaseModel(ABC):
def get_feature_importance(self) -> Optional[Dict[str, float]]: def get_feature_importance(self) -> Optional[Dict[str, float]]:
"""Get feature importance if supported by the model""" """Get feature importance if supported by the model"""
if hasattr(self.model, "feature_importances_"): model = self.model
if model is None:
return None
if hasattr(model, "feature_importances_"):
# For tree-based models # For tree-based models
importances = self.model.feature_importances_ importances = model.feature_importances_
feature_names = self._get_feature_names() feature_names = self._get_feature_names()
return dict(zip(feature_names, importances)) return dict(zip(feature_names, importances))
elif hasattr(self.model, "coef_"): elif hasattr(model, "coef_"):
# For linear models # For linear models
coefficients = np.abs(self.model.coef_[0]) coefficients = np.abs(model.coef_[0])
feature_names = self._get_feature_names() feature_names = self._get_feature_names()
return dict(zip(feature_names, coefficients)) return dict(zip(feature_names, coefficients))
elif ( elif hasattr(model, "named_steps") and "classifier" in model.named_steps:
hasattr(self.model, "named_steps")
and "classifier" in self.model.named_steps
):
# For sklearn pipelines (like LogisticRegression with vectorizer) # For sklearn pipelines (like LogisticRegression with vectorizer)
classifier = self.model.named_steps["classifier"] classifier = model.named_steps["classifier"]
if hasattr(classifier, "coef_"): if hasattr(classifier, "coef_"):
coefficients = np.abs(classifier.coef_[0]) coefficients = np.abs(classifier.coef_[0])
if hasattr( if hasattr(model.named_steps["vectorizer"], "get_feature_names_out"):
self.model.named_steps["vectorizer"], "get_feature_names_out" feature_names = model.named_steps[
):
feature_names = self.model.named_steps[
"vectorizer" "vectorizer"
].get_feature_names_out() ].get_feature_names_out()
# Take top features to avoid too many n-grams # Take top features to avoid too many n-grams
@@ -127,8 +144,9 @@ class BaseModel(ABC):
def _get_feature_names(self) -> List[str]: def _get_feature_names(self) -> List[str]:
"""Get feature names (override in subclasses if needed)""" """Get feature names (override in subclasses if needed)"""
if hasattr(self.model, "feature_names_in_"): model = self.model
return list(self.model.feature_names_in_) if model is not None and hasattr(model, "feature_names_in_"):
return list(model.feature_names_in_)
return [f"feature_{i}" for i in range(100)] # Default fallback return [f"feature_{i}" for i in range(100)] # Default fallback
def save(self, path: str): def save(self, path: str):
+1 -1
View File
@@ -70,7 +70,7 @@ class ExperimentStatus(Enum):
def calculate_metrics( def calculate_metrics(
y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None y_true: np.ndarray, y_pred: np.ndarray, metrics: Optional[List[str]] = None
) -> Dict[str, float]: ) -> Dict[str, float]:
"""Calculate specified metrics""" """Calculate specified metrics"""
@@ -99,14 +99,24 @@ class ExperimentBuilder:
logging.warning(f"Unknown feature type: {feature_str}") logging.warning(f"Unknown feature type: {feature_str}")
continue continue
name = (
template_config.get("name")
or template_config.get("model_type")
or "experiment"
)
model_type = template_config.get("model_type") or "logistic_regression"
description = template_config.get("description") or ""
return ExperimentConfig( return ExperimentConfig(
name=template_config.get("name"), name=str(name),
description=template_config.get("description"), description=str(description),
model_type=template_config.get("model_type"), model_type=str(model_type),
features=features, features=features,
model_params=template_config.get("model_params", {}), model_params=template_config.get("model_params", {}),
tags=template_config.get("tags", []), tags=template_config.get("tags", []),
test_size=template_config.get("test_size", 0.2), test_size=float(template_config.get("test_size", 0.2)),
cross_validation_folds=template_config.get("cross_validation_folds", 5), cross_validation_folds=int(
template_config.get("cross_validation_folds", 5)
),
train_data_filter=template_config.get("train_data_filter"), train_data_filter=template_config.get("train_data_filter"),
) )
@@ -1,5 +1,5 @@
from enum import Enum from enum import Enum
from typing import List, Dict, Any, Union from typing import List, Dict, Any, Union, Optional
import pandas as pd import pandas as pd
@@ -25,7 +25,9 @@ class FeatureExtractor:
"""Extract different types of features from name data""" """Extract different types of features from name data"""
def __init__( def __init__(
self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None self,
feature_types: List[FeatureType],
feature_params: Optional[Dict[str, Any]] = None,
): ):
self.feature_types = feature_types self.feature_types = feature_types
self.feature_params = feature_params or {} self.feature_params = feature_params or {}
+5 -5
View File
@@ -1,7 +1,7 @@
import json import json
import logging import logging
from datetime import datetime from datetime import datetime
from typing import List, Dict, Any from typing import List, Dict, Any, Optional
import pandas as pd import pandas as pd
@@ -30,9 +30,9 @@ class ModelTrainer:
self, self,
model_name: str, model_name: str,
model_type: str = "logistic_regression", model_type: str = "logistic_regression",
features: List[str] = None, features: Optional[List[str]] = None,
model_params: Dict[str, Any] = None, model_params: Optional[Dict[str, Any]] = None,
tags: List[str] = None, tags: Optional[List[str]] = None,
save_artifacts: bool = True, save_artifacts: bool = True,
) -> str: ) -> str:
""" """
@@ -106,7 +106,7 @@ class ModelTrainer:
logging.info(f"Completed training {len(experiment_ids)} models successfully") logging.info(f"Completed training {len(experiment_ids)} models successfully")
return experiment_ids return experiment_ids
def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]: def save_model_artifacts(self, experiment_id: str) -> Dict[str, Optional[str]]:
""" """
Save model artifacts in a structured way for easy loading. Save model artifacts in a structured way for easy loading.
Returns paths to saved artifacts. Returns paths to saved artifacts.
+6 -3
View File
@@ -13,7 +13,7 @@ from ners.research.neural_network_model import NeuralNetworkModel
class BiGRUModel(NeuralNetworkModel): class BiGRUModel(NeuralNetworkModel):
"""Bidirectional GRU model for name classification""" """Bidirectional GRU model for name classification"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: def build_model(self, vocab_size: int, **kwargs) -> Any:
params = kwargs params = kwargs
model = Sequential( model = Sequential(
[ [
@@ -33,7 +33,10 @@ class BiGRUModel(NeuralNetworkModel):
params.get("gru_units", 32), params.get("gru_units", 32),
return_sequences=True, return_sequences=True,
dropout=params.get("dropout", 0.2), dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0), # Use a small non-zero recurrent_dropout by default to
# disable cuDNN path, which has strict right-padding mask
# requirements and can assert when using Bidirectional.
recurrent_dropout=params.get("recurrent_dropout", 0.1),
) )
), ),
# Second GRU summarizes to the last hidden state (no return_sequences), # Second GRU summarizes to the last hidden state (no return_sequences),
@@ -42,7 +45,7 @@ class BiGRUModel(NeuralNetworkModel):
GRU( GRU(
params.get("gru_units", 32), params.get("gru_units", 32),
dropout=params.get("dropout", 0.2), dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0), recurrent_dropout=params.get("recurrent_dropout", 0.1),
) )
), ),
# Small dense head; ReLU + dropout for capacity and regularization. # Small dense head; ReLU + dropout for capacity and regularization.
+1 -1
View File
@@ -21,7 +21,7 @@ from ners.research.neural_network_model import NeuralNetworkModel
class CNNModel(NeuralNetworkModel): class CNNModel(NeuralNetworkModel):
"""1D Convolutional Neural Network for character patterns""" """1D Convolutional Neural Network for character patterns"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: def build_model(self, vocab_size: int, **kwargs) -> Any:
"""Build CNN model with known vocabulary size""" """Build CNN model with known vocabulary size"""
params = kwargs params = kwargs
+5 -3
View File
@@ -13,7 +13,7 @@ from ners.research.neural_network_model import NeuralNetworkModel
class LSTMModel(NeuralNetworkModel): class LSTMModel(NeuralNetworkModel):
"""LSTM model for sequence learning""" """LSTM model for sequence learning"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: def build_model(self, vocab_size: int, **kwargs) -> Any:
params = kwargs params = kwargs
model = Sequential( model = Sequential(
[ [
@@ -30,7 +30,9 @@ class LSTMModel(NeuralNetworkModel):
params.get("lstm_units", 32), params.get("lstm_units", 32),
return_sequences=True, return_sequences=True,
dropout=params.get("dropout", 0.2), dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0), # Default to a small non-zero recurrent_dropout to avoid
# cuDNN mask assertions when masking with Bidirectional.
recurrent_dropout=params.get("recurrent_dropout", 0.1),
) )
), ),
# Second LSTM condenses sequence to a fixed vector for classification. # Second LSTM condenses sequence to a fixed vector for classification.
@@ -38,7 +40,7 @@ class LSTMModel(NeuralNetworkModel):
LSTM( LSTM(
params.get("lstm_units", 32), params.get("lstm_units", 32),
dropout=params.get("dropout", 0.2), dropout=params.get("dropout", 0.2),
recurrent_dropout=params.get("recurrent_dropout", 0.0), recurrent_dropout=params.get("recurrent_dropout", 0.1),
) )
), ),
# Compact dense head with dropout; sufficient capacity for name signals. # Compact dense head with dropout; sufficient capacity for name signals.
@@ -22,7 +22,7 @@ from ners.research.neural_network_model import NeuralNetworkModel
class TransformerModel(NeuralNetworkModel): class TransformerModel(NeuralNetworkModel):
"""Transformer-based model""" """Transformer-based model"""
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: def build_model(self, vocab_size: int, **kwargs) -> Any:
params = kwargs params = kwargs
# Use a single resolved max_len everywhere to avoid shape mismatches # Use a single resolved max_len everywhere to avoid shape mismatches
max_len = int(params.get("max_len", 6)) max_len = int(params.get("max_len", 6))
+6 -8
View File
@@ -24,7 +24,7 @@ class NeuralNetworkModel(BaseModel):
return "neural_network" return "neural_network"
@abstractmethod @abstractmethod
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: def build_model(self, vocab_size: int, **kwargs) -> Any:
"""Build neural network model with known vocabulary size""" """Build neural network model with known vocabulary size"""
pass pass
@@ -86,9 +86,7 @@ class NeuralNetworkModel(BaseModel):
logging.info(f"Vocabulary size: {vocab_size}") logging.info(f"Vocabulary size: {vocab_size}")
# Get additional model parameters # Get additional model parameters
self.model = self.build_model_with_vocab( self.model = self.build_model(vocab_size=vocab_size, **self.config.model_params)
vocab_size=vocab_size, **self.config.model_params
)
# Train the neural network # Train the neural network
logging.info( logging.info(
@@ -249,8 +247,8 @@ class NeuralNetworkModel(BaseModel):
max_len = self.config.model_params.get("max_len", 6) max_len = self.config.model_params.get("max_len", 6)
for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)): for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)):
# Create fresh model for each fold using build_model_with_vocab # Create fresh model for each fold using build_model
fold_model = self.build_model_with_vocab( fold_model = self.build_model(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params vocab_size=vocab_size, max_len=max_len, **self.config.model_params
) )
@@ -364,8 +362,8 @@ class NeuralNetworkModel(BaseModel):
val_scores = [] val_scores = []
for seed in range(3): # 3 runs for variance for seed in range(3): # 3 runs for variance
# Build fresh model using build_model_with_vocab # Build fresh model using build_model
model = self.build_model_with_vocab( model = self.build_model(
vocab_size=vocab_size, max_len=max_len, **self.config.model_params vocab_size=vocab_size, max_len=max_len, **self.config.model_params
) )
+2 -6
View File
@@ -5,7 +5,8 @@ import numpy as np
import pandas as pd import pandas as pd
from scipy.spatial.distance import euclidean from scipy.spatial.distance import euclidean
from scipy.stats import entropy from scipy.stats import entropy
from typing import Dict, Any from collections import Counter
from typing import Dict, Any, Literal
LETTERS = "abcdefghijklmnopqrstuvwxyz" LETTERS = "abcdefghijklmnopqrstuvwxyz"
START_TOKEN = "^" START_TOKEN = "^"
@@ -234,11 +235,6 @@ def build_transition_comparisons(
return out return out
import pandas as pd
from collections import Counter
from typing import Literal
def build_ngrams_count( def build_ngrams_count(
df: pd.DataFrame, df: pd.DataFrame,
n: int, n: int,
+13 -4
View File
@@ -30,12 +30,21 @@ def train_from_template(
logging.info(f"Features: {experiment_config.get('features')}") logging.info(f"Features: {experiment_config.get('features')}")
trainer = ModelTrainer(cfg) trainer = ModelTrainer(cfg)
name_val = experiment_config.get("name")
type_val = experiment_config.get("model_type")
features_val = experiment_config.get("features") or ["full_name"]
tags_val = experiment_config.get("tags", [])
if not isinstance(name_val, str) or not isinstance(type_val, str):
raise ValueError("Template must include 'name' and 'model_type' as strings")
if not isinstance(features_val, list):
raise ValueError("Template 'features' must be a list of strings")
trainer.train_single_model( trainer.train_single_model(
model_name=experiment_config.get("name"), model_name=name_val,
model_type=experiment_config.get("model_type"), model_type=type_val,
features=experiment_config.get("features"), features=features_val,
model_params=experiment_config.get("model_params", {}), model_params=experiment_config.get("model_params", {}),
tags=experiment_config.get("tags", []), tags=tags_val if isinstance(tags_val, list) else [],
) )
logging.info("Training completed successfully!") logging.info("Training completed successfully!")
+3 -1
View File
@@ -1 +1,3 @@
from .ner_testing import NERTesting from .ner_testing import NERTesting as NERTesting
__all__ = ["NERTesting"]
+2 -2
View File
@@ -116,7 +116,7 @@ class Predictions:
try: try:
probabilities = model.predict_proba(input_df)[0] probabilities = model.predict_proba(input_df)[0]
return max(probabilities) return max(probabilities)
except: except Exception:
return None return None
def _display_single_prediction_results( def _display_single_prediction_results(
@@ -209,7 +209,7 @@ class Predictions:
try: try:
probabilities = model.predict_proba(df) probabilities = model.predict_proba(df)
df["confidence"] = np.max(probabilities, axis=1) df["confidence"] = np.max(probabilities, axis=1)
except: except Exception:
df["confidence"] = None df["confidence"] = None
st.success("Predictions completed!") st.success("Predictions completed!")
Generated
+60
View File
@@ -710,6 +710,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
] ]
[[package]]
name = "iniconfig"
version = "2.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" },
]
[[package]] [[package]]
name = "ipykernel" name = "ipykernel"
version = "6.30.1" version = "6.30.1"
@@ -1349,6 +1358,8 @@ dependencies = [
[package.dev-dependencies] [package.dev-dependencies]
dev = [ dev = [
{ name = "ipykernel" }, { name = "ipykernel" },
{ name = "pyright" },
{ name = "pytest" },
{ name = "ruff" }, { name = "ruff" },
] ]
@@ -1379,6 +1390,8 @@ requires-dist = [
[package.metadata.requires-dev] [package.metadata.requires-dev]
dev = [ dev = [
{ name = "ipykernel", specifier = ">=6.30.1" }, { name = "ipykernel", specifier = ">=6.30.1" },
{ name = "pyright", specifier = ">=1.1.406" },
{ name = "pytest", specifier = ">=8.4.2" },
{ name = "ruff", specifier = ">=0.13.3" }, { name = "ruff", specifier = ">=0.13.3" },
] ]
@@ -1400,6 +1413,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" },
] ]
[[package]]
name = "nodeenv"
version = "1.9.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
]
[[package]] [[package]]
name = "numpy" name = "numpy"
version = "2.3.3" version = "2.3.3"
@@ -1734,6 +1756,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3f/93/023955c26b0ce614342d11cc0652f1e45e32393b6ab9d11a664a60e9b7b7/plotly-6.3.1-py3-none-any.whl", hash = "sha256:8b4420d1dcf2b040f5983eed433f95732ed24930e496d36eb70d211923532e64", size = 9833698, upload-time = "2025-10-02T16:10:22.584Z" }, { url = "https://files.pythonhosted.org/packages/3f/93/023955c26b0ce614342d11cc0652f1e45e32393b6ab9d11a664a60e9b7b7/plotly-6.3.1-py3-none-any.whl", hash = "sha256:8b4420d1dcf2b040f5983eed433f95732ed24930e496d36eb70d211923532e64", size = 9833698, upload-time = "2025-10-02T16:10:22.584Z" },
] ]
[[package]]
name = "pluggy"
version = "1.6.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
]
[[package]] [[package]]
name = "preshed" name = "preshed"
version = "3.0.10" version = "3.0.10"
@@ -2079,6 +2110,35 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/15/73/a7141a1a0559bf1a7aa42a11c879ceb19f02f5c6c371c6d57fd86cefd4d1/pyproj-3.7.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d9d25bae416a24397e0d85739f84d323b55f6511e45a522dd7d7eae70d10c7e4", size = 6391844, upload-time = "2025-08-14T12:05:40.745Z" }, { url = "https://files.pythonhosted.org/packages/15/73/a7141a1a0559bf1a7aa42a11c879ceb19f02f5c6c371c6d57fd86cefd4d1/pyproj-3.7.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d9d25bae416a24397e0d85739f84d323b55f6511e45a522dd7d7eae70d10c7e4", size = 6391844, upload-time = "2025-08-14T12:05:40.745Z" },
] ]
[[package]]
name = "pyright"
version = "1.1.406"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nodeenv" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f7/16/6b4fbdd1fef59a0292cbb99f790b44983e390321eccbc5921b4d161da5d1/pyright-1.1.406.tar.gz", hash = "sha256:c4872bc58c9643dac09e8a2e74d472c62036910b3bd37a32813989ef7576ea2c", size = 4113151, upload-time = "2025-10-02T01:04:45.488Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f6/a2/e309afbb459f50507103793aaef85ca4348b66814c86bc73908bdeb66d12/pyright-1.1.406-py3-none-any.whl", hash = "sha256:1d81fb43c2407bf566e97e57abb01c811973fdb21b2df8df59f870f688bdca71", size = 5980982, upload-time = "2025-10-02T01:04:43.137Z" },
]
[[package]]
name = "pytest"
version = "8.4.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "iniconfig" },
{ name = "packaging" },
{ name = "pluggy" },
{ name = "pygments" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
]
[[package]] [[package]]
name = "python-dateutil" name = "python-dateutil"
version = "2.9.0.post0" version = "2.9.0.post0"