diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml new file mode 100644 index 0000000..82079f1 --- /dev/null +++ b/.github/workflows/audit.yml @@ -0,0 +1,35 @@ +name: audit + +on: + push: + branches: + - main + pull_request: + +jobs: + bandit: + name: bandit + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Cache uv dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + .venv + key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }} + restore-keys: | + ${{ runner.os }}-uv- + + - name: Sync dependencies (with dev tools) + run: uv sync --dev + + - name: Run Bandit (security linter) + run: uv run bandit -r . -c pyproject.toml || true diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml new file mode 100644 index 0000000..cf34983 --- /dev/null +++ b/.github/workflows/quality.yml @@ -0,0 +1,40 @@ +name: quality + +on: + push: + branches: + - main + pull_request: + +jobs: + lint: + name: ruff and pyright + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Cache uv dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + .venv + key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }} + restore-keys: | + ${{ runner.os }}-uv- + + - name: Sync dependencies (with dev tools) + run: uv sync --dev + + - name: Run Ruff (lint + format checks) + run: | + uv run ruff check . + uv run ruff format --check . + + - name: Run Pyright (type checks) + run: uv run pyright diff --git a/pyproject.toml b/pyproject.toml index 97b315e..42a4346 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,5 +37,21 @@ build-backend = "uv_build" [dependency-groups] dev = [ "ipykernel>=6.30.1", + "pyright>=1.1.406", + "pytest>=8.4.2", "ruff>=0.13.3", ] + +[tool.pyright] +pythonVersion = "3.11" +typeCheckingMode = "basic" +reportMissingImports = "none" +reportMissingModuleSource = "none" +useLibraryCodeForTypes = true +include = ["src"] + +[tool.ruff] +# Keep defaults and additionally ignore notebooks +extend-exclude = [ + "**/*.ipynb", +] diff --git a/src/ners/cli.py b/src/ners/cli.py index 8d717ad..182a993 100644 --- a/src/ners/cli.py +++ b/src/ners/cli.py @@ -118,12 +118,31 @@ def research_train( exp_cfg = exp_builder.find_template(tmpl, name, type) trainer = ModelTrainer(cfg) + # Validate and coerce template fields to expected types for type safety + model_name = exp_cfg.get("name") + model_type = exp_cfg.get("model_type") + features = exp_cfg.get("features") + tags = exp_cfg.get("tags", []) + + if not isinstance(model_name, str) or not isinstance(model_type, str): + raise typer.BadParameter( + "Template must include 'name' and 'model_type' as strings" + ) + + if features is None: + features = ["full_name"] + elif not isinstance(features, list): + raise typer.BadParameter("Template 'features' must be a list of strings") + + if not isinstance(tags, list): + tags = [] + trainer.train_single_model( - model_name=exp_cfg.get("name"), - model_type=exp_cfg.get("model_type"), - features=exp_cfg.get("features"), + model_name=model_name, + model_type=model_type, + features=features, model_params=exp_cfg.get("model_params", {}), - tags=exp_cfg.get("tags", []), + tags=tags, ) diff --git a/src/ners/core/config/__init__.py b/src/ners/core/config/__init__.py index ce852da..9b89cc3 100644 --- a/src/ners/core/config/__init__.py +++ b/src/ners/core/config/__init__.py @@ -16,13 +16,13 @@ def get_config() -> PipelineConfig: def load_config(config_path: Optional[Union[str, Path]] = None) -> PipelineConfig: """Load configuration from specified path""" - if config_path: - return config_manager.load_config(Path(config_path)) + if config_path is not None: + return config_manager.load_config(config_path) return config_manager.get_config() def setup_config( - config_path: Optional[Path] = None, env: str = "development" + config_path: Optional[Union[str, Path]] = None, env: str = "development" ) -> PipelineConfig: """ Unified configuration loading and logging setup for all entrypoint scripts. @@ -37,6 +37,8 @@ def setup_config( # Determine config path if config_path is None: config_path = Path("config") / f"pipeline.{env}.yaml" + else: + config_path = Path(config_path) # Load configuration config = ConfigManager(config_path).load_config() diff --git a/src/ners/core/config/config_manager.py b/src/ners/core/config/config_manager.py index 8e6c650..1363b7d 100644 --- a/src/ners/core/config/config_manager.py +++ b/src/ners/core/config/config_manager.py @@ -13,7 +13,9 @@ class ConfigManager: """Centralized configuration management""" def __init__(self, config_path: Optional[Union[str, Path]] = None): - self.config_path = config_path or self._find_config_file() + self.config_path: Path = ( + Path(config_path) if config_path is not None else self._find_config_file() + ) self._config: Optional[PipelineConfig] = None self._setup_default_paths() @@ -47,10 +49,12 @@ class ConfigManager: checkpoints_dir=root_dir / "data" / "checkpoints", ) - def load_config(self, config_path: Optional[Path] = None) -> PipelineConfig: + def load_config( + self, config_path: Optional[Union[str, Path]] = None + ) -> PipelineConfig: """Load configuration from file""" - if config_path: - self.config_path = config_path + if config_path is not None: + self.config_path = Path(config_path) if not self.config_path.exists(): logging.warning( @@ -80,9 +84,11 @@ class ConfigManager: """Create default configuration""" return PipelineConfig(paths=self.default_paths) - def save_config(self, config: PipelineConfig, path: Optional[Path] = None): + def save_config( + self, config: PipelineConfig, path: Optional[Union[str, Path]] = None + ): """Save configuration to file""" - save_path = path or self.config_path + save_path = Path(path) if path is not None else self.config_path save_path.parent.mkdir(parents=True, exist_ok=True) config_dict = config.model_dump() @@ -142,8 +148,8 @@ class ConfigManager: env_config = self.load_config(env_config_path) # Merge configurations - base_dict = base_config.dict() - env_dict = env_config.dict() + base_dict = base_config.model_dump() + env_dict = env_config.model_dump() self._deep_update(base_dict, env_dict) return PipelineConfig(**base_dict) diff --git a/src/ners/processing/ner/name_tagger.py b/src/ners/processing/ner/name_tagger.py index 7ab7280..f699a61 100644 --- a/src/ners/processing/ner/name_tagger.py +++ b/src/ners/processing/ner/name_tagger.py @@ -260,9 +260,9 @@ class NameTagger: # Remove overlaps filtered, last_end = [], -1 - for s, e, l in valid: + for s, e, label in valid: if s >= last_end: - filtered.append((s, e, l)) + filtered.append((s, e, label)) last_end = e return filtered diff --git a/src/ners/processing/steps/__init__.py b/src/ners/processing/steps/__init__.py index 8873645..4f2872e 100644 --- a/src/ners/processing/steps/__init__.py +++ b/src/ners/processing/steps/__init__.py @@ -19,7 +19,7 @@ class PipelineState: processed_batches: int = 0 total_batches: int = 0 - failed_batches: List[int] = None + failed_batches: Optional[List[int]] = None last_checkpoint: Optional[str] = None def __post_init__(self): diff --git a/src/ners/processing/steps/data_selection_step.py b/src/ners/processing/steps/data_selection_step.py index 8ae202d..30c3a56 100644 --- a/src/ners/processing/steps/data_selection_step.py +++ b/src/ners/processing/steps/data_selection_step.py @@ -21,7 +21,7 @@ class DataSelectionStep(PipelineStep): if "region" in batch.columns and "year" in batch.columns: target_years = {2015, 2021, 2022} mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin( - target_years + list(target_years) ) removed = int(mask_remove.sum()) if removed: diff --git a/src/ners/processing/steps/feature_extraction_step.py b/src/ners/processing/steps/feature_extraction_step.py index 30192ed..41112a3 100644 --- a/src/ners/processing/steps/feature_extraction_step.py +++ b/src/ners/processing/steps/feature_extraction_step.py @@ -29,8 +29,8 @@ class FeatureExtractionStep(PipelineStep): self.region_mapper = RegionMapper() self.name_tagger = NameTagger() - @classmethod - def requires_batch_mutation(cls) -> bool: + @property + def requires_batch_mutation(self) -> bool: """This step creates new columns, so mutation is required""" return True diff --git a/src/ners/research/base_model.py b/src/ners/research/base_model.py index 35cb599..90a0cfa 100644 --- a/src/ners/research/base_model.py +++ b/src/ners/research/base_model.py @@ -1,6 +1,6 @@ import logging from abc import ABC, abstractmethod -from typing import Dict, Any, Optional, List +from typing import Dict, Any, Optional, List, TYPE_CHECKING, Union import joblib import matplotlib.pyplot as plt @@ -9,19 +9,23 @@ import pandas as pd from ners.research.experiment import ExperimentConfig +if TYPE_CHECKING: + from ners.research.experiment.feature_extractor import FeatureExtractor + from sklearn.preprocessing import LabelEncoder + class BaseModel(ABC): """Abstract base class for all models""" def __init__(self, config: ExperimentConfig): self.config = config - self.model = None - self.feature_extractor = None - self.label_encoder = None - self.tokenizer = None # For neural models - self.is_fitted = False - self.training_history = {} # Store training history for learning curves - self.learning_curve_data = {} # Store learning curve experiment data + self.model: Any | None = None + self.feature_extractor: "FeatureExtractor | None" = None + self.label_encoder: "LabelEncoder | None" = None + self.tokenizer: Any | None = None # For neural models + self.is_fitted: bool = False + self.training_history: Dict[str, Any] = {} # For learning curves + self.learning_curve_data: Dict[str, Any] = {} @property @abstractmethod @@ -48,7 +52,7 @@ class BaseModel(ABC): @abstractmethod def generate_learning_curve( - self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None + self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = [] ) -> Dict[str, Any]: """Generate learning curve data for the model""" pass @@ -58,10 +62,17 @@ class BaseModel(ABC): if not self.is_fitted: raise ValueError("Model must be fitted before making predictions") + if ( + self.feature_extractor is None + or self.model is None + or self.label_encoder is None + ): + raise ValueError("Model is not fully initialized for prediction") + features_df = self.feature_extractor.extract_features(X) X_prepared = self.prepare_features(features_df) - predictions = self.model.predict(X_prepared) + predictions: Union[np.ndarray, Any] = self.model.predict(X_prepared) # Handle different prediction formats if hasattr(predictions, "shape") and len(predictions.shape) > 1: @@ -75,6 +86,9 @@ class BaseModel(ABC): if not self.is_fitted: raise ValueError("Model must be fitted before making predictions") + if self.feature_extractor is None or self.model is None: + raise ValueError("Model is not fully initialized for prediction") + features_df = self.feature_extractor.extract_features(X) X_prepared = self.prepare_features(features_df) @@ -83,7 +97,11 @@ class BaseModel(ABC): elif hasattr(self.model, "predict"): # For neural networks that return probabilities directly probabilities = self.model.predict(X_prepared) - if len(probabilities.shape) == 2 and probabilities.shape[1] > 1: + if ( + hasattr(probabilities, "shape") + and len(probabilities.shape) == 2 + and probabilities.shape[1] > 1 + ): return probabilities raise NotImplementedError("Model does not support probability predictions") @@ -91,30 +109,29 @@ class BaseModel(ABC): def get_feature_importance(self) -> Optional[Dict[str, float]]: """Get feature importance if supported by the model""" - if hasattr(self.model, "feature_importances_"): + model = self.model + if model is None: + return None + + if hasattr(model, "feature_importances_"): # For tree-based models - importances = self.model.feature_importances_ + importances = model.feature_importances_ feature_names = self._get_feature_names() return dict(zip(feature_names, importances)) - elif hasattr(self.model, "coef_"): + elif hasattr(model, "coef_"): # For linear models - coefficients = np.abs(self.model.coef_[0]) + coefficients = np.abs(model.coef_[0]) feature_names = self._get_feature_names() return dict(zip(feature_names, coefficients)) - elif ( - hasattr(self.model, "named_steps") - and "classifier" in self.model.named_steps - ): + elif hasattr(model, "named_steps") and "classifier" in model.named_steps: # For sklearn pipelines (like LogisticRegression with vectorizer) - classifier = self.model.named_steps["classifier"] + classifier = model.named_steps["classifier"] if hasattr(classifier, "coef_"): coefficients = np.abs(classifier.coef_[0]) - if hasattr( - self.model.named_steps["vectorizer"], "get_feature_names_out" - ): - feature_names = self.model.named_steps[ + if hasattr(model.named_steps["vectorizer"], "get_feature_names_out"): + feature_names = model.named_steps[ "vectorizer" ].get_feature_names_out() # Take top features to avoid too many n-grams @@ -127,8 +144,9 @@ class BaseModel(ABC): def _get_feature_names(self) -> List[str]: """Get feature names (override in subclasses if needed)""" - if hasattr(self.model, "feature_names_in_"): - return list(self.model.feature_names_in_) + model = self.model + if model is not None and hasattr(model, "feature_names_in_"): + return list(model.feature_names_in_) return [f"feature_{i}" for i in range(100)] # Default fallback def save(self, path: str): diff --git a/src/ners/research/experiment/__init__.py b/src/ners/research/experiment/__init__.py index 2cbd36d..a8491f8 100644 --- a/src/ners/research/experiment/__init__.py +++ b/src/ners/research/experiment/__init__.py @@ -70,7 +70,7 @@ class ExperimentStatus(Enum): def calculate_metrics( - y_true: np.ndarray, y_pred: np.ndarray, metrics: List[str] = None + y_true: np.ndarray, y_pred: np.ndarray, metrics: Optional[List[str]] = None ) -> Dict[str, float]: """Calculate specified metrics""" diff --git a/src/ners/research/experiment/experiment_builder.py b/src/ners/research/experiment/experiment_builder.py index e9efe0c..0dfbc9a 100644 --- a/src/ners/research/experiment/experiment_builder.py +++ b/src/ners/research/experiment/experiment_builder.py @@ -99,14 +99,24 @@ class ExperimentBuilder: logging.warning(f"Unknown feature type: {feature_str}") continue + name = ( + template_config.get("name") + or template_config.get("model_type") + or "experiment" + ) + model_type = template_config.get("model_type") or "logistic_regression" + description = template_config.get("description") or "" + return ExperimentConfig( - name=template_config.get("name"), - description=template_config.get("description"), - model_type=template_config.get("model_type"), + name=str(name), + description=str(description), + model_type=str(model_type), features=features, model_params=template_config.get("model_params", {}), tags=template_config.get("tags", []), - test_size=template_config.get("test_size", 0.2), - cross_validation_folds=template_config.get("cross_validation_folds", 5), + test_size=float(template_config.get("test_size", 0.2)), + cross_validation_folds=int( + template_config.get("cross_validation_folds", 5) + ), train_data_filter=template_config.get("train_data_filter"), ) diff --git a/src/ners/research/experiment/feature_extractor.py b/src/ners/research/experiment/feature_extractor.py index cff71e2..4f21af3 100644 --- a/src/ners/research/experiment/feature_extractor.py +++ b/src/ners/research/experiment/feature_extractor.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import List, Dict, Any, Union +from typing import List, Dict, Any, Union, Optional import pandas as pd @@ -25,7 +25,9 @@ class FeatureExtractor: """Extract different types of features from name data""" def __init__( - self, feature_types: List[FeatureType], feature_params: Dict[str, Any] = None + self, + feature_types: List[FeatureType], + feature_params: Optional[Dict[str, Any]] = None, ): self.feature_types = feature_types self.feature_params = feature_params or {} diff --git a/src/ners/research/model_trainer.py b/src/ners/research/model_trainer.py index ad77c61..88517af 100644 --- a/src/ners/research/model_trainer.py +++ b/src/ners/research/model_trainer.py @@ -1,7 +1,7 @@ import json import logging from datetime import datetime -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional import pandas as pd @@ -30,9 +30,9 @@ class ModelTrainer: self, model_name: str, model_type: str = "logistic_regression", - features: List[str] = None, - model_params: Dict[str, Any] = None, - tags: List[str] = None, + features: Optional[List[str]] = None, + model_params: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, save_artifacts: bool = True, ) -> str: """ @@ -106,7 +106,7 @@ class ModelTrainer: logging.info(f"Completed training {len(experiment_ids)} models successfully") return experiment_ids - def save_model_artifacts(self, experiment_id: str) -> Dict[str, str]: + def save_model_artifacts(self, experiment_id: str) -> Dict[str, Optional[str]]: """ Save model artifacts in a structured way for easy loading. Returns paths to saved artifacts. diff --git a/src/ners/research/models/bigru_model.py b/src/ners/research/models/bigru_model.py index b6c1bbf..d85abfc 100644 --- a/src/ners/research/models/bigru_model.py +++ b/src/ners/research/models/bigru_model.py @@ -13,7 +13,7 @@ from ners.research.neural_network_model import NeuralNetworkModel class BiGRUModel(NeuralNetworkModel): """Bidirectional GRU model for name classification""" - def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: + def build_model(self, vocab_size: int, **kwargs) -> Any: params = kwargs model = Sequential( [ @@ -33,7 +33,10 @@ class BiGRUModel(NeuralNetworkModel): params.get("gru_units", 32), return_sequences=True, dropout=params.get("dropout", 0.2), - recurrent_dropout=params.get("recurrent_dropout", 0.0), + # Use a small non-zero recurrent_dropout by default to + # disable cuDNN path, which has strict right-padding mask + # requirements and can assert when using Bidirectional. + recurrent_dropout=params.get("recurrent_dropout", 0.1), ) ), # Second GRU summarizes to the last hidden state (no return_sequences), @@ -42,7 +45,7 @@ class BiGRUModel(NeuralNetworkModel): GRU( params.get("gru_units", 32), dropout=params.get("dropout", 0.2), - recurrent_dropout=params.get("recurrent_dropout", 0.0), + recurrent_dropout=params.get("recurrent_dropout", 0.1), ) ), # Small dense head; ReLU + dropout for capacity and regularization. diff --git a/src/ners/research/models/cnn_model.py b/src/ners/research/models/cnn_model.py index 011011d..a152669 100644 --- a/src/ners/research/models/cnn_model.py +++ b/src/ners/research/models/cnn_model.py @@ -21,7 +21,7 @@ from ners.research.neural_network_model import NeuralNetworkModel class CNNModel(NeuralNetworkModel): """1D Convolutional Neural Network for character patterns""" - def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: + def build_model(self, vocab_size: int, **kwargs) -> Any: """Build CNN model with known vocabulary size""" params = kwargs diff --git a/src/ners/research/models/lstm_model.py b/src/ners/research/models/lstm_model.py index 78b878f..6b83713 100644 --- a/src/ners/research/models/lstm_model.py +++ b/src/ners/research/models/lstm_model.py @@ -13,7 +13,7 @@ from ners.research.neural_network_model import NeuralNetworkModel class LSTMModel(NeuralNetworkModel): """LSTM model for sequence learning""" - def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: + def build_model(self, vocab_size: int, **kwargs) -> Any: params = kwargs model = Sequential( [ @@ -30,7 +30,9 @@ class LSTMModel(NeuralNetworkModel): params.get("lstm_units", 32), return_sequences=True, dropout=params.get("dropout", 0.2), - recurrent_dropout=params.get("recurrent_dropout", 0.0), + # Default to a small non-zero recurrent_dropout to avoid + # cuDNN mask assertions when masking with Bidirectional. + recurrent_dropout=params.get("recurrent_dropout", 0.1), ) ), # Second LSTM condenses sequence to a fixed vector for classification. @@ -38,7 +40,7 @@ class LSTMModel(NeuralNetworkModel): LSTM( params.get("lstm_units", 32), dropout=params.get("dropout", 0.2), - recurrent_dropout=params.get("recurrent_dropout", 0.0), + recurrent_dropout=params.get("recurrent_dropout", 0.1), ) ), # Compact dense head with dropout; sufficient capacity for name signals. diff --git a/src/ners/research/models/transformer_model.py b/src/ners/research/models/transformer_model.py index 2a581d7..f7ff23b 100644 --- a/src/ners/research/models/transformer_model.py +++ b/src/ners/research/models/transformer_model.py @@ -22,7 +22,7 @@ from ners.research.neural_network_model import NeuralNetworkModel class TransformerModel(NeuralNetworkModel): """Transformer-based model""" - def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: + def build_model(self, vocab_size: int, **kwargs) -> Any: params = kwargs # Use a single resolved max_len everywhere to avoid shape mismatches max_len = int(params.get("max_len", 6)) diff --git a/src/ners/research/neural_network_model.py b/src/ners/research/neural_network_model.py index db68419..37c276f 100644 --- a/src/ners/research/neural_network_model.py +++ b/src/ners/research/neural_network_model.py @@ -24,7 +24,7 @@ class NeuralNetworkModel(BaseModel): return "neural_network" @abstractmethod - def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any: + def build_model(self, vocab_size: int, **kwargs) -> Any: """Build neural network model with known vocabulary size""" pass @@ -86,9 +86,7 @@ class NeuralNetworkModel(BaseModel): logging.info(f"Vocabulary size: {vocab_size}") # Get additional model parameters - self.model = self.build_model_with_vocab( - vocab_size=vocab_size, **self.config.model_params - ) + self.model = self.build_model(vocab_size=vocab_size, **self.config.model_params) # Train the neural network logging.info( @@ -249,8 +247,8 @@ class NeuralNetworkModel(BaseModel): max_len = self.config.model_params.get("max_len", 6) for fold, (train_idx, val_idx) in enumerate(cv.split(X_prepared, y_encoded)): - # Create fresh model for each fold using build_model_with_vocab - fold_model = self.build_model_with_vocab( + # Create fresh model for each fold using build_model + fold_model = self.build_model( vocab_size=vocab_size, max_len=max_len, **self.config.model_params ) @@ -364,8 +362,8 @@ class NeuralNetworkModel(BaseModel): val_scores = [] for seed in range(3): # 3 runs for variance - # Build fresh model using build_model_with_vocab - model = self.build_model_with_vocab( + # Build fresh model using build_model + model = self.build_model( vocab_size=vocab_size, max_len=max_len, **self.config.model_params ) diff --git a/src/ners/research/statistics/utils.py b/src/ners/research/statistics/utils.py index b27c135..10c670b 100644 --- a/src/ners/research/statistics/utils.py +++ b/src/ners/research/statistics/utils.py @@ -5,7 +5,8 @@ import numpy as np import pandas as pd from scipy.spatial.distance import euclidean from scipy.stats import entropy -from typing import Dict, Any +from collections import Counter +from typing import Dict, Any, Literal LETTERS = "abcdefghijklmnopqrstuvwxyz" START_TOKEN = "^" @@ -234,11 +235,6 @@ def build_transition_comparisons( return out -import pandas as pd -from collections import Counter -from typing import Literal - - def build_ngrams_count( df: pd.DataFrame, n: int, diff --git a/src/ners/train.py b/src/ners/train.py index eee5c18..b13dce5 100755 --- a/src/ners/train.py +++ b/src/ners/train.py @@ -30,12 +30,21 @@ def train_from_template( logging.info(f"Features: {experiment_config.get('features')}") trainer = ModelTrainer(cfg) + name_val = experiment_config.get("name") + type_val = experiment_config.get("model_type") + features_val = experiment_config.get("features") or ["full_name"] + tags_val = experiment_config.get("tags", []) + if not isinstance(name_val, str) or not isinstance(type_val, str): + raise ValueError("Template must include 'name' and 'model_type' as strings") + if not isinstance(features_val, list): + raise ValueError("Template 'features' must be a list of strings") + trainer.train_single_model( - model_name=experiment_config.get("name"), - model_type=experiment_config.get("model_type"), - features=experiment_config.get("features"), + model_name=name_val, + model_type=type_val, + features=features_val, model_params=experiment_config.get("model_params", {}), - tags=experiment_config.get("tags", []), + tags=tags_val if isinstance(tags_val, list) else [], ) logging.info("Training completed successfully!") diff --git a/src/ners/web/interfaces/__init__.py b/src/ners/web/interfaces/__init__.py index 515d2d0..33fbabd 100644 --- a/src/ners/web/interfaces/__init__.py +++ b/src/ners/web/interfaces/__init__.py @@ -1 +1,3 @@ -from .ner_testing import NERTesting +from .ner_testing import NERTesting as NERTesting + +__all__ = ["NERTesting"] diff --git a/src/ners/web/interfaces/predictions.py b/src/ners/web/interfaces/predictions.py index 685f59d..5cded2b 100644 --- a/src/ners/web/interfaces/predictions.py +++ b/src/ners/web/interfaces/predictions.py @@ -116,7 +116,7 @@ class Predictions: try: probabilities = model.predict_proba(input_df)[0] return max(probabilities) - except: + except Exception: return None def _display_single_prediction_results( @@ -209,7 +209,7 @@ class Predictions: try: probabilities = model.predict_proba(df) df["confidence"] = np.max(probabilities, axis=1) - except: + except Exception: df["confidence"] = None st.success("Predictions completed!") diff --git a/uv.lock b/uv.lock index af00728..6ed060e 100644 --- a/uv.lock +++ b/uv.lock @@ -710,6 +710,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "iniconfig" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, +] + [[package]] name = "ipykernel" version = "6.30.1" @@ -1349,6 +1358,8 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "ipykernel" }, + { name = "pyright" }, + { name = "pytest" }, { name = "ruff" }, ] @@ -1379,6 +1390,8 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ { name = "ipykernel", specifier = ">=6.30.1" }, + { name = "pyright", specifier = ">=1.1.406" }, + { name = "pytest", specifier = ">=8.4.2" }, { name = "ruff", specifier = ">=0.13.3" }, ] @@ -1400,6 +1413,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, ] +[[package]] +name = "nodeenv" +version = "1.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, +] + [[package]] name = "numpy" version = "2.3.3" @@ -1734,6 +1756,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/93/023955c26b0ce614342d11cc0652f1e45e32393b6ab9d11a664a60e9b7b7/plotly-6.3.1-py3-none-any.whl", hash = "sha256:8b4420d1dcf2b040f5983eed433f95732ed24930e496d36eb70d211923532e64", size = 9833698, upload-time = "2025-10-02T16:10:22.584Z" }, ] +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + [[package]] name = "preshed" version = "3.0.10" @@ -2079,6 +2110,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/73/a7141a1a0559bf1a7aa42a11c879ceb19f02f5c6c371c6d57fd86cefd4d1/pyproj-3.7.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d9d25bae416a24397e0d85739f84d323b55f6511e45a522dd7d7eae70d10c7e4", size = 6391844, upload-time = "2025-08-14T12:05:40.745Z" }, ] +[[package]] +name = "pyright" +version = "1.1.406" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nodeenv" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f7/16/6b4fbdd1fef59a0292cbb99f790b44983e390321eccbc5921b4d161da5d1/pyright-1.1.406.tar.gz", hash = "sha256:c4872bc58c9643dac09e8a2e74d472c62036910b3bd37a32813989ef7576ea2c", size = 4113151, upload-time = "2025-10-02T01:04:45.488Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/a2/e309afbb459f50507103793aaef85ca4348b66814c86bc73908bdeb66d12/pyright-1.1.406-py3-none-any.whl", hash = "sha256:1d81fb43c2407bf566e97e57abb01c811973fdb21b2df8df59f870f688bdca71", size = 5980982, upload-time = "2025-10-02T01:04:43.137Z" }, +] + +[[package]] +name = "pytest" +version = "8.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0"