fix: models
This commit is contained in:
@@ -204,7 +204,6 @@ def web_run(
|
||||
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
|
||||
env: str = typer.Option("development", help="Environment name"),
|
||||
) -> None:
|
||||
"""Launch the Streamlit web app via subprocess."""
|
||||
app_path = Path(__file__).parent / "web" / "app.py"
|
||||
cmd = [
|
||||
sys.executable,
|
||||
|
||||
@@ -4,7 +4,6 @@ from typing import Optional, Union
|
||||
|
||||
from ners.core.utils import ensure_directories
|
||||
from ners.core.config.config_manager import ConfigManager
|
||||
from ners.core.config.logging_config import LoggingConfig
|
||||
from ners.core.config.pipeline_config import PipelineConfig
|
||||
|
||||
config_manager = ConfigManager()
|
||||
|
||||
@@ -16,6 +16,7 @@ class LightGBMModel(TraditionalModel):
|
||||
# Store vectorizers and encoders to ensure consistent feature space
|
||||
self.vectorizers = {}
|
||||
self.label_encoders = {}
|
||||
self.feature_columns = []
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
@@ -38,14 +39,16 @@ class LightGBMModel(TraditionalModel):
|
||||
random_state=self.config.random_seed,
|
||||
objective=params.get("objective", "binary"),
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
verbose=2,
|
||||
verbose=params.get("verbose", -1),
|
||||
device=device,
|
||||
gpu_platform_id=gpu_platform_id,
|
||||
gpu_device_id=gpu_device_id,
|
||||
force_row_wise=params.get("force_row_wise", True),
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
def prepare_features(self, X: pd.DataFrame) -> pd.DataFrame | np.ndarray:
|
||||
features = []
|
||||
columns: list[str] = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
@@ -53,7 +56,9 @@ class LightGBMModel(TraditionalModel):
|
||||
|
||||
if feature_type.value in ["name_length", "word_count"]:
|
||||
# Numerical features
|
||||
features.append(column.fillna(0).values.reshape(-1, 1))
|
||||
arr = column.fillna(0).values.reshape(-1, 1)
|
||||
features.append(arr)
|
||||
columns.append(feature_type.value)
|
||||
elif feature_type.value in ["full_name", "native_name", "surname"]:
|
||||
# Character-level features for names
|
||||
feature_key = f"vectorizer_{feature_type.value}"
|
||||
@@ -63,20 +68,24 @@ class LightGBMModel(TraditionalModel):
|
||||
self.vectorizers[feature_key] = CountVectorizer(
|
||||
analyzer="char", ngram_range=(2, 3), max_features=50
|
||||
)
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.fit_transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
vec = self.vectorizers[feature_key]
|
||||
char_features = vec.fit_transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
vocab_names = list(vec.get_feature_names_out())
|
||||
else:
|
||||
# Subsequent times - use existing vectorizer
|
||||
char_features = (
|
||||
self.vectorizers[feature_key]
|
||||
.transform(column.fillna("").astype(str))
|
||||
.toarray()
|
||||
)
|
||||
vec = self.vectorizers[feature_key]
|
||||
char_features = vec.transform(
|
||||
column.fillna("").astype(str)
|
||||
).toarray()
|
||||
vocab_names = list(vec.get_feature_names_out())
|
||||
|
||||
features.append(char_features)
|
||||
# Prefix with feature name to avoid collisions
|
||||
columns.extend(
|
||||
[f"char_{feature_type.value}_{n}" for n in vocab_names]
|
||||
)
|
||||
else:
|
||||
# Categorical features
|
||||
feature_key = f"encoder_{feature_type.value}"
|
||||
@@ -111,5 +120,11 @@ class LightGBMModel(TraditionalModel):
|
||||
)
|
||||
|
||||
features.append(encoded.reshape(-1, 1))
|
||||
columns.append(f"cat_{feature_type.value}")
|
||||
if not features:
|
||||
return pd.DataFrame(index=X.index)
|
||||
|
||||
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
|
||||
matrix = np.hstack(features)
|
||||
# Persist column order for consistency
|
||||
self.feature_columns = columns
|
||||
return pd.DataFrame(matrix, index=X.index, columns=columns)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
@@ -13,22 +14,38 @@ class LogisticRegressionModel(TraditionalModel):
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# Character n-grams are strong signals for names; (2,5) balances
|
||||
# Character n-grams are strong signals for names; (2,4) balances
|
||||
# capturing prefixes/suffixes with tractable feature size.
|
||||
# Ensure tuple for sklearn API (YAML lists -> tuple)
|
||||
ngram_range = params.get("ngram_range", (2, 4))
|
||||
if isinstance(ngram_range, list):
|
||||
ngram_range = tuple(ngram_range)
|
||||
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 5)),
|
||||
ngram_range=ngram_range,
|
||||
max_features=params.get("max_features", 10000),
|
||||
)
|
||||
|
||||
# liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
|
||||
# OvR across classes (no effect for binary). class_weight can mitigate imbalance.
|
||||
# Choose solver and threads. liblinear ignores n_jobs>1 in recent sklearn
|
||||
# versions, which raises a warning; clamp to 1 to avoid noise.
|
||||
solver = params.get("solver", "liblinear")
|
||||
n_jobs = params.get("n_jobs", -1)
|
||||
if solver == "liblinear" and (n_jobs is None or n_jobs != 1):
|
||||
if isinstance(n_jobs, int) and n_jobs != 1:
|
||||
logging.info(
|
||||
"LogisticRegression(liblinear): forcing n_jobs=1 to avoid sklearn warning"
|
||||
)
|
||||
n_jobs = 1
|
||||
|
||||
# liblinear handles sparse, small-to-medium problems well; class_weight can
|
||||
# mitigate imbalance. For very large, consider solver='saga'.
|
||||
classifier = LogisticRegression(
|
||||
max_iter=params.get("max_iter", 1000),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
solver=params.get("solver", "liblinear"),
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
solver=solver,
|
||||
n_jobs=n_jobs,
|
||||
class_weight=params.get("class_weight", None),
|
||||
)
|
||||
|
||||
|
||||
@@ -15,9 +15,14 @@ class NaiveBayesModel(TraditionalModel):
|
||||
params = self.config.model_params
|
||||
# Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
|
||||
# includes unigrams for coverage and higher n for suffix/prefix cues.
|
||||
# Ensure tuple for sklearn API (YAML lists -> tuple)
|
||||
ngram_range = params.get("ngram_range", (2, 4))
|
||||
if isinstance(ngram_range, list):
|
||||
ngram_range = tuple(ngram_range)
|
||||
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 5)),
|
||||
ngram_range=ngram_range,
|
||||
max_features=params.get("max_features", 8000),
|
||||
)
|
||||
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from ners.research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class SVMModel(TraditionalModel):
|
||||
"""Support Vector Machine with character n-grams and RBF kernel"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
|
||||
# for distinguishing name morphology under RBF kernels.
|
||||
vectorizer = TfidfVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (2, 4)),
|
||||
max_features=params.get("max_features", 5000),
|
||||
)
|
||||
|
||||
# RBF kernel captures non-linear interactions between n-grams; probability=True
|
||||
# adds calibration at some cost. Larger cache helps speed kernel computations.
|
||||
classifier = SVC(
|
||||
kernel=params.get("kernel", "rbf"),
|
||||
C=params.get("C", 1.0),
|
||||
gamma=params.get("gamma", "scale"),
|
||||
probability=True, # Enable probability prediction
|
||||
class_weight=params.get("class_weight", None),
|
||||
cache_size=params.get("cache_size", 1000),
|
||||
random_state=self.config.random_seed,
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
@@ -24,20 +24,21 @@ class TransformerModel(NeuralNetworkModel):
|
||||
|
||||
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
|
||||
params = kwargs
|
||||
# Use a single resolved max_len everywhere to avoid shape mismatches
|
||||
max_len = int(params.get("max_len", 6))
|
||||
|
||||
# Build Transformer model
|
||||
inputs = Input(shape=(params.get("max_len", 8),))
|
||||
inputs = Input(shape=(max_len,))
|
||||
x = Embedding(
|
||||
input_dim=vocab_size,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
input_length=params.get("max_len", 8),
|
||||
mask_zero=True,
|
||||
)(inputs)
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
||||
positions = tf.range(start=0, limit=max_len, delta=1)
|
||||
pos_embedding = Embedding(
|
||||
input_dim=params.get("max_len", 8),
|
||||
input_dim=max_len,
|
||||
output_dim=params.get("embedding_dim", 64),
|
||||
)(positions)
|
||||
x = x + pos_embedding
|
||||
@@ -85,6 +86,6 @@ class TransformerModel(NeuralNetworkModel):
|
||||
|
||||
# Convert to sequences
|
||||
sequences = self.tokenizer.texts_to_sequences(text_data)
|
||||
max_len = self.config.model_params.get("max_len", 6)
|
||||
max_len = int(self.config.model_params.get("max_len", 6))
|
||||
|
||||
return pad_sequences(sequences, maxlen=max_len, padding="post")
|
||||
|
||||
@@ -20,13 +20,12 @@ class XGBoostModel(TraditionalModel):
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
|
||||
# Optional GPU acceleration
|
||||
# Optional GPU acceleration. With modern XGBoost, setting tree_method is
|
||||
# sufficient and you typically don't need to pass `predictor`; doing so can
|
||||
# trigger "Parameters ... are not used" warnings with the sklearn API.
|
||||
use_gpu = bool(params.get("use_gpu", False))
|
||||
default_tree_method = "gpu_hist" if use_gpu else "hist"
|
||||
tree_method = params.get("tree_method", default_tree_method)
|
||||
predictor = params.get(
|
||||
"predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto"
|
||||
)
|
||||
|
||||
# Histogram-based trees and parallelism provide fast training; default
|
||||
# logloss metric suits binary classification of gender.
|
||||
@@ -40,8 +39,7 @@ class XGBoostModel(TraditionalModel):
|
||||
eval_metric="logloss",
|
||||
n_jobs=params.get("n_jobs", -1),
|
||||
tree_method=tree_method,
|
||||
predictor=predictor,
|
||||
verbosity=2,
|
||||
verbosity=params.get("verbosity", 0),
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
|
||||
@@ -61,18 +61,22 @@ class TraditionalModel(BaseModel):
|
||||
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
|
||||
)
|
||||
|
||||
logging.info(X_prepared[0])
|
||||
try:
|
||||
# Log a small sample safely for arrays or DataFrames
|
||||
if hasattr(X_prepared, "iloc"):
|
||||
logging.info(X_prepared.iloc[0].to_dict())
|
||||
else:
|
||||
logging.info(X_prepared[0])
|
||||
except Exception:
|
||||
pass
|
||||
logging.info(f"Model parameters: {self.config.model_params}")
|
||||
|
||||
history = self.model.fit(X_prepared, y_encoded)
|
||||
# Fit scikit-learn compatible model. Unlike Keras, sklearn's fit returns
|
||||
# the estimator itself and does not provide a training history object.
|
||||
# We therefore do not populate training_history here.
|
||||
self.model.fit(X_prepared, y_encoded)
|
||||
self.is_fitted = True
|
||||
|
||||
self.training_history = {
|
||||
"accuracy": history.history["accuracy"],
|
||||
"loss": history.history["loss"],
|
||||
"val_accuracy": history.history.get("val_accuracy", []),
|
||||
"val_loss": history.history.get("val_loss", []),
|
||||
}
|
||||
self.training_history = {}
|
||||
|
||||
return self
|
||||
|
||||
|
||||
Reference in New Issue
Block a user