fix: models

This commit is contained in:
2025-10-05 21:54:25 +02:00
parent 9dd4f759b3
commit 137dea7fe5
15 changed files with 376 additions and 197 deletions
-1
View File
@@ -204,7 +204,6 @@ def web_run(
config: Optional[Path] = typer.Option(None, help="Path to configuration file"),
env: str = typer.Option("development", help="Environment name"),
) -> None:
"""Launch the Streamlit web app via subprocess."""
app_path = Path(__file__).parent / "web" / "app.py"
cmd = [
sys.executable,
-1
View File
@@ -4,7 +4,6 @@ from typing import Optional, Union
from ners.core.utils import ensure_directories
from ners.core.config.config_manager import ConfigManager
from ners.core.config.logging_config import LoggingConfig
from ners.core.config.pipeline_config import PipelineConfig
config_manager = ConfigManager()
+29 -14
View File
@@ -16,6 +16,7 @@ class LightGBMModel(TraditionalModel):
# Store vectorizers and encoders to ensure consistent feature space
self.vectorizers = {}
self.label_encoders = {}
self.feature_columns = []
def build_model(self) -> BaseEstimator:
params = self.config.model_params
@@ -38,14 +39,16 @@ class LightGBMModel(TraditionalModel):
random_state=self.config.random_seed,
objective=params.get("objective", "binary"),
n_jobs=params.get("n_jobs", -1),
verbose=2,
verbose=params.get("verbose", -1),
device=device,
gpu_platform_id=gpu_platform_id,
gpu_device_id=gpu_device_id,
force_row_wise=params.get("force_row_wise", True),
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
def prepare_features(self, X: pd.DataFrame) -> pd.DataFrame | np.ndarray:
features = []
columns: list[str] = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
@@ -53,7 +56,9 @@ class LightGBMModel(TraditionalModel):
if feature_type.value in ["name_length", "word_count"]:
# Numerical features
features.append(column.fillna(0).values.reshape(-1, 1))
arr = column.fillna(0).values.reshape(-1, 1)
features.append(arr)
columns.append(feature_type.value)
elif feature_type.value in ["full_name", "native_name", "surname"]:
# Character-level features for names
feature_key = f"vectorizer_{feature_type.value}"
@@ -63,20 +68,24 @@ class LightGBMModel(TraditionalModel):
self.vectorizers[feature_key] = CountVectorizer(
analyzer="char", ngram_range=(2, 3), max_features=50
)
char_features = (
self.vectorizers[feature_key]
.fit_transform(column.fillna("").astype(str))
.toarray()
)
vec = self.vectorizers[feature_key]
char_features = vec.fit_transform(
column.fillna("").astype(str)
).toarray()
vocab_names = list(vec.get_feature_names_out())
else:
# Subsequent times - use existing vectorizer
char_features = (
self.vectorizers[feature_key]
.transform(column.fillna("").astype(str))
.toarray()
)
vec = self.vectorizers[feature_key]
char_features = vec.transform(
column.fillna("").astype(str)
).toarray()
vocab_names = list(vec.get_feature_names_out())
features.append(char_features)
# Prefix with feature name to avoid collisions
columns.extend(
[f"char_{feature_type.value}_{n}" for n in vocab_names]
)
else:
# Categorical features
feature_key = f"encoder_{feature_type.value}"
@@ -111,5 +120,11 @@ class LightGBMModel(TraditionalModel):
)
features.append(encoded.reshape(-1, 1))
columns.append(f"cat_{feature_type.value}")
if not features:
return pd.DataFrame(index=X.index)
return np.hstack(features) if features else np.array([]).reshape(len(X), 0)
matrix = np.hstack(features)
# Persist column order for consistency
self.feature_columns = columns
return pd.DataFrame(matrix, index=X.index, columns=columns)
@@ -1,3 +1,4 @@
import logging
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
@@ -13,22 +14,38 @@ class LogisticRegressionModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Character n-grams are strong signals for names; (2,5) balances
# Character n-grams are strong signals for names; (2,4) balances
# capturing prefixes/suffixes with tractable feature size.
# Ensure tuple for sklearn API (YAML lists -> tuple)
ngram_range = params.get("ngram_range", (2, 4))
if isinstance(ngram_range, list):
ngram_range = tuple(ngram_range)
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 5)),
ngram_range=ngram_range,
max_features=params.get("max_features", 10000),
)
# liblinear handles sparse, small-to-medium problems well; n_jobs parallelizes
# OvR across classes (no effect for binary). class_weight can mitigate imbalance.
# Choose solver and threads. liblinear ignores n_jobs>1 in recent sklearn
# versions, which raises a warning; clamp to 1 to avoid noise.
solver = params.get("solver", "liblinear")
n_jobs = params.get("n_jobs", -1)
if solver == "liblinear" and (n_jobs is None or n_jobs != 1):
if isinstance(n_jobs, int) and n_jobs != 1:
logging.info(
"LogisticRegression(liblinear): forcing n_jobs=1 to avoid sklearn warning"
)
n_jobs = 1
# liblinear handles sparse, small-to-medium problems well; class_weight can
# mitigate imbalance. For very large, consider solver='saga'.
classifier = LogisticRegression(
max_iter=params.get("max_iter", 1000),
random_state=self.config.random_seed,
verbose=2,
solver=params.get("solver", "liblinear"),
n_jobs=params.get("n_jobs", -1),
solver=solver,
n_jobs=n_jobs,
class_weight=params.get("class_weight", None),
)
@@ -15,9 +15,14 @@ class NaiveBayesModel(TraditionalModel):
params = self.config.model_params
# Bag-of-character-ngrams aligns with Multinomial NB assumptions; (1,4)
# includes unigrams for coverage and higher n for suffix/prefix cues.
# Ensure tuple for sklearn API (YAML lists -> tuple)
ngram_range = params.get("ngram_range", (2, 4))
if isinstance(ngram_range, list):
ngram_range = tuple(ngram_range)
vectorizer = CountVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 5)),
ngram_range=ngram_range,
max_features=params.get("max_features", 8000),
)
-52
View File
@@ -1,52 +0,0 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from ners.research.traditional_model import TraditionalModel
class SVMModel(TraditionalModel):
"""Support Vector Machine with character n-grams and RBF kernel"""
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# TF-IDF downweights very common patterns; char n-grams (2,4) are effective
# for distinguishing name morphology under RBF kernels.
vectorizer = TfidfVectorizer(
analyzer="char",
ngram_range=params.get("ngram_range", (2, 4)),
max_features=params.get("max_features", 5000),
)
# RBF kernel captures non-linear interactions between n-grams; probability=True
# adds calibration at some cost. Larger cache helps speed kernel computations.
classifier = SVC(
kernel=params.get("kernel", "rbf"),
C=params.get("C", 1.0),
gamma=params.get("gamma", "scale"),
probability=True, # Enable probability prediction
class_weight=params.get("class_weight", None),
cache_size=params.get("cache_size", 1000),
random_state=self.config.random_seed,
verbose=2,
)
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values
@@ -24,20 +24,21 @@ class TransformerModel(NeuralNetworkModel):
def build_model_with_vocab(self, vocab_size: int, **kwargs) -> Any:
params = kwargs
# Use a single resolved max_len everywhere to avoid shape mismatches
max_len = int(params.get("max_len", 6))
# Build Transformer model
inputs = Input(shape=(params.get("max_len", 8),))
inputs = Input(shape=(max_len,))
x = Embedding(
input_dim=vocab_size,
output_dim=params.get("embedding_dim", 64),
input_length=params.get("max_len", 8),
mask_zero=True,
)(inputs)
# Add positional encoding
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
positions = tf.range(start=0, limit=max_len, delta=1)
pos_embedding = Embedding(
input_dim=params.get("max_len", 8),
input_dim=max_len,
output_dim=params.get("embedding_dim", 64),
)(positions)
x = x + pos_embedding
@@ -85,6 +86,6 @@ class TransformerModel(NeuralNetworkModel):
# Convert to sequences
sequences = self.tokenizer.texts_to_sequences(text_data)
max_len = self.config.model_params.get("max_len", 6)
max_len = int(self.config.model_params.get("max_len", 6))
return pad_sequences(sequences, maxlen=max_len, padding="post")
+4 -6
View File
@@ -20,13 +20,12 @@ class XGBoostModel(TraditionalModel):
def build_model(self) -> BaseEstimator:
params = self.config.model_params
# Optional GPU acceleration
# Optional GPU acceleration. With modern XGBoost, setting tree_method is
# sufficient and you typically don't need to pass `predictor`; doing so can
# trigger "Parameters ... are not used" warnings with the sklearn API.
use_gpu = bool(params.get("use_gpu", False))
default_tree_method = "gpu_hist" if use_gpu else "hist"
tree_method = params.get("tree_method", default_tree_method)
predictor = params.get(
"predictor", "gpu_predictor" if tree_method.startswith("gpu") else "auto"
)
# Histogram-based trees and parallelism provide fast training; default
# logloss metric suits binary classification of gender.
@@ -40,8 +39,7 @@ class XGBoostModel(TraditionalModel):
eval_metric="logloss",
n_jobs=params.get("n_jobs", -1),
tree_method=tree_method,
predictor=predictor,
verbosity=2,
verbosity=params.get("verbosity", 0),
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+13 -9
View File
@@ -61,18 +61,22 @@ class TraditionalModel(BaseModel):
f"Fitting model with {X_prepared.shape[0]} samples and {X_prepared.shape[1]} features"
)
logging.info(X_prepared[0])
try:
# Log a small sample safely for arrays or DataFrames
if hasattr(X_prepared, "iloc"):
logging.info(X_prepared.iloc[0].to_dict())
else:
logging.info(X_prepared[0])
except Exception:
pass
logging.info(f"Model parameters: {self.config.model_params}")
history = self.model.fit(X_prepared, y_encoded)
# Fit scikit-learn compatible model. Unlike Keras, sklearn's fit returns
# the estimator itself and does not provide a training history object.
# We therefore do not populate training_history here.
self.model.fit(X_prepared, y_encoded)
self.is_fitted = True
self.training_history = {
"accuracy": history.history["accuracy"],
"loss": history.history["loss"],
"val_accuracy": history.history.get("val_accuracy", []),
"val_loss": history.history.get("val_loss", []),
}
self.training_history = {}
return self