refactoring: add initial pipeline configuration and model classes
This commit is contained in:
@@ -0,0 +1,39 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from research.traditional_model import TraditionalModel
|
||||
|
||||
|
||||
class NaiveBayesModel(TraditionalModel):
|
||||
"""Multinomial Naive Bayes with character n-grams"""
|
||||
|
||||
def build_model(self) -> BaseEstimator:
|
||||
params = self.config.model_params
|
||||
vectorizer = CountVectorizer(
|
||||
analyzer="char",
|
||||
ngram_range=params.get("ngram_range", (1, 4)),
|
||||
max_features=params.get("max_features", 8000),
|
||||
)
|
||||
|
||||
classifier = MultinomialNB(alpha=params.get("alpha", 1.0))
|
||||
|
||||
return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
for feature_type in self.config.features:
|
||||
if feature_type.value in X.columns:
|
||||
text_features.append(X[feature_type.value].astype(str))
|
||||
|
||||
if len(text_features) == 1:
|
||||
return text_features[0].values
|
||||
else:
|
||||
combined = text_features[0].astype(str)
|
||||
for feature in text_features[1:]:
|
||||
combined = combined + " " + feature.astype(str)
|
||||
return combined.values
|
||||
Reference in New Issue
Block a user