Files
drc-ners-nlp/research/models/ensemble_model.py
T

98 lines
3.5 KiB
Python

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from research.experiment import ExperimentConfig
from research.traditional_model import TraditionalModel
class EnsembleModel(TraditionalModel):
"""Ensemble model combining multiple base models"""
@property
def architecture(self) -> str:
"""Return the architecture type"""
return "ensemble"
def __init__(self, config: ExperimentConfig):
super().__init__(config)
self.base_models = []
self.model_weights = None
def build_model(self) -> BaseEstimator:
params = self.config.model_params
base_model_types = params.get(
"base_models", ["logistic_regression", "random_forest", "naive_bayes"]
)
# Create base models with simplified configs
estimators = []
for model_type in base_model_types:
if model_type == "logistic_regression":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(2, 4), max_features=5000),
),
(
"classifier",
LogisticRegression(max_iter=1000, random_state=self.config.random_seed),
),
]
)
estimators.append((f"logistic_regression", model))
elif model_type == "random_forest":
model = Pipeline(
[
(
"vectorizer",
TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=3000),
),
(
"classifier",
RandomForestClassifier(
n_estimators=50, random_state=self.config.random_seed
),
),
]
)
estimators.append((f"rf", model))
elif model_type == "naive_bayes":
model = Pipeline(
[
(
"vectorizer",
CountVectorizer(analyzer="char", ngram_range=(1, 3), max_features=4000),
),
("classifier", MultinomialNB()),
]
)
estimators.append((f"nb", model))
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(estimators=estimators, voting=voting_type)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
for feature_type in self.config.features:
if feature_type.value in X.columns:
text_features.append(X[feature_type.value].astype(str))
if len(text_features) == 1:
return text_features[0].values
else:
combined = text_features[0].astype(str)
for feature in text_features[1:]:
combined = combined + " " + feature.astype(str)
return combined.values