refactoring: add initial pipeline configuration and model classes

2025-08-04 16:12:25 +02:00
parent 19c66fd0ee
commit f4689faf80
82 changed files with 7176 additions and 1218 deletions
@@ -0,0 +1,45 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+from sklearn.svm import SVC
+
+from research.traditional_model import TraditionalModel
+
+
+class SVMModel(TraditionalModel):
+    """Support Vector Machine with character n-grams and RBF kernel"""
+
+    def build_model(self) -> BaseEstimator:
+        params = self.config.model_params
+        vectorizer = TfidfVectorizer(
+            analyzer="char",
+            ngram_range=params.get("ngram_range", (2, 4)),
+            max_features=params.get("max_features", 5000),
+        )
+
+        classifier = SVC(
+            kernel=params.get("kernel", "rbf"),
+            C=params.get("C", 1.0),
+            gamma=params.get("gamma", "scale"),
+            probability=True,  # Enable probability prediction
+            random_state=self.config.random_seed,
+        )
+
+        return Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])
+
+    def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
+        text_features = []
+
+        for feature_type in self.config.features:
+            if feature_type.value in X.columns:
+                text_features.append(X[feature_type.value].astype(str))
+
+        if len(text_features) == 1:
+            return text_features[0].values
+        else:
+            combined = text_features[0].astype(str)
+            for feature in text_features[1:]:
+                combined = combined + " " + feature.astype(str)
+            return combined.values