feat: add osm data

This commit is contained in:
2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
+2 -2
View File
@@ -41,14 +41,14 @@ class BaseModel(ABC):
@abstractmethod
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float] | dict[str, np.floating[Any]]:
"""Perform cross-validation and return average scores"""
pass
@abstractmethod
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
pass
+10 -8
View File
@@ -55,11 +55,11 @@ class ExperimentBuilder:
# Check if this is the experiment we're looking for
# Look for experiments that match the model type or contain the name
if (
experiment.get("model_type") == name
or name.lower() in experiment.get("name", "").lower()
or experiment.get("name") == name
or f"baseline_{name}" == experiment.get("name")
or f"advanced_{name}" == experiment.get("name")
experiment.get("model_type") == name
or name.lower() in experiment.get("name", "").lower()
or experiment.get("name") == name
or f"baseline_{name}" == experiment.get("name")
or f"advanced_{name}" == experiment.get("name")
):
return experiment
@@ -72,7 +72,9 @@ class ExperimentBuilder:
f"Available experiments: {available_experiments}"
)
def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]:
def get_templates(
self, templates_path: str = "research_templates.yaml"
) -> Dict[str, List[Dict]]:
"""Get all available experiments from templates organized by type"""
templates = self.load_templates(templates_path)
@@ -80,7 +82,7 @@ class ExperimentBuilder:
"baseline": templates.get("baseline_experiments", []),
"advanced": templates.get("advanced_experiments", []),
"feature_study": templates.get("feature_studies", []),
"tuning": templates.get("hyperparameter_tuning", [])
"tuning": templates.get("hyperparameter_tuning", []),
}
@classmethod
@@ -104,5 +106,5 @@ class ExperimentBuilder:
tags=template_config.get("tags", []),
test_size=template_config.get("test_size", 0.2),
cross_validation_folds=template_config.get("cross_validation_folds", 5),
train_data_filter=template_config.get("train_data_filter")
train_data_filter=template_config.get("train_data_filter"),
)
+7 -7
View File
@@ -158,12 +158,12 @@ class ExperimentRunner:
@classmethod
def _create_prediction_examples(
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
cls,
X_test: pd.DataFrame,
y_test: pd.Series,
predictions: np.ndarray,
model: BaseModel,
n_examples: int = 10,
) -> List[Dict]:
"""Create prediction examples for analysis"""
examples = []
@@ -237,7 +237,7 @@ class ExperimentRunner:
return None
def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy"
self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame:
"""Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids)
+7 -7
View File
@@ -77,10 +77,10 @@ class ExperimentTracker:
return self._results.get(experiment_id)
def list_experiments(
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
self,
status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None,
model_type: Optional[str] = None,
) -> List[ExperimentResult]:
"""List experiments with optional filtering"""
results = list(self._results.values())
@@ -97,7 +97,7 @@ class ExperimentTracker:
return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric"""
experiments = self.list_experiments()
@@ -159,8 +159,8 @@ class ExperimentTracker:
"""Export all results to CSV"""
if output_path is None:
output_path = (
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
rows = []
+1 -1
View File
@@ -43,7 +43,7 @@ class FeatureExtractor:
return features_df
def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType
self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME:
+8 -8
View File
@@ -27,13 +27,13 @@ class ModelTrainer:
self.models_dir.mkdir(parents=True, exist_ok=True)
def train_single_model(
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
tags: List[str] = None,
save_artifacts: bool = True,
self,
model_name: str,
model_type: str = "logistic_regression",
features: List[str] = None,
model_params: Dict[str, Any] = None,
tags: List[str] = None,
save_artifacts: bool = True,
) -> str:
"""
Train a single model and save its artifacts.
@@ -75,7 +75,7 @@ class ModelTrainer:
return experiment_id
def train_multiple_models(
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
) -> List[str]:
"""
Train multiple models with different configurations.
+3 -1
View File
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
# Soft voting averages probabilities (preferred when members are calibrated);
# hard voting uses majority class. Parallelize member predictions.
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
return VotingClassifier(
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
)
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
text_features = []
+3 -1
View File
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
encoder = self.label_encoders[feature_key]
column_clean = column.fillna("unknown").astype(str)
known_classes = set(encoder.classes_)
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
default_class = (
"unknown" if "unknown" in known_classes else encoder.classes_[0]
)
column_mapped = column_clean.apply(
lambda value: value if value in known_classes else default_class
)
+3 -3
View File
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):
# Add positional encoding
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
positions
)
pos_embedding = Embedding(
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
)(positions)
x = x + pos_embedding
x = self._transformer_encoder(x, params)
+5 -3
View File
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
"""Combine configured textual features into one string per record."""
column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
column_names = [
feature.value for feature in self.config.features if feature.value in X.columns
]
if not column_names:
raise ValueError("No configured text features found in the provided DataFrame.")
@@ -101,7 +103,7 @@ class NeuralNetworkModel(BaseModel):
return combined_rows
def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]:
features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df)
@@ -158,7 +160,7 @@ class NeuralNetworkModel(BaseModel):
}
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")
+1 -1
View File
@@ -103,7 +103,7 @@ class TraditionalModel(BaseModel):
return results
def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]:
"""Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}")