feat: add osm data
This commit is contained in:
@@ -41,14 +41,14 @@ class BaseModel(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> Dict[str, float] | dict[str, np.floating[Any]]:
|
||||
"""Perform cross-validation and return average scores"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
pass
|
||||
|
||||
@@ -55,11 +55,11 @@ class ExperimentBuilder:
|
||||
# Check if this is the experiment we're looking for
|
||||
# Look for experiments that match the model type or contain the name
|
||||
if (
|
||||
experiment.get("model_type") == name
|
||||
or name.lower() in experiment.get("name", "").lower()
|
||||
or experiment.get("name") == name
|
||||
or f"baseline_{name}" == experiment.get("name")
|
||||
or f"advanced_{name}" == experiment.get("name")
|
||||
experiment.get("model_type") == name
|
||||
or name.lower() in experiment.get("name", "").lower()
|
||||
or experiment.get("name") == name
|
||||
or f"baseline_{name}" == experiment.get("name")
|
||||
or f"advanced_{name}" == experiment.get("name")
|
||||
):
|
||||
return experiment
|
||||
|
||||
@@ -72,7 +72,9 @@ class ExperimentBuilder:
|
||||
f"Available experiments: {available_experiments}"
|
||||
)
|
||||
|
||||
def get_templates(self, templates_path: str = "research_templates.yaml") -> Dict[str, List[Dict]]:
|
||||
def get_templates(
|
||||
self, templates_path: str = "research_templates.yaml"
|
||||
) -> Dict[str, List[Dict]]:
|
||||
"""Get all available experiments from templates organized by type"""
|
||||
templates = self.load_templates(templates_path)
|
||||
|
||||
@@ -80,7 +82,7 @@ class ExperimentBuilder:
|
||||
"baseline": templates.get("baseline_experiments", []),
|
||||
"advanced": templates.get("advanced_experiments", []),
|
||||
"feature_study": templates.get("feature_studies", []),
|
||||
"tuning": templates.get("hyperparameter_tuning", [])
|
||||
"tuning": templates.get("hyperparameter_tuning", []),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -104,5 +106,5 @@ class ExperimentBuilder:
|
||||
tags=template_config.get("tags", []),
|
||||
test_size=template_config.get("test_size", 0.2),
|
||||
cross_validation_folds=template_config.get("cross_validation_folds", 5),
|
||||
train_data_filter=template_config.get("train_data_filter")
|
||||
train_data_filter=template_config.get("train_data_filter"),
|
||||
)
|
||||
|
||||
@@ -158,12 +158,12 @@ class ExperimentRunner:
|
||||
|
||||
@classmethod
|
||||
def _create_prediction_examples(
|
||||
cls,
|
||||
X_test: pd.DataFrame,
|
||||
y_test: pd.Series,
|
||||
predictions: np.ndarray,
|
||||
model: BaseModel,
|
||||
n_examples: int = 10,
|
||||
cls,
|
||||
X_test: pd.DataFrame,
|
||||
y_test: pd.Series,
|
||||
predictions: np.ndarray,
|
||||
model: BaseModel,
|
||||
n_examples: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""Create prediction examples for analysis"""
|
||||
examples = []
|
||||
@@ -237,7 +237,7 @@ class ExperimentRunner:
|
||||
return None
|
||||
|
||||
def compare_experiments(
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
self, experiment_ids: List[str], metric: str = "accuracy"
|
||||
) -> pd.DataFrame:
|
||||
"""Compare experiments and return analysis"""
|
||||
comparison_df = self.tracker.compare_experiments(experiment_ids)
|
||||
|
||||
@@ -77,10 +77,10 @@ class ExperimentTracker:
|
||||
return self._results.get(experiment_id)
|
||||
|
||||
def list_experiments(
|
||||
self,
|
||||
status: Optional[ExperimentStatus] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
self,
|
||||
status: Optional[ExperimentStatus] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
) -> List[ExperimentResult]:
|
||||
"""List experiments with optional filtering"""
|
||||
results = list(self._results.values())
|
||||
@@ -97,7 +97,7 @@ class ExperimentTracker:
|
||||
return sorted(results, key=lambda x: x.start_time, reverse=True)
|
||||
|
||||
def get_best_experiment(
|
||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
|
||||
) -> Optional[ExperimentResult]:
|
||||
"""Get the best experiment based on a metric"""
|
||||
experiments = self.list_experiments()
|
||||
@@ -159,8 +159,8 @@ class ExperimentTracker:
|
||||
"""Export all results to CSV"""
|
||||
if output_path is None:
|
||||
output_path = (
|
||||
self.experiments_dir
|
||||
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
self.experiments_dir
|
||||
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
)
|
||||
|
||||
rows = []
|
||||
|
||||
@@ -43,7 +43,7 @@ class FeatureExtractor:
|
||||
return features_df
|
||||
|
||||
def _extract_single_feature(
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
self, df: pd.DataFrame, feature_type: FeatureType
|
||||
) -> Union[pd.Series, pd.DataFrame]:
|
||||
"""Extract a single type of feature"""
|
||||
if feature_type == FeatureType.FULL_NAME:
|
||||
|
||||
@@ -27,13 +27,13 @@ class ModelTrainer:
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def train_single_model(
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
tags: List[str] = None,
|
||||
save_artifacts: bool = True,
|
||||
self,
|
||||
model_name: str,
|
||||
model_type: str = "logistic_regression",
|
||||
features: List[str] = None,
|
||||
model_params: Dict[str, Any] = None,
|
||||
tags: List[str] = None,
|
||||
save_artifacts: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Train a single model and save its artifacts.
|
||||
@@ -75,7 +75,7 @@ class ModelTrainer:
|
||||
return experiment_id
|
||||
|
||||
def train_multiple_models(
|
||||
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
|
||||
) -> List[str]:
|
||||
"""
|
||||
Train multiple models with different configurations.
|
||||
|
||||
@@ -82,7 +82,9 @@ class EnsembleModel(TraditionalModel):
|
||||
# Soft voting averages probabilities (preferred when members are calibrated);
|
||||
# hard voting uses majority class. Parallelize member predictions.
|
||||
voting_type = params.get("voting", "soft") # 'hard' or 'soft'
|
||||
return VotingClassifier(estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1))
|
||||
return VotingClassifier(
|
||||
estimators=estimators, voting=voting_type, n_jobs=params.get("n_jobs", -1)
|
||||
)
|
||||
|
||||
def prepare_features(self, X: pd.DataFrame) -> np.ndarray:
|
||||
text_features = []
|
||||
|
||||
@@ -55,7 +55,9 @@ class RandomForestModel(TraditionalModel):
|
||||
encoder = self.label_encoders[feature_key]
|
||||
column_clean = column.fillna("unknown").astype(str)
|
||||
known_classes = set(encoder.classes_)
|
||||
default_class = "unknown" if "unknown" in known_classes else encoder.classes_[0]
|
||||
default_class = (
|
||||
"unknown" if "unknown" in known_classes else encoder.classes_[0]
|
||||
)
|
||||
column_mapped = column_clean.apply(
|
||||
lambda value: value if value in known_classes else default_class
|
||||
)
|
||||
|
||||
@@ -36,9 +36,9 @@ class TransformerModel(NeuralNetworkModel):
|
||||
|
||||
# Add positional encoding
|
||||
positions = tf.range(start=0, limit=params.get("max_len", 8), delta=1)
|
||||
pos_embedding = Embedding(input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64))(
|
||||
positions
|
||||
)
|
||||
pos_embedding = Embedding(
|
||||
input_dim=params.get("max_len", 8), output_dim=params.get("embedding_dim", 64)
|
||||
)(positions)
|
||||
x = x + pos_embedding
|
||||
|
||||
x = self._transformer_encoder(x, params)
|
||||
|
||||
@@ -84,7 +84,9 @@ class NeuralNetworkModel(BaseModel):
|
||||
def _collect_text_corpus(self, X: pd.DataFrame) -> List[str]:
|
||||
"""Combine configured textual features into one string per record."""
|
||||
|
||||
column_names = [feature.value for feature in self.config.features if feature.value in X.columns]
|
||||
column_names = [
|
||||
feature.value for feature in self.config.features if feature.value in X.columns
|
||||
]
|
||||
if not column_names:
|
||||
raise ValueError("No configured text features found in the provided DataFrame.")
|
||||
|
||||
@@ -101,7 +103,7 @@ class NeuralNetworkModel(BaseModel):
|
||||
return combined_rows
|
||||
|
||||
def cross_validate(
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
|
||||
) -> dict[str, np.floating[Any]]:
|
||||
features_df = self.feature_extractor.extract_features(X)
|
||||
X_prepared = self.prepare_features(features_df)
|
||||
@@ -158,7 +160,7 @@ class NeuralNetworkModel(BaseModel):
|
||||
}
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
@@ -103,7 +103,7 @@ class TraditionalModel(BaseModel):
|
||||
return results
|
||||
|
||||
def generate_learning_curve(
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate learning curve data for the model"""
|
||||
logging.info(f"Generating learning curve for {self.__class__.__name__}")
|
||||
|
||||
Reference in New Issue
Block a user