feat: add osm data

This commit is contained in:
2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
-1
View File
@@ -1,2 +1 @@
from .ner_testing import NERTesting
+24 -21
View File
@@ -13,10 +13,10 @@ from research.model_registry import list_available_models
class Experiments:
def __init__(
self,
config: PipelineConfig,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner
self,
config: PipelineConfig,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner,
):
self.config = config
self.experiment_tracker = experiment_tracker
@@ -26,8 +26,7 @@ class Experiments:
def index(self):
st.title("Experiments")
tab1, tab2, tab3 = st.tabs(
["Templates", "Experiments", "Batch Experiments"])
tab1, tab2, tab3 = st.tabs(["Templates", "Experiments", "Batch Experiments"])
with tab1:
self.show_template_experiments()
@@ -56,14 +55,18 @@ class Experiments:
self._show_experiments_by_type(available_experiments["advanced"], "advanced")
with exp_tabs[2]:
self._show_experiments_by_type(available_experiments["feature_study"], "feature_study")
self._show_experiments_by_type(
available_experiments["feature_study"], "feature_study"
)
with exp_tabs[3]:
self._show_experiments_by_type(available_experiments["tuning"], "tuning")
except Exception as e:
st.error(f"Error loading experiment templates: {e}")
st.info("Make sure the research templates file exists at `config/research_templates.yaml`")
st.info(
"Make sure the research templates file exists at `config/research_templates.yaml`"
)
def _show_experiments_by_type(self, experiments: List[Dict], experiment_type: str):
"""Show experiments for a specific type"""
@@ -142,7 +145,7 @@ class Experiments:
# Display experiments
for i, exp in enumerate(experiments):
with st.expander(
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
):
self._display_experiment_details(exp, i)
@@ -213,7 +216,7 @@ class Experiments:
experiment_types = st.multiselect(
"Select Experiment Types",
["baseline", "advanced", "feature_study", "tuning"],
default=["baseline"]
default=["baseline"],
)
if experiment_types:
@@ -223,11 +226,11 @@ class Experiments:
experiments = available_experiments.get(exp_type, [])
if experiments:
st.write(f"**{exp_type.title()} Experiments:**")
exp_names = [exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)]
exp_names = [
exp.get("name", f"Exp {i}") for i, exp in enumerate(experiments)
]
selected_names = st.multiselect(
f"Select {exp_type} experiments",
exp_names,
key=f"select_{exp_type}"
f"Select {exp_type} experiments", exp_names, key=f"select_{exp_type}"
)
for name in selected_names:
@@ -308,13 +311,13 @@ class Experiments:
)
def run_batch_experiments(
self,
base_name: str,
model_types: List[str],
ngram_ranges: str,
feature_combinations: List[str],
test_sizes: str,
tags: str,
self,
base_name: str,
model_types: List[str],
ngram_ranges: str,
feature_combinations: List[str],
test_sizes: str,
tags: str,
):
"""Run batch experiments with parameter combinations"""
with st.spinner("Running batch experiments..."):
+1 -1
View File
@@ -38,7 +38,7 @@ class LogReader:
# Parse log entries from the end
entries = []
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip())
if entry:
entries.append(entry)
+20 -28
View File
@@ -33,7 +33,9 @@ class NERTesting:
# Load model
if not self.load_ner_model():
st.warning("NER model could not be loaded. Please ensure the model is trained and available.")
st.warning(
"NER model could not be loaded. Please ensure the model is trained and available."
)
return
# Display model information
@@ -53,9 +55,11 @@ class NERTesting:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Training Examples", f"{self.training_stats.get('training_examples', 0):,}")
st.metric(
"Training Examples", f"{self.training_stats.get('training_examples', 0):,}"
)
with col2:
st.metric("Epochs", self.training_stats.get('epochs', 0))
st.metric("Epochs", self.training_stats.get("epochs", 0))
with col3:
st.metric("Final Loss", f"{self.training_stats.get('final_loss', 0):.2f}")
with col4:
@@ -64,7 +68,7 @@ class NERTesting:
def show_model_evaluation_info(self):
if self.evaluation_stats:
col1, col2, col3 = st.columns(4)
overall = self.evaluation_stats.get('overall', {})
overall = self.evaluation_stats.get("overall", {})
with col1:
st.metric("Overall Precision", f"{overall['precision']:.2f}")
@@ -79,7 +83,7 @@ class NERTesting:
name_input = st.text_input(
"Name:",
placeholder="e.g., Jean Baptiste Mukendi, Marie Kabamba Tshiala, Joseph Kasongo",
help="Enter a full name or multiple names separated by spaces"
help="Enter a full name or multiple names separated by spaces",
)
if name_input.strip():
if st.button("Analyze Name", type="primary"):
@@ -90,12 +94,12 @@ class NERTesting:
"Names:",
placeholder="Jean Baptiste Mukendi\nMarie Kabamba Tshiala\nJoseph Kasongo\nGrace Mbuyi Kalala",
height=150,
help="Enter each name on a new line"
help="Enter each name on a new line",
)
if names_input.strip():
if st.button("Analyze All Names", type="primary"):
names = [name.strip() for name in names_input.split('\n') if name.strip()]
names = [name.strip() for name in names_input.split("\n") if name.strip()]
for i, name in enumerate(names):
st.markdown(f"**Name {i+1}: {name}**")
self.analyze_and_display(name)
@@ -106,12 +110,12 @@ class NERTesting:
try:
result = self.ner_model.predict(text)
st.subheader("Analysis Results")
entities = result.get('entities', [])
entities = result.get("entities", [])
if entities:
self.show_visual_entities(text, entities)
native_count = sum(1 for e in entities if e['label'] == 'NATIVE')
surname_count = sum(1 for e in entities if e['label'] == 'SURNAME')
native_count = sum(1 for e in entities if e["label"] == "NATIVE")
surname_count = sum(1 for e in entities if e["label"] == "SURNAME")
col1, col2, col3 = st.columns(3)
with col1:
@@ -134,29 +138,17 @@ class NERTesting:
# Convert our entities format to spaCy format for displacy
ents = []
for entity in entities:
ents.append({
"start": entity['start'],
"end": entity['end'],
"label": entity['label']
})
ents.append(
{"start": entity["start"], "end": entity["end"], "label": entity["label"]}
)
# Create doc-like structure for displacy
doc_data = {
"text": text,
"ents": ents,
"title": None
}
doc_data = {"text": text, "ents": ents, "title": None}
# Custom colors for our labels
colors = {
"NATIVE": "#74C0FC", # Light blue
"SURNAME": "#69DB7C" # Light green
}
colors = {"NATIVE": "#74C0FC", "SURNAME": "#69DB7C"} # Light blue # Light green
options = {
"colors": colors,
"distance": 90
}
options = {"colors": colors, "distance": 90}
# Generate HTML visualization
html = displacy.render(doc_data, style="ent", manual=True, options=options)
+3 -3
View File
@@ -13,7 +13,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
class Predictions:
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker
@@ -111,7 +111,7 @@ class Predictions:
return None
def _display_single_prediction_results(
self, prediction: str, confidence: Optional[float], experiment, name_input: str
self, prediction: str, confidence: Optional[float], experiment, name_input: str
):
"""Display single prediction results"""
col1, col2 = st.columns(2)
@@ -288,7 +288,7 @@ class Predictions:
return pd.DataFrame()
def _run_dataset_prediction(
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
):
"""Run dataset prediction and display results"""
with st.spinner("Running predictions..."):
+1 -1
View File
@@ -12,7 +12,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
class ResultsAnalysis:
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
self.config = config
self.experiment_tracker = experiment_tracker