From 84f7d41a8407b62e564a818f178d4056409bcae5 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Sat, 16 Aug 2025 19:05:24 +0200 Subject: [PATCH] feat: web application multipage support --- app.py | 101 ------- config/pipeline.yaml | 7 +- config/research_templates.yaml | 8 +- config/spacy_ner.cfg | 145 ++++++++++ core/config/processing_config.py | 1 + ner.py | 25 +- .../__init__.py => pages/1_๐Ÿ“Š_Dashboard.py | 0 pages/2_๐Ÿ“‹_Data_Overview.py | 0 pages/3_โš™๏ธ_Data_Processing.py | 0 pages/4_๐Ÿงช_Experiments.py | 0 pages/5_๐Ÿ“ˆ_Results_Analysis.py | 0 pages/6_๐Ÿ”ฎ_Predictions.py | 0 pages/7_โš™๏ธ_Configuration.py | 0 pages/README.md | 0 processing/ner/name_builder.py | 68 +++++ ...ner_engineering.py => name_engineering.py} | 11 +- .../ner/{ner_name_model.py => name_model.py} | 14 +- processing/ner/name_tagger.py | 273 ++++++++++++++++++ processing/ner/ner_data_builder.py | 149 ---------- processing/ner/ner_name_tagger.py | 212 -------------- web/__init__.py | 0 web/app.py | 89 ++++++ web/interfaces/__init__.py | 0 .../interfaces}/configuration.py | 4 +- {interface => web/interfaces}/dashboard.py | 2 +- .../interfaces}/data_overview.py | 2 +- .../interfaces}/data_processing.py | 4 +- {interface => web/interfaces}/experiments.py | 5 +- {interface => web/interfaces}/log_reader.py | 0 {interface => web/interfaces}/predictions.py | 5 +- .../interfaces}/results_analysis.py | 5 +- web/pages/1_๐Ÿ“Š_Dashboard.py | 22 ++ web/pages/2_๐Ÿ“‹_Data_Overview.py | 18 ++ web/pages/3_โš™๏ธ_Data_Processing.py | 18 ++ web/pages/4_๐Ÿงช_Experiments.py | 22 ++ web/pages/5_๐Ÿ“ˆ_Results_Analysis.py | 22 ++ web/pages/6_๐Ÿ”ฎ_Predictions.py | 22 ++ web/pages/7_โš™๏ธ_Configuration.py | 18 ++ 38 files changed, 765 insertions(+), 507 deletions(-) delete mode 100644 app.py create mode 100644 config/spacy_ner.cfg rename interface/__init__.py => pages/1_๐Ÿ“Š_Dashboard.py (100%) create mode 100644 pages/2_๐Ÿ“‹_Data_Overview.py create mode 100644 pages/3_โš™๏ธ_Data_Processing.py create mode 100644 pages/4_๐Ÿงช_Experiments.py create mode 100644 pages/5_๐Ÿ“ˆ_Results_Analysis.py create mode 100644 pages/6_๐Ÿ”ฎ_Predictions.py create mode 100644 pages/7_โš™๏ธ_Configuration.py create mode 100644 pages/README.md create mode 100644 processing/ner/name_builder.py rename processing/ner/{ner_engineering.py => name_engineering.py} (95%) rename processing/ner/{ner_name_model.py => name_model.py} (98%) create mode 100644 processing/ner/name_tagger.py create mode 100644 web/__init__.py create mode 100644 web/app.py create mode 100644 web/interfaces/__init__.py rename {interface => web/interfaces}/configuration.py (63%) rename {interface => web/interfaces}/dashboard.py (98%) rename {interface => web/interfaces}/data_overview.py (99%) rename {interface => web/interfaces}/data_processing.py (98%) rename {interface => web/interfaces}/experiments.py (99%) rename {interface => web/interfaces}/log_reader.py (100%) rename {interface => web/interfaces}/predictions.py (99%) rename {interface => web/interfaces}/results_analysis.py (98%) create mode 100644 web/pages/1_๐Ÿ“Š_Dashboard.py create mode 100644 web/pages/2_๐Ÿ“‹_Data_Overview.py create mode 100644 web/pages/3_โš™๏ธ_Data_Processing.py create mode 100644 web/pages/4_๐Ÿงช_Experiments.py create mode 100644 web/pages/5_๐Ÿ“ˆ_Results_Analysis.py create mode 100644 web/pages/6_๐Ÿ”ฎ_Predictions.py create mode 100644 web/pages/7_โš™๏ธ_Configuration.py diff --git a/app.py b/app.py deleted file mode 100644 index 81a5b08..0000000 --- a/app.py +++ /dev/null @@ -1,101 +0,0 @@ -#!.venv/bin/python3 -import argparse - -import streamlit as st - -from core.config import setup_config, PipelineConfig -from core.utils.data_loader import DataLoader -from interface.configuration import Configuration -from interface.dashboard import Dashboard -from interface.data_overview import DataOverview -from interface.data_processing import DataProcessing -from interface.experiments import Experiments -from interface.predictions import Predictions -from interface.results_analysis import ResultsAnalysis -from processing.monitoring.pipeline_monitor import PipelineMonitor -from research.experiment.experiment_runner import ExperimentRunner -from research.experiment.experiment_tracker import ExperimentTracker - -# Page configuration -st.set_page_config( - page_title="DRC Names NLP Pipeline", - page_icon="๐Ÿ‡จ๐Ÿ‡ฉ", - layout="wide", - initial_sidebar_state="expanded", -) - - -class StreamlitApp: - """Main Streamlit application class""" - - def __init__(self, config: PipelineConfig): - self.config = config - self.data_loader = DataLoader(self.config) - self.experiment_tracker = ExperimentTracker(self.config) - self.experiment_runner = ExperimentRunner(self.config) - self.pipeline_monitor = PipelineMonitor() - - # Initialize interface components - self.dashboard = Dashboard(self.config, self.experiment_tracker, self.experiment_runner) - self.data_overview = DataOverview(self.config) - self.data_processing = DataProcessing(self.config, self.pipeline_monitor) - self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner) - self.results_analysis = ResultsAnalysis( - self.config, self.experiment_tracker, self.experiment_runner - ) - self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner) - self.configuration = Configuration(self.config) - - # Initialize session state - if "current_experiment" not in st.session_state: - st.session_state.current_experiment = None - if "experiment_results" not in st.session_state: - st.session_state.experiment_results = {} - - def run(self): - st.title("๐Ÿ‡จ๐Ÿ‡ฉ DRC NERS Pipeline") - st.markdown("A comprehensive tool for Congolese name analysis and gender prediction") - - # Sidebar navigation - page = st.sidebar.selectbox( - "Navigation", - [ - "Dashboard", - "Dataset Overview", - "Data Processing", - "Experiments", - "Results & Analysis", - "Predictions", - "Configuration", - ], - ) - - # Route to appropriate page - page_map = { - "Dashboard": self.dashboard.index, - "Dataset Overview": self.data_overview.index, - "Data Processing": self.data_processing.index, - "Experiments": self.experiments.index, - "Results & Analysis": self.results_analysis.index, - "Predictions": self.predictions.index, - "Configuration": self.configuration.index, - } - page_map.get(page, lambda: None)() - - -def main(): - parser = argparse.ArgumentParser( - description="DRC NERS Platform", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument("--config", type=str, help="Path to configuration file") - parser.add_argument("--env", type=str, default="development", help="Environment name") - args = parser.parse_args() - - config = setup_config(args.config, env=args.env) - app = StreamlitApp(config) - app.run() - - -if __name__ == "__main__": - main() diff --git a/config/pipeline.yaml b/config/pipeline.yaml index babc9be..36bb561 100644 --- a/config/pipeline.yaml +++ b/config/pipeline.yaml @@ -18,7 +18,8 @@ paths: checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints # Pipeline stages -stages: # List of stages in the processing pipeline +# List of stages in the processing pipeline +stages: - "data_cleaning" # Data cleaning stage - "feature_extraction" # Feature extraction stage - "ner_annotation" # NER-based annotation stage @@ -36,6 +37,7 @@ processing: - "utf-16" - "latin1" chunk_size: 100_000 # Size of data chunks to process in parallel + epochs: 2 # Number of Epochs for training # Annotation settings annotation: @@ -72,8 +74,9 @@ data: balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size? # Logging configuration +# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) logging: - level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + level: "INFO" format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" file_logging: true # Enable logging to file console_logging: true # Enable logging to console diff --git a/config/research_templates.yaml b/config/research_templates.yaml index a968aec..f8a122c 100644 --- a/config/research_templates.yaml +++ b/config/research_templates.yaml @@ -7,7 +7,7 @@ baseline_experiments: max_len: 20 embedding_dim: 64 gru_units: 32 - epochs: 10 + epochs: 2 batch_size: 32 tags: [ "baseline", "neural", "bigru" ] @@ -21,7 +21,7 @@ baseline_experiments: filters: 64 kernel_size: 3 dropout: 0.5 - epochs: 10 + epochs: 2 batch_size: 32 tags: [ "baseline", "neural", "cnn" ] @@ -79,7 +79,7 @@ baseline_experiments: model_params: embedding_dim: 128 lstm_units: 64 - epochs: 10 + epochs: 2 batch_size: 64 tags: [ "baseline", "neural", "lstm" ] @@ -121,7 +121,7 @@ baseline_experiments: embedding_dim: 128 num_heads: 4 num_layers: 2 - epochs: 10 + epochs: 2 batch_size: 64 tags: [ "baseline", "neural", "transformer" ] diff --git a/config/spacy_ner.cfg b/config/spacy_ner.cfg new file mode 100644 index 0000000..911a774 --- /dev/null +++ b/config/spacy_ner.cfg @@ -0,0 +1,145 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +gpu_allocator = null +seed = 42 + +[nlp] +lang = "fr" +pipeline = ["tok2vec","ner"] +batch_size = 100000 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +vectors = {"@vectors":"spacy.Vectors.v1"} + +[components] + +[components.ner] +factory = "ner" +incorrect_spans_key = null +moves = null +scorer = {"@scorers":"spacy.ner_scorer.v1"} +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = true +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.tok2vec.model.encode.width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,1000,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +ents_f = 1.0 +ents_p = 0.0 +ents_r = 0.0 +ents_per_type = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] diff --git a/core/config/processing_config.py b/core/config/processing_config.py index 0037a4e..5d1d705 100644 --- a/core/config/processing_config.py +++ b/core/config/processing_config.py @@ -12,3 +12,4 @@ class ProcessingConfig(BaseModel): use_multiprocessing: bool = False encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"]) chunk_size: int = 100_000 + epochs: int = 2 diff --git a/ner.py b/ner.py index 1e4ed8f..3b0ac49 100755 --- a/ner.py +++ b/ner.py @@ -7,24 +7,24 @@ import traceback from pathlib import Path from core.config import setup_config, PipelineConfig -from processing.ner.ner_data_builder import NERDataBuilder -from processing.ner.ner_engineering import NEREngineering -from processing.ner.ner_name_model import NERNameModel +from processing.ner.name_builder import NameBuilder +from processing.ner.name_engineering import NameEngineering +from processing.ner.name_model import NameModel def feature(config: PipelineConfig): """Apply feature engineering to create position-independent NER dataset.""" - NEREngineering(config).compute() + NameEngineering(config).compute() def build(config: PipelineConfig): """Build NER dataset using NERDataBuilder.""" - NERDataBuilder(config).build() + NameBuilder(config).build() def train(config: PipelineConfig): """Train the NER model.""" - trainer = NERNameModel(config) + trainer = NameModel(config) data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"] if not data_path.exists(): @@ -39,7 +39,10 @@ def train(config: PipelineConfig): logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}") trainer.train( - data=train_data, epochs=1, batch_size=config.processing.batch_size, dropout_rate=0.3 + data=train_data, + epochs=config.processing.epochs, + batch_size=config.processing.batch_size, + dropout_rate=0.3, ) trainer.evaluate(eval_data) @@ -48,13 +51,17 @@ def train(config: PipelineConfig): def run_pipeline(config: PipelineConfig, reset: bool = False): - if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])): + if not reset and os.path.exists( + config.paths.get_data_path(config.data.output_files["engineered"]) + ): logging.info("Step 1: Feature engineering already done.") else: logging.info("Step 1: Running feature engineering") feature(config) - if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])): + if not reset and os.path.exists( + config.paths.get_data_path(config.data.output_files["ner_data"]) + ): logging.info("Step 2: NER dataset already built.") else: logging.info("Step 2: Building NER dataset") diff --git a/interface/__init__.py b/pages/1_๐Ÿ“Š_Dashboard.py similarity index 100% rename from interface/__init__.py rename to pages/1_๐Ÿ“Š_Dashboard.py diff --git a/pages/2_๐Ÿ“‹_Data_Overview.py b/pages/2_๐Ÿ“‹_Data_Overview.py new file mode 100644 index 0000000..e69de29 diff --git a/pages/3_โš™๏ธ_Data_Processing.py b/pages/3_โš™๏ธ_Data_Processing.py new file mode 100644 index 0000000..e69de29 diff --git a/pages/4_๐Ÿงช_Experiments.py b/pages/4_๐Ÿงช_Experiments.py new file mode 100644 index 0000000..e69de29 diff --git a/pages/5_๐Ÿ“ˆ_Results_Analysis.py b/pages/5_๐Ÿ“ˆ_Results_Analysis.py new file mode 100644 index 0000000..e69de29 diff --git a/pages/6_๐Ÿ”ฎ_Predictions.py b/pages/6_๐Ÿ”ฎ_Predictions.py new file mode 100644 index 0000000..e69de29 diff --git a/pages/7_โš™๏ธ_Configuration.py b/pages/7_โš™๏ธ_Configuration.py new file mode 100644 index 0000000..e69de29 diff --git a/pages/README.md b/pages/README.md new file mode 100644 index 0000000..e69de29 diff --git a/processing/ner/name_builder.py b/processing/ner/name_builder.py new file mode 100644 index 0000000..b569f7f --- /dev/null +++ b/processing/ner/name_builder.py @@ -0,0 +1,68 @@ +import json +import logging + +import spacy +from spacy.tokens import DocBin + +from core.config import PipelineConfig +from core.utils.data_loader import DataLoader +from .name_tagger import NameTagger + + +class NameBuilder: + def __init__(self, config: PipelineConfig): + self.config = config + self.data_loader = DataLoader(config) + self.tagger = NameTagger() + + def build(self) -> int: + filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"]) + df = self.data_loader.load_csv_complete(filepath) + df = df[["name", "ner_tagged", "ner_entities"]] + + # Filter early + ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]] + if ner_df.empty: + logging.error("No NER tagged data found") + return 1 + + total_rows = len(df) + del df # No need to keep in memory + + logging.info(f"Found {len(ner_df)} NER tagged entries") + nlp = spacy.blank("fr") + + # Use NERNameTagger for parsing and validation + parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"]) + validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities) + + # Drop rows with no valid entities + mask = validated_entities.map(bool) + ner_df = ner_df.loc[mask] + validated_entities = validated_entities.loc[mask] + + if ner_df.empty: + logging.error("No valid training examples after validation") + return 1 + + # Prepare training data + training_data = list( + zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities]) + ) + + # Use NERNameTagger to create spaCy DocBin + docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist()) + doc_bin = DocBin(docs=docs) + + # Save + json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"]) + spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"]) + + with open(json_path, "w", encoding="utf-8") as f: + json.dump(training_data, f, ensure_ascii=False, separators=(",", ":")) + doc_bin.to_disk(spacy_path) + + logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}") + logging.info(f"Saved NER JSON to {json_path}") + logging.info(f"Saved NER spacy to {spacy_path}") + return 0 diff --git a/processing/ner/ner_engineering.py b/processing/ner/name_engineering.py similarity index 95% rename from processing/ner/ner_engineering.py rename to processing/ner/name_engineering.py index 713654b..0b300f6 100644 --- a/processing/ner/ner_engineering.py +++ b/processing/ner/name_engineering.py @@ -1,5 +1,5 @@ +import gc import random -from typing import List import logging import numpy as np @@ -7,7 +7,7 @@ import pandas as pd from tqdm import tqdm from core.config import PipelineConfig -from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader +from core.utils.data_loader import DataLoader from processing.ner.formats.connectors_format import ConnectorFormatter from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter from processing.ner.formats.native_only_format import NativeOnlyFormatter @@ -16,7 +16,7 @@ from processing.ner.formats.position_flipped_format import PositionFlippedFormat from processing.ner.formats.reduced_native_format import ReducedNativeFormatter -class NEREngineering: +class NameEngineering: """ Feature engineering for NER dataset to prevent position-based learning and encourage sequence characteristic learning. @@ -66,13 +66,16 @@ class NEREngineering: def compute(self) -> None: logging.info("Applying feature engineering transformations...") input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"]) - output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"]) + output_filepath = self.config.paths.get_data_path( + self.config.data.output_files["engineered"] + ) df = self.data_loader.load_csv_complete(input_filepath) ner_df = df[df["ner_tagged"] == 1].copy() logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records") del df # No need to keep in memory + gc.collect() ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index( drop=True diff --git a/processing/ner/ner_name_model.py b/processing/ner/name_model.py similarity index 98% rename from processing/ner/ner_name_model.py rename to processing/ner/name_model.py index ab2aa01..8488424 100644 --- a/processing/ner/ner_name_model.py +++ b/processing/ner/name_model.py @@ -1,3 +1,4 @@ +import ast import json import logging import os @@ -11,7 +12,7 @@ from spacy.util import minibatch from core.config.pipeline_config import PipelineConfig -class NERNameModel: +class NameModel: """NER model trainer using spaCy for DRC names entity recognition""" def __init__(self, config: PipelineConfig): @@ -84,8 +85,6 @@ class NERNameModel: if isinstance(entities_raw, str): # String format from tagger: "[(0, 6, 'NATIVE'), ...]" try: - import ast - entities = ast.literal_eval(entities_raw) if not isinstance(entities, list): logging.warning( @@ -175,9 +174,9 @@ class NERNameModel: def train( self, data: List[Tuple[str, Dict]], - epochs: int = 5, - batch_size: int = 16, - dropout_rate: float = 0.2, + epochs: int = 1, + batch_size: int = 10_000, + dropout_rate: float = 0.3, ) -> None: """Train the NER model""" logging.info(f"Starting NER training with {len(data)} examples") @@ -204,7 +203,7 @@ class NERNameModel: example = Example.from_dict(doc, annotations) examples.append(example) logging.info( - f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}" + f"Training example: {text[:30]} with entities {annotations.get('entities', [])}" ) # Train in batches @@ -215,6 +214,7 @@ class NERNameModel: ) logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}") + del batches # free memory epoch_loss = losses.get("ner", 0) losses_history.append(epoch_loss) logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}") diff --git a/processing/ner/name_tagger.py b/processing/ner/name_tagger.py new file mode 100644 index 0000000..6251ac0 --- /dev/null +++ b/processing/ner/name_tagger.py @@ -0,0 +1,273 @@ +from typing import Union, Dict, Any, List +import ast +import json +import logging +import pandas as pd +from spacy.util import filter_spans + + +class NameTagger: + def tag_name( + self, name: str, probable_native: str, probable_surname: str + ) -> Union[Dict[str, Any], None]: + """Create a single NER training example using probable_native and probable_surname""" + if not name or not probable_native or not probable_surname: + return None + + name = name.strip() + probable_native = probable_native.strip() + probable_surname = probable_surname.strip() + + entities = [] + used_spans = [] # Track used character spans to prevent overlaps + + # Helper function to check if a span overlaps with any existing span + def has_overlap(start, end): + for used_start, used_end in used_spans: + if not (end <= used_start or start >= used_end): + return True + return False + + # Find positions of native names in the full name + native_words = probable_native.split() + name_lower = name.lower() # Use lowercase for consistent searching + processed_native_words = set() + + for native_word in native_words: + native_word = native_word.strip() + if len(native_word) < 2: # Skip very short words + continue + + native_word_lower = native_word.lower() + + # Skip if we've already processed this exact word + if native_word_lower in processed_native_words: + continue + processed_native_words.add(native_word_lower) + + # Find the first occurrence of this native word that doesn't overlap + start_pos = 0 + while True: + pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search + if pos == -1: + break + + # Calculate end position - make sure we only include the word itself + end_pos = pos + len(native_word_lower) + + # Double-check that the extracted span matches exactly what we expect + extracted_text = name[pos:end_pos] # Get original case text + if extracted_text.lower() != native_word_lower: + start_pos = pos + 1 + continue + + # Check if this is a word boundary match and doesn't overlap + if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap( + pos, end_pos + ): + entities.append((pos, end_pos, "NATIVE")) + used_spans.append((pos, end_pos)) + break # Only take the first non-overlapping occurrence + + start_pos = pos + 1 + + # Find position of surname in the full name + if probable_surname and len(probable_surname.strip()) >= 2: + surname_lower = probable_surname.lower() + + # Find the first occurrence that doesn't overlap + start_pos = 0 + while True: + pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search + if pos == -1: + break + + # Calculate end position correctly - exact match only + end_pos = pos + len(surname_lower) + + # Double-check that the extracted span matches exactly what we expect + extracted_text = name[pos:end_pos] # Get original case text + if extracted_text.lower() != surname_lower: + start_pos = pos + 1 + continue + + if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap( + pos, end_pos + ): + entities.append((pos, end_pos, "SURNAME")) + used_spans.append((pos, end_pos)) + break + + start_pos = pos + 1 + + if not entities: + logging.warning( + f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'" + ) + return None + + # Sort entities by position and validate + entities.sort(key=lambda x: x[0]) + + # Final validation - ensure no overlaps and valid spans + validated_entities = [] + for start, end, label in entities: + # Check bounds + if not (0 <= start < end <= len(name)): + logging.warning( + f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'" + ) + continue + + # Check for overlaps with already validated entities + if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities): + logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'") + continue + + # CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces) + span_text = name[start:end] + if not span_text or span_text != span_text.strip() or " " in span_text: + logging.warning( + f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'" + ) + continue + + validated_entities.append((start, end, label)) + + if not validated_entities: + logging.warning(f"No valid entities after validation for: '{name}'") + return None + + # Convert to string format that matches the dataset + entities_str = str(validated_entities) + + return { + "entities": entities_str, + "spans": validated_entities, # Keep the original tuples for internal use + } + + @classmethod + def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool: + """Check if the match is at word boundaries""" + # Check character before start position + if start > 0: + prev_char = text[start - 1] + if prev_char.isalnum(): + return False + + # Check character after end position + if end < len(text): + next_char = text[end] + if next_char.isalnum(): + return False + + return True + + @classmethod + def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]: + """Extract the actual text for each entity type""" + result = {"NATIVE": [], "SURNAME": []} + + try: + entities = ast.literal_eval(entities_str) + + for start, end, label in entities: + if 0 <= start < end <= len(name): + span_text = name[start:end] + if label in result: + result[label].append(span_text) + + except (ValueError, SyntaxError, TypeError): + pass + + return result + + @classmethod + def parse(cls, entities_str: str) -> List[tuple]: + """Parse entity strings from various formats. + + Supports formats: + - [(start, end, label), ...] + - [[start, end, label], ...] + - [{"start": start, "end": end, "label": label}, ...] + """ + if not entities_str or entities_str in ["[]", "", "nan"]: + return [] + entities_str = str(entities_str).strip() + try: + if entities_str.startswith("[(") and entities_str.endswith(")]"): + return ast.literal_eval(entities_str) + elif entities_str.startswith("[[") and entities_str.endswith("]]"): + return [tuple(e) for e in ast.literal_eval(entities_str)] + elif entities_str.startswith("[{") and entities_str.endswith("}]"): + return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)] + else: + parsed = ast.literal_eval(entities_str) + return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3] + except (ValueError, SyntaxError, json.JSONDecodeError): + return [] + + def parse_entities(self, series: pd.Series) -> pd.Series: + """Vectorized parse of entity strings.""" + return series.map(self.parse) + + @classmethod + def validate(cls, text: str, entities: List[tuple]) -> List[tuple]: + """Advanced entity validation with overlap removal. + + This is more comprehensive than the basic validate_entities method. + """ + if not entities or not text: + return [] + text = str(text).strip() + valid = [] + + for ent in entities: + if not isinstance(ent, (list, tuple)) or len(ent) != 3: + continue + start, end, label = ent + try: + start, end = int(start), int(end) + except (ValueError, TypeError): + continue + if not isinstance(label, str): + continue + if not (0 <= start < end <= len(text)): + continue + if not text[start:end].strip(): + continue + valid.append((start, end, label)) + + if not valid: + return [] + + valid.sort(key=lambda x: (x[0], x[1])) + + # Remove overlaps + filtered, last_end = [], -1 + for s, e, l in valid: + if s >= last_end: + filtered.append((s, e, l)) + last_end = e + return filtered + + def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series: + """Vectorized entity validation.""" + return pd.Series(map(self.validate, texts, entities_series), index=texts.index) + + @classmethod + def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List: + """Batch create spaCy Docs from texts and entities.""" + docs = [] + for text, ents in zip(texts, entities): + doc = nlp(text) + spans = [] + for start, end, label in ents: + span = doc.char_span( + start, end, label=label, alignment_mode="contract" + ) or doc.char_span(start, end, label=label, alignment_mode="strict") + if span: + spans.append(span) + doc.ents = filter_spans(spans) + docs.append(doc) + return docs diff --git a/processing/ner/ner_data_builder.py b/processing/ner/ner_data_builder.py index 4d1d5cc..e69de29 100644 --- a/processing/ner/ner_data_builder.py +++ b/processing/ner/ner_data_builder.py @@ -1,149 +0,0 @@ -import ast -import json -import logging -from pathlib import Path - -import pandas as pd -import spacy -from spacy.tokens import DocBin -from spacy.util import filter_spans - -from core.config import PipelineConfig -from core.utils.data_loader import DataLoader - - -class NERDataBuilder: - def __init__(self, config: PipelineConfig): - self.config = config - self.data_loader = DataLoader(config) - - @staticmethod - def _parse_entities(series: pd.Series) -> pd.Series: - """Vectorized parse of entity strings.""" - - def _parse(entities_str): - if not entities_str or entities_str in ["[]", "", "nan"]: - return [] - entities_str = str(entities_str).strip() - try: - if entities_str.startswith("[(") and entities_str.endswith(")]"): - return ast.literal_eval(entities_str) - elif entities_str.startswith("[[") and entities_str.endswith("]]"): - return [tuple(e) for e in ast.literal_eval(entities_str)] - elif entities_str.startswith("[{") and entities_str.endswith("}]"): - return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)] - else: - parsed = ast.literal_eval(entities_str) - return [ - tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3 - ] - except (ValueError, SyntaxError, json.JSONDecodeError): - return [] - - return series.map(_parse) - - @staticmethod - def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series: - """Vectorized entity validation.""" - - def _validate(text, entities): - if not entities or not text: - return [] - text = str(text).strip() - valid = [] - for ent in entities: - if not isinstance(ent, (list, tuple)) or len(ent) != 3: - continue - start, end, label = ent - try: - start, end = int(start), int(end) - except (ValueError, TypeError): - continue - if not isinstance(label, str): - continue - if not (0 <= start < end <= len(text)): - continue - if not text[start:end].strip(): - continue - valid.append((start, end, label)) - if not valid: - return [] - valid.sort(key=lambda x: (x[0], x[1])) - # remove overlaps - filtered, last_end = [], -1 - for s, e, l in valid: - if s >= last_end: - filtered.append((s, e, l)) - last_end = e - return filtered - - return pd.Series(map(_validate, texts, entities_series), index=texts.index) - - @staticmethod - def _create_docs(nlp, texts, entities): - """Batch create spaCy Docs.""" - docs = [] - for text, ents in zip(texts, entities): - doc = nlp(text) - spans = [] - for start, end, label in ents: - span = doc.char_span( - start, end, label=label, alignment_mode="contract" - ) or doc.char_span(start, end, label=label, alignment_mode="strict") - if span: - spans.append(span) - doc.ents = filter_spans(spans) - docs.append(doc) - return docs - - def build(self) -> int: - filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"]) - df = self.data_loader.load_csv_complete(filepath) - df = df[["name", "ner_tagged", "ner_entities"]] - - # Filter early - ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]] - if ner_df.empty: - logging.error("No NER tagged data found") - return 1 - - total_rows = len(df) - del df # No need to keep in memory - - logging.info(f"Found {len(ner_df)} NER tagged entries") - nlp = spacy.blank("fr") - - # Vectorized parsing + validation - parsed_entities = self._parse_entities(ner_df["ner_entities"]) - validated_entities = self._validate_entities(ner_df["name"], parsed_entities) - - # Drop rows with no valid entities - mask = validated_entities.map(bool) - ner_df = ner_df.loc[mask] - validated_entities = validated_entities.loc[mask] - - if ner_df.empty: - logging.error("No valid training examples after validation") - return 1 - - # Prepare training data - training_data = list( - zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities]) - ) - - # Create spaCy DocBin in batch - docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist()) - doc_bin = DocBin(docs=docs) - - # Save - json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"]) - spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"]) - - with open(json_path, "w", encoding="utf-8") as f: - json.dump(training_data, f, ensure_ascii=False, separators=(",", ":")) - doc_bin.to_disk(spacy_path) - - logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}") - logging.info(f"Saved NER JSON to {json_path}") - logging.info(f"Saved NER spacy to {spacy_path}") - return 0 diff --git a/processing/ner/ner_name_tagger.py b/processing/ner/ner_name_tagger.py index 3eeee9c..e69de29 100644 --- a/processing/ner/ner_name_tagger.py +++ b/processing/ner/ner_name_tagger.py @@ -1,212 +0,0 @@ -from typing import Union, Dict, Any, List -import logging - - -class NERNameTagger: - def tag_name( - self, name: str, probable_native: str, probable_surname: str - ) -> Union[Dict[str, Any], None]: - """Create a single NER training example using probable_native and probable_surname""" - if not name or not probable_native or not probable_surname: - return None - - name = name.strip() - probable_native = probable_native.strip() - probable_surname = probable_surname.strip() - - entities = [] - used_spans = [] # Track used character spans to prevent overlaps - - # Helper function to check if a span overlaps with any existing span - def has_overlap(start, end): - for used_start, used_end in used_spans: - if not (end <= used_start or start >= used_end): - return True - return False - - # Find positions of native names in the full name - native_words = probable_native.split() - name_lower = name.lower() # Use lowercase for consistent searching - processed_native_words = set() - - for native_word in native_words: - native_word = native_word.strip() - if len(native_word) < 2: # Skip very short words - continue - - native_word_lower = native_word.lower() - - # Skip if we've already processed this exact word - if native_word_lower in processed_native_words: - continue - processed_native_words.add(native_word_lower) - - # Find the first occurrence of this native word that doesn't overlap - start_pos = 0 - while True: - pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search - if pos == -1: - break - - # Calculate end position - make sure we only include the word itself - end_pos = pos + len(native_word_lower) - - # Double-check that the extracted span matches exactly what we expect - extracted_text = name[pos:end_pos] # Get original case text - if extracted_text.lower() != native_word_lower: - start_pos = pos + 1 - continue - - # Check if this is a word boundary match and doesn't overlap - if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap( - pos, end_pos - ): - entities.append((pos, end_pos, "NATIVE")) - used_spans.append((pos, end_pos)) - break # Only take the first non-overlapping occurrence - - start_pos = pos + 1 - - # Find position of surname in the full name - if probable_surname and len(probable_surname.strip()) >= 2: - surname_lower = probable_surname.lower() - - # Find the first occurrence that doesn't overlap - start_pos = 0 - while True: - pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search - if pos == -1: - break - - # Calculate end position correctly - exact match only - end_pos = pos + len(surname_lower) - - # Double-check that the extracted span matches exactly what we expect - extracted_text = name[pos:end_pos] # Get original case text - if extracted_text.lower() != surname_lower: - start_pos = pos + 1 - continue - - if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap( - pos, end_pos - ): - entities.append((pos, end_pos, "SURNAME")) - used_spans.append((pos, end_pos)) - break - - start_pos = pos + 1 - - if not entities: - logging.warning( - f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'" - ) - return None - - # Sort entities by position and validate - entities.sort(key=lambda x: x[0]) - - # Final validation - ensure no overlaps and valid spans - validated_entities = [] - for start, end, label in entities: - # Check bounds - if not (0 <= start < end <= len(name)): - logging.warning( - f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'" - ) - continue - - # Check for overlaps with already validated entities - if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities): - logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'") - continue - - # CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces) - span_text = name[start:end] - if not span_text or span_text != span_text.strip() or " " in span_text: - logging.warning( - f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'" - ) - continue - - validated_entities.append((start, end, label)) - - if not validated_entities: - logging.warning(f"No valid entities after validation for: '{name}'") - return None - - # Convert to string format that matches the dataset - entities_str = str(validated_entities) - - return { - "entities": entities_str, - "spans": validated_entities, # Keep the original tuples for internal use - } - - @classmethod - def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool: - """Check if the match is at word boundaries""" - # Check character before start position - if start > 0: - prev_char = text[start - 1] - if prev_char.isalnum(): - return False - - # Check character after end position - if end < len(text): - next_char = text[end] - if next_char.isalnum(): - return False - - return True - - @classmethod - def validate_entities(cls, name: str, entities_str: str) -> bool: - """Validate that entity annotations are correct for a given name""" - try: - import ast - - entities = ast.literal_eval(entities_str) - - # Check for overlaps and valid bounds - sorted_entities = sorted(entities, key=lambda x: x[0]) - - for i, (start, end, label) in enumerate(sorted_entities): - # Check bounds - if not (0 <= start < end <= len(name)): - return False - - # Check for overlaps with next entity - if i < len(sorted_entities) - 1: - next_start = sorted_entities[i + 1][0] - if end > next_start: - return False - - # Extract the text span and validate it's not empty - span_text = name[start:end] - if not span_text.strip(): - return False - - return True - except (ValueError, SyntaxError, TypeError): - return False - - @classmethod - def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]: - """Extract the actual text for each entity type""" - result = {"NATIVE": [], "SURNAME": []} - - try: - import ast - - entities = ast.literal_eval(entities_str) - - for start, end, label in entities: - if 0 <= start < end <= len(name): - span_text = name[start:end] - if label in result: - result[label].append(span_text) - - except (ValueError, SyntaxError, TypeError): - pass - - return result diff --git a/web/__init__.py b/web/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/app.py b/web/app.py new file mode 100644 index 0000000..9b5b487 --- /dev/null +++ b/web/app.py @@ -0,0 +1,89 @@ +#!.venv/bin/python3 +import argparse +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent +sys.path.insert(0, str(parent_dir)) + +from core.config import setup_config, PipelineConfig +from core.utils.data_loader import DataLoader +from processing.monitoring.pipeline_monitor import PipelineMonitor +from research.experiment.experiment_runner import ExperimentRunner +from research.experiment.experiment_tracker import ExperimentTracker +from web.interfaces.configuration import Configuration +from web.interfaces.dashboard import Dashboard +from web.interfaces.data_overview import DataOverview +from web.interfaces.data_processing import DataProcessing +from web.interfaces.experiments import Experiments +from web.interfaces.predictions import Predictions +from web.interfaces.results_analysis import ResultsAnalysis + +# Page configuration +st.set_page_config( + page_title="DRC NERS Platform", + page_icon="๐Ÿ‡จ๐Ÿ‡ฉ", + layout="wide", + initial_sidebar_state="expanded", +) + + +def initialize_session_state(config: PipelineConfig): + """Initialize session state variables""" + if "config" not in st.session_state: + st.session_state.config = config + if "data_loader" not in st.session_state: + st.session_state.data_loader = DataLoader(config) + if "experiment_tracker" not in st.session_state: + st.session_state.experiment_tracker = ExperimentTracker(config) + if "experiment_runner" not in st.session_state: + st.session_state.experiment_runner = ExperimentRunner(config) + if "pipeline_monitor" not in st.session_state: + st.session_state.pipeline_monitor = PipelineMonitor() + if "current_experiment" not in st.session_state: + st.session_state.current_experiment = None + if "experiment_results" not in st.session_state: + st.session_state.experiment_results = {} + + +class StreamlitApp: + def __init__(self, config: PipelineConfig): + self.config = config + initialize_session_state(config) + + def run(self): + st.title("๐Ÿ‡จ๐Ÿ‡ฉ DRC NERS Pipeline") + st.markdown( + "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference" + ) + + st.markdown( + """ + ## Overview + Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often + underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training + data. + This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5 + million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata. + """ + ) + + +def main(): + parser = argparse.ArgumentParser( + description="DRC NERS Platform", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--config", type=str, help="Path to configuration file") + parser.add_argument("--env", type=str, default="development", help="Environment name") + args = parser.parse_args() + + config = setup_config(args.config, env=args.env) + app = StreamlitApp(config) + app.run() + + +if __name__ == "__main__": + main() diff --git a/web/interfaces/__init__.py b/web/interfaces/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/interface/configuration.py b/web/interfaces/configuration.py similarity index 63% rename from interface/configuration.py rename to web/interfaces/configuration.py index bc843e9..712f613 100644 --- a/interface/configuration.py +++ b/web/interfaces/configuration.py @@ -2,11 +2,9 @@ import streamlit as st class Configuration: - """Handles configuration display and management""" - def __init__(self, config): self.config = config def index(self): - st.header("Current Configuration") + st.title("Configuration") st.json(self.config.model_dump()) diff --git a/interface/dashboard.py b/web/interfaces/dashboard.py similarity index 98% rename from interface/dashboard.py rename to web/interfaces/dashboard.py index 5287322..804c66b 100644 --- a/interface/dashboard.py +++ b/web/interfaces/dashboard.py @@ -20,7 +20,7 @@ class Dashboard: self.experiment_runner = experiment_runner def index(self): - st.header("Dashboard") + st.title("Dashboard") col1, col2, col3, col4 = st.columns(4) # Load basic statistics diff --git a/interface/data_overview.py b/web/interfaces/data_overview.py similarity index 99% rename from interface/data_overview.py rename to web/interfaces/data_overview.py index fc34190..74a3acc 100644 --- a/interface/data_overview.py +++ b/web/interfaces/data_overview.py @@ -21,7 +21,7 @@ class DataOverview: self.config = config def index(self): - st.header("Data Overview") + st.title("Data Overview") data_files = { "Names": self.config.data.input_file, "Featured Dataset": self.config.data.output_files["featured"], diff --git a/interface/data_processing.py b/web/interfaces/data_processing.py similarity index 98% rename from interface/data_processing.py rename to web/interfaces/data_processing.py index 4f6093e..1f82d53 100644 --- a/interface/data_processing.py +++ b/web/interfaces/data_processing.py @@ -3,7 +3,7 @@ import plotly.express as px import streamlit as st from core.utils.data_loader import OPTIMIZED_DTYPES -from interface.log_reader import LogReader +from web.interfaces.log_reader import LogReader @st.cache_data @@ -21,7 +21,7 @@ class DataProcessing: self.pipeline_monitor = pipeline_monitor def index(self): - st.header("Data Processing Pipeline") + st.title("Data Processing") status = self.pipeline_monitor.get_pipeline_status() # Overall progress diff --git a/interface/experiments.py b/web/interfaces/experiments.py similarity index 99% rename from interface/experiments.py rename to web/interfaces/experiments.py index aa519ec..dd258c2 100644 --- a/interface/experiments.py +++ b/web/interfaces/experiments.py @@ -12,8 +12,6 @@ from research.model_registry import list_available_models class Experiments: - """Handles experiment management interface""" - def __init__( self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner ): @@ -22,8 +20,7 @@ class Experiments: self.experiment_runner = experiment_runner def index(self): - """Main experiments page""" - st.header("Experiment Management") + st.title("Experiments") tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"]) with tab1: diff --git a/interface/log_reader.py b/web/interfaces/log_reader.py similarity index 100% rename from interface/log_reader.py rename to web/interfaces/log_reader.py diff --git a/interface/predictions.py b/web/interfaces/predictions.py similarity index 99% rename from interface/predictions.py rename to web/interfaces/predictions.py index 56bbf99..b3804d1 100644 --- a/interface/predictions.py +++ b/web/interfaces/predictions.py @@ -12,8 +12,6 @@ from research.experiment.experiment_tracker import ExperimentTracker class Predictions: - """Handles prediction interface""" - def __init__( self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner ): @@ -22,8 +20,7 @@ class Predictions: self.experiment_runner = experiment_runner def index(self): - """Main predictions page""" - st.header("Make Predictions") + st.title("Predictions") # Load available models experiments = self.experiment_tracker.list_experiments() diff --git a/interface/results_analysis.py b/web/interfaces/results_analysis.py similarity index 98% rename from interface/results_analysis.py rename to web/interfaces/results_analysis.py index 22123f9..aa3d52c 100644 --- a/interface/results_analysis.py +++ b/web/interfaces/results_analysis.py @@ -11,8 +11,6 @@ from research.experiment.experiment_tracker import ExperimentTracker class ResultsAnalysis: - """Handles experiment results and analysis interface""" - def __init__( self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner ): @@ -21,8 +19,7 @@ class ResultsAnalysis: self.experiment_runner = experiment_runner def index(self): - """Main results analysis page""" - st.header("Results & Analysis") + st.title("Results & Analysis") tab1, tab2, tab3 = st.tabs( ["Experiment Comparison", "Performance Analysis", "Model Analysis"] ) diff --git a/web/pages/1_๐Ÿ“Š_Dashboard.py b/web/pages/1_๐Ÿ“Š_Dashboard.py new file mode 100644 index 0000000..3cb186c --- /dev/null +++ b/web/pages/1_๐Ÿ“Š_Dashboard.py @@ -0,0 +1,22 @@ +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(parent_dir)) + +from web.interfaces.dashboard import Dashboard + +st.set_page_config(page_title="Dashboard", page_icon="๐Ÿ“Š", layout="wide") + +if "config" in st.session_state: + dashboard = Dashboard( + st.session_state.config, + st.session_state.experiment_tracker, + st.session_state.experiment_runner, + ) + dashboard.index() +else: + st.error("Please run the main app first to initialize the configuration.") + st.markdown("Go back to the [main page](/) to start the application.") diff --git a/web/pages/2_๐Ÿ“‹_Data_Overview.py b/web/pages/2_๐Ÿ“‹_Data_Overview.py new file mode 100644 index 0000000..8a520e1 --- /dev/null +++ b/web/pages/2_๐Ÿ“‹_Data_Overview.py @@ -0,0 +1,18 @@ +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(parent_dir)) + +from web.interfaces.data_overview import DataOverview + +st.set_page_config(page_title="Data Overview", page_icon="๐Ÿ“‹", layout="wide") + +if "config" in st.session_state: + data_overview = DataOverview(st.session_state.config) + data_overview.index() +else: + st.error("Please run the main app first to initialize the configuration.") + st.markdown("Go back to the [main page](/) to start the application.") diff --git a/web/pages/3_โš™๏ธ_Data_Processing.py b/web/pages/3_โš™๏ธ_Data_Processing.py new file mode 100644 index 0000000..d028daf --- /dev/null +++ b/web/pages/3_โš™๏ธ_Data_Processing.py @@ -0,0 +1,18 @@ +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(parent_dir)) + +from web.interfaces.data_processing import DataProcessing + +st.set_page_config(page_title="Data Processing", page_icon="โš™๏ธ", layout="wide") + +if "config" in st.session_state: + data_processing = DataProcessing(st.session_state.config, st.session_state.pipeline_monitor) + data_processing.index() +else: + st.error("Please run the main app first to initialize the configuration.") + st.markdown("Go back to the [main page](/) to start the application.") diff --git a/web/pages/4_๐Ÿงช_Experiments.py b/web/pages/4_๐Ÿงช_Experiments.py new file mode 100644 index 0000000..880b5d9 --- /dev/null +++ b/web/pages/4_๐Ÿงช_Experiments.py @@ -0,0 +1,22 @@ +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(parent_dir)) + +from web.interfaces.experiments import Experiments + +st.set_page_config(page_title="Experiments", page_icon="๐Ÿงช", layout="wide") + +if "config" in st.session_state: + experiments = Experiments( + st.session_state.config, + st.session_state.experiment_tracker, + st.session_state.experiment_runner, + ) + experiments.index() +else: + st.error("Please run the main app first to initialize the configuration.") + st.markdown("Go back to the [main page](/) to start the application.") diff --git a/web/pages/5_๐Ÿ“ˆ_Results_Analysis.py b/web/pages/5_๐Ÿ“ˆ_Results_Analysis.py new file mode 100644 index 0000000..593dc8a --- /dev/null +++ b/web/pages/5_๐Ÿ“ˆ_Results_Analysis.py @@ -0,0 +1,22 @@ +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(parent_dir)) + +from web.interfaces.results_analysis import ResultsAnalysis + +st.set_page_config(page_title="Results & Analysis", page_icon="๐Ÿ“ˆ", layout="wide") + +if "config" in st.session_state: + results_analysis = ResultsAnalysis( + st.session_state.config, + st.session_state.experiment_tracker, + st.session_state.experiment_runner, + ) + results_analysis.index() +else: + st.error("Please run the main app first to initialize the configuration.") + st.markdown("Go back to the [main page](/) to start the application.") diff --git a/web/pages/6_๐Ÿ”ฎ_Predictions.py b/web/pages/6_๐Ÿ”ฎ_Predictions.py new file mode 100644 index 0000000..1fa3a2a --- /dev/null +++ b/web/pages/6_๐Ÿ”ฎ_Predictions.py @@ -0,0 +1,22 @@ +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(parent_dir)) + +from web.interfaces.predictions import Predictions + +st.set_page_config(page_title="Predictions", page_icon="๐Ÿ”ฎ", layout="wide") + +if "config" in st.session_state: + predictions = Predictions( + st.session_state.config, + st.session_state.experiment_tracker, + st.session_state.experiment_runner, + ) + predictions.index() +else: + st.error("Please run the main app first to initialize the configuration.") + st.markdown("Go back to the [main page](/) to start the application.") diff --git a/web/pages/7_โš™๏ธ_Configuration.py b/web/pages/7_โš™๏ธ_Configuration.py new file mode 100644 index 0000000..abd2f8e --- /dev/null +++ b/web/pages/7_โš™๏ธ_Configuration.py @@ -0,0 +1,18 @@ +import sys +from pathlib import Path +import streamlit as st + +# Add parent directory to Python path to access core modules +parent_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(parent_dir)) + +from web.interfaces.configuration import Configuration + +st.set_page_config(page_title="Configuration", page_icon="โš™๏ธ", layout="wide") + +if "config" in st.session_state: + configuration = Configuration(st.session_state.config) + configuration.index() +else: + st.error("Please run the main app first to initialize the configuration.") + st.markdown("Go back to the [main page](/) to start the application.")