feat: web application multipage support
This commit is contained in:
@@ -1,101 +0,0 @@
|
|||||||
#!.venv/bin/python3
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
import streamlit as st
|
|
||||||
|
|
||||||
from core.config import setup_config, PipelineConfig
|
|
||||||
from core.utils.data_loader import DataLoader
|
|
||||||
from interface.configuration import Configuration
|
|
||||||
from interface.dashboard import Dashboard
|
|
||||||
from interface.data_overview import DataOverview
|
|
||||||
from interface.data_processing import DataProcessing
|
|
||||||
from interface.experiments import Experiments
|
|
||||||
from interface.predictions import Predictions
|
|
||||||
from interface.results_analysis import ResultsAnalysis
|
|
||||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
|
||||||
from research.experiment.experiment_runner import ExperimentRunner
|
|
||||||
from research.experiment.experiment_tracker import ExperimentTracker
|
|
||||||
|
|
||||||
# Page configuration
|
|
||||||
st.set_page_config(
|
|
||||||
page_title="DRC Names NLP Pipeline",
|
|
||||||
page_icon="🇨🇩",
|
|
||||||
layout="wide",
|
|
||||||
initial_sidebar_state="expanded",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class StreamlitApp:
|
|
||||||
"""Main Streamlit application class"""
|
|
||||||
|
|
||||||
def __init__(self, config: PipelineConfig):
|
|
||||||
self.config = config
|
|
||||||
self.data_loader = DataLoader(self.config)
|
|
||||||
self.experiment_tracker = ExperimentTracker(self.config)
|
|
||||||
self.experiment_runner = ExperimentRunner(self.config)
|
|
||||||
self.pipeline_monitor = PipelineMonitor()
|
|
||||||
|
|
||||||
# Initialize interface components
|
|
||||||
self.dashboard = Dashboard(self.config, self.experiment_tracker, self.experiment_runner)
|
|
||||||
self.data_overview = DataOverview(self.config)
|
|
||||||
self.data_processing = DataProcessing(self.config, self.pipeline_monitor)
|
|
||||||
self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner)
|
|
||||||
self.results_analysis = ResultsAnalysis(
|
|
||||||
self.config, self.experiment_tracker, self.experiment_runner
|
|
||||||
)
|
|
||||||
self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner)
|
|
||||||
self.configuration = Configuration(self.config)
|
|
||||||
|
|
||||||
# Initialize session state
|
|
||||||
if "current_experiment" not in st.session_state:
|
|
||||||
st.session_state.current_experiment = None
|
|
||||||
if "experiment_results" not in st.session_state:
|
|
||||||
st.session_state.experiment_results = {}
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
st.title("🇨🇩 DRC NERS Pipeline")
|
|
||||||
st.markdown("A comprehensive tool for Congolese name analysis and gender prediction")
|
|
||||||
|
|
||||||
# Sidebar navigation
|
|
||||||
page = st.sidebar.selectbox(
|
|
||||||
"Navigation",
|
|
||||||
[
|
|
||||||
"Dashboard",
|
|
||||||
"Dataset Overview",
|
|
||||||
"Data Processing",
|
|
||||||
"Experiments",
|
|
||||||
"Results & Analysis",
|
|
||||||
"Predictions",
|
|
||||||
"Configuration",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Route to appropriate page
|
|
||||||
page_map = {
|
|
||||||
"Dashboard": self.dashboard.index,
|
|
||||||
"Dataset Overview": self.data_overview.index,
|
|
||||||
"Data Processing": self.data_processing.index,
|
|
||||||
"Experiments": self.experiments.index,
|
|
||||||
"Results & Analysis": self.results_analysis.index,
|
|
||||||
"Predictions": self.predictions.index,
|
|
||||||
"Configuration": self.configuration.index,
|
|
||||||
}
|
|
||||||
page_map.get(page, lambda: None)()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="DRC NERS Platform",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
|
||||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
config = setup_config(args.config, env=args.env)
|
|
||||||
app = StreamlitApp(config)
|
|
||||||
app.run()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -18,7 +18,8 @@ paths:
|
|||||||
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
|
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
|
||||||
|
|
||||||
# Pipeline stages
|
# Pipeline stages
|
||||||
stages: # List of stages in the processing pipeline
|
# List of stages in the processing pipeline
|
||||||
|
stages:
|
||||||
- "data_cleaning" # Data cleaning stage
|
- "data_cleaning" # Data cleaning stage
|
||||||
- "feature_extraction" # Feature extraction stage
|
- "feature_extraction" # Feature extraction stage
|
||||||
- "ner_annotation" # NER-based annotation stage
|
- "ner_annotation" # NER-based annotation stage
|
||||||
@@ -36,6 +37,7 @@ processing:
|
|||||||
- "utf-16"
|
- "utf-16"
|
||||||
- "latin1"
|
- "latin1"
|
||||||
chunk_size: 100_000 # Size of data chunks to process in parallel
|
chunk_size: 100_000 # Size of data chunks to process in parallel
|
||||||
|
epochs: 2 # Number of Epochs for training
|
||||||
|
|
||||||
# Annotation settings
|
# Annotation settings
|
||||||
annotation:
|
annotation:
|
||||||
@@ -72,8 +74,9 @@ data:
|
|||||||
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
|
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
|
||||||
|
|
||||||
# Logging configuration
|
# Logging configuration
|
||||||
|
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||||
logging:
|
logging:
|
||||||
level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
level: "INFO"
|
||||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
file_logging: true # Enable logging to file
|
file_logging: true # Enable logging to file
|
||||||
console_logging: true # Enable logging to console
|
console_logging: true # Enable logging to console
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ baseline_experiments:
|
|||||||
max_len: 20
|
max_len: 20
|
||||||
embedding_dim: 64
|
embedding_dim: 64
|
||||||
gru_units: 32
|
gru_units: 32
|
||||||
epochs: 10
|
epochs: 2
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
tags: [ "baseline", "neural", "bigru" ]
|
tags: [ "baseline", "neural", "bigru" ]
|
||||||
|
|
||||||
@@ -21,7 +21,7 @@ baseline_experiments:
|
|||||||
filters: 64
|
filters: 64
|
||||||
kernel_size: 3
|
kernel_size: 3
|
||||||
dropout: 0.5
|
dropout: 0.5
|
||||||
epochs: 10
|
epochs: 2
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
tags: [ "baseline", "neural", "cnn" ]
|
tags: [ "baseline", "neural", "cnn" ]
|
||||||
|
|
||||||
@@ -79,7 +79,7 @@ baseline_experiments:
|
|||||||
model_params:
|
model_params:
|
||||||
embedding_dim: 128
|
embedding_dim: 128
|
||||||
lstm_units: 64
|
lstm_units: 64
|
||||||
epochs: 10
|
epochs: 2
|
||||||
batch_size: 64
|
batch_size: 64
|
||||||
tags: [ "baseline", "neural", "lstm" ]
|
tags: [ "baseline", "neural", "lstm" ]
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ baseline_experiments:
|
|||||||
embedding_dim: 128
|
embedding_dim: 128
|
||||||
num_heads: 4
|
num_heads: 4
|
||||||
num_layers: 2
|
num_layers: 2
|
||||||
epochs: 10
|
epochs: 2
|
||||||
batch_size: 64
|
batch_size: 64
|
||||||
tags: [ "baseline", "neural", "transformer" ]
|
tags: [ "baseline", "neural", "transformer" ]
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,145 @@
|
|||||||
|
[paths]
|
||||||
|
train = null
|
||||||
|
dev = null
|
||||||
|
vectors = null
|
||||||
|
init_tok2vec = null
|
||||||
|
|
||||||
|
[system]
|
||||||
|
gpu_allocator = null
|
||||||
|
seed = 42
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "fr"
|
||||||
|
pipeline = ["tok2vec","ner"]
|
||||||
|
batch_size = 100000
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
vectors = {"@vectors":"spacy.Vectors.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
incorrect_spans_key = null
|
||||||
|
moves = null
|
||||||
|
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
||||||
|
update_with_oracle_cut_size = 100
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
|
state_type = "ner"
|
||||||
|
extra_state_tokens = false
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
upstream = "*"
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
|
||||||
|
rows = [5000,1000,2500,2500]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[training]
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
dropout = 0.1
|
||||||
|
accumulate_gradient = 1
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
frozen_components = []
|
||||||
|
annotating_components = []
|
||||||
|
before_to_disk = null
|
||||||
|
before_update = null
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
get_length = null
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
t = 0.0
|
||||||
|
|
||||||
|
[training.logger]
|
||||||
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
progress_bar = false
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 0.00000001
|
||||||
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
ents_f = 1.0
|
||||||
|
ents_p = 0.0
|
||||||
|
ents_r = 0.0
|
||||||
|
ents_per_type = null
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
@@ -12,3 +12,4 @@ class ProcessingConfig(BaseModel):
|
|||||||
use_multiprocessing: bool = False
|
use_multiprocessing: bool = False
|
||||||
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
||||||
chunk_size: int = 100_000
|
chunk_size: int = 100_000
|
||||||
|
epochs: int = 2
|
||||||
|
|||||||
@@ -7,24 +7,24 @@ import traceback
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from core.config import setup_config, PipelineConfig
|
from core.config import setup_config, PipelineConfig
|
||||||
from processing.ner.ner_data_builder import NERDataBuilder
|
from processing.ner.name_builder import NameBuilder
|
||||||
from processing.ner.ner_engineering import NEREngineering
|
from processing.ner.name_engineering import NameEngineering
|
||||||
from processing.ner.ner_name_model import NERNameModel
|
from processing.ner.name_model import NameModel
|
||||||
|
|
||||||
|
|
||||||
def feature(config: PipelineConfig):
|
def feature(config: PipelineConfig):
|
||||||
"""Apply feature engineering to create position-independent NER dataset."""
|
"""Apply feature engineering to create position-independent NER dataset."""
|
||||||
NEREngineering(config).compute()
|
NameEngineering(config).compute()
|
||||||
|
|
||||||
|
|
||||||
def build(config: PipelineConfig):
|
def build(config: PipelineConfig):
|
||||||
"""Build NER dataset using NERDataBuilder."""
|
"""Build NER dataset using NERDataBuilder."""
|
||||||
NERDataBuilder(config).build()
|
NameBuilder(config).build()
|
||||||
|
|
||||||
|
|
||||||
def train(config: PipelineConfig):
|
def train(config: PipelineConfig):
|
||||||
"""Train the NER model."""
|
"""Train the NER model."""
|
||||||
trainer = NERNameModel(config)
|
trainer = NameModel(config)
|
||||||
|
|
||||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||||
if not data_path.exists():
|
if not data_path.exists():
|
||||||
@@ -39,7 +39,10 @@ def train(config: PipelineConfig):
|
|||||||
|
|
||||||
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
||||||
trainer.train(
|
trainer.train(
|
||||||
data=train_data, epochs=1, batch_size=config.processing.batch_size, dropout_rate=0.3
|
data=train_data,
|
||||||
|
epochs=config.processing.epochs,
|
||||||
|
batch_size=config.processing.batch_size,
|
||||||
|
dropout_rate=0.3,
|
||||||
)
|
)
|
||||||
trainer.evaluate(eval_data)
|
trainer.evaluate(eval_data)
|
||||||
|
|
||||||
@@ -48,13 +51,17 @@ def train(config: PipelineConfig):
|
|||||||
|
|
||||||
|
|
||||||
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
||||||
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
|
if not reset and os.path.exists(
|
||||||
|
config.paths.get_data_path(config.data.output_files["engineered"])
|
||||||
|
):
|
||||||
logging.info("Step 1: Feature engineering already done.")
|
logging.info("Step 1: Feature engineering already done.")
|
||||||
else:
|
else:
|
||||||
logging.info("Step 1: Running feature engineering")
|
logging.info("Step 1: Running feature engineering")
|
||||||
feature(config)
|
feature(config)
|
||||||
|
|
||||||
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
|
if not reset and os.path.exists(
|
||||||
|
config.paths.get_data_path(config.data.output_files["ner_data"])
|
||||||
|
):
|
||||||
logging.info("Step 2: NER dataset already built.")
|
logging.info("Step 2: NER dataset already built.")
|
||||||
else:
|
else:
|
||||||
logging.info("Step 2: Building NER dataset")
|
logging.info("Step 2: Building NER dataset")
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.tokens import DocBin
|
||||||
|
|
||||||
|
from core.config import PipelineConfig
|
||||||
|
from core.utils.data_loader import DataLoader
|
||||||
|
from .name_tagger import NameTagger
|
||||||
|
|
||||||
|
|
||||||
|
class NameBuilder:
|
||||||
|
def __init__(self, config: PipelineConfig):
|
||||||
|
self.config = config
|
||||||
|
self.data_loader = DataLoader(config)
|
||||||
|
self.tagger = NameTagger()
|
||||||
|
|
||||||
|
def build(self) -> int:
|
||||||
|
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||||
|
df = self.data_loader.load_csv_complete(filepath)
|
||||||
|
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||||
|
|
||||||
|
# Filter early
|
||||||
|
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
|
||||||
|
if ner_df.empty:
|
||||||
|
logging.error("No NER tagged data found")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
total_rows = len(df)
|
||||||
|
del df # No need to keep in memory
|
||||||
|
|
||||||
|
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
||||||
|
nlp = spacy.blank("fr")
|
||||||
|
|
||||||
|
# Use NERNameTagger for parsing and validation
|
||||||
|
parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"])
|
||||||
|
validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities)
|
||||||
|
|
||||||
|
# Drop rows with no valid entities
|
||||||
|
mask = validated_entities.map(bool)
|
||||||
|
ner_df = ner_df.loc[mask]
|
||||||
|
validated_entities = validated_entities.loc[mask]
|
||||||
|
|
||||||
|
if ner_df.empty:
|
||||||
|
logging.error("No valid training examples after validation")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Prepare training data
|
||||||
|
training_data = list(
|
||||||
|
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use NERNameTagger to create spaCy DocBin
|
||||||
|
docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
||||||
|
doc_bin = DocBin(docs=docs)
|
||||||
|
|
||||||
|
# Save
|
||||||
|
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
||||||
|
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
||||||
|
|
||||||
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||||
|
doc_bin.to_disk(spacy_path)
|
||||||
|
|
||||||
|
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
||||||
|
logging.info(f"Saved NER JSON to {json_path}")
|
||||||
|
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||||
|
return 0
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
|
import gc
|
||||||
import random
|
import random
|
||||||
from typing import List
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -7,7 +7,7 @@ import pandas as pd
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
from core.config import PipelineConfig
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
from core.utils.data_loader import DataLoader
|
||||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||||
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||||
@@ -16,7 +16,7 @@ from processing.ner.formats.position_flipped_format import PositionFlippedFormat
|
|||||||
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
||||||
|
|
||||||
|
|
||||||
class NEREngineering:
|
class NameEngineering:
|
||||||
"""
|
"""
|
||||||
Feature engineering for NER dataset to prevent position-based learning
|
Feature engineering for NER dataset to prevent position-based learning
|
||||||
and encourage sequence characteristic learning.
|
and encourage sequence characteristic learning.
|
||||||
@@ -66,13 +66,16 @@ class NEREngineering:
|
|||||||
def compute(self) -> None:
|
def compute(self) -> None:
|
||||||
logging.info("Applying feature engineering transformations...")
|
logging.info("Applying feature engineering transformations...")
|
||||||
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||||
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
output_filepath = self.config.paths.get_data_path(
|
||||||
|
self.config.data.output_files["engineered"]
|
||||||
|
)
|
||||||
|
|
||||||
df = self.data_loader.load_csv_complete(input_filepath)
|
df = self.data_loader.load_csv_complete(input_filepath)
|
||||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||||
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
||||||
|
|
||||||
del df # No need to keep in memory
|
del df # No need to keep in memory
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||||
drop=True
|
drop=True
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import ast
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -11,7 +12,7 @@ from spacy.util import minibatch
|
|||||||
from core.config.pipeline_config import PipelineConfig
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
|
|
||||||
class NERNameModel:
|
class NameModel:
|
||||||
"""NER model trainer using spaCy for DRC names entity recognition"""
|
"""NER model trainer using spaCy for DRC names entity recognition"""
|
||||||
|
|
||||||
def __init__(self, config: PipelineConfig):
|
def __init__(self, config: PipelineConfig):
|
||||||
@@ -84,8 +85,6 @@ class NERNameModel:
|
|||||||
if isinstance(entities_raw, str):
|
if isinstance(entities_raw, str):
|
||||||
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
|
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
|
||||||
try:
|
try:
|
||||||
import ast
|
|
||||||
|
|
||||||
entities = ast.literal_eval(entities_raw)
|
entities = ast.literal_eval(entities_raw)
|
||||||
if not isinstance(entities, list):
|
if not isinstance(entities, list):
|
||||||
logging.warning(
|
logging.warning(
|
||||||
@@ -175,9 +174,9 @@ class NERNameModel:
|
|||||||
def train(
|
def train(
|
||||||
self,
|
self,
|
||||||
data: List[Tuple[str, Dict]],
|
data: List[Tuple[str, Dict]],
|
||||||
epochs: int = 5,
|
epochs: int = 1,
|
||||||
batch_size: int = 16,
|
batch_size: int = 10_000,
|
||||||
dropout_rate: float = 0.2,
|
dropout_rate: float = 0.3,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Train the NER model"""
|
"""Train the NER model"""
|
||||||
logging.info(f"Starting NER training with {len(data)} examples")
|
logging.info(f"Starting NER training with {len(data)} examples")
|
||||||
@@ -204,7 +203,7 @@ class NERNameModel:
|
|||||||
example = Example.from_dict(doc, annotations)
|
example = Example.from_dict(doc, annotations)
|
||||||
examples.append(example)
|
examples.append(example)
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
|
f"Training example: {text[:30]} with entities {annotations.get('entities', [])}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Train in batches
|
# Train in batches
|
||||||
@@ -215,6 +214,7 @@ class NERNameModel:
|
|||||||
)
|
)
|
||||||
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
|
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
|
||||||
|
|
||||||
|
del batches # free memory
|
||||||
epoch_loss = losses.get("ner", 0)
|
epoch_loss = losses.get("ner", 0)
|
||||||
losses_history.append(epoch_loss)
|
losses_history.append(epoch_loss)
|
||||||
logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")
|
logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")
|
||||||
@@ -0,0 +1,273 @@
|
|||||||
|
from typing import Union, Dict, Any, List
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import pandas as pd
|
||||||
|
from spacy.util import filter_spans
|
||||||
|
|
||||||
|
|
||||||
|
class NameTagger:
|
||||||
|
def tag_name(
|
||||||
|
self, name: str, probable_native: str, probable_surname: str
|
||||||
|
) -> Union[Dict[str, Any], None]:
|
||||||
|
"""Create a single NER training example using probable_native and probable_surname"""
|
||||||
|
if not name or not probable_native or not probable_surname:
|
||||||
|
return None
|
||||||
|
|
||||||
|
name = name.strip()
|
||||||
|
probable_native = probable_native.strip()
|
||||||
|
probable_surname = probable_surname.strip()
|
||||||
|
|
||||||
|
entities = []
|
||||||
|
used_spans = [] # Track used character spans to prevent overlaps
|
||||||
|
|
||||||
|
# Helper function to check if a span overlaps with any existing span
|
||||||
|
def has_overlap(start, end):
|
||||||
|
for used_start, used_end in used_spans:
|
||||||
|
if not (end <= used_start or start >= used_end):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Find positions of native names in the full name
|
||||||
|
native_words = probable_native.split()
|
||||||
|
name_lower = name.lower() # Use lowercase for consistent searching
|
||||||
|
processed_native_words = set()
|
||||||
|
|
||||||
|
for native_word in native_words:
|
||||||
|
native_word = native_word.strip()
|
||||||
|
if len(native_word) < 2: # Skip very short words
|
||||||
|
continue
|
||||||
|
|
||||||
|
native_word_lower = native_word.lower()
|
||||||
|
|
||||||
|
# Skip if we've already processed this exact word
|
||||||
|
if native_word_lower in processed_native_words:
|
||||||
|
continue
|
||||||
|
processed_native_words.add(native_word_lower)
|
||||||
|
|
||||||
|
# Find the first occurrence of this native word that doesn't overlap
|
||||||
|
start_pos = 0
|
||||||
|
while True:
|
||||||
|
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
|
||||||
|
if pos == -1:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Calculate end position - make sure we only include the word itself
|
||||||
|
end_pos = pos + len(native_word_lower)
|
||||||
|
|
||||||
|
# Double-check that the extracted span matches exactly what we expect
|
||||||
|
extracted_text = name[pos:end_pos] # Get original case text
|
||||||
|
if extracted_text.lower() != native_word_lower:
|
||||||
|
start_pos = pos + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this is a word boundary match and doesn't overlap
|
||||||
|
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||||
|
pos, end_pos
|
||||||
|
):
|
||||||
|
entities.append((pos, end_pos, "NATIVE"))
|
||||||
|
used_spans.append((pos, end_pos))
|
||||||
|
break # Only take the first non-overlapping occurrence
|
||||||
|
|
||||||
|
start_pos = pos + 1
|
||||||
|
|
||||||
|
# Find position of surname in the full name
|
||||||
|
if probable_surname and len(probable_surname.strip()) >= 2:
|
||||||
|
surname_lower = probable_surname.lower()
|
||||||
|
|
||||||
|
# Find the first occurrence that doesn't overlap
|
||||||
|
start_pos = 0
|
||||||
|
while True:
|
||||||
|
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
|
||||||
|
if pos == -1:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Calculate end position correctly - exact match only
|
||||||
|
end_pos = pos + len(surname_lower)
|
||||||
|
|
||||||
|
# Double-check that the extracted span matches exactly what we expect
|
||||||
|
extracted_text = name[pos:end_pos] # Get original case text
|
||||||
|
if extracted_text.lower() != surname_lower:
|
||||||
|
start_pos = pos + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||||
|
pos, end_pos
|
||||||
|
):
|
||||||
|
entities.append((pos, end_pos, "SURNAME"))
|
||||||
|
used_spans.append((pos, end_pos))
|
||||||
|
break
|
||||||
|
|
||||||
|
start_pos = pos + 1
|
||||||
|
|
||||||
|
if not entities:
|
||||||
|
logging.warning(
|
||||||
|
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Sort entities by position and validate
|
||||||
|
entities.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
# Final validation - ensure no overlaps and valid spans
|
||||||
|
validated_entities = []
|
||||||
|
for start, end, label in entities:
|
||||||
|
# Check bounds
|
||||||
|
if not (0 <= start < end <= len(name)):
|
||||||
|
logging.warning(
|
||||||
|
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for overlaps with already validated entities
|
||||||
|
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
|
||||||
|
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||||
|
span_text = name[start:end]
|
||||||
|
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||||
|
logging.warning(
|
||||||
|
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
validated_entities.append((start, end, label))
|
||||||
|
|
||||||
|
if not validated_entities:
|
||||||
|
logging.warning(f"No valid entities after validation for: '{name}'")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Convert to string format that matches the dataset
|
||||||
|
entities_str = str(validated_entities)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"entities": entities_str,
|
||||||
|
"spans": validated_entities, # Keep the original tuples for internal use
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
|
||||||
|
"""Check if the match is at word boundaries"""
|
||||||
|
# Check character before start position
|
||||||
|
if start > 0:
|
||||||
|
prev_char = text[start - 1]
|
||||||
|
if prev_char.isalnum():
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check character after end position
|
||||||
|
if end < len(text):
|
||||||
|
next_char = text[end]
|
||||||
|
if next_char.isalnum():
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
||||||
|
"""Extract the actual text for each entity type"""
|
||||||
|
result = {"NATIVE": [], "SURNAME": []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
entities = ast.literal_eval(entities_str)
|
||||||
|
|
||||||
|
for start, end, label in entities:
|
||||||
|
if 0 <= start < end <= len(name):
|
||||||
|
span_text = name[start:end]
|
||||||
|
if label in result:
|
||||||
|
result[label].append(span_text)
|
||||||
|
|
||||||
|
except (ValueError, SyntaxError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parse(cls, entities_str: str) -> List[tuple]:
|
||||||
|
"""Parse entity strings from various formats.
|
||||||
|
|
||||||
|
Supports formats:
|
||||||
|
- [(start, end, label), ...]
|
||||||
|
- [[start, end, label], ...]
|
||||||
|
- [{"start": start, "end": end, "label": label}, ...]
|
||||||
|
"""
|
||||||
|
if not entities_str or entities_str in ["[]", "", "nan"]:
|
||||||
|
return []
|
||||||
|
entities_str = str(entities_str).strip()
|
||||||
|
try:
|
||||||
|
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
||||||
|
return ast.literal_eval(entities_str)
|
||||||
|
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||||
|
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||||
|
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||||
|
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
||||||
|
else:
|
||||||
|
parsed = ast.literal_eval(entities_str)
|
||||||
|
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
|
||||||
|
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def parse_entities(self, series: pd.Series) -> pd.Series:
|
||||||
|
"""Vectorized parse of entity strings."""
|
||||||
|
return series.map(self.parse)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate(cls, text: str, entities: List[tuple]) -> List[tuple]:
|
||||||
|
"""Advanced entity validation with overlap removal.
|
||||||
|
|
||||||
|
This is more comprehensive than the basic validate_entities method.
|
||||||
|
"""
|
||||||
|
if not entities or not text:
|
||||||
|
return []
|
||||||
|
text = str(text).strip()
|
||||||
|
valid = []
|
||||||
|
|
||||||
|
for ent in entities:
|
||||||
|
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
|
||||||
|
continue
|
||||||
|
start, end, label = ent
|
||||||
|
try:
|
||||||
|
start, end = int(start), int(end)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
continue
|
||||||
|
if not isinstance(label, str):
|
||||||
|
continue
|
||||||
|
if not (0 <= start < end <= len(text)):
|
||||||
|
continue
|
||||||
|
if not text[start:end].strip():
|
||||||
|
continue
|
||||||
|
valid.append((start, end, label))
|
||||||
|
|
||||||
|
if not valid:
|
||||||
|
return []
|
||||||
|
|
||||||
|
valid.sort(key=lambda x: (x[0], x[1]))
|
||||||
|
|
||||||
|
# Remove overlaps
|
||||||
|
filtered, last_end = [], -1
|
||||||
|
for s, e, l in valid:
|
||||||
|
if s >= last_end:
|
||||||
|
filtered.append((s, e, l))
|
||||||
|
last_end = e
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
||||||
|
"""Vectorized entity validation."""
|
||||||
|
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List:
|
||||||
|
"""Batch create spaCy Docs from texts and entities."""
|
||||||
|
docs = []
|
||||||
|
for text, ents in zip(texts, entities):
|
||||||
|
doc = nlp(text)
|
||||||
|
spans = []
|
||||||
|
for start, end, label in ents:
|
||||||
|
span = doc.char_span(
|
||||||
|
start, end, label=label, alignment_mode="contract"
|
||||||
|
) or doc.char_span(start, end, label=label, alignment_mode="strict")
|
||||||
|
if span:
|
||||||
|
spans.append(span)
|
||||||
|
doc.ents = filter_spans(spans)
|
||||||
|
docs.append(doc)
|
||||||
|
return docs
|
||||||
@@ -1,149 +0,0 @@
|
|||||||
import ast
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import spacy
|
|
||||||
from spacy.tokens import DocBin
|
|
||||||
from spacy.util import filter_spans
|
|
||||||
|
|
||||||
from core.config import PipelineConfig
|
|
||||||
from core.utils.data_loader import DataLoader
|
|
||||||
|
|
||||||
|
|
||||||
class NERDataBuilder:
|
|
||||||
def __init__(self, config: PipelineConfig):
|
|
||||||
self.config = config
|
|
||||||
self.data_loader = DataLoader(config)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse_entities(series: pd.Series) -> pd.Series:
|
|
||||||
"""Vectorized parse of entity strings."""
|
|
||||||
|
|
||||||
def _parse(entities_str):
|
|
||||||
if not entities_str or entities_str in ["[]", "", "nan"]:
|
|
||||||
return []
|
|
||||||
entities_str = str(entities_str).strip()
|
|
||||||
try:
|
|
||||||
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
|
||||||
return ast.literal_eval(entities_str)
|
|
||||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
|
||||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
|
||||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
|
||||||
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
|
||||||
else:
|
|
||||||
parsed = ast.literal_eval(entities_str)
|
|
||||||
return [
|
|
||||||
tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
|
|
||||||
]
|
|
||||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
|
||||||
return []
|
|
||||||
|
|
||||||
return series.map(_parse)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
|
||||||
"""Vectorized entity validation."""
|
|
||||||
|
|
||||||
def _validate(text, entities):
|
|
||||||
if not entities or not text:
|
|
||||||
return []
|
|
||||||
text = str(text).strip()
|
|
||||||
valid = []
|
|
||||||
for ent in entities:
|
|
||||||
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
|
|
||||||
continue
|
|
||||||
start, end, label = ent
|
|
||||||
try:
|
|
||||||
start, end = int(start), int(end)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
continue
|
|
||||||
if not isinstance(label, str):
|
|
||||||
continue
|
|
||||||
if not (0 <= start < end <= len(text)):
|
|
||||||
continue
|
|
||||||
if not text[start:end].strip():
|
|
||||||
continue
|
|
||||||
valid.append((start, end, label))
|
|
||||||
if not valid:
|
|
||||||
return []
|
|
||||||
valid.sort(key=lambda x: (x[0], x[1]))
|
|
||||||
# remove overlaps
|
|
||||||
filtered, last_end = [], -1
|
|
||||||
for s, e, l in valid:
|
|
||||||
if s >= last_end:
|
|
||||||
filtered.append((s, e, l))
|
|
||||||
last_end = e
|
|
||||||
return filtered
|
|
||||||
|
|
||||||
return pd.Series(map(_validate, texts, entities_series), index=texts.index)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _create_docs(nlp, texts, entities):
|
|
||||||
"""Batch create spaCy Docs."""
|
|
||||||
docs = []
|
|
||||||
for text, ents in zip(texts, entities):
|
|
||||||
doc = nlp(text)
|
|
||||||
spans = []
|
|
||||||
for start, end, label in ents:
|
|
||||||
span = doc.char_span(
|
|
||||||
start, end, label=label, alignment_mode="contract"
|
|
||||||
) or doc.char_span(start, end, label=label, alignment_mode="strict")
|
|
||||||
if span:
|
|
||||||
spans.append(span)
|
|
||||||
doc.ents = filter_spans(spans)
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
def build(self) -> int:
|
|
||||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
|
||||||
df = self.data_loader.load_csv_complete(filepath)
|
|
||||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
|
||||||
|
|
||||||
# Filter early
|
|
||||||
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
|
|
||||||
if ner_df.empty:
|
|
||||||
logging.error("No NER tagged data found")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
total_rows = len(df)
|
|
||||||
del df # No need to keep in memory
|
|
||||||
|
|
||||||
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
|
||||||
nlp = spacy.blank("fr")
|
|
||||||
|
|
||||||
# Vectorized parsing + validation
|
|
||||||
parsed_entities = self._parse_entities(ner_df["ner_entities"])
|
|
||||||
validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
|
|
||||||
|
|
||||||
# Drop rows with no valid entities
|
|
||||||
mask = validated_entities.map(bool)
|
|
||||||
ner_df = ner_df.loc[mask]
|
|
||||||
validated_entities = validated_entities.loc[mask]
|
|
||||||
|
|
||||||
if ner_df.empty:
|
|
||||||
logging.error("No valid training examples after validation")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# Prepare training data
|
|
||||||
training_data = list(
|
|
||||||
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create spaCy DocBin in batch
|
|
||||||
docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
|
||||||
doc_bin = DocBin(docs=docs)
|
|
||||||
|
|
||||||
# Save
|
|
||||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
|
||||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
|
||||||
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
|
||||||
doc_bin.to_disk(spacy_path)
|
|
||||||
|
|
||||||
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
|
||||||
logging.info(f"Saved NER JSON to {json_path}")
|
|
||||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
|
||||||
return 0
|
|
||||||
|
|||||||
@@ -1,212 +0,0 @@
|
|||||||
from typing import Union, Dict, Any, List
|
|
||||||
import logging
|
|
||||||
|
|
||||||
|
|
||||||
class NERNameTagger:
|
|
||||||
def tag_name(
|
|
||||||
self, name: str, probable_native: str, probable_surname: str
|
|
||||||
) -> Union[Dict[str, Any], None]:
|
|
||||||
"""Create a single NER training example using probable_native and probable_surname"""
|
|
||||||
if not name or not probable_native or not probable_surname:
|
|
||||||
return None
|
|
||||||
|
|
||||||
name = name.strip()
|
|
||||||
probable_native = probable_native.strip()
|
|
||||||
probable_surname = probable_surname.strip()
|
|
||||||
|
|
||||||
entities = []
|
|
||||||
used_spans = [] # Track used character spans to prevent overlaps
|
|
||||||
|
|
||||||
# Helper function to check if a span overlaps with any existing span
|
|
||||||
def has_overlap(start, end):
|
|
||||||
for used_start, used_end in used_spans:
|
|
||||||
if not (end <= used_start or start >= used_end):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Find positions of native names in the full name
|
|
||||||
native_words = probable_native.split()
|
|
||||||
name_lower = name.lower() # Use lowercase for consistent searching
|
|
||||||
processed_native_words = set()
|
|
||||||
|
|
||||||
for native_word in native_words:
|
|
||||||
native_word = native_word.strip()
|
|
||||||
if len(native_word) < 2: # Skip very short words
|
|
||||||
continue
|
|
||||||
|
|
||||||
native_word_lower = native_word.lower()
|
|
||||||
|
|
||||||
# Skip if we've already processed this exact word
|
|
||||||
if native_word_lower in processed_native_words:
|
|
||||||
continue
|
|
||||||
processed_native_words.add(native_word_lower)
|
|
||||||
|
|
||||||
# Find the first occurrence of this native word that doesn't overlap
|
|
||||||
start_pos = 0
|
|
||||||
while True:
|
|
||||||
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
|
|
||||||
if pos == -1:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Calculate end position - make sure we only include the word itself
|
|
||||||
end_pos = pos + len(native_word_lower)
|
|
||||||
|
|
||||||
# Double-check that the extracted span matches exactly what we expect
|
|
||||||
extracted_text = name[pos:end_pos] # Get original case text
|
|
||||||
if extracted_text.lower() != native_word_lower:
|
|
||||||
start_pos = pos + 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if this is a word boundary match and doesn't overlap
|
|
||||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
|
||||||
pos, end_pos
|
|
||||||
):
|
|
||||||
entities.append((pos, end_pos, "NATIVE"))
|
|
||||||
used_spans.append((pos, end_pos))
|
|
||||||
break # Only take the first non-overlapping occurrence
|
|
||||||
|
|
||||||
start_pos = pos + 1
|
|
||||||
|
|
||||||
# Find position of surname in the full name
|
|
||||||
if probable_surname and len(probable_surname.strip()) >= 2:
|
|
||||||
surname_lower = probable_surname.lower()
|
|
||||||
|
|
||||||
# Find the first occurrence that doesn't overlap
|
|
||||||
start_pos = 0
|
|
||||||
while True:
|
|
||||||
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
|
|
||||||
if pos == -1:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Calculate end position correctly - exact match only
|
|
||||||
end_pos = pos + len(surname_lower)
|
|
||||||
|
|
||||||
# Double-check that the extracted span matches exactly what we expect
|
|
||||||
extracted_text = name[pos:end_pos] # Get original case text
|
|
||||||
if extracted_text.lower() != surname_lower:
|
|
||||||
start_pos = pos + 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
|
||||||
pos, end_pos
|
|
||||||
):
|
|
||||||
entities.append((pos, end_pos, "SURNAME"))
|
|
||||||
used_spans.append((pos, end_pos))
|
|
||||||
break
|
|
||||||
|
|
||||||
start_pos = pos + 1
|
|
||||||
|
|
||||||
if not entities:
|
|
||||||
logging.warning(
|
|
||||||
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Sort entities by position and validate
|
|
||||||
entities.sort(key=lambda x: x[0])
|
|
||||||
|
|
||||||
# Final validation - ensure no overlaps and valid spans
|
|
||||||
validated_entities = []
|
|
||||||
for start, end, label in entities:
|
|
||||||
# Check bounds
|
|
||||||
if not (0 <= start < end <= len(name)):
|
|
||||||
logging.warning(
|
|
||||||
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check for overlaps with already validated entities
|
|
||||||
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
|
|
||||||
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
|
||||||
span_text = name[start:end]
|
|
||||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
|
||||||
logging.warning(
|
|
||||||
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
validated_entities.append((start, end, label))
|
|
||||||
|
|
||||||
if not validated_entities:
|
|
||||||
logging.warning(f"No valid entities after validation for: '{name}'")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Convert to string format that matches the dataset
|
|
||||||
entities_str = str(validated_entities)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"entities": entities_str,
|
|
||||||
"spans": validated_entities, # Keep the original tuples for internal use
|
|
||||||
}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
|
|
||||||
"""Check if the match is at word boundaries"""
|
|
||||||
# Check character before start position
|
|
||||||
if start > 0:
|
|
||||||
prev_char = text[start - 1]
|
|
||||||
if prev_char.isalnum():
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check character after end position
|
|
||||||
if end < len(text):
|
|
||||||
next_char = text[end]
|
|
||||||
if next_char.isalnum():
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def validate_entities(cls, name: str, entities_str: str) -> bool:
|
|
||||||
"""Validate that entity annotations are correct for a given name"""
|
|
||||||
try:
|
|
||||||
import ast
|
|
||||||
|
|
||||||
entities = ast.literal_eval(entities_str)
|
|
||||||
|
|
||||||
# Check for overlaps and valid bounds
|
|
||||||
sorted_entities = sorted(entities, key=lambda x: x[0])
|
|
||||||
|
|
||||||
for i, (start, end, label) in enumerate(sorted_entities):
|
|
||||||
# Check bounds
|
|
||||||
if not (0 <= start < end <= len(name)):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check for overlaps with next entity
|
|
||||||
if i < len(sorted_entities) - 1:
|
|
||||||
next_start = sorted_entities[i + 1][0]
|
|
||||||
if end > next_start:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Extract the text span and validate it's not empty
|
|
||||||
span_text = name[start:end]
|
|
||||||
if not span_text.strip():
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
except (ValueError, SyntaxError, TypeError):
|
|
||||||
return False
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
|
||||||
"""Extract the actual text for each entity type"""
|
|
||||||
result = {"NATIVE": [], "SURNAME": []}
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ast
|
|
||||||
|
|
||||||
entities = ast.literal_eval(entities_str)
|
|
||||||
|
|
||||||
for start, end, label in entities:
|
|
||||||
if 0 <= start < end <= len(name):
|
|
||||||
span_text = name[start:end]
|
|
||||||
if label in result:
|
|
||||||
result[label].append(span_text)
|
|
||||||
|
|
||||||
except (ValueError, SyntaxError, TypeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
+89
@@ -0,0 +1,89 @@
|
|||||||
|
#!.venv/bin/python3
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from core.config import setup_config, PipelineConfig
|
||||||
|
from core.utils.data_loader import DataLoader
|
||||||
|
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||||
|
from research.experiment.experiment_runner import ExperimentRunner
|
||||||
|
from research.experiment.experiment_tracker import ExperimentTracker
|
||||||
|
from web.interfaces.configuration import Configuration
|
||||||
|
from web.interfaces.dashboard import Dashboard
|
||||||
|
from web.interfaces.data_overview import DataOverview
|
||||||
|
from web.interfaces.data_processing import DataProcessing
|
||||||
|
from web.interfaces.experiments import Experiments
|
||||||
|
from web.interfaces.predictions import Predictions
|
||||||
|
from web.interfaces.results_analysis import ResultsAnalysis
|
||||||
|
|
||||||
|
# Page configuration
|
||||||
|
st.set_page_config(
|
||||||
|
page_title="DRC NERS Platform",
|
||||||
|
page_icon="🇨🇩",
|
||||||
|
layout="wide",
|
||||||
|
initial_sidebar_state="expanded",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_session_state(config: PipelineConfig):
|
||||||
|
"""Initialize session state variables"""
|
||||||
|
if "config" not in st.session_state:
|
||||||
|
st.session_state.config = config
|
||||||
|
if "data_loader" not in st.session_state:
|
||||||
|
st.session_state.data_loader = DataLoader(config)
|
||||||
|
if "experiment_tracker" not in st.session_state:
|
||||||
|
st.session_state.experiment_tracker = ExperimentTracker(config)
|
||||||
|
if "experiment_runner" not in st.session_state:
|
||||||
|
st.session_state.experiment_runner = ExperimentRunner(config)
|
||||||
|
if "pipeline_monitor" not in st.session_state:
|
||||||
|
st.session_state.pipeline_monitor = PipelineMonitor()
|
||||||
|
if "current_experiment" not in st.session_state:
|
||||||
|
st.session_state.current_experiment = None
|
||||||
|
if "experiment_results" not in st.session_state:
|
||||||
|
st.session_state.experiment_results = {}
|
||||||
|
|
||||||
|
|
||||||
|
class StreamlitApp:
|
||||||
|
def __init__(self, config: PipelineConfig):
|
||||||
|
self.config = config
|
||||||
|
initialize_session_state(config)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
st.title("🇨🇩 DRC NERS Pipeline")
|
||||||
|
st.markdown(
|
||||||
|
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
|
||||||
|
)
|
||||||
|
|
||||||
|
st.markdown(
|
||||||
|
"""
|
||||||
|
## Overview
|
||||||
|
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
|
||||||
|
underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
|
||||||
|
data.
|
||||||
|
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
|
||||||
|
million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="DRC NERS Platform",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||||
|
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = setup_config(args.config, env=args.env)
|
||||||
|
app = StreamlitApp(config)
|
||||||
|
app.run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -2,11 +2,9 @@ import streamlit as st
|
|||||||
|
|
||||||
|
|
||||||
class Configuration:
|
class Configuration:
|
||||||
"""Handles configuration display and management"""
|
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
st.header("Current Configuration")
|
st.title("Configuration")
|
||||||
st.json(self.config.model_dump())
|
st.json(self.config.model_dump())
|
||||||
@@ -20,7 +20,7 @@ class Dashboard:
|
|||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
st.header("Dashboard")
|
st.title("Dashboard")
|
||||||
col1, col2, col3, col4 = st.columns(4)
|
col1, col2, col3, col4 = st.columns(4)
|
||||||
|
|
||||||
# Load basic statistics
|
# Load basic statistics
|
||||||
@@ -21,7 +21,7 @@ class DataOverview:
|
|||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
st.header("Data Overview")
|
st.title("Data Overview")
|
||||||
data_files = {
|
data_files = {
|
||||||
"Names": self.config.data.input_file,
|
"Names": self.config.data.input_file,
|
||||||
"Featured Dataset": self.config.data.output_files["featured"],
|
"Featured Dataset": self.config.data.output_files["featured"],
|
||||||
@@ -3,7 +3,7 @@ import plotly.express as px
|
|||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||||
from interface.log_reader import LogReader
|
from web.interfaces.log_reader import LogReader
|
||||||
|
|
||||||
|
|
||||||
@st.cache_data
|
@st.cache_data
|
||||||
@@ -21,7 +21,7 @@ class DataProcessing:
|
|||||||
self.pipeline_monitor = pipeline_monitor
|
self.pipeline_monitor = pipeline_monitor
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
st.header("Data Processing Pipeline")
|
st.title("Data Processing")
|
||||||
status = self.pipeline_monitor.get_pipeline_status()
|
status = self.pipeline_monitor.get_pipeline_status()
|
||||||
|
|
||||||
# Overall progress
|
# Overall progress
|
||||||
@@ -12,8 +12,6 @@ from research.model_registry import list_available_models
|
|||||||
|
|
||||||
|
|
||||||
class Experiments:
|
class Experiments:
|
||||||
"""Handles experiment management interface"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||||
):
|
):
|
||||||
@@ -22,8 +20,7 @@ class Experiments:
|
|||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
"""Main experiments page"""
|
st.title("Experiments")
|
||||||
st.header("Experiment Management")
|
|
||||||
tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"])
|
tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"])
|
||||||
|
|
||||||
with tab1:
|
with tab1:
|
||||||
@@ -12,8 +12,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
|||||||
|
|
||||||
|
|
||||||
class Predictions:
|
class Predictions:
|
||||||
"""Handles prediction interface"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||||
):
|
):
|
||||||
@@ -22,8 +20,7 @@ class Predictions:
|
|||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
"""Main predictions page"""
|
st.title("Predictions")
|
||||||
st.header("Make Predictions")
|
|
||||||
|
|
||||||
# Load available models
|
# Load available models
|
||||||
experiments = self.experiment_tracker.list_experiments()
|
experiments = self.experiment_tracker.list_experiments()
|
||||||
@@ -11,8 +11,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
|||||||
|
|
||||||
|
|
||||||
class ResultsAnalysis:
|
class ResultsAnalysis:
|
||||||
"""Handles experiment results and analysis interface"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||||
):
|
):
|
||||||
@@ -21,8 +19,7 @@ class ResultsAnalysis:
|
|||||||
self.experiment_runner = experiment_runner
|
self.experiment_runner = experiment_runner
|
||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
"""Main results analysis page"""
|
st.title("Results & Analysis")
|
||||||
st.header("Results & Analysis")
|
|
||||||
tab1, tab2, tab3 = st.tabs(
|
tab1, tab2, tab3 = st.tabs(
|
||||||
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
|
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
|
||||||
)
|
)
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from web.interfaces.dashboard import Dashboard
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Dashboard", page_icon="📊", layout="wide")
|
||||||
|
|
||||||
|
if "config" in st.session_state:
|
||||||
|
dashboard = Dashboard(
|
||||||
|
st.session_state.config,
|
||||||
|
st.session_state.experiment_tracker,
|
||||||
|
st.session_state.experiment_runner,
|
||||||
|
)
|
||||||
|
dashboard.index()
|
||||||
|
else:
|
||||||
|
st.error("Please run the main app first to initialize the configuration.")
|
||||||
|
st.markdown("Go back to the [main page](/) to start the application.")
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from web.interfaces.data_overview import DataOverview
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Data Overview", page_icon="📋", layout="wide")
|
||||||
|
|
||||||
|
if "config" in st.session_state:
|
||||||
|
data_overview = DataOverview(st.session_state.config)
|
||||||
|
data_overview.index()
|
||||||
|
else:
|
||||||
|
st.error("Please run the main app first to initialize the configuration.")
|
||||||
|
st.markdown("Go back to the [main page](/) to start the application.")
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from web.interfaces.data_processing import DataProcessing
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Data Processing", page_icon="⚙️", layout="wide")
|
||||||
|
|
||||||
|
if "config" in st.session_state:
|
||||||
|
data_processing = DataProcessing(st.session_state.config, st.session_state.pipeline_monitor)
|
||||||
|
data_processing.index()
|
||||||
|
else:
|
||||||
|
st.error("Please run the main app first to initialize the configuration.")
|
||||||
|
st.markdown("Go back to the [main page](/) to start the application.")
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from web.interfaces.experiments import Experiments
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Experiments", page_icon="🧪", layout="wide")
|
||||||
|
|
||||||
|
if "config" in st.session_state:
|
||||||
|
experiments = Experiments(
|
||||||
|
st.session_state.config,
|
||||||
|
st.session_state.experiment_tracker,
|
||||||
|
st.session_state.experiment_runner,
|
||||||
|
)
|
||||||
|
experiments.index()
|
||||||
|
else:
|
||||||
|
st.error("Please run the main app first to initialize the configuration.")
|
||||||
|
st.markdown("Go back to the [main page](/) to start the application.")
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from web.interfaces.results_analysis import ResultsAnalysis
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Results & Analysis", page_icon="📈", layout="wide")
|
||||||
|
|
||||||
|
if "config" in st.session_state:
|
||||||
|
results_analysis = ResultsAnalysis(
|
||||||
|
st.session_state.config,
|
||||||
|
st.session_state.experiment_tracker,
|
||||||
|
st.session_state.experiment_runner,
|
||||||
|
)
|
||||||
|
results_analysis.index()
|
||||||
|
else:
|
||||||
|
st.error("Please run the main app first to initialize the configuration.")
|
||||||
|
st.markdown("Go back to the [main page](/) to start the application.")
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from web.interfaces.predictions import Predictions
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Predictions", page_icon="🔮", layout="wide")
|
||||||
|
|
||||||
|
if "config" in st.session_state:
|
||||||
|
predictions = Predictions(
|
||||||
|
st.session_state.config,
|
||||||
|
st.session_state.experiment_tracker,
|
||||||
|
st.session_state.experiment_runner,
|
||||||
|
)
|
||||||
|
predictions.index()
|
||||||
|
else:
|
||||||
|
st.error("Please run the main app first to initialize the configuration.")
|
||||||
|
st.markdown("Go back to the [main page](/) to start the application.")
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Add parent directory to Python path to access core modules
|
||||||
|
parent_dir = Path(__file__).parent.parent.parent
|
||||||
|
sys.path.insert(0, str(parent_dir))
|
||||||
|
|
||||||
|
from web.interfaces.configuration import Configuration
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
|
||||||
|
|
||||||
|
if "config" in st.session_state:
|
||||||
|
configuration = Configuration(st.session_state.config)
|
||||||
|
configuration.index()
|
||||||
|
else:
|
||||||
|
st.error("Please run the main app first to initialize the configuration.")
|
||||||
|
st.markdown("Go back to the [main page](/) to start the application.")
|
||||||
Reference in New Issue
Block a user