feat: web application multipage support
This commit is contained in:
@@ -1,101 +0,0 @@
|
||||
#!.venv/bin/python3
|
||||
import argparse
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from core.config import setup_config, PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from interface.configuration import Configuration
|
||||
from interface.dashboard import Dashboard
|
||||
from interface.data_overview import DataOverview
|
||||
from interface.data_processing import DataProcessing
|
||||
from interface.experiments import Experiments
|
||||
from interface.predictions import Predictions
|
||||
from interface.results_analysis import ResultsAnalysis
|
||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
# Page configuration
|
||||
st.set_page_config(
|
||||
page_title="DRC Names NLP Pipeline",
|
||||
page_icon="🇨🇩",
|
||||
layout="wide",
|
||||
initial_sidebar_state="expanded",
|
||||
)
|
||||
|
||||
|
||||
class StreamlitApp:
|
||||
"""Main Streamlit application class"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.data_loader = DataLoader(self.config)
|
||||
self.experiment_tracker = ExperimentTracker(self.config)
|
||||
self.experiment_runner = ExperimentRunner(self.config)
|
||||
self.pipeline_monitor = PipelineMonitor()
|
||||
|
||||
# Initialize interface components
|
||||
self.dashboard = Dashboard(self.config, self.experiment_tracker, self.experiment_runner)
|
||||
self.data_overview = DataOverview(self.config)
|
||||
self.data_processing = DataProcessing(self.config, self.pipeline_monitor)
|
||||
self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner)
|
||||
self.results_analysis = ResultsAnalysis(
|
||||
self.config, self.experiment_tracker, self.experiment_runner
|
||||
)
|
||||
self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner)
|
||||
self.configuration = Configuration(self.config)
|
||||
|
||||
# Initialize session state
|
||||
if "current_experiment" not in st.session_state:
|
||||
st.session_state.current_experiment = None
|
||||
if "experiment_results" not in st.session_state:
|
||||
st.session_state.experiment_results = {}
|
||||
|
||||
def run(self):
|
||||
st.title("🇨🇩 DRC NERS Pipeline")
|
||||
st.markdown("A comprehensive tool for Congolese name analysis and gender prediction")
|
||||
|
||||
# Sidebar navigation
|
||||
page = st.sidebar.selectbox(
|
||||
"Navigation",
|
||||
[
|
||||
"Dashboard",
|
||||
"Dataset Overview",
|
||||
"Data Processing",
|
||||
"Experiments",
|
||||
"Results & Analysis",
|
||||
"Predictions",
|
||||
"Configuration",
|
||||
],
|
||||
)
|
||||
|
||||
# Route to appropriate page
|
||||
page_map = {
|
||||
"Dashboard": self.dashboard.index,
|
||||
"Dataset Overview": self.data_overview.index,
|
||||
"Data Processing": self.data_processing.index,
|
||||
"Experiments": self.experiments.index,
|
||||
"Results & Analysis": self.results_analysis.index,
|
||||
"Predictions": self.predictions.index,
|
||||
"Configuration": self.configuration.index,
|
||||
}
|
||||
page_map.get(page, lambda: None)()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="DRC NERS Platform",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||
args = parser.parse_args()
|
||||
|
||||
config = setup_config(args.config, env=args.env)
|
||||
app = StreamlitApp(config)
|
||||
app.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -18,7 +18,8 @@ paths:
|
||||
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
|
||||
|
||||
# Pipeline stages
|
||||
stages: # List of stages in the processing pipeline
|
||||
# List of stages in the processing pipeline
|
||||
stages:
|
||||
- "data_cleaning" # Data cleaning stage
|
||||
- "feature_extraction" # Feature extraction stage
|
||||
- "ner_annotation" # NER-based annotation stage
|
||||
@@ -36,6 +37,7 @@ processing:
|
||||
- "utf-16"
|
||||
- "latin1"
|
||||
chunk_size: 100_000 # Size of data chunks to process in parallel
|
||||
epochs: 2 # Number of Epochs for training
|
||||
|
||||
# Annotation settings
|
||||
annotation:
|
||||
@@ -72,8 +74,9 @@ data:
|
||||
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
|
||||
|
||||
# Logging configuration
|
||||
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
logging:
|
||||
level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
level: "INFO"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: true # Enable logging to file
|
||||
console_logging: true # Enable logging to console
|
||||
|
||||
@@ -7,7 +7,7 @@ baseline_experiments:
|
||||
max_len: 20
|
||||
embedding_dim: 64
|
||||
gru_units: 32
|
||||
epochs: 10
|
||||
epochs: 2
|
||||
batch_size: 32
|
||||
tags: [ "baseline", "neural", "bigru" ]
|
||||
|
||||
@@ -21,7 +21,7 @@ baseline_experiments:
|
||||
filters: 64
|
||||
kernel_size: 3
|
||||
dropout: 0.5
|
||||
epochs: 10
|
||||
epochs: 2
|
||||
batch_size: 32
|
||||
tags: [ "baseline", "neural", "cnn" ]
|
||||
|
||||
@@ -79,7 +79,7 @@ baseline_experiments:
|
||||
model_params:
|
||||
embedding_dim: 128
|
||||
lstm_units: 64
|
||||
epochs: 10
|
||||
epochs: 2
|
||||
batch_size: 64
|
||||
tags: [ "baseline", "neural", "lstm" ]
|
||||
|
||||
@@ -121,7 +121,7 @@ baseline_experiments:
|
||||
embedding_dim: 128
|
||||
num_heads: 4
|
||||
num_layers: 2
|
||||
epochs: 10
|
||||
epochs: 2
|
||||
batch_size: 64
|
||||
tags: [ "baseline", "neural", "transformer" ]
|
||||
|
||||
|
||||
@@ -0,0 +1,145 @@
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
vectors = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
gpu_allocator = null
|
||||
seed = 42
|
||||
|
||||
[nlp]
|
||||
lang = "fr"
|
||||
pipeline = ["tok2vec","ner"]
|
||||
batch_size = 100000
|
||||
disabled = []
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||
vectors = {"@vectors":"spacy.Vectors.v1"}
|
||||
|
||||
[components]
|
||||
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
incorrect_spans_key = null
|
||||
moves = null
|
||||
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
||||
update_with_oracle_cut_size = 100
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
upstream = "*"
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
|
||||
rows = [5000,1000,2500,2500]
|
||||
include_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
max_length = 0
|
||||
gold_preproc = false
|
||||
limit = 0
|
||||
augmenter = null
|
||||
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
max_length = 0
|
||||
gold_preproc = false
|
||||
limit = 0
|
||||
augmenter = null
|
||||
|
||||
[training]
|
||||
dev_corpus = "corpora.dev"
|
||||
train_corpus = "corpora.train"
|
||||
seed = ${system.seed}
|
||||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
frozen_components = []
|
||||
annotating_components = []
|
||||
before_to_disk = null
|
||||
before_update = null
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
get_length = null
|
||||
|
||||
[training.batcher.size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
t = 0.0
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
progress_bar = false
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 0.00000001
|
||||
learn_rate = 0.001
|
||||
|
||||
[training.score_weights]
|
||||
ents_f = 1.0
|
||||
ents_p = 0.0
|
||||
ents_r = 0.0
|
||||
ents_per_type = null
|
||||
|
||||
[pretraining]
|
||||
|
||||
[initialize]
|
||||
vectors = ${paths.vectors}
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
vocab_data = null
|
||||
lookups = null
|
||||
before_init = null
|
||||
after_init = null
|
||||
|
||||
[initialize.components]
|
||||
|
||||
[initialize.tokenizer]
|
||||
@@ -12,3 +12,4 @@ class ProcessingConfig(BaseModel):
|
||||
use_multiprocessing: bool = False
|
||||
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
|
||||
chunk_size: int = 100_000
|
||||
epochs: int = 2
|
||||
|
||||
@@ -7,24 +7,24 @@ import traceback
|
||||
from pathlib import Path
|
||||
|
||||
from core.config import setup_config, PipelineConfig
|
||||
from processing.ner.ner_data_builder import NERDataBuilder
|
||||
from processing.ner.ner_engineering import NEREngineering
|
||||
from processing.ner.ner_name_model import NERNameModel
|
||||
from processing.ner.name_builder import NameBuilder
|
||||
from processing.ner.name_engineering import NameEngineering
|
||||
from processing.ner.name_model import NameModel
|
||||
|
||||
|
||||
def feature(config: PipelineConfig):
|
||||
"""Apply feature engineering to create position-independent NER dataset."""
|
||||
NEREngineering(config).compute()
|
||||
NameEngineering(config).compute()
|
||||
|
||||
|
||||
def build(config: PipelineConfig):
|
||||
"""Build NER dataset using NERDataBuilder."""
|
||||
NERDataBuilder(config).build()
|
||||
NameBuilder(config).build()
|
||||
|
||||
|
||||
def train(config: PipelineConfig):
|
||||
"""Train the NER model."""
|
||||
trainer = NERNameModel(config)
|
||||
trainer = NameModel(config)
|
||||
|
||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||
if not data_path.exists():
|
||||
@@ -39,7 +39,10 @@ def train(config: PipelineConfig):
|
||||
|
||||
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
||||
trainer.train(
|
||||
data=train_data, epochs=1, batch_size=config.processing.batch_size, dropout_rate=0.3
|
||||
data=train_data,
|
||||
epochs=config.processing.epochs,
|
||||
batch_size=config.processing.batch_size,
|
||||
dropout_rate=0.3,
|
||||
)
|
||||
trainer.evaluate(eval_data)
|
||||
|
||||
@@ -48,13 +51,17 @@ def train(config: PipelineConfig):
|
||||
|
||||
|
||||
def run_pipeline(config: PipelineConfig, reset: bool = False):
|
||||
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
|
||||
if not reset and os.path.exists(
|
||||
config.paths.get_data_path(config.data.output_files["engineered"])
|
||||
):
|
||||
logging.info("Step 1: Feature engineering already done.")
|
||||
else:
|
||||
logging.info("Step 1: Running feature engineering")
|
||||
feature(config)
|
||||
|
||||
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
|
||||
if not reset and os.path.exists(
|
||||
config.paths.get_data_path(config.data.output_files["ner_data"])
|
||||
):
|
||||
logging.info("Step 2: NER dataset already built.")
|
||||
else:
|
||||
logging.info("Step 2: Building NER dataset")
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
import json
|
||||
import logging
|
||||
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from .name_tagger import NameTagger
|
||||
|
||||
|
||||
class NameBuilder:
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.data_loader = DataLoader(config)
|
||||
self.tagger = NameTagger()
|
||||
|
||||
def build(self) -> int:
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||
|
||||
# Filter early
|
||||
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
|
||||
if ner_df.empty:
|
||||
logging.error("No NER tagged data found")
|
||||
return 1
|
||||
|
||||
total_rows = len(df)
|
||||
del df # No need to keep in memory
|
||||
|
||||
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
||||
nlp = spacy.blank("fr")
|
||||
|
||||
# Use NERNameTagger for parsing and validation
|
||||
parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"])
|
||||
validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities)
|
||||
|
||||
# Drop rows with no valid entities
|
||||
mask = validated_entities.map(bool)
|
||||
ner_df = ner_df.loc[mask]
|
||||
validated_entities = validated_entities.loc[mask]
|
||||
|
||||
if ner_df.empty:
|
||||
logging.error("No valid training examples after validation")
|
||||
return 1
|
||||
|
||||
# Prepare training data
|
||||
training_data = list(
|
||||
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
||||
)
|
||||
|
||||
# Use NERNameTagger to create spaCy DocBin
|
||||
docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
||||
doc_bin = DocBin(docs=docs)
|
||||
|
||||
# Save
|
||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
doc_bin.to_disk(spacy_path)
|
||||
|
||||
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
||||
logging.info(f"Saved NER JSON to {json_path}")
|
||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||
return 0
|
||||
@@ -1,5 +1,5 @@
|
||||
import gc
|
||||
import random
|
||||
from typing import List
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
@@ -7,7 +7,7 @@ import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||
from core.utils.data_loader import DataLoader
|
||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||
@@ -16,7 +16,7 @@ from processing.ner.formats.position_flipped_format import PositionFlippedFormat
|
||||
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
|
||||
|
||||
|
||||
class NEREngineering:
|
||||
class NameEngineering:
|
||||
"""
|
||||
Feature engineering for NER dataset to prevent position-based learning
|
||||
and encourage sequence characteristic learning.
|
||||
@@ -66,13 +66,16 @@ class NEREngineering:
|
||||
def compute(self) -> None:
|
||||
logging.info("Applying feature engineering transformations...")
|
||||
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
|
||||
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
output_filepath = self.config.paths.get_data_path(
|
||||
self.config.data.output_files["engineered"]
|
||||
)
|
||||
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
||||
|
||||
del df # No need to keep in memory
|
||||
gc.collect()
|
||||
|
||||
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||
drop=True
|
||||
@@ -1,3 +1,4 @@
|
||||
import ast
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -11,7 +12,7 @@ from spacy.util import minibatch
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
|
||||
|
||||
class NERNameModel:
|
||||
class NameModel:
|
||||
"""NER model trainer using spaCy for DRC names entity recognition"""
|
||||
|
||||
def __init__(self, config: PipelineConfig):
|
||||
@@ -84,8 +85,6 @@ class NERNameModel:
|
||||
if isinstance(entities_raw, str):
|
||||
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
|
||||
try:
|
||||
import ast
|
||||
|
||||
entities = ast.literal_eval(entities_raw)
|
||||
if not isinstance(entities, list):
|
||||
logging.warning(
|
||||
@@ -175,9 +174,9 @@ class NERNameModel:
|
||||
def train(
|
||||
self,
|
||||
data: List[Tuple[str, Dict]],
|
||||
epochs: int = 5,
|
||||
batch_size: int = 16,
|
||||
dropout_rate: float = 0.2,
|
||||
epochs: int = 1,
|
||||
batch_size: int = 10_000,
|
||||
dropout_rate: float = 0.3,
|
||||
) -> None:
|
||||
"""Train the NER model"""
|
||||
logging.info(f"Starting NER training with {len(data)} examples")
|
||||
@@ -204,7 +203,7 @@ class NERNameModel:
|
||||
example = Example.from_dict(doc, annotations)
|
||||
examples.append(example)
|
||||
logging.info(
|
||||
f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
|
||||
f"Training example: {text[:30]} with entities {annotations.get('entities', [])}"
|
||||
)
|
||||
|
||||
# Train in batches
|
||||
@@ -215,6 +214,7 @@ class NERNameModel:
|
||||
)
|
||||
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
|
||||
|
||||
del batches # free memory
|
||||
epoch_loss = losses.get("ner", 0)
|
||||
losses_history.append(epoch_loss)
|
||||
logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")
|
||||
@@ -0,0 +1,273 @@
|
||||
from typing import Union, Dict, Any, List
|
||||
import ast
|
||||
import json
|
||||
import logging
|
||||
import pandas as pd
|
||||
from spacy.util import filter_spans
|
||||
|
||||
|
||||
class NameTagger:
|
||||
def tag_name(
|
||||
self, name: str, probable_native: str, probable_surname: str
|
||||
) -> Union[Dict[str, Any], None]:
|
||||
"""Create a single NER training example using probable_native and probable_surname"""
|
||||
if not name or not probable_native or not probable_surname:
|
||||
return None
|
||||
|
||||
name = name.strip()
|
||||
probable_native = probable_native.strip()
|
||||
probable_surname = probable_surname.strip()
|
||||
|
||||
entities = []
|
||||
used_spans = [] # Track used character spans to prevent overlaps
|
||||
|
||||
# Helper function to check if a span overlaps with any existing span
|
||||
def has_overlap(start, end):
|
||||
for used_start, used_end in used_spans:
|
||||
if not (end <= used_start or start >= used_end):
|
||||
return True
|
||||
return False
|
||||
|
||||
# Find positions of native names in the full name
|
||||
native_words = probable_native.split()
|
||||
name_lower = name.lower() # Use lowercase for consistent searching
|
||||
processed_native_words = set()
|
||||
|
||||
for native_word in native_words:
|
||||
native_word = native_word.strip()
|
||||
if len(native_word) < 2: # Skip very short words
|
||||
continue
|
||||
|
||||
native_word_lower = native_word.lower()
|
||||
|
||||
# Skip if we've already processed this exact word
|
||||
if native_word_lower in processed_native_words:
|
||||
continue
|
||||
processed_native_words.add(native_word_lower)
|
||||
|
||||
# Find the first occurrence of this native word that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
# Calculate end position - make sure we only include the word itself
|
||||
end_pos = pos + len(native_word_lower)
|
||||
|
||||
# Double-check that the extracted span matches exactly what we expect
|
||||
extracted_text = name[pos:end_pos] # Get original case text
|
||||
if extracted_text.lower() != native_word_lower:
|
||||
start_pos = pos + 1
|
||||
continue
|
||||
|
||||
# Check if this is a word boundary match and doesn't overlap
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "NATIVE"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break # Only take the first non-overlapping occurrence
|
||||
|
||||
start_pos = pos + 1
|
||||
|
||||
# Find position of surname in the full name
|
||||
if probable_surname and len(probable_surname.strip()) >= 2:
|
||||
surname_lower = probable_surname.lower()
|
||||
|
||||
# Find the first occurrence that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
# Calculate end position correctly - exact match only
|
||||
end_pos = pos + len(surname_lower)
|
||||
|
||||
# Double-check that the extracted span matches exactly what we expect
|
||||
extracted_text = name[pos:end_pos] # Get original case text
|
||||
if extracted_text.lower() != surname_lower:
|
||||
start_pos = pos + 1
|
||||
continue
|
||||
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "SURNAME"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break
|
||||
|
||||
start_pos = pos + 1
|
||||
|
||||
if not entities:
|
||||
logging.warning(
|
||||
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Sort entities by position and validate
|
||||
entities.sort(key=lambda x: x[0])
|
||||
|
||||
# Final validation - ensure no overlaps and valid spans
|
||||
validated_entities = []
|
||||
for start, end, label in entities:
|
||||
# Check bounds
|
||||
if not (0 <= start < end <= len(name)):
|
||||
logging.warning(
|
||||
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
|
||||
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
|
||||
continue
|
||||
|
||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||
span_text = name[start:end]
|
||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||
logging.warning(
|
||||
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
|
||||
)
|
||||
continue
|
||||
|
||||
validated_entities.append((start, end, label))
|
||||
|
||||
if not validated_entities:
|
||||
logging.warning(f"No valid entities after validation for: '{name}'")
|
||||
return None
|
||||
|
||||
# Convert to string format that matches the dataset
|
||||
entities_str = str(validated_entities)
|
||||
|
||||
return {
|
||||
"entities": entities_str,
|
||||
"spans": validated_entities, # Keep the original tuples for internal use
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
|
||||
"""Check if the match is at word boundaries"""
|
||||
# Check character before start position
|
||||
if start > 0:
|
||||
prev_char = text[start - 1]
|
||||
if prev_char.isalnum():
|
||||
return False
|
||||
|
||||
# Check character after end position
|
||||
if end < len(text):
|
||||
next_char = text[end]
|
||||
if next_char.isalnum():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
||||
"""Extract the actual text for each entity type"""
|
||||
result = {"NATIVE": [], "SURNAME": []}
|
||||
|
||||
try:
|
||||
entities = ast.literal_eval(entities_str)
|
||||
|
||||
for start, end, label in entities:
|
||||
if 0 <= start < end <= len(name):
|
||||
span_text = name[start:end]
|
||||
if label in result:
|
||||
result[label].append(span_text)
|
||||
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def parse(cls, entities_str: str) -> List[tuple]:
|
||||
"""Parse entity strings from various formats.
|
||||
|
||||
Supports formats:
|
||||
- [(start, end, label), ...]
|
||||
- [[start, end, label], ...]
|
||||
- [{"start": start, "end": end, "label": label}, ...]
|
||||
"""
|
||||
if not entities_str or entities_str in ["[]", "", "nan"]:
|
||||
return []
|
||||
entities_str = str(entities_str).strip()
|
||||
try:
|
||||
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
||||
return ast.literal_eval(entities_str)
|
||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
||||
else:
|
||||
parsed = ast.literal_eval(entities_str)
|
||||
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
|
||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
def parse_entities(self, series: pd.Series) -> pd.Series:
|
||||
"""Vectorized parse of entity strings."""
|
||||
return series.map(self.parse)
|
||||
|
||||
@classmethod
|
||||
def validate(cls, text: str, entities: List[tuple]) -> List[tuple]:
|
||||
"""Advanced entity validation with overlap removal.
|
||||
|
||||
This is more comprehensive than the basic validate_entities method.
|
||||
"""
|
||||
if not entities or not text:
|
||||
return []
|
||||
text = str(text).strip()
|
||||
valid = []
|
||||
|
||||
for ent in entities:
|
||||
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
|
||||
continue
|
||||
start, end, label = ent
|
||||
try:
|
||||
start, end = int(start), int(end)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
if not isinstance(label, str):
|
||||
continue
|
||||
if not (0 <= start < end <= len(text)):
|
||||
continue
|
||||
if not text[start:end].strip():
|
||||
continue
|
||||
valid.append((start, end, label))
|
||||
|
||||
if not valid:
|
||||
return []
|
||||
|
||||
valid.sort(key=lambda x: (x[0], x[1]))
|
||||
|
||||
# Remove overlaps
|
||||
filtered, last_end = [], -1
|
||||
for s, e, l in valid:
|
||||
if s >= last_end:
|
||||
filtered.append((s, e, l))
|
||||
last_end = e
|
||||
return filtered
|
||||
|
||||
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
||||
"""Vectorized entity validation."""
|
||||
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
|
||||
|
||||
@classmethod
|
||||
def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List:
|
||||
"""Batch create spaCy Docs from texts and entities."""
|
||||
docs = []
|
||||
for text, ents in zip(texts, entities):
|
||||
doc = nlp(text)
|
||||
spans = []
|
||||
for start, end, label in ents:
|
||||
span = doc.char_span(
|
||||
start, end, label=label, alignment_mode="contract"
|
||||
) or doc.char_span(start, end, label=label, alignment_mode="strict")
|
||||
if span:
|
||||
spans.append(span)
|
||||
doc.ents = filter_spans(spans)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
@@ -1,149 +0,0 @@
|
||||
import ast
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
from spacy.util import filter_spans
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
|
||||
|
||||
class NERDataBuilder:
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.data_loader = DataLoader(config)
|
||||
|
||||
@staticmethod
|
||||
def _parse_entities(series: pd.Series) -> pd.Series:
|
||||
"""Vectorized parse of entity strings."""
|
||||
|
||||
def _parse(entities_str):
|
||||
if not entities_str or entities_str in ["[]", "", "nan"]:
|
||||
return []
|
||||
entities_str = str(entities_str).strip()
|
||||
try:
|
||||
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
||||
return ast.literal_eval(entities_str)
|
||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
||||
else:
|
||||
parsed = ast.literal_eval(entities_str)
|
||||
return [
|
||||
tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
|
||||
]
|
||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
return series.map(_parse)
|
||||
|
||||
@staticmethod
|
||||
def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
||||
"""Vectorized entity validation."""
|
||||
|
||||
def _validate(text, entities):
|
||||
if not entities or not text:
|
||||
return []
|
||||
text = str(text).strip()
|
||||
valid = []
|
||||
for ent in entities:
|
||||
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
|
||||
continue
|
||||
start, end, label = ent
|
||||
try:
|
||||
start, end = int(start), int(end)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
if not isinstance(label, str):
|
||||
continue
|
||||
if not (0 <= start < end <= len(text)):
|
||||
continue
|
||||
if not text[start:end].strip():
|
||||
continue
|
||||
valid.append((start, end, label))
|
||||
if not valid:
|
||||
return []
|
||||
valid.sort(key=lambda x: (x[0], x[1]))
|
||||
# remove overlaps
|
||||
filtered, last_end = [], -1
|
||||
for s, e, l in valid:
|
||||
if s >= last_end:
|
||||
filtered.append((s, e, l))
|
||||
last_end = e
|
||||
return filtered
|
||||
|
||||
return pd.Series(map(_validate, texts, entities_series), index=texts.index)
|
||||
|
||||
@staticmethod
|
||||
def _create_docs(nlp, texts, entities):
|
||||
"""Batch create spaCy Docs."""
|
||||
docs = []
|
||||
for text, ents in zip(texts, entities):
|
||||
doc = nlp(text)
|
||||
spans = []
|
||||
for start, end, label in ents:
|
||||
span = doc.char_span(
|
||||
start, end, label=label, alignment_mode="contract"
|
||||
) or doc.char_span(start, end, label=label, alignment_mode="strict")
|
||||
if span:
|
||||
spans.append(span)
|
||||
doc.ents = filter_spans(spans)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
def build(self) -> int:
|
||||
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||
|
||||
# Filter early
|
||||
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
|
||||
if ner_df.empty:
|
||||
logging.error("No NER tagged data found")
|
||||
return 1
|
||||
|
||||
total_rows = len(df)
|
||||
del df # No need to keep in memory
|
||||
|
||||
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
||||
nlp = spacy.blank("fr")
|
||||
|
||||
# Vectorized parsing + validation
|
||||
parsed_entities = self._parse_entities(ner_df["ner_entities"])
|
||||
validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
|
||||
|
||||
# Drop rows with no valid entities
|
||||
mask = validated_entities.map(bool)
|
||||
ner_df = ner_df.loc[mask]
|
||||
validated_entities = validated_entities.loc[mask]
|
||||
|
||||
if ner_df.empty:
|
||||
logging.error("No valid training examples after validation")
|
||||
return 1
|
||||
|
||||
# Prepare training data
|
||||
training_data = list(
|
||||
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
||||
)
|
||||
|
||||
# Create spaCy DocBin in batch
|
||||
docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
||||
doc_bin = DocBin(docs=docs)
|
||||
|
||||
# Save
|
||||
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
|
||||
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
doc_bin.to_disk(spacy_path)
|
||||
|
||||
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
||||
logging.info(f"Saved NER JSON to {json_path}")
|
||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||
return 0
|
||||
|
||||
@@ -1,212 +0,0 @@
|
||||
from typing import Union, Dict, Any, List
|
||||
import logging
|
||||
|
||||
|
||||
class NERNameTagger:
|
||||
def tag_name(
|
||||
self, name: str, probable_native: str, probable_surname: str
|
||||
) -> Union[Dict[str, Any], None]:
|
||||
"""Create a single NER training example using probable_native and probable_surname"""
|
||||
if not name or not probable_native or not probable_surname:
|
||||
return None
|
||||
|
||||
name = name.strip()
|
||||
probable_native = probable_native.strip()
|
||||
probable_surname = probable_surname.strip()
|
||||
|
||||
entities = []
|
||||
used_spans = [] # Track used character spans to prevent overlaps
|
||||
|
||||
# Helper function to check if a span overlaps with any existing span
|
||||
def has_overlap(start, end):
|
||||
for used_start, used_end in used_spans:
|
||||
if not (end <= used_start or start >= used_end):
|
||||
return True
|
||||
return False
|
||||
|
||||
# Find positions of native names in the full name
|
||||
native_words = probable_native.split()
|
||||
name_lower = name.lower() # Use lowercase for consistent searching
|
||||
processed_native_words = set()
|
||||
|
||||
for native_word in native_words:
|
||||
native_word = native_word.strip()
|
||||
if len(native_word) < 2: # Skip very short words
|
||||
continue
|
||||
|
||||
native_word_lower = native_word.lower()
|
||||
|
||||
# Skip if we've already processed this exact word
|
||||
if native_word_lower in processed_native_words:
|
||||
continue
|
||||
processed_native_words.add(native_word_lower)
|
||||
|
||||
# Find the first occurrence of this native word that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
# Calculate end position - make sure we only include the word itself
|
||||
end_pos = pos + len(native_word_lower)
|
||||
|
||||
# Double-check that the extracted span matches exactly what we expect
|
||||
extracted_text = name[pos:end_pos] # Get original case text
|
||||
if extracted_text.lower() != native_word_lower:
|
||||
start_pos = pos + 1
|
||||
continue
|
||||
|
||||
# Check if this is a word boundary match and doesn't overlap
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "NATIVE"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break # Only take the first non-overlapping occurrence
|
||||
|
||||
start_pos = pos + 1
|
||||
|
||||
# Find position of surname in the full name
|
||||
if probable_surname and len(probable_surname.strip()) >= 2:
|
||||
surname_lower = probable_surname.lower()
|
||||
|
||||
# Find the first occurrence that doesn't overlap
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
# Calculate end position correctly - exact match only
|
||||
end_pos = pos + len(surname_lower)
|
||||
|
||||
# Double-check that the extracted span matches exactly what we expect
|
||||
extracted_text = name[pos:end_pos] # Get original case text
|
||||
if extracted_text.lower() != surname_lower:
|
||||
start_pos = pos + 1
|
||||
continue
|
||||
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "SURNAME"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break
|
||||
|
||||
start_pos = pos + 1
|
||||
|
||||
if not entities:
|
||||
logging.warning(
|
||||
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Sort entities by position and validate
|
||||
entities.sort(key=lambda x: x[0])
|
||||
|
||||
# Final validation - ensure no overlaps and valid spans
|
||||
validated_entities = []
|
||||
for start, end, label in entities:
|
||||
# Check bounds
|
||||
if not (0 <= start < end <= len(name)):
|
||||
logging.warning(
|
||||
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
|
||||
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
|
||||
continue
|
||||
|
||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||
span_text = name[start:end]
|
||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||
logging.warning(
|
||||
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
|
||||
)
|
||||
continue
|
||||
|
||||
validated_entities.append((start, end, label))
|
||||
|
||||
if not validated_entities:
|
||||
logging.warning(f"No valid entities after validation for: '{name}'")
|
||||
return None
|
||||
|
||||
# Convert to string format that matches the dataset
|
||||
entities_str = str(validated_entities)
|
||||
|
||||
return {
|
||||
"entities": entities_str,
|
||||
"spans": validated_entities, # Keep the original tuples for internal use
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
|
||||
"""Check if the match is at word boundaries"""
|
||||
# Check character before start position
|
||||
if start > 0:
|
||||
prev_char = text[start - 1]
|
||||
if prev_char.isalnum():
|
||||
return False
|
||||
|
||||
# Check character after end position
|
||||
if end < len(text):
|
||||
next_char = text[end]
|
||||
if next_char.isalnum():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def validate_entities(cls, name: str, entities_str: str) -> bool:
|
||||
"""Validate that entity annotations are correct for a given name"""
|
||||
try:
|
||||
import ast
|
||||
|
||||
entities = ast.literal_eval(entities_str)
|
||||
|
||||
# Check for overlaps and valid bounds
|
||||
sorted_entities = sorted(entities, key=lambda x: x[0])
|
||||
|
||||
for i, (start, end, label) in enumerate(sorted_entities):
|
||||
# Check bounds
|
||||
if not (0 <= start < end <= len(name)):
|
||||
return False
|
||||
|
||||
# Check for overlaps with next entity
|
||||
if i < len(sorted_entities) - 1:
|
||||
next_start = sorted_entities[i + 1][0]
|
||||
if end > next_start:
|
||||
return False
|
||||
|
||||
# Extract the text span and validate it's not empty
|
||||
span_text = name[start:end]
|
||||
if not span_text.strip():
|
||||
return False
|
||||
|
||||
return True
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
||||
"""Extract the actual text for each entity type"""
|
||||
result = {"NATIVE": [], "SURNAME": []}
|
||||
|
||||
try:
|
||||
import ast
|
||||
|
||||
entities = ast.literal_eval(entities_str)
|
||||
|
||||
for start, end, label in entities:
|
||||
if 0 <= start < end <= len(name):
|
||||
span_text = name[start:end]
|
||||
if label in result:
|
||||
result[label].append(span_text)
|
||||
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
+89
@@ -0,0 +1,89 @@
|
||||
#!.venv/bin/python3
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from core.config import setup_config, PipelineConfig
|
||||
from core.utils.data_loader import DataLoader
|
||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from web.interfaces.configuration import Configuration
|
||||
from web.interfaces.dashboard import Dashboard
|
||||
from web.interfaces.data_overview import DataOverview
|
||||
from web.interfaces.data_processing import DataProcessing
|
||||
from web.interfaces.experiments import Experiments
|
||||
from web.interfaces.predictions import Predictions
|
||||
from web.interfaces.results_analysis import ResultsAnalysis
|
||||
|
||||
# Page configuration
|
||||
st.set_page_config(
|
||||
page_title="DRC NERS Platform",
|
||||
page_icon="🇨🇩",
|
||||
layout="wide",
|
||||
initial_sidebar_state="expanded",
|
||||
)
|
||||
|
||||
|
||||
def initialize_session_state(config: PipelineConfig):
|
||||
"""Initialize session state variables"""
|
||||
if "config" not in st.session_state:
|
||||
st.session_state.config = config
|
||||
if "data_loader" not in st.session_state:
|
||||
st.session_state.data_loader = DataLoader(config)
|
||||
if "experiment_tracker" not in st.session_state:
|
||||
st.session_state.experiment_tracker = ExperimentTracker(config)
|
||||
if "experiment_runner" not in st.session_state:
|
||||
st.session_state.experiment_runner = ExperimentRunner(config)
|
||||
if "pipeline_monitor" not in st.session_state:
|
||||
st.session_state.pipeline_monitor = PipelineMonitor()
|
||||
if "current_experiment" not in st.session_state:
|
||||
st.session_state.current_experiment = None
|
||||
if "experiment_results" not in st.session_state:
|
||||
st.session_state.experiment_results = {}
|
||||
|
||||
|
||||
class StreamlitApp:
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
initialize_session_state(config)
|
||||
|
||||
def run(self):
|
||||
st.title("🇨🇩 DRC NERS Pipeline")
|
||||
st.markdown(
|
||||
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
|
||||
)
|
||||
|
||||
st.markdown(
|
||||
"""
|
||||
## Overview
|
||||
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
|
||||
underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
|
||||
data.
|
||||
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
|
||||
million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="DRC NERS Platform",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="Path to configuration file")
|
||||
parser.add_argument("--env", type=str, default="development", help="Environment name")
|
||||
args = parser.parse_args()
|
||||
|
||||
config = setup_config(args.config, env=args.env)
|
||||
app = StreamlitApp(config)
|
||||
app.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -2,11 +2,9 @@ import streamlit as st
|
||||
|
||||
|
||||
class Configuration:
|
||||
"""Handles configuration display and management"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
def index(self):
|
||||
st.header("Current Configuration")
|
||||
st.title("Configuration")
|
||||
st.json(self.config.model_dump())
|
||||
@@ -20,7 +20,7 @@ class Dashboard:
|
||||
self.experiment_runner = experiment_runner
|
||||
|
||||
def index(self):
|
||||
st.header("Dashboard")
|
||||
st.title("Dashboard")
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
# Load basic statistics
|
||||
@@ -21,7 +21,7 @@ class DataOverview:
|
||||
self.config = config
|
||||
|
||||
def index(self):
|
||||
st.header("Data Overview")
|
||||
st.title("Data Overview")
|
||||
data_files = {
|
||||
"Names": self.config.data.input_file,
|
||||
"Featured Dataset": self.config.data.output_files["featured"],
|
||||
@@ -3,7 +3,7 @@ import plotly.express as px
|
||||
import streamlit as st
|
||||
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES
|
||||
from interface.log_reader import LogReader
|
||||
from web.interfaces.log_reader import LogReader
|
||||
|
||||
|
||||
@st.cache_data
|
||||
@@ -21,7 +21,7 @@ class DataProcessing:
|
||||
self.pipeline_monitor = pipeline_monitor
|
||||
|
||||
def index(self):
|
||||
st.header("Data Processing Pipeline")
|
||||
st.title("Data Processing")
|
||||
status = self.pipeline_monitor.get_pipeline_status()
|
||||
|
||||
# Overall progress
|
||||
@@ -12,8 +12,6 @@ from research.model_registry import list_available_models
|
||||
|
||||
|
||||
class Experiments:
|
||||
"""Handles experiment management interface"""
|
||||
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
@@ -22,8 +20,7 @@ class Experiments:
|
||||
self.experiment_runner = experiment_runner
|
||||
|
||||
def index(self):
|
||||
"""Main experiments page"""
|
||||
st.header("Experiment Management")
|
||||
st.title("Experiments")
|
||||
tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"])
|
||||
|
||||
with tab1:
|
||||
@@ -12,8 +12,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
|
||||
class Predictions:
|
||||
"""Handles prediction interface"""
|
||||
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
@@ -22,8 +20,7 @@ class Predictions:
|
||||
self.experiment_runner = experiment_runner
|
||||
|
||||
def index(self):
|
||||
"""Main predictions page"""
|
||||
st.header("Make Predictions")
|
||||
st.title("Predictions")
|
||||
|
||||
# Load available models
|
||||
experiments = self.experiment_tracker.list_experiments()
|
||||
@@ -11,8 +11,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
|
||||
|
||||
|
||||
class ResultsAnalysis:
|
||||
"""Handles experiment results and analysis interface"""
|
||||
|
||||
def __init__(
|
||||
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
||||
):
|
||||
@@ -21,8 +19,7 @@ class ResultsAnalysis:
|
||||
self.experiment_runner = experiment_runner
|
||||
|
||||
def index(self):
|
||||
"""Main results analysis page"""
|
||||
st.header("Results & Analysis")
|
||||
st.title("Results & Analysis")
|
||||
tab1, tab2, tab3 = st.tabs(
|
||||
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
|
||||
)
|
||||
@@ -0,0 +1,22 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from web.interfaces.dashboard import Dashboard
|
||||
|
||||
st.set_page_config(page_title="Dashboard", page_icon="📊", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
dashboard = Dashboard(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
dashboard.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,18 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from web.interfaces.data_overview import DataOverview
|
||||
|
||||
st.set_page_config(page_title="Data Overview", page_icon="📋", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
data_overview = DataOverview(st.session_state.config)
|
||||
data_overview.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,18 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from web.interfaces.data_processing import DataProcessing
|
||||
|
||||
st.set_page_config(page_title="Data Processing", page_icon="⚙️", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
data_processing = DataProcessing(st.session_state.config, st.session_state.pipeline_monitor)
|
||||
data_processing.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,22 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from web.interfaces.experiments import Experiments
|
||||
|
||||
st.set_page_config(page_title="Experiments", page_icon="🧪", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
experiments = Experiments(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
experiments.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,22 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from web.interfaces.results_analysis import ResultsAnalysis
|
||||
|
||||
st.set_page_config(page_title="Results & Analysis", page_icon="📈", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
results_analysis = ResultsAnalysis(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
results_analysis.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,22 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from web.interfaces.predictions import Predictions
|
||||
|
||||
st.set_page_config(page_title="Predictions", page_icon="🔮", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
predictions = Predictions(
|
||||
st.session_state.config,
|
||||
st.session_state.experiment_tracker,
|
||||
st.session_state.experiment_runner,
|
||||
)
|
||||
predictions.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
@@ -0,0 +1,18 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
parent_dir = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
|
||||
from web.interfaces.configuration import Configuration
|
||||
|
||||
st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
|
||||
|
||||
if "config" in st.session_state:
|
||||
configuration = Configuration(st.session_state.config)
|
||||
configuration.index()
|
||||
else:
|
||||
st.error("Please run the main app first to initialize the configuration.")
|
||||
st.markdown("Go back to the [main page](/) to start the application.")
|
||||
Reference in New Issue
Block a user