feat: web application multipage support

This commit is contained in:
2025-08-16 19:05:24 +02:00
parent 7b652d6999
commit 84f7d41a84
38 changed files with 765 additions and 507 deletions
-101
View File
@@ -1,101 +0,0 @@
#!.venv/bin/python3
import argparse
import streamlit as st
from core.config import setup_config, PipelineConfig
from core.utils.data_loader import DataLoader
from interface.configuration import Configuration
from interface.dashboard import Dashboard
from interface.data_overview import DataOverview
from interface.data_processing import DataProcessing
from interface.experiments import Experiments
from interface.predictions import Predictions
from interface.results_analysis import ResultsAnalysis
from processing.monitoring.pipeline_monitor import PipelineMonitor
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
# Page configuration
st.set_page_config(
page_title="DRC Names NLP Pipeline",
page_icon="🇨🇩",
layout="wide",
initial_sidebar_state="expanded",
)
class StreamlitApp:
"""Main Streamlit application class"""
def __init__(self, config: PipelineConfig):
self.config = config
self.data_loader = DataLoader(self.config)
self.experiment_tracker = ExperimentTracker(self.config)
self.experiment_runner = ExperimentRunner(self.config)
self.pipeline_monitor = PipelineMonitor()
# Initialize interface components
self.dashboard = Dashboard(self.config, self.experiment_tracker, self.experiment_runner)
self.data_overview = DataOverview(self.config)
self.data_processing = DataProcessing(self.config, self.pipeline_monitor)
self.experiments = Experiments(self.config, self.experiment_tracker, self.experiment_runner)
self.results_analysis = ResultsAnalysis(
self.config, self.experiment_tracker, self.experiment_runner
)
self.predictions = Predictions(self.config, self.experiment_tracker, self.experiment_runner)
self.configuration = Configuration(self.config)
# Initialize session state
if "current_experiment" not in st.session_state:
st.session_state.current_experiment = None
if "experiment_results" not in st.session_state:
st.session_state.experiment_results = {}
def run(self):
st.title("🇨🇩 DRC NERS Pipeline")
st.markdown("A comprehensive tool for Congolese name analysis and gender prediction")
# Sidebar navigation
page = st.sidebar.selectbox(
"Navigation",
[
"Dashboard",
"Dataset Overview",
"Data Processing",
"Experiments",
"Results & Analysis",
"Predictions",
"Configuration",
],
)
# Route to appropriate page
page_map = {
"Dashboard": self.dashboard.index,
"Dataset Overview": self.data_overview.index,
"Data Processing": self.data_processing.index,
"Experiments": self.experiments.index,
"Results & Analysis": self.results_analysis.index,
"Predictions": self.predictions.index,
"Configuration": self.configuration.index,
}
page_map.get(page, lambda: None)()
def main():
parser = argparse.ArgumentParser(
description="DRC NERS Platform",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
args = parser.parse_args()
config = setup_config(args.config, env=args.env)
app = StreamlitApp(config)
app.run()
if __name__ == "__main__":
main()
+5 -2
View File
@@ -18,7 +18,8 @@ paths:
checkpoints_dir: "./data/checkpoints" # Directory for model checkpoints
# Pipeline stages
stages: # List of stages in the processing pipeline
# List of stages in the processing pipeline
stages:
- "data_cleaning" # Data cleaning stage
- "feature_extraction" # Feature extraction stage
- "ner_annotation" # NER-based annotation stage
@@ -36,6 +37,7 @@ processing:
- "utf-16"
- "latin1"
chunk_size: 100_000 # Size of data chunks to process in parallel
epochs: 2 # Number of Epochs for training
# Annotation settings
annotation:
@@ -72,8 +74,9 @@ data:
balance_by_sex: false # Should the dataset be balanced by sex when limiting the dataset size?
# Logging configuration
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
logging:
level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: true # Enable logging to file
console_logging: true # Enable logging to console
+4 -4
View File
@@ -7,7 +7,7 @@ baseline_experiments:
max_len: 20
embedding_dim: 64
gru_units: 32
epochs: 10
epochs: 2
batch_size: 32
tags: [ "baseline", "neural", "bigru" ]
@@ -21,7 +21,7 @@ baseline_experiments:
filters: 64
kernel_size: 3
dropout: 0.5
epochs: 10
epochs: 2
batch_size: 32
tags: [ "baseline", "neural", "cnn" ]
@@ -79,7 +79,7 @@ baseline_experiments:
model_params:
embedding_dim: 128
lstm_units: 64
epochs: 10
epochs: 2
batch_size: 64
tags: [ "baseline", "neural", "lstm" ]
@@ -121,7 +121,7 @@ baseline_experiments:
embedding_dim: 128
num_heads: 4
num_layers: 2
epochs: 10
epochs: 2
batch_size: 64
tags: [ "baseline", "neural", "transformer" ]
+145
View File
@@ -0,0 +1,145 @@
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = null
seed = 42
[nlp]
lang = "fr"
pipeline = ["tok2vec","ner"]
batch_size = 100000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}
[components]
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,1000,2500,2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null
before_update = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]
+1
View File
@@ -12,3 +12,4 @@ class ProcessingConfig(BaseModel):
use_multiprocessing: bool = False
encoding_options: list = field(default_factory=lambda: ["utf-8", "utf-16", "latin1"])
chunk_size: int = 100_000
epochs: int = 2
+16 -9
View File
@@ -7,24 +7,24 @@ import traceback
from pathlib import Path
from core.config import setup_config, PipelineConfig
from processing.ner.ner_data_builder import NERDataBuilder
from processing.ner.ner_engineering import NEREngineering
from processing.ner.ner_name_model import NERNameModel
from processing.ner.name_builder import NameBuilder
from processing.ner.name_engineering import NameEngineering
from processing.ner.name_model import NameModel
def feature(config: PipelineConfig):
"""Apply feature engineering to create position-independent NER dataset."""
NEREngineering(config).compute()
NameEngineering(config).compute()
def build(config: PipelineConfig):
"""Build NER dataset using NERDataBuilder."""
NERDataBuilder(config).build()
NameBuilder(config).build()
def train(config: PipelineConfig):
"""Train the NER model."""
trainer = NERNameModel(config)
trainer = NameModel(config)
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
if not data_path.exists():
@@ -39,7 +39,10 @@ def train(config: PipelineConfig):
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
trainer.train(
data=train_data, epochs=1, batch_size=config.processing.batch_size, dropout_rate=0.3
data=train_data,
epochs=config.processing.epochs,
batch_size=config.processing.batch_size,
dropout_rate=0.3,
)
trainer.evaluate(eval_data)
@@ -48,13 +51,17 @@ def train(config: PipelineConfig):
def run_pipeline(config: PipelineConfig, reset: bool = False):
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["engineered"])):
if not reset and os.path.exists(
config.paths.get_data_path(config.data.output_files["engineered"])
):
logging.info("Step 1: Feature engineering already done.")
else:
logging.info("Step 1: Running feature engineering")
feature(config)
if not reset and os.path.exists(config.paths.get_data_path(config.data.output_files["ner_data"])):
if not reset and os.path.exists(
config.paths.get_data_path(config.data.output_files["ner_data"])
):
logging.info("Step 2: NER dataset already built.")
else:
logging.info("Step 2: Building NER dataset")
View File
View File
View File
View File
View File
View File
View File
+68
View File
@@ -0,0 +1,68 @@
import json
import logging
import spacy
from spacy.tokens import DocBin
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
from .name_tagger import NameTagger
class NameBuilder:
def __init__(self, config: PipelineConfig):
self.config = config
self.data_loader = DataLoader(config)
self.tagger = NameTagger()
def build(self) -> int:
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
# Filter early
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
if ner_df.empty:
logging.error("No NER tagged data found")
return 1
total_rows = len(df)
del df # No need to keep in memory
logging.info(f"Found {len(ner_df)} NER tagged entries")
nlp = spacy.blank("fr")
# Use NERNameTagger for parsing and validation
parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"])
validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities)
# Drop rows with no valid entities
mask = validated_entities.map(bool)
ner_df = ner_df.loc[mask]
validated_entities = validated_entities.loc[mask]
if ner_df.empty:
logging.error("No valid training examples after validation")
return 1
# Prepare training data
training_data = list(
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
)
# Use NERNameTagger to create spaCy DocBin
docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
doc_bin = DocBin(docs=docs)
# Save
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
doc_bin.to_disk(spacy_path)
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
logging.info(f"Saved NER JSON to {json_path}")
logging.info(f"Saved NER spacy to {spacy_path}")
return 0
@@ -1,5 +1,5 @@
import gc
import random
from typing import List
import logging
import numpy as np
@@ -7,7 +7,7 @@ import pandas as pd
from tqdm import tqdm
from core.config import PipelineConfig
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
from core.utils.data_loader import DataLoader
from processing.ner.formats.connectors_format import ConnectorFormatter
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from processing.ner.formats.native_only_format import NativeOnlyFormatter
@@ -16,7 +16,7 @@ from processing.ner.formats.position_flipped_format import PositionFlippedFormat
from processing.ner.formats.reduced_native_format import ReducedNativeFormatter
class NEREngineering:
class NameEngineering:
"""
Feature engineering for NER dataset to prevent position-based learning
and encourage sequence characteristic learning.
@@ -66,13 +66,16 @@ class NEREngineering:
def compute(self) -> None:
logging.info("Applying feature engineering transformations...")
input_filepath = self.config.paths.get_data_path(self.config.data.output_files["featured"])
output_filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
output_filepath = self.config.paths.get_data_path(
self.config.data.output_files["engineered"]
)
df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
del df # No need to keep in memory
gc.collect()
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
drop=True
@@ -1,3 +1,4 @@
import ast
import json
import logging
import os
@@ -11,7 +12,7 @@ from spacy.util import minibatch
from core.config.pipeline_config import PipelineConfig
class NERNameModel:
class NameModel:
"""NER model trainer using spaCy for DRC names entity recognition"""
def __init__(self, config: PipelineConfig):
@@ -84,8 +85,6 @@ class NERNameModel:
if isinstance(entities_raw, str):
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
try:
import ast
entities = ast.literal_eval(entities_raw)
if not isinstance(entities, list):
logging.warning(
@@ -175,9 +174,9 @@ class NERNameModel:
def train(
self,
data: List[Tuple[str, Dict]],
epochs: int = 5,
batch_size: int = 16,
dropout_rate: float = 0.2,
epochs: int = 1,
batch_size: int = 10_000,
dropout_rate: float = 0.3,
) -> None:
"""Train the NER model"""
logging.info(f"Starting NER training with {len(data)} examples")
@@ -204,7 +203,7 @@ class NERNameModel:
example = Example.from_dict(doc, annotations)
examples.append(example)
logging.info(
f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
f"Training example: {text[:30]} with entities {annotations.get('entities', [])}"
)
# Train in batches
@@ -215,6 +214,7 @@ class NERNameModel:
)
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
del batches # free memory
epoch_loss = losses.get("ner", 0)
losses_history.append(epoch_loss)
logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")
+273
View File
@@ -0,0 +1,273 @@
from typing import Union, Dict, Any, List
import ast
import json
import logging
import pandas as pd
from spacy.util import filter_spans
class NameTagger:
def tag_name(
self, name: str, probable_native: str, probable_surname: str
) -> Union[Dict[str, Any], None]:
"""Create a single NER training example using probable_native and probable_surname"""
if not name or not probable_native or not probable_surname:
return None
name = name.strip()
probable_native = probable_native.strip()
probable_surname = probable_surname.strip()
entities = []
used_spans = [] # Track used character spans to prevent overlaps
# Helper function to check if a span overlaps with any existing span
def has_overlap(start, end):
for used_start, used_end in used_spans:
if not (end <= used_start or start >= used_end):
return True
return False
# Find positions of native names in the full name
native_words = probable_native.split()
name_lower = name.lower() # Use lowercase for consistent searching
processed_native_words = set()
for native_word in native_words:
native_word = native_word.strip()
if len(native_word) < 2: # Skip very short words
continue
native_word_lower = native_word.lower()
# Skip if we've already processed this exact word
if native_word_lower in processed_native_words:
continue
processed_native_words.add(native_word_lower)
# Find the first occurrence of this native word that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
if pos == -1:
break
# Calculate end position - make sure we only include the word itself
end_pos = pos + len(native_word_lower)
# Double-check that the extracted span matches exactly what we expect
extracted_text = name[pos:end_pos] # Get original case text
if extracted_text.lower() != native_word_lower:
start_pos = pos + 1
continue
# Check if this is a word boundary match and doesn't overlap
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "NATIVE"))
used_spans.append((pos, end_pos))
break # Only take the first non-overlapping occurrence
start_pos = pos + 1
# Find position of surname in the full name
if probable_surname and len(probable_surname.strip()) >= 2:
surname_lower = probable_surname.lower()
# Find the first occurrence that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
if pos == -1:
break
# Calculate end position correctly - exact match only
end_pos = pos + len(surname_lower)
# Double-check that the extracted span matches exactly what we expect
extracted_text = name[pos:end_pos] # Get original case text
if extracted_text.lower() != surname_lower:
start_pos = pos + 1
continue
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "SURNAME"))
used_spans.append((pos, end_pos))
break
start_pos = pos + 1
if not entities:
logging.warning(
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
)
return None
# Sort entities by position and validate
entities.sort(key=lambda x: x[0])
# Final validation - ensure no overlaps and valid spans
validated_entities = []
for start, end, label in entities:
# Check bounds
if not (0 <= start < end <= len(name)):
logging.warning(
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
)
continue
# Check for overlaps with already validated entities
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
continue
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
span_text = name[start:end]
if not span_text or span_text != span_text.strip() or " " in span_text:
logging.warning(
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
)
continue
validated_entities.append((start, end, label))
if not validated_entities:
logging.warning(f"No valid entities after validation for: '{name}'")
return None
# Convert to string format that matches the dataset
entities_str = str(validated_entities)
return {
"entities": entities_str,
"spans": validated_entities, # Keep the original tuples for internal use
}
@classmethod
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
"""Check if the match is at word boundaries"""
# Check character before start position
if start > 0:
prev_char = text[start - 1]
if prev_char.isalnum():
return False
# Check character after end position
if end < len(text):
next_char = text[end]
if next_char.isalnum():
return False
return True
@classmethod
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
"""Extract the actual text for each entity type"""
result = {"NATIVE": [], "SURNAME": []}
try:
entities = ast.literal_eval(entities_str)
for start, end, label in entities:
if 0 <= start < end <= len(name):
span_text = name[start:end]
if label in result:
result[label].append(span_text)
except (ValueError, SyntaxError, TypeError):
pass
return result
@classmethod
def parse(cls, entities_str: str) -> List[tuple]:
"""Parse entity strings from various formats.
Supports formats:
- [(start, end, label), ...]
- [[start, end, label], ...]
- [{"start": start, "end": end, "label": label}, ...]
"""
if not entities_str or entities_str in ["[]", "", "nan"]:
return []
entities_str = str(entities_str).strip()
try:
if entities_str.startswith("[(") and entities_str.endswith(")]"):
return ast.literal_eval(entities_str)
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
return [tuple(e) for e in ast.literal_eval(entities_str)]
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
else:
parsed = ast.literal_eval(entities_str)
return [tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3]
except (ValueError, SyntaxError, json.JSONDecodeError):
return []
def parse_entities(self, series: pd.Series) -> pd.Series:
"""Vectorized parse of entity strings."""
return series.map(self.parse)
@classmethod
def validate(cls, text: str, entities: List[tuple]) -> List[tuple]:
"""Advanced entity validation with overlap removal.
This is more comprehensive than the basic validate_entities method.
"""
if not entities or not text:
return []
text = str(text).strip()
valid = []
for ent in entities:
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
continue
start, end, label = ent
try:
start, end = int(start), int(end)
except (ValueError, TypeError):
continue
if not isinstance(label, str):
continue
if not (0 <= start < end <= len(text)):
continue
if not text[start:end].strip():
continue
valid.append((start, end, label))
if not valid:
return []
valid.sort(key=lambda x: (x[0], x[1]))
# Remove overlaps
filtered, last_end = [], -1
for s, e, l in valid:
if s >= last_end:
filtered.append((s, e, l))
last_end = e
return filtered
def validate_entities(self, texts: pd.Series, entities_series: pd.Series) -> pd.Series:
"""Vectorized entity validation."""
return pd.Series(map(self.validate, texts, entities_series), index=texts.index)
@classmethod
def create_docs(cls, nlp, texts: List[str], entities: List[List[tuple]]) -> List:
"""Batch create spaCy Docs from texts and entities."""
docs = []
for text, ents in zip(texts, entities):
doc = nlp(text)
spans = []
for start, end, label in ents:
span = doc.char_span(
start, end, label=label, alignment_mode="contract"
) or doc.char_span(start, end, label=label, alignment_mode="strict")
if span:
spans.append(span)
doc.ents = filter_spans(spans)
docs.append(doc)
return docs
-149
View File
@@ -1,149 +0,0 @@
import ast
import json
import logging
from pathlib import Path
import pandas as pd
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from core.config import PipelineConfig
from core.utils.data_loader import DataLoader
class NERDataBuilder:
def __init__(self, config: PipelineConfig):
self.config = config
self.data_loader = DataLoader(config)
@staticmethod
def _parse_entities(series: pd.Series) -> pd.Series:
"""Vectorized parse of entity strings."""
def _parse(entities_str):
if not entities_str or entities_str in ["[]", "", "nan"]:
return []
entities_str = str(entities_str).strip()
try:
if entities_str.startswith("[(") and entities_str.endswith(")]"):
return ast.literal_eval(entities_str)
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
return [tuple(e) for e in ast.literal_eval(entities_str)]
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
else:
parsed = ast.literal_eval(entities_str)
return [
tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
]
except (ValueError, SyntaxError, json.JSONDecodeError):
return []
return series.map(_parse)
@staticmethod
def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
"""Vectorized entity validation."""
def _validate(text, entities):
if not entities or not text:
return []
text = str(text).strip()
valid = []
for ent in entities:
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
continue
start, end, label = ent
try:
start, end = int(start), int(end)
except (ValueError, TypeError):
continue
if not isinstance(label, str):
continue
if not (0 <= start < end <= len(text)):
continue
if not text[start:end].strip():
continue
valid.append((start, end, label))
if not valid:
return []
valid.sort(key=lambda x: (x[0], x[1]))
# remove overlaps
filtered, last_end = [], -1
for s, e, l in valid:
if s >= last_end:
filtered.append((s, e, l))
last_end = e
return filtered
return pd.Series(map(_validate, texts, entities_series), index=texts.index)
@staticmethod
def _create_docs(nlp, texts, entities):
"""Batch create spaCy Docs."""
docs = []
for text, ents in zip(texts, entities):
doc = nlp(text)
spans = []
for start, end, label in ents:
span = doc.char_span(
start, end, label=label, alignment_mode="contract"
) or doc.char_span(start, end, label=label, alignment_mode="strict")
if span:
spans.append(span)
doc.ents = filter_spans(spans)
docs.append(doc)
return docs
def build(self) -> int:
filepath = self.config.paths.get_data_path(self.config.data.output_files["engineered"])
df = self.data_loader.load_csv_complete(filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
# Filter early
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
if ner_df.empty:
logging.error("No NER tagged data found")
return 1
total_rows = len(df)
del df # No need to keep in memory
logging.info(f"Found {len(ner_df)} NER tagged entries")
nlp = spacy.blank("fr")
# Vectorized parsing + validation
parsed_entities = self._parse_entities(ner_df["ner_entities"])
validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
# Drop rows with no valid entities
mask = validated_entities.map(bool)
ner_df = ner_df.loc[mask]
validated_entities = validated_entities.loc[mask]
if ner_df.empty:
logging.error("No valid training examples after validation")
return 1
# Prepare training data
training_data = list(
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
)
# Create spaCy DocBin in batch
docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
doc_bin = DocBin(docs=docs)
# Save
json_path = self.config.paths.get_data_path(self.config.data.output_files["ner_data"])
spacy_path = self.config.paths.get_data_path(self.config.data.output_files["ner_spacy"])
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
doc_bin.to_disk(spacy_path)
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
logging.info(f"Saved NER JSON to {json_path}")
logging.info(f"Saved NER spacy to {spacy_path}")
return 0
-212
View File
@@ -1,212 +0,0 @@
from typing import Union, Dict, Any, List
import logging
class NERNameTagger:
def tag_name(
self, name: str, probable_native: str, probable_surname: str
) -> Union[Dict[str, Any], None]:
"""Create a single NER training example using probable_native and probable_surname"""
if not name or not probable_native or not probable_surname:
return None
name = name.strip()
probable_native = probable_native.strip()
probable_surname = probable_surname.strip()
entities = []
used_spans = [] # Track used character spans to prevent overlaps
# Helper function to check if a span overlaps with any existing span
def has_overlap(start, end):
for used_start, used_end in used_spans:
if not (end <= used_start or start >= used_end):
return True
return False
# Find positions of native names in the full name
native_words = probable_native.split()
name_lower = name.lower() # Use lowercase for consistent searching
processed_native_words = set()
for native_word in native_words:
native_word = native_word.strip()
if len(native_word) < 2: # Skip very short words
continue
native_word_lower = native_word.lower()
# Skip if we've already processed this exact word
if native_word_lower in processed_native_words:
continue
processed_native_words.add(native_word_lower)
# Find the first occurrence of this native word that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(native_word_lower, start_pos) # Case-insensitive search
if pos == -1:
break
# Calculate end position - make sure we only include the word itself
end_pos = pos + len(native_word_lower)
# Double-check that the extracted span matches exactly what we expect
extracted_text = name[pos:end_pos] # Get original case text
if extracted_text.lower() != native_word_lower:
start_pos = pos + 1
continue
# Check if this is a word boundary match and doesn't overlap
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "NATIVE"))
used_spans.append((pos, end_pos))
break # Only take the first non-overlapping occurrence
start_pos = pos + 1
# Find position of surname in the full name
if probable_surname and len(probable_surname.strip()) >= 2:
surname_lower = probable_surname.lower()
# Find the first occurrence that doesn't overlap
start_pos = 0
while True:
pos = name_lower.find(surname_lower, start_pos) # Case-insensitive search
if pos == -1:
break
# Calculate end position correctly - exact match only
end_pos = pos + len(surname_lower)
# Double-check that the extracted span matches exactly what we expect
extracted_text = name[pos:end_pos] # Get original case text
if extracted_text.lower() != surname_lower:
start_pos = pos + 1
continue
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "SURNAME"))
used_spans.append((pos, end_pos))
break
start_pos = pos + 1
if not entities:
logging.warning(
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
)
return None
# Sort entities by position and validate
entities.sort(key=lambda x: x[0])
# Final validation - ensure no overlaps and valid spans
validated_entities = []
for start, end, label in entities:
# Check bounds
if not (0 <= start < end <= len(name)):
logging.warning(
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
)
continue
# Check for overlaps with already validated entities
if any(start < v_end and end > v_start for v_start, v_end, _ in validated_entities):
logging.warning(f"Overlapping span ({start}, {end}, '{label}') in '{name}'")
continue
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
span_text = name[start:end]
if not span_text or span_text != span_text.strip() or " " in span_text:
logging.warning(
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
)
continue
validated_entities.append((start, end, label))
if not validated_entities:
logging.warning(f"No valid entities after validation for: '{name}'")
return None
# Convert to string format that matches the dataset
entities_str = str(validated_entities)
return {
"entities": entities_str,
"spans": validated_entities, # Keep the original tuples for internal use
}
@classmethod
def _is_word_boundary_match(cls, text: str, start: int, end: int) -> bool:
"""Check if the match is at word boundaries"""
# Check character before start position
if start > 0:
prev_char = text[start - 1]
if prev_char.isalnum():
return False
# Check character after end position
if end < len(text):
next_char = text[end]
if next_char.isalnum():
return False
return True
@classmethod
def validate_entities(cls, name: str, entities_str: str) -> bool:
"""Validate that entity annotations are correct for a given name"""
try:
import ast
entities = ast.literal_eval(entities_str)
# Check for overlaps and valid bounds
sorted_entities = sorted(entities, key=lambda x: x[0])
for i, (start, end, label) in enumerate(sorted_entities):
# Check bounds
if not (0 <= start < end <= len(name)):
return False
# Check for overlaps with next entity
if i < len(sorted_entities) - 1:
next_start = sorted_entities[i + 1][0]
if end > next_start:
return False
# Extract the text span and validate it's not empty
span_text = name[start:end]
if not span_text.strip():
return False
return True
except (ValueError, SyntaxError, TypeError):
return False
@classmethod
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
"""Extract the actual text for each entity type"""
result = {"NATIVE": [], "SURNAME": []}
try:
import ast
entities = ast.literal_eval(entities_str)
for start, end, label in entities:
if 0 <= start < end <= len(name):
span_text = name[start:end]
if label in result:
result[label].append(span_text)
except (ValueError, SyntaxError, TypeError):
pass
return result
View File
+89
View File
@@ -0,0 +1,89 @@
#!.venv/bin/python3
import argparse
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent
sys.path.insert(0, str(parent_dir))
from core.config import setup_config, PipelineConfig
from core.utils.data_loader import DataLoader
from processing.monitoring.pipeline_monitor import PipelineMonitor
from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker
from web.interfaces.configuration import Configuration
from web.interfaces.dashboard import Dashboard
from web.interfaces.data_overview import DataOverview
from web.interfaces.data_processing import DataProcessing
from web.interfaces.experiments import Experiments
from web.interfaces.predictions import Predictions
from web.interfaces.results_analysis import ResultsAnalysis
# Page configuration
st.set_page_config(
page_title="DRC NERS Platform",
page_icon="🇨🇩",
layout="wide",
initial_sidebar_state="expanded",
)
def initialize_session_state(config: PipelineConfig):
"""Initialize session state variables"""
if "config" not in st.session_state:
st.session_state.config = config
if "data_loader" not in st.session_state:
st.session_state.data_loader = DataLoader(config)
if "experiment_tracker" not in st.session_state:
st.session_state.experiment_tracker = ExperimentTracker(config)
if "experiment_runner" not in st.session_state:
st.session_state.experiment_runner = ExperimentRunner(config)
if "pipeline_monitor" not in st.session_state:
st.session_state.pipeline_monitor = PipelineMonitor()
if "current_experiment" not in st.session_state:
st.session_state.current_experiment = None
if "experiment_results" not in st.session_state:
st.session_state.experiment_results = {}
class StreamlitApp:
def __init__(self, config: PipelineConfig):
self.config = config
initialize_session_state(config)
def run(self):
st.title("🇨🇩 DRC NERS Pipeline")
st.markdown(
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
)
st.markdown(
"""
## Overview
Despite the growing success of gender inference models in Natural Language Processing (NLP), these tools often
underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training
data.
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
"""
)
def main():
parser = argparse.ArgumentParser(
description="DRC NERS Platform",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--config", type=str, help="Path to configuration file")
parser.add_argument("--env", type=str, default="development", help="Environment name")
args = parser.parse_args()
config = setup_config(args.config, env=args.env)
app = StreamlitApp(config)
app.run()
if __name__ == "__main__":
main()
View File
@@ -2,11 +2,9 @@ import streamlit as st
class Configuration:
"""Handles configuration display and management"""
def __init__(self, config):
self.config = config
def index(self):
st.header("Current Configuration")
st.title("Configuration")
st.json(self.config.model_dump())
@@ -20,7 +20,7 @@ class Dashboard:
self.experiment_runner = experiment_runner
def index(self):
st.header("Dashboard")
st.title("Dashboard")
col1, col2, col3, col4 = st.columns(4)
# Load basic statistics
@@ -21,7 +21,7 @@ class DataOverview:
self.config = config
def index(self):
st.header("Data Overview")
st.title("Data Overview")
data_files = {
"Names": self.config.data.input_file,
"Featured Dataset": self.config.data.output_files["featured"],
@@ -3,7 +3,7 @@ import plotly.express as px
import streamlit as st
from core.utils.data_loader import OPTIMIZED_DTYPES
from interface.log_reader import LogReader
from web.interfaces.log_reader import LogReader
@st.cache_data
@@ -21,7 +21,7 @@ class DataProcessing:
self.pipeline_monitor = pipeline_monitor
def index(self):
st.header("Data Processing Pipeline")
st.title("Data Processing")
status = self.pipeline_monitor.get_pipeline_status()
# Overall progress
@@ -12,8 +12,6 @@ from research.model_registry import list_available_models
class Experiments:
"""Handles experiment management interface"""
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
@@ -22,8 +20,7 @@ class Experiments:
self.experiment_runner = experiment_runner
def index(self):
"""Main experiments page"""
st.header("Experiment Management")
st.title("Experiments")
tab1, tab2, tab3 = st.tabs(["New Experiment", "Experiment List", "Batch Experiments"])
with tab1:
@@ -12,8 +12,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
class Predictions:
"""Handles prediction interface"""
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
@@ -22,8 +20,7 @@ class Predictions:
self.experiment_runner = experiment_runner
def index(self):
"""Main predictions page"""
st.header("Make Predictions")
st.title("Predictions")
# Load available models
experiments = self.experiment_tracker.list_experiments()
@@ -11,8 +11,6 @@ from research.experiment.experiment_tracker import ExperimentTracker
class ResultsAnalysis:
"""Handles experiment results and analysis interface"""
def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
):
@@ -21,8 +19,7 @@ class ResultsAnalysis:
self.experiment_runner = experiment_runner
def index(self):
"""Main results analysis page"""
st.header("Results & Analysis")
st.title("Results & Analysis")
tab1, tab2, tab3 = st.tabs(
["Experiment Comparison", "Performance Analysis", "Model Analysis"]
)
+22
View File
@@ -0,0 +1,22 @@
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent.parent
sys.path.insert(0, str(parent_dir))
from web.interfaces.dashboard import Dashboard
st.set_page_config(page_title="Dashboard", page_icon="📊", layout="wide")
if "config" in st.session_state:
dashboard = Dashboard(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
dashboard.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+18
View File
@@ -0,0 +1,18 @@
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent.parent
sys.path.insert(0, str(parent_dir))
from web.interfaces.data_overview import DataOverview
st.set_page_config(page_title="Data Overview", page_icon="📋", layout="wide")
if "config" in st.session_state:
data_overview = DataOverview(st.session_state.config)
data_overview.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+18
View File
@@ -0,0 +1,18 @@
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent.parent
sys.path.insert(0, str(parent_dir))
from web.interfaces.data_processing import DataProcessing
st.set_page_config(page_title="Data Processing", page_icon="⚙️", layout="wide")
if "config" in st.session_state:
data_processing = DataProcessing(st.session_state.config, st.session_state.pipeline_monitor)
data_processing.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+22
View File
@@ -0,0 +1,22 @@
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent.parent
sys.path.insert(0, str(parent_dir))
from web.interfaces.experiments import Experiments
st.set_page_config(page_title="Experiments", page_icon="🧪", layout="wide")
if "config" in st.session_state:
experiments = Experiments(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
experiments.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+22
View File
@@ -0,0 +1,22 @@
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent.parent
sys.path.insert(0, str(parent_dir))
from web.interfaces.results_analysis import ResultsAnalysis
st.set_page_config(page_title="Results & Analysis", page_icon="📈", layout="wide")
if "config" in st.session_state:
results_analysis = ResultsAnalysis(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
results_analysis.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+22
View File
@@ -0,0 +1,22 @@
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent.parent
sys.path.insert(0, str(parent_dir))
from web.interfaces.predictions import Predictions
st.set_page_config(page_title="Predictions", page_icon="🔮", layout="wide")
if "config" in st.session_state:
predictions = Predictions(
st.session_state.config,
st.session_state.experiment_tracker,
st.session_state.experiment_runner,
)
predictions.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")
+18
View File
@@ -0,0 +1,18 @@
import sys
from pathlib import Path
import streamlit as st
# Add parent directory to Python path to access core modules
parent_dir = Path(__file__).parent.parent.parent
sys.path.insert(0, str(parent_dir))
from web.interfaces.configuration import Configuration
st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
if "config" in st.session_state:
configuration = Configuration(st.session_state.config)
configuration.index()
else:
st.error("Please run the main app first to initialize the configuration.")
st.markdown("Go back to the [main page](/) to start the application.")