hotfixes
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
notebooks/* linguist-vendored
|
||||
|
||||
linguist-detectable=false
|
||||
|
||||
*.ipynb linguist-detectable=false
|
||||
|
||||
# Enforce Unix newlines
|
||||
*.py text eol=lf
|
||||
@@ -24,29 +24,29 @@ def build(config: PipelineConfig):
|
||||
|
||||
def train(config: PipelineConfig):
|
||||
"""Train the NER model."""
|
||||
trainer = NameModel(config)
|
||||
name_model = NameModel(config)
|
||||
|
||||
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
|
||||
if not data_path.exists():
|
||||
logging.info("NER data not found. Building dataset first...")
|
||||
build(config)
|
||||
|
||||
trainer.create_blank_model("fr")
|
||||
data = trainer.load_data(str(data_path))
|
||||
name_model.create_blank_model("fr")
|
||||
data = name_model.load_data(str(data_path))
|
||||
|
||||
split_idx = int(len(data) * 0.9)
|
||||
train_data, eval_data = data[:split_idx], data[split_idx:]
|
||||
|
||||
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
|
||||
trainer.train(
|
||||
name_model.train(
|
||||
data=train_data,
|
||||
epochs=config.processing.epochs,
|
||||
batch_size=config.processing.batch_size,
|
||||
dropout_rate=0.3,
|
||||
)
|
||||
trainer.evaluate(eval_data)
|
||||
name_model.evaluate(eval_data)
|
||||
|
||||
model_path = trainer.save()
|
||||
model_path = name_model.save()
|
||||
logging.info(f"Model saved to: {model_path}")
|
||||
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any
|
||||
import time
|
||||
|
||||
from processing.batch.batch_config import BatchConfig
|
||||
from processing.batch.batch_processor import BatchProcessor
|
||||
@@ -49,9 +49,6 @@ class Pipeline:
|
||||
"processed_batches": step.state.processed_batches,
|
||||
"total_batches": step.state.total_batches,
|
||||
"failed_batches": len(step.state.failed_batches),
|
||||
"completion_percentage": (
|
||||
step.state.processed_batches / max(1, step.state.total_batches)
|
||||
)
|
||||
* 100,
|
||||
"completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
|
||||
}
|
||||
return progress
|
||||
|
||||
@@ -7,7 +7,7 @@ import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from core.utils.region_mapper import RegionMapper
|
||||
from processing.ner.ner_name_tagger import NERNameTagger
|
||||
from processing.ner.name_tagger import NameTagger
|
||||
from processing.steps import PipelineStep
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ class FeatureExtractionStep(PipelineStep):
|
||||
def __init__(self, pipeline_config: PipelineConfig):
|
||||
super().__init__("feature_extraction", pipeline_config)
|
||||
self.region_mapper = RegionMapper()
|
||||
self.name_tagger = NERNameTagger()
|
||||
self.name_tagger = NameTagger()
|
||||
|
||||
@classmethod
|
||||
def requires_batch_mutation(cls) -> bool:
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import Dict
|
||||
import pandas as pd
|
||||
|
||||
from core.config.pipeline_config import PipelineConfig
|
||||
from processing.ner.ner_name_model import NERNameModel
|
||||
from processing.ner.name_model import NameModel
|
||||
from processing.steps import PipelineStep, NameAnnotation
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ class NERAnnotationStep(PipelineStep):
|
||||
|
||||
self.model_name = "drc_ner_model"
|
||||
self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
|
||||
self.ner_trainer = NERNameModel(pipeline_config)
|
||||
self.name_model = NameModel(pipeline_config)
|
||||
self.ner_config = pipeline_config.annotation.ner
|
||||
|
||||
# Statistics
|
||||
@@ -35,19 +35,19 @@ class NERAnnotationStep(PipelineStep):
|
||||
try:
|
||||
if self.model_path.exists():
|
||||
logging.info(f"Loading NER model from {self.model_path}")
|
||||
self.ner_trainer.load(str(self.model_path))
|
||||
self.name_model.load(str(self.model_path))
|
||||
logging.info("NER model loaded successfully")
|
||||
else:
|
||||
logging.warning(f"NER model not found at {self.model_path}")
|
||||
logging.warning("NER annotation will be skipped. Train the model first.")
|
||||
self.ner_trainer.nlp = None
|
||||
self.name_model.nlp = None
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load NER model: {e}")
|
||||
self.ner_trainer.nlp = None
|
||||
self.name_model.nlp = None
|
||||
|
||||
def analyze_name(self, name: str) -> Dict:
|
||||
"""Analyze a name with retry logic"""
|
||||
if self.ner_trainer.nlp is None:
|
||||
if self.name_model.nlp is None:
|
||||
return {
|
||||
"identified_name": None,
|
||||
"identified_surname": None,
|
||||
@@ -62,7 +62,7 @@ class NERAnnotationStep(PipelineStep):
|
||||
start_time = time.time()
|
||||
|
||||
# Get NER predictions
|
||||
prediction = self.ner_trainer.predict(name.lower())
|
||||
prediction = self.name_model.predict(name.lower())
|
||||
entities = prediction.get("entities", [])
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
@@ -7,7 +7,6 @@ from typing import Optional, Dict, List
|
||||
import pandas as pd
|
||||
|
||||
from core.config import PipelineConfig, get_config
|
||||
|
||||
from research.experiment import ExperimentConfig, ExperimentStatus
|
||||
from research.experiment.experiement_result import ExperimentResult
|
||||
|
||||
|
||||
+5
-13
@@ -2,6 +2,7 @@
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
@@ -13,13 +14,6 @@ from core.utils.data_loader import DataLoader
|
||||
from processing.monitoring.pipeline_monitor import PipelineMonitor
|
||||
from research.experiment.experiment_runner import ExperimentRunner
|
||||
from research.experiment.experiment_tracker import ExperimentTracker
|
||||
from web.interfaces.configuration import Configuration
|
||||
from web.interfaces.dashboard import Dashboard
|
||||
from web.interfaces.data_overview import DataOverview
|
||||
from web.interfaces.data_processing import DataProcessing
|
||||
from web.interfaces.experiments import Experiments
|
||||
from web.interfaces.predictions import Predictions
|
||||
from web.interfaces.results_analysis import ResultsAnalysis
|
||||
|
||||
# Page configuration
|
||||
st.set_page_config(
|
||||
@@ -53,12 +47,10 @@ class StreamlitApp:
|
||||
self.config = config
|
||||
initialize_session_state(config)
|
||||
|
||||
def run(self):
|
||||
st.title("🇨🇩 DRC NERS Pipeline")
|
||||
st.markdown(
|
||||
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def run(cls):
|
||||
st.title("🇨🇩 DRC NERS Platform")
|
||||
st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
|
||||
st.markdown(
|
||||
"""
|
||||
## Overview
|
||||
|
||||
@@ -230,7 +230,8 @@ class Experiments:
|
||||
|
||||
return experiments
|
||||
|
||||
def _display_experiment_details(self, exp, index: int):
|
||||
@classmethod
|
||||
def _display_experiment_details(cls, exp, index: int):
|
||||
"""Display details for a single experiment"""
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
@@ -368,64 +369,3 @@ class Experiments:
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running batch experiments: {e}")
|
||||
|
||||
def run_baseline_experiments(self):
|
||||
"""Run baseline experiments"""
|
||||
with st.spinner("Running baseline experiments..."):
|
||||
try:
|
||||
builder = ExperimentBuilder()
|
||||
experiments = builder.create_baseline_experiments()
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} baseline experiments")
|
||||
|
||||
# Show quick comparison
|
||||
if experiment_ids:
|
||||
comparison = self.experiment_runner.compare_experiments(experiment_ids)
|
||||
st.write("**Results Summary:**")
|
||||
st.dataframe(
|
||||
comparison[["name", "model_type", "test_accuracy"]],
|
||||
use_container_width=True,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running baseline experiments: {e}")
|
||||
|
||||
def run_ablation_study(self):
|
||||
"""Run feature ablation study"""
|
||||
with st.spinner("Running ablation study..."):
|
||||
try:
|
||||
builder = ExperimentBuilder()
|
||||
experiments = builder.create_feature_ablation_study()
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} ablation experiments")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running ablation study: {e}")
|
||||
|
||||
def run_component_study(self):
|
||||
"""Run name component study"""
|
||||
with st.spinner("Running component study..."):
|
||||
try:
|
||||
builder = ExperimentBuilder()
|
||||
experiments = builder.create_name_component_study()
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} component experiments")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running component study: {e}")
|
||||
|
||||
def run_province_study(self):
|
||||
"""Run province-specific study"""
|
||||
with st.spinner("Running province study..."):
|
||||
try:
|
||||
builder = ExperimentBuilder()
|
||||
experiments = builder.create_province_specific_study()
|
||||
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
|
||||
|
||||
st.success(f"Completed {len(experiment_ids)} province experiments")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Error running province study: {e}")
|
||||
|
||||
@@ -38,7 +38,7 @@ class LogReader:
|
||||
|
||||
# Parse log entries from the end
|
||||
entries = []
|
||||
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match
|
||||
for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
|
||||
entry = self._parse_log_line(line.strip())
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# Add parent directory to Python path to access core modules
|
||||
|
||||
Reference in New Issue
Block a user