This commit is contained in:
2025-08-16 20:34:45 +02:00
parent 84f7d41a84
commit cf1cbac1a8
36 changed files with 95 additions and 152 deletions
+8
View File
@@ -0,0 +1,8 @@
notebooks/* linguist-vendored
linguist-detectable=false
*.ipynb linguist-detectable=false
# Enforce Unix newlines
*.py text eol=lf
+6 -6
View File
@@ -24,29 +24,29 @@ def build(config: PipelineConfig):
def train(config: PipelineConfig): def train(config: PipelineConfig):
"""Train the NER model.""" """Train the NER model."""
trainer = NameModel(config) name_model = NameModel(config)
data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"] data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
if not data_path.exists(): if not data_path.exists():
logging.info("NER data not found. Building dataset first...") logging.info("NER data not found. Building dataset first...")
build(config) build(config)
trainer.create_blank_model("fr") name_model.create_blank_model("fr")
data = trainer.load_data(str(data_path)) data = name_model.load_data(str(data_path))
split_idx = int(len(data) * 0.9) split_idx = int(len(data) * 0.9)
train_data, eval_data = data[:split_idx], data[split_idx:] train_data, eval_data = data[:split_idx], data[split_idx:]
logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}") logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
trainer.train( name_model.train(
data=train_data, data=train_data,
epochs=config.processing.epochs, epochs=config.processing.epochs,
batch_size=config.processing.batch_size, batch_size=config.processing.batch_size,
dropout_rate=0.3, dropout_rate=0.3,
) )
trainer.evaluate(eval_data) name_model.evaluate(eval_data)
model_path = trainer.save() model_path = name_model.save()
logging.info(f"Model saved to: {model_path}") logging.info(f"Model saved to: {model_path}")
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
+3 -6
View File
@@ -1,8 +1,8 @@
import logging import logging
import time
from typing import Dict, Any
import pandas as pd import pandas as pd
from typing import Dict, Any
import time
from processing.batch.batch_config import BatchConfig from processing.batch.batch_config import BatchConfig
from processing.batch.batch_processor import BatchProcessor from processing.batch.batch_processor import BatchProcessor
@@ -49,9 +49,6 @@ class Pipeline:
"processed_batches": step.state.processed_batches, "processed_batches": step.state.processed_batches,
"total_batches": step.state.total_batches, "total_batches": step.state.total_batches,
"failed_batches": len(step.state.failed_batches), "failed_batches": len(step.state.failed_batches),
"completion_percentage": ( "completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
step.state.processed_batches / max(1, step.state.total_batches)
)
* 100,
} }
return progress return progress
View File
+2 -2
View File
@@ -7,7 +7,7 @@ import pandas as pd
from core.config.pipeline_config import PipelineConfig from core.config.pipeline_config import PipelineConfig
from core.utils.region_mapper import RegionMapper from core.utils.region_mapper import RegionMapper
from processing.ner.ner_name_tagger import NERNameTagger from processing.ner.name_tagger import NameTagger
from processing.steps import PipelineStep from processing.steps import PipelineStep
@@ -27,7 +27,7 @@ class FeatureExtractionStep(PipelineStep):
def __init__(self, pipeline_config: PipelineConfig): def __init__(self, pipeline_config: PipelineConfig):
super().__init__("feature_extraction", pipeline_config) super().__init__("feature_extraction", pipeline_config)
self.region_mapper = RegionMapper() self.region_mapper = RegionMapper()
self.name_tagger = NERNameTagger() self.name_tagger = NameTagger()
@classmethod @classmethod
def requires_batch_mutation(cls) -> bool: def requires_batch_mutation(cls) -> bool:
+7 -7
View File
@@ -6,7 +6,7 @@ from typing import Dict
import pandas as pd import pandas as pd
from core.config.pipeline_config import PipelineConfig from core.config.pipeline_config import PipelineConfig
from processing.ner.ner_name_model import NERNameModel from processing.ner.name_model import NameModel
from processing.steps import PipelineStep, NameAnnotation from processing.steps import PipelineStep, NameAnnotation
@@ -19,7 +19,7 @@ class NERAnnotationStep(PipelineStep):
self.model_name = "drc_ner_model" self.model_name = "drc_ner_model"
self.model_path = pipeline_config.paths.models_dir / "drc_ner_model" self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
self.ner_trainer = NERNameModel(pipeline_config) self.name_model = NameModel(pipeline_config)
self.ner_config = pipeline_config.annotation.ner self.ner_config = pipeline_config.annotation.ner
# Statistics # Statistics
@@ -35,19 +35,19 @@ class NERAnnotationStep(PipelineStep):
try: try:
if self.model_path.exists(): if self.model_path.exists():
logging.info(f"Loading NER model from {self.model_path}") logging.info(f"Loading NER model from {self.model_path}")
self.ner_trainer.load(str(self.model_path)) self.name_model.load(str(self.model_path))
logging.info("NER model loaded successfully") logging.info("NER model loaded successfully")
else: else:
logging.warning(f"NER model not found at {self.model_path}") logging.warning(f"NER model not found at {self.model_path}")
logging.warning("NER annotation will be skipped. Train the model first.") logging.warning("NER annotation will be skipped. Train the model first.")
self.ner_trainer.nlp = None self.name_model.nlp = None
except Exception as e: except Exception as e:
logging.error(f"Failed to load NER model: {e}") logging.error(f"Failed to load NER model: {e}")
self.ner_trainer.nlp = None self.name_model.nlp = None
def analyze_name(self, name: str) -> Dict: def analyze_name(self, name: str) -> Dict:
"""Analyze a name with retry logic""" """Analyze a name with retry logic"""
if self.ner_trainer.nlp is None: if self.name_model.nlp is None:
return { return {
"identified_name": None, "identified_name": None,
"identified_surname": None, "identified_surname": None,
@@ -62,7 +62,7 @@ class NERAnnotationStep(PipelineStep):
start_time = time.time() start_time = time.time()
# Get NER predictions # Get NER predictions
prediction = self.ner_trainer.predict(name.lower()) prediction = self.name_model.predict(name.lower())
entities = prediction.get("entities", []) entities = prediction.get("entities", [])
elapsed_time = time.time() - start_time elapsed_time = time.time() - start_time
+2 -2
View File
@@ -41,14 +41,14 @@ class BaseModel(ABC):
@abstractmethod @abstractmethod
def cross_validate( def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5 self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> Dict[str, float] | dict[str, np.floating[Any]]: ) -> Dict[str, float] | dict[str, np.floating[Any]]:
"""Perform cross-validation and return average scores""" """Perform cross-validation and return average scores"""
pass pass
@abstractmethod @abstractmethod
def generate_learning_curve( def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Generate learning curve data for the model""" """Generate learning curve data for the model"""
pass pass
+7 -7
View File
@@ -158,12 +158,12 @@ class ExperimentRunner:
@classmethod @classmethod
def _create_prediction_examples( def _create_prediction_examples(
cls, cls,
X_test: pd.DataFrame, X_test: pd.DataFrame,
y_test: pd.Series, y_test: pd.Series,
predictions: np.ndarray, predictions: np.ndarray,
model: BaseModel, model: BaseModel,
n_examples: int = 10, n_examples: int = 10,
) -> List[Dict]: ) -> List[Dict]:
"""Create prediction examples for analysis""" """Create prediction examples for analysis"""
examples = [] examples = []
@@ -237,7 +237,7 @@ class ExperimentRunner:
return None return None
def compare_experiments( def compare_experiments(
self, experiment_ids: List[str], metric: str = "accuracy" self, experiment_ids: List[str], metric: str = "accuracy"
) -> pd.DataFrame: ) -> pd.DataFrame:
"""Compare experiments and return analysis""" """Compare experiments and return analysis"""
comparison_df = self.tracker.compare_experiments(experiment_ids) comparison_df = self.tracker.compare_experiments(experiment_ids)
+7 -8
View File
@@ -7,7 +7,6 @@ from typing import Optional, Dict, List
import pandas as pd import pandas as pd
from core.config import PipelineConfig, get_config from core.config import PipelineConfig, get_config
from research.experiment import ExperimentConfig, ExperimentStatus from research.experiment import ExperimentConfig, ExperimentStatus
from research.experiment.experiement_result import ExperimentResult from research.experiment.experiement_result import ExperimentResult
@@ -78,10 +77,10 @@ class ExperimentTracker:
return self._results.get(experiment_id) return self._results.get(experiment_id)
def list_experiments( def list_experiments(
self, self,
status: Optional[ExperimentStatus] = None, status: Optional[ExperimentStatus] = None,
tags: Optional[List[str]] = None, tags: Optional[List[str]] = None,
model_type: Optional[str] = None, model_type: Optional[str] = None,
) -> List[ExperimentResult]: ) -> List[ExperimentResult]:
"""List experiments with optional filtering""" """List experiments with optional filtering"""
results = list(self._results.values()) results = list(self._results.values())
@@ -98,7 +97,7 @@ class ExperimentTracker:
return sorted(results, key=lambda x: x.start_time, reverse=True) return sorted(results, key=lambda x: x.start_time, reverse=True)
def get_best_experiment( def get_best_experiment(
self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
) -> Optional[ExperimentResult]: ) -> Optional[ExperimentResult]:
"""Get the best experiment based on a metric""" """Get the best experiment based on a metric"""
experiments = self.list_experiments() experiments = self.list_experiments()
@@ -160,8 +159,8 @@ class ExperimentTracker:
"""Export all results to CSV""" """Export all results to CSV"""
if output_path is None: if output_path is None:
output_path = ( output_path = (
self.experiments_dir self.experiments_dir
/ f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
) )
rows = [] rows = []
+1 -1
View File
@@ -43,7 +43,7 @@ class FeatureExtractor:
return features_df return features_df
def _extract_single_feature( def _extract_single_feature(
self, df: pd.DataFrame, feature_type: FeatureType self, df: pd.DataFrame, feature_type: FeatureType
) -> Union[pd.Series, pd.DataFrame]: ) -> Union[pd.Series, pd.DataFrame]:
"""Extract a single type of feature""" """Extract a single type of feature"""
if feature_type == FeatureType.FULL_NAME: if feature_type == FeatureType.FULL_NAME:
+8 -8
View File
@@ -27,13 +27,13 @@ class ModelTrainer:
self.models_dir.mkdir(parents=True, exist_ok=True) self.models_dir.mkdir(parents=True, exist_ok=True)
def train_single_model( def train_single_model(
self, self,
model_name: str, model_name: str,
model_type: str = "logistic_regression", model_type: str = "logistic_regression",
features: List[str] = None, features: List[str] = None,
model_params: Dict[str, Any] = None, model_params: Dict[str, Any] = None,
tags: List[str] = None, tags: List[str] = None,
save_artifacts: bool = True, save_artifacts: bool = True,
) -> str: ) -> str:
""" """
Train a single model and save its artifacts. Train a single model and save its artifacts.
@@ -75,7 +75,7 @@ class ModelTrainer:
return experiment_id return experiment_id
def train_multiple_models( def train_multiple_models(
self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
) -> List[str]: ) -> List[str]:
""" """
Train multiple models with different configurations. Train multiple models with different configurations.
+2 -2
View File
@@ -83,7 +83,7 @@ class NeuralNetworkModel(BaseModel):
return self return self
def cross_validate( def cross_validate(
self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5 self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
) -> dict[str, np.floating[Any]]: ) -> dict[str, np.floating[Any]]:
features_df = self.feature_extractor.extract_features(X) features_df = self.feature_extractor.extract_features(X)
X_prepared = self.prepare_features(features_df) X_prepared = self.prepare_features(features_df)
@@ -140,7 +140,7 @@ class NeuralNetworkModel(BaseModel):
} }
def generate_learning_curve( def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Generate learning curve data for the model""" """Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}") logging.info(f"Generating learning curve for {self.__class__.__name__}")
+1 -1
View File
@@ -93,7 +93,7 @@ class TraditionalModel(BaseModel):
return results return results
def generate_learning_curve( def generate_learning_curve(
self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Generate learning curve data for the model""" """Generate learning curve data for the model"""
logging.info(f"Generating learning curve for {self.__class__.__name__}") logging.info(f"Generating learning curve for {self.__class__.__name__}")
+6 -14
View File
@@ -2,6 +2,7 @@
import argparse import argparse
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules
@@ -13,13 +14,6 @@ from core.utils.data_loader import DataLoader
from processing.monitoring.pipeline_monitor import PipelineMonitor from processing.monitoring.pipeline_monitor import PipelineMonitor
from research.experiment.experiment_runner import ExperimentRunner from research.experiment.experiment_runner import ExperimentRunner
from research.experiment.experiment_tracker import ExperimentTracker from research.experiment.experiment_tracker import ExperimentTracker
from web.interfaces.configuration import Configuration
from web.interfaces.dashboard import Dashboard
from web.interfaces.data_overview import DataOverview
from web.interfaces.data_processing import DataProcessing
from web.interfaces.experiments import Experiments
from web.interfaces.predictions import Predictions
from web.interfaces.results_analysis import ResultsAnalysis
# Page configuration # Page configuration
st.set_page_config( st.set_page_config(
@@ -53,12 +47,10 @@ class StreamlitApp:
self.config = config self.config = config
initialize_session_state(config) initialize_session_state(config)
def run(self): @classmethod
st.title("🇨🇩 DRC NERS Pipeline") def run(cls):
st.markdown( st.title("🇨🇩 DRC NERS Platform")
"A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference" st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
)
st.markdown( st.markdown(
""" """
## Overview ## Overview
@@ -67,7 +59,7 @@ class StreamlitApp:
data. data.
This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5 This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata. million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
""" """
) )
+23 -83
View File
@@ -13,7 +13,7 @@ from research.model_registry import list_available_models
class Experiments: class Experiments:
def __init__( def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
): ):
self.config = config self.config = config
self.experiment_tracker = experiment_tracker self.experiment_tracker = experiment_tracker
@@ -113,18 +113,18 @@ class Experiments:
) )
def _handle_experiment_submission( def _handle_experiment_submission(
self, self,
exp_name: str, exp_name: str,
description: str, description: str,
model_type: str, model_type: str,
selected_features: List[str], selected_features: List[str],
model_params: Dict[str, Any], model_params: Dict[str, Any],
test_size: float, test_size: float,
cv_folds: int, cv_folds: int,
tags: str, tags: str,
filter_province: str, filter_province: str,
min_words: int, min_words: int,
max_words: int, max_words: int,
): ):
"""Handle experiment form submission""" """Handle experiment form submission"""
if not exp_name: if not exp_name:
@@ -209,7 +209,7 @@ class Experiments:
# Display experiments # Display experiments
for i, exp in enumerate(experiments): for i, exp in enumerate(experiments):
with st.expander( with st.expander(
f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}" f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
): ):
self._display_experiment_details(exp, i) self._display_experiment_details(exp, i)
@@ -230,7 +230,8 @@ class Experiments:
return experiments return experiments
def _display_experiment_details(self, exp, index: int): @classmethod
def _display_experiment_details(cls, exp, index: int):
"""Display details for a single experiment""" """Display details for a single experiment"""
col1, col2, col3 = st.columns(3) col1, col2, col3 = st.columns(3)
@@ -295,13 +296,13 @@ class Experiments:
) )
def run_batch_experiments( def run_batch_experiments(
self, self,
base_name: str, base_name: str,
model_types: List[str], model_types: List[str],
ngram_ranges: str, ngram_ranges: str,
feature_combinations: List[str], feature_combinations: List[str],
test_sizes: str, test_sizes: str,
tags: str, tags: str,
): ):
"""Run batch experiments with parameter combinations""" """Run batch experiments with parameter combinations"""
with st.spinner("Running batch experiments..."): with st.spinner("Running batch experiments..."):
@@ -368,64 +369,3 @@ class Experiments:
except Exception as e: except Exception as e:
st.error(f"Error running batch experiments: {e}") st.error(f"Error running batch experiments: {e}")
def run_baseline_experiments(self):
"""Run baseline experiments"""
with st.spinner("Running baseline experiments..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_baseline_experiments()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} baseline experiments")
# Show quick comparison
if experiment_ids:
comparison = self.experiment_runner.compare_experiments(experiment_ids)
st.write("**Results Summary:**")
st.dataframe(
comparison[["name", "model_type", "test_accuracy"]],
use_container_width=True,
)
except Exception as e:
st.error(f"Error running baseline experiments: {e}")
def run_ablation_study(self):
"""Run feature ablation study"""
with st.spinner("Running ablation study..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_feature_ablation_study()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} ablation experiments")
except Exception as e:
st.error(f"Error running ablation study: {e}")
def run_component_study(self):
"""Run name component study"""
with st.spinner("Running component study..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_name_component_study()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} component experiments")
except Exception as e:
st.error(f"Error running component study: {e}")
def run_province_study(self):
"""Run province-specific study"""
with st.spinner("Running province study..."):
try:
builder = ExperimentBuilder()
experiments = builder.create_province_specific_study()
experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
st.success(f"Completed {len(experiment_ids)} province experiments")
except Exception as e:
st.error(f"Error running province study: {e}")
+1 -1
View File
@@ -38,7 +38,7 @@ class LogReader:
# Parse log entries from the end # Parse log entries from the end
entries = [] entries = []
for line in reversed(lines[-count * 2 :]): # Read more lines in case some don't match for line in reversed(lines[-count * 2:]): # Read more lines in case some don't match
entry = self._parse_log_line(line.strip()) entry = self._parse_log_line(line.strip())
if entry: if entry:
entries.append(entry) entries.append(entry)
+3 -3
View File
@@ -13,7 +13,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
class Predictions: class Predictions:
def __init__( def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
): ):
self.config = config self.config = config
self.experiment_tracker = experiment_tracker self.experiment_tracker = experiment_tracker
@@ -114,7 +114,7 @@ class Predictions:
return None return None
def _display_single_prediction_results( def _display_single_prediction_results(
self, prediction: str, confidence: Optional[float], experiment, name_input: str self, prediction: str, confidence: Optional[float], experiment, name_input: str
): ):
"""Display single prediction results""" """Display single prediction results"""
col1, col2 = st.columns(2) col1, col2 = st.columns(2)
@@ -300,7 +300,7 @@ class Predictions:
return pd.DataFrame() return pd.DataFrame()
def _run_dataset_prediction( def _run_dataset_prediction(
self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
): ):
"""Run dataset prediction and display results""" """Run dataset prediction and display results"""
with st.spinner("Running predictions..."): with st.spinner("Running predictions..."):
+1 -1
View File
@@ -12,7 +12,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
class ResultsAnalysis: class ResultsAnalysis:
def __init__( def __init__(
self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
): ):
self.config = config self.config = config
self.experiment_tracker = experiment_tracker self.experiment_tracker = experiment_tracker
+1
View File
@@ -1,5 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules
+1
View File
@@ -1,5 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules
+1
View File
@@ -1,5 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules
+1
View File
@@ -1,5 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules
+1
View File
@@ -1,5 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules
+1
View File
@@ -1,5 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules
+1
View File
@@ -1,5 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
import streamlit as st import streamlit as st
# Add parent directory to Python path to access core modules # Add parent directory to Python path to access core modules