hotfixes

2025-08-16 20:34:45 +02:00
parent 84f7d41a84
commit cf1cbac1a8
36 changed files with 95 additions and 152 deletions
@@ -0,0 +1,8 @@
 notebooks/* linguist-vendored
 linguist-detectable=false
 *.ipynb linguist-detectable=false
 # Enforce Unix newlines
 *.py   text eol=lf
@@ -24,29 +24,29 @@ def build(config: PipelineConfig):
 def train(config: PipelineConfig):
    """Train the NER model."""
-    trainer = NameModel(config)
+    name_model = NameModel(config)
    data_path = Path(config.paths.data_dir) / config.data.output_files["ner_data"]
    if not data_path.exists():
        logging.info("NER data not found. Building dataset first...")
        build(config)
-    trainer.create_blank_model("fr")
+    name_model.create_blank_model("fr")
-    data = trainer.load_data(str(data_path))
+    data = name_model.load_data(str(data_path))
    split_idx = int(len(data) * 0.9)
    train_data, eval_data = data[:split_idx], data[split_idx:]
    logging.info(f"Training with {len(train_data)} examples, evaluating on {len(eval_data)}")
-    trainer.train(
+    name_model.train(
        data=train_data,
        epochs=config.processing.epochs,
        batch_size=config.processing.batch_size,
        dropout_rate=0.3,
    )
-    trainer.evaluate(eval_data)
+    name_model.evaluate(eval_data)
-    model_path = trainer.save()
+    model_path = name_model.save()
    logging.info(f"Model saved to: {model_path}")
@@ -1,8 +1,8 @@
 import logging
 import time
 from typing import Dict, Any
 import pandas as pd
 from typing import Dict, Any
 import time
 from processing.batch.batch_config import BatchConfig
 from processing.batch.batch_processor import BatchProcessor
@@ -49,9 +49,6 @@ class Pipeline:
                "processed_batches": step.state.processed_batches,
                "total_batches": step.state.total_batches,
                "failed_batches": len(step.state.failed_batches),
-                "completion_percentage": (
+                "completion_percentage": (step.state.processed_batches / max(1, step.state.total_batches)) * 100,
                    step.state.processed_batches / max(1, step.state.total_batches)
                )
                * 100,
            }
        return progress
@@ -7,7 +7,7 @@ import pandas as pd
 from core.config.pipeline_config import PipelineConfig
 from core.utils.region_mapper import RegionMapper
-from processing.ner.ner_name_tagger import NERNameTagger
+from processing.ner.name_tagger import NameTagger
 from processing.steps import PipelineStep
@@ -27,7 +27,7 @@ class FeatureExtractionStep(PipelineStep):
    def __init__(self, pipeline_config: PipelineConfig):
        super().__init__("feature_extraction", pipeline_config)
        self.region_mapper = RegionMapper()
-        self.name_tagger = NERNameTagger()
+        self.name_tagger = NameTagger()
    @classmethod
    def requires_batch_mutation(cls) -> bool:
@@ -6,7 +6,7 @@ from typing import Dict
 import pandas as pd
 from core.config.pipeline_config import PipelineConfig
-from processing.ner.ner_name_model import NERNameModel
+from processing.ner.name_model import NameModel
 from processing.steps import PipelineStep, NameAnnotation
@@ -19,7 +19,7 @@ class NERAnnotationStep(PipelineStep):
        self.model_name = "drc_ner_model"
        self.model_path = pipeline_config.paths.models_dir / "drc_ner_model"
-        self.ner_trainer = NERNameModel(pipeline_config)
+        self.name_model = NameModel(pipeline_config)
        self.ner_config = pipeline_config.annotation.ner
        # Statistics
@@ -35,19 +35,19 @@ class NERAnnotationStep(PipelineStep):
        try:
            if self.model_path.exists():
                logging.info(f"Loading NER model from {self.model_path}")
-                self.ner_trainer.load(str(self.model_path))
+                self.name_model.load(str(self.model_path))
                logging.info("NER model loaded successfully")
            else:
                logging.warning(f"NER model not found at {self.model_path}")
                logging.warning("NER annotation will be skipped. Train the model first.")
-                self.ner_trainer.nlp = None
+                self.name_model.nlp = None
        except Exception as e:
            logging.error(f"Failed to load NER model: {e}")
-            self.ner_trainer.nlp = None
+            self.name_model.nlp = None
    def analyze_name(self, name: str) -> Dict:
        """Analyze a name with retry logic"""
-        if self.ner_trainer.nlp is None:
+        if self.name_model.nlp is None:
            return {
                "identified_name": None,
                "identified_surname": None,
@@ -62,7 +62,7 @@ class NERAnnotationStep(PipelineStep):
                start_time = time.time()
                # Get NER predictions
-                prediction = self.ner_trainer.predict(name.lower())
+                prediction = self.name_model.predict(name.lower())
                entities = prediction.get("entities", [])
                elapsed_time = time.time() - start_time
@@ -41,14 +41,14 @@ class BaseModel(ABC):
    @abstractmethod
    def cross_validate(
-        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
+            self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
    ) -> Dict[str, float] | dict[str, np.floating[Any]]:
        """Perform cross-validation and return average scores"""
        pass
    @abstractmethod
    def generate_learning_curve(
-        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+            self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        pass
@@ -158,12 +158,12 @@ class ExperimentRunner:
    @classmethod
    def _create_prediction_examples(
-        cls,
+            cls,
-        X_test: pd.DataFrame,
+            X_test: pd.DataFrame,
-        y_test: pd.Series,
+            y_test: pd.Series,
-        predictions: np.ndarray,
+            predictions: np.ndarray,
-        model: BaseModel,
+            model: BaseModel,
-        n_examples: int = 10,
+            n_examples: int = 10,
    ) -> List[Dict]:
        """Create prediction examples for analysis"""
        examples = []
@@ -237,7 +237,7 @@ class ExperimentRunner:
        return None
    def compare_experiments(
-        self, experiment_ids: List[str], metric: str = "accuracy"
+            self, experiment_ids: List[str], metric: str = "accuracy"
    ) -> pd.DataFrame:
        """Compare experiments and return analysis"""
        comparison_df = self.tracker.compare_experiments(experiment_ids)
@@ -7,7 +7,6 @@ from typing import Optional, Dict, List
 import pandas as pd
 from core.config import PipelineConfig, get_config
 from research.experiment import ExperimentConfig, ExperimentStatus
 from research.experiment.experiement_result import ExperimentResult
@@ -78,10 +77,10 @@ class ExperimentTracker:
        return self._results.get(experiment_id)
    def list_experiments(
-        self,
+            self,
-        status: Optional[ExperimentStatus] = None,
+            status: Optional[ExperimentStatus] = None,
-        tags: Optional[List[str]] = None,
+            tags: Optional[List[str]] = None,
-        model_type: Optional[str] = None,
+            model_type: Optional[str] = None,
    ) -> List[ExperimentResult]:
        """List experiments with optional filtering"""
        results = list(self._results.values())
@@ -98,7 +97,7 @@ class ExperimentTracker:
        return sorted(results, key=lambda x: x.start_time, reverse=True)
    def get_best_experiment(
-        self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
+            self, metric: str = "accuracy", dataset: str = "test", filters: Optional[Dict] = None
    ) -> Optional[ExperimentResult]:
        """Get the best experiment based on a metric"""
        experiments = self.list_experiments()
@@ -160,8 +159,8 @@ class ExperimentTracker:
        """Export all results to CSV"""
        if output_path is None:
            output_path = (
-                self.experiments_dir
+                    self.experiments_dir
-                / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+                    / f"experiments_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            )
        rows = []
@@ -43,7 +43,7 @@ class FeatureExtractor:
        return features_df
    def _extract_single_feature(
-        self, df: pd.DataFrame, feature_type: FeatureType
+            self, df: pd.DataFrame, feature_type: FeatureType
    ) -> Union[pd.Series, pd.DataFrame]:
        """Extract a single type of feature"""
        if feature_type == FeatureType.FULL_NAME:
@@ -27,13 +27,13 @@ class ModelTrainer:
        self.models_dir.mkdir(parents=True, exist_ok=True)
    def train_single_model(
-        self,
+            self,
-        model_name: str,
+            model_name: str,
-        model_type: str = "logistic_regression",
+            model_type: str = "logistic_regression",
-        features: List[str] = None,
+            features: List[str] = None,
-        model_params: Dict[str, Any] = None,
+            model_params: Dict[str, Any] = None,
-        tags: List[str] = None,
+            tags: List[str] = None,
-        save_artifacts: bool = True,
+            save_artifacts: bool = True,
    ) -> str:
        """
        Train a single model and save its artifacts.
@@ -75,7 +75,7 @@ class ModelTrainer:
        return experiment_id
    def train_multiple_models(
-        self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
+            self, base_name: str, model_configs: List[Dict[str, Any]], save_all: bool = True
    ) -> List[str]:
        """
        Train multiple models with different configurations.
@@ -83,7 +83,7 @@ class NeuralNetworkModel(BaseModel):
        return self
    def cross_validate(
-        self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
+            self, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5
    ) -> dict[str, np.floating[Any]]:
        features_df = self.feature_extractor.extract_features(X)
        X_prepared = self.prepare_features(features_df)
@@ -140,7 +140,7 @@ class NeuralNetworkModel(BaseModel):
        }
    def generate_learning_curve(
-        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+            self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        logging.info(f"Generating learning curve for {self.__class__.__name__}")
@@ -93,7 +93,7 @@ class TraditionalModel(BaseModel):
        return results
    def generate_learning_curve(
-        self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
+            self, X: pd.DataFrame, y: pd.Series, train_sizes: List[float] = None
    ) -> Dict[str, Any]:
        """Generate learning curve data for the model"""
        logging.info(f"Generating learning curve for {self.__class__.__name__}")
@@ -2,6 +2,7 @@
 import argparse
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
@@ -13,13 +14,6 @@ from core.utils.data_loader import DataLoader
 from processing.monitoring.pipeline_monitor import PipelineMonitor
 from research.experiment.experiment_runner import ExperimentRunner
 from research.experiment.experiment_tracker import ExperimentTracker
 from web.interfaces.configuration import Configuration
 from web.interfaces.dashboard import Dashboard
 from web.interfaces.data_overview import DataOverview
 from web.interfaces.data_processing import DataProcessing
 from web.interfaces.experiments import Experiments
 from web.interfaces.predictions import Predictions
 from web.interfaces.results_analysis import ResultsAnalysis
 # Page configuration
 st.set_page_config(
@@ -53,12 +47,10 @@ class StreamlitApp:
        self.config = config
        initialize_session_state(config)
-    def run(self):
+    @classmethod
-        st.title("🇨🇩 DRC NERS Pipeline")
+    def run(cls):
-        st.markdown(
+        st.title("🇨🇩 DRC NERS Platform")
-            "A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference"
+        st.markdown("A Culturally-Aware NLP System for Congolese Name Analysis and Gender Inference")
        )
        st.markdown(
            """
            ## Overview
@@ -67,7 +59,7 @@ class StreamlitApp:
            data.
            This project introduces a comprehensive pipeline for Congolese name analysis with a large-scale dataset of over 5
            million names from the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata.
-        """
+            """
        )
@@ -13,7 +13,7 @@ from research.model_registry import list_available_models
 class Experiments:
    def __init__(
-        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+            self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -113,18 +113,18 @@ class Experiments:
                )
    def _handle_experiment_submission(
-        self,
+            self,
-        exp_name: str,
+            exp_name: str,
-        description: str,
+            description: str,
-        model_type: str,
+            model_type: str,
-        selected_features: List[str],
+            selected_features: List[str],
-        model_params: Dict[str, Any],
+            model_params: Dict[str, Any],
-        test_size: float,
+            test_size: float,
-        cv_folds: int,
+            cv_folds: int,
-        tags: str,
+            tags: str,
-        filter_province: str,
+            filter_province: str,
-        min_words: int,
+            min_words: int,
-        max_words: int,
+            max_words: int,
    ):
        """Handle experiment form submission"""
        if not exp_name:
@@ -209,7 +209,7 @@ class Experiments:
        # Display experiments
        for i, exp in enumerate(experiments):
            with st.expander(
-                f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
+                    f"{exp.config.name} - {exp.status.value} - {exp.start_time.strftime('%Y-%m-%d %H:%M')}"
            ):
                self._display_experiment_details(exp, i)
@@ -230,7 +230,8 @@ class Experiments:
        return experiments
-    def _display_experiment_details(self, exp, index: int):
+    @classmethod
    def _display_experiment_details(cls, exp, index: int):
        """Display details for a single experiment"""
        col1, col2, col3 = st.columns(3)
@@ -295,13 +296,13 @@ class Experiments:
                )
    def run_batch_experiments(
-        self,
+            self,
-        base_name: str,
+            base_name: str,
-        model_types: List[str],
+            model_types: List[str],
-        ngram_ranges: str,
+            ngram_ranges: str,
-        feature_combinations: List[str],
+            feature_combinations: List[str],
-        test_sizes: str,
+            test_sizes: str,
-        tags: str,
+            tags: str,
    ):
        """Run batch experiments with parameter combinations"""
        with st.spinner("Running batch experiments..."):
@@ -368,64 +369,3 @@ class Experiments:
            except Exception as e:
                st.error(f"Error running batch experiments: {e}")
    def run_baseline_experiments(self):
        """Run baseline experiments"""
        with st.spinner("Running baseline experiments..."):
            try:
                builder = ExperimentBuilder()
                experiments = builder.create_baseline_experiments()
                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
                st.success(f"Completed {len(experiment_ids)} baseline experiments")
                # Show quick comparison
                if experiment_ids:
                    comparison = self.experiment_runner.compare_experiments(experiment_ids)
                    st.write("**Results Summary:**")
                    st.dataframe(
                        comparison[["name", "model_type", "test_accuracy"]],
                        use_container_width=True,
                    )
            except Exception as e:
                st.error(f"Error running baseline experiments: {e}")
    def run_ablation_study(self):
        """Run feature ablation study"""
        with st.spinner("Running ablation study..."):
            try:
                builder = ExperimentBuilder()
                experiments = builder.create_feature_ablation_study()
                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
                st.success(f"Completed {len(experiment_ids)} ablation experiments")
            except Exception as e:
                st.error(f"Error running ablation study: {e}")
    def run_component_study(self):
        """Run name component study"""
        with st.spinner("Running component study..."):
            try:
                builder = ExperimentBuilder()
                experiments = builder.create_name_component_study()
                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
                st.success(f"Completed {len(experiment_ids)} component experiments")
            except Exception as e:
                st.error(f"Error running component study: {e}")
    def run_province_study(self):
        """Run province-specific study"""
        with st.spinner("Running province study..."):
            try:
                builder = ExperimentBuilder()
                experiments = builder.create_province_specific_study()
                experiment_ids = self.experiment_runner.run_experiment_batch(experiments)
                st.success(f"Completed {len(experiment_ids)} province experiments")
            except Exception as e:
                st.error(f"Error running province study: {e}")
@@ -38,7 +38,7 @@ class LogReader:
            # Parse log entries from the end
            entries = []
-            for line in reversed(lines[-count * 2 :]):  # Read more lines in case some don't match
+            for line in reversed(lines[-count * 2:]):  # Read more lines in case some don't match
                entry = self._parse_log_line(line.strip())
                if entry:
                    entries.append(entry)
@@ -13,7 +13,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
 class Predictions:
    def __init__(
-        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+            self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -114,7 +114,7 @@ class Predictions:
            return None
    def _display_single_prediction_results(
-        self, prediction: str, confidence: Optional[float], experiment, name_input: str
+            self, prediction: str, confidence: Optional[float], experiment, name_input: str
    ):
        """Display single prediction results"""
        col1, col2 = st.columns(2)
@@ -300,7 +300,7 @@ class Predictions:
            return pd.DataFrame()
    def _run_dataset_prediction(
-        self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
+            self, df: pd.DataFrame, experiment, sample_size: int, compare_with_actual: bool
    ):
        """Run dataset prediction and display results"""
        with st.spinner("Running predictions..."):
@@ -12,7 +12,7 @@ from research.experiment.experiment_tracker import ExperimentTracker
 class ResultsAnalysis:
    def __init__(
-        self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
+            self, config, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
    ):
        self.config = config
        self.experiment_tracker = experiment_tracker
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
 import streamlit as st
 # Add parent directory to Python path to access core modules