feat: enhance logging and memory management across modules

2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
@@ -48,7 +48,7 @@ class NERNameModel:

        logging.info(f"Loading training data from {data_path}")

-        with open(data_path, 'r', encoding='utf-8') as f:
+        with open(data_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        # Validate and clean training data
@@ -58,7 +58,9 @@ class NERNameModel:
        for i, item in enumerate(raw_data):
            try:
                if not isinstance(item, (list, tuple)) or len(item) != 2:
-                    logging.warning(f"Skipping invalid training example format at index {i}: {item}")
+                    logging.warning(
+                        f"Skipping invalid training example format at index {i}: {item}"
+                    )
                    skipped_count += 1
                    continue

@@ -83,20 +85,27 @@ class NERNameModel:
                    # String format from tagger: "[(0, 6, 'NATIVE'), ...]"
                    try:
                        import ast
+
                        entities = ast.literal_eval(entities_raw)
                        if not isinstance(entities, list):
-                            logging.warning(f"Parsed entities is not a list at index {i}: {entities}")
+                            logging.warning(
+                                f"Parsed entities is not a list at index {i}: {entities}"
+                            )
                            skipped_count += 1
                            continue
                    except (ValueError, SyntaxError) as e:
-                        logging.warning(f"Failed to parse entity string at index {i}: {entities_raw} ({e})")
+                        logging.warning(
+                            f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
+                        )
                        skipped_count += 1
                        continue
                elif isinstance(entities_raw, list):
                    # Already in list format
                    entities = entities_raw
                else:
-                    logging.warning(f"Skipping invalid entities format at index {i}: {entities_raw}")
+                    logging.warning(
+                        f"Skipping invalid entities format at index {i}: {entities_raw}"
+                    )
                    skipped_count += 1
                    continue

@@ -110,16 +119,20 @@ class NERNameModel:
                    start, end, label = entity

                    # Validate entity components
-                    if (not isinstance(start, int) or not isinstance(end, int) or
-                            not isinstance(label, str) or start >= end or
-                            start < 0 or end > len(text)):
+                    if (
+                        not isinstance(start, int)
+                        or not isinstance(end, int)
+                        or not isinstance(label, str)
+                        or start >= end
+                        or start < 0
+                        or end > len(text)
+                    ):
                        logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
                        continue

                    # Check for overlaps with already validated entities
                    has_overlap = any(
-                        start < v_end and end > v_start
-                        for v_start, v_end, _ in valid_entities
+                        start < v_end and end > v_start for v_start, v_end, _ in valid_entities
                    )

                    if has_overlap:
@@ -128,8 +141,10 @@ class NERNameModel:

                    # Validate that the span doesn't contain spaces (matching tagger validation)
                    span_text = text[start:end]
-                    if not span_text or span_text != span_text.strip() or ' ' in span_text:
-                        logging.warning(f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'")
+                    if not span_text or span_text != span_text.strip() or " " in span_text:
+                        logging.warning(
+                            f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
+                        )
                        continue

                    valid_entities.append((start, end, label))
@@ -148,7 +163,9 @@ class NERNameModel:
                skipped_count += 1
                continue

-        logging.info(f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones")
+        logging.info(
+            f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
+        )

        if not valid_data:
            raise ValueError("No valid training examples found in the data")
@@ -156,15 +173,17 @@ class NERNameModel:
        return valid_data

    def train(
-            self,
-            data: List[Tuple[str, Dict]],
-            epochs: int = 5,
-            batch_size: int = 16,
-            dropout_rate: float = 0.2,
+        self,
+        data: List[Tuple[str, Dict]],
+        epochs: int = 5,
+        batch_size: int = 16,
+        dropout_rate: float = 0.2,
    ) -> None:
        """Train the NER model"""
        logging.info(f"Starting NER training with {len(data)} examples")
-        logging.info(f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}")
+        logging.info(
+            f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
+        )

        if self.nlp is None:
            raise ValueError("Model not initialized. Call create_blank_model() first.")
@@ -184,16 +203,15 @@ class NERNameModel:
                doc = self.nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
-                logging.info(f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}")
+                logging.info(
+                    f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
+                )

            # Train in batches
            batches = minibatch(examples, size=batch_size)
            for batch in batches:
                self.nlp.update(
-                    batch,
-                    losses=losses,
-                    drop=dropout_rate,
-                    sgd=self.nlp.create_optimizer()
+                    batch, losses=losses, drop=dropout_rate, sgd=self.nlp.create_optimizer()
                )
                logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")

@@ -208,7 +226,7 @@ class NERNameModel:
            "training_examples": len(data),
            "loss_history": losses_history,
            "batch_size": batch_size,
-            "dropout_rate": dropout_rate
+            "dropout_rate": dropout_rate,
        }

        logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
@@ -225,7 +243,10 @@ class NERNameModel:
        predicted_entities = 0
        actual_entities = 0

-        entity_stats = {"NATIVE": {"tp": 0, "fp": 0, "fn": 0}, "SURNAME": {"tp": 0, "fp": 0, "fn": 0}}
+        entity_stats = {
+            "NATIVE": {"tp": 0, "fp": 0, "fn": 0},
+            "SURNAME": {"tp": 0, "fp": 0, "fn": 0},
+        }

        for text, annotations in test_data:
            # Get actual entities
@@ -259,7 +280,9 @@ class NERNameModel:
        # Calculate overall metrics
        precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
        recall = correct_entities / actual_entities if actual_entities > 0 else 0
-        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+        f1_score = (
+            2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+        )

        # Calculate per-label metrics
        label_metrics = {}
@@ -268,14 +291,16 @@ class NERNameModel:
            label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            label_f1 = (
-                    2 * (label_precision * label_recall) / (label_precision + label_recall)) \
-                if (label_precision + label_recall) > 0 else 0
+                (2 * (label_precision * label_recall) / (label_precision + label_recall))
+                if (label_precision + label_recall) > 0
+                else 0
+            )

            label_metrics[label] = {
                "precision": label_precision,
                "recall": label_recall,
                "f1_score": label_f1,
-                "support": tp + fn
+                "support": tp + fn,
            }

        evaluation_results = {
@@ -286,9 +311,9 @@ class NERNameModel:
                "total_examples": total_examples,
                "correct_entities": correct_entities,
                "predicted_entities": predicted_entities,
-                "actual_entities": actual_entities
+                "actual_entities": actual_entities,
            },
-            "by_label": label_metrics
+            "by_label": label_metrics,
        }

        logging.info(f"NER Evaluation completed. Overall F1: {f1_score:.4f}")
@@ -309,7 +334,7 @@ class NERNameModel:

        # Save training statistics
        stats_path = model_dir / "training_stats.json"
-        with open(stats_path, 'w', encoding='utf-8') as f:
+        with open(stats_path, "w", encoding="utf-8") as f:
            json.dump(self.training_stats, f, indent=2)

        logging.info(f"NER Model saved to {model_dir}")
@@ -328,7 +353,7 @@ class NERNameModel:
        # Load training statistics if available
        stats_path = Path(model_path) / "training_stats.json"
        if stats_path.exists():
-            with open(stats_path, 'r', encoding='utf-8') as f:
+            with open(stats_path, "r", encoding="utf-8") as f:
                self.training_stats = json.load(f)

        logging.info("NER Model loaded successfully")
@@ -342,15 +367,14 @@ class NERNameModel:
        entities = []

        for ent in doc.ents:
-            entities.append({
-                "text": ent.text,
-                "label": ent.label_,
-                "start": ent.start_char,
-                "end": ent.end_char,
-                "confidence": getattr(ent, 'score', None)  # If confidence scores are available
-            })
+            entities.append(
+                {
+                    "text": ent.text,
+                    "label": ent.label_,
+                    "start": ent.start_char,
+                    "end": ent.end_char,
+                    "confidence": getattr(ent, "score", None),  # If confidence scores are available
+                }
+            )

-        return {
-            "text": text,
-            "entities": entities
-        }
+        return {"text": text, "entities": entities}