drc-ners-nlp/processing/ner/ner_data_builder.py

import ast
import json
import logging
from pathlib import Path

import pandas as pd
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans

from core.config import PipelineConfig
from core.utils import get_data_file_path


class NERDataBuilder:
    def __init__(self, config: PipelineConfig):
        self.config = config

    @classmethod
    def parse_entities(cls, entities_str):
        """Parse entity string (tuple format or JSON) into spaCy-style tuples."""
        if not entities_str or entities_str in ["[]", "", "nan"]:
            return []

        entities_str = str(entities_str).strip()

        # Handle different formats
        try:
            # Try to parse as Python literal (tuples or lists)
            if entities_str.startswith("[(") and entities_str.endswith(")]"):
                # Standard tuple format: [(0, 6, 'NATIVE'), ...]
                return ast.literal_eval(entities_str)
            elif entities_str.startswith("[[") and entities_str.endswith("]]"):
                # Nested list format: [[0, 6, 'NATIVE'], ...]
                nested_list = ast.literal_eval(entities_str)
                return [(start, end, label) for start, end, label in nested_list]
            elif entities_str.startswith("[{") and entities_str.endswith("}]"):
                # JSON format: [{"start": 0, "end": 6, "label": "NATIVE"}, ...]
                json_entities = json.loads(entities_str)
                return [(e["start"], e["end"], e["label"]) for e in json_entities]
            else:
                # Try general ast.literal_eval for other formats
                parsed = ast.literal_eval(entities_str)
                if isinstance(parsed, list):
                    # Convert any list format to tuples
                    result = []
                    for item in parsed:
                        if isinstance(item, (list, tuple)) and len(item) == 3:
                            result.append((item[0], item[1], item[2]))
                    return result

        except (ValueError, SyntaxError, json.JSONDecodeError) as e:
            logging.warning(f"Failed to parse entities: {entities_str} ({e})")
            return []

        logging.warning(f"Unknown entity format: {entities_str}")
        return []

    @classmethod
    def validate_entities(cls, entities, text):
        """Validate and sort entity tuples, removing overlaps and invalid spans."""
        if not entities or not text:
            return []

        text = str(text).strip()
        if not text:
            return []

        # Filter out invalid entities
        valid_entities = []
        for entity in entities:
            if not isinstance(entity, (list, tuple)) or len(entity) != 3:
                logging.warning(f"Invalid entity format: {entity}")
                continue

            start, end, label = entity

            # Ensure start/end are integers
            try:
                start = int(start)
                end = int(end)
            except (ValueError, TypeError):
                logging.warning(f"Invalid start/end positions: {entity}")
                continue

            # Ensure label is string
            if not isinstance(label, str):
                logging.warning(f"Invalid label type: {entity}")
                continue

            # Check bounds
            if not (0 <= start < end <= len(text)):
                logging.warning(f"Entity span out of bounds: {entity} for text '{text}' (length {len(text)})")
                continue

            # Check that span contains actual text
            span_text = text[start:end].strip()
            if not span_text:
                logging.warning(f"Empty span: {entity} in text '{text}'")
                continue

            valid_entities.append((start, end, label))

        if not valid_entities:
            return []

        # Sort by start position
        valid_entities.sort(key=lambda x: (x[0], x[1]))

        # Remove overlapping entities (keep the first one)
        filtered = []
        for start, end, label in valid_entities:
            # Check for overlap with already added entities
            has_overlap = False
            for e_start, e_end, _ in filtered:
                if not (end <= e_start or start >= e_end):
                    has_overlap = True
                    logging.warning(
                        f"Removing overlapping entity ({start}, {end}, '{label}') "
                        f"conflicts with ({e_start}, {e_end}) in '{text}'"
                    )
                    break

            if not has_overlap:
                filtered.append((start, end, label))

        return filtered

    @classmethod
    def create_doc(cls, text, entities, nlp):
        """Create a spaCy Doc object with entities added."""
        doc = nlp(text)
        ents = []

        for start, end, label in entities:
            span = doc.char_span(start, end, label=label, alignment_mode="contract") \
                   or doc.char_span(start, end, label=label, alignment_mode="strict")
            if span:
                ents.append(span)
            else:
                logging.warning(f"Could not create span ({start}, {end}, '{label}') in '{text}'")

        doc.ents = filter_spans(ents) if ents else []
        return doc

    def build(self, data: pd.DataFrame = None) -> int:
        """Build the dataset for NER training."""
        logging.info("Building dataset for NER training")
        try:
            df = pd.read_csv(get_data_file_path("names_featured.csv", self.config)) \
                if data is None \
                else data

            ner_df = df[df["ner_tagged"] == 1].copy()
            if ner_df.empty:
                logging.error("No NER tagged data found in the CSV")
                return 1

            logging.info(f"Found {len(ner_df)} NER tagged entries")
            nlp = spacy.blank("fr")
            doc_bin, training_data = DocBin(), []
            processed_count, skipped_count = 0, 0

            for _, row in ner_df.iterrows():
                text = str(row.get("name", "")).strip()
                if not text:
                    continue

                entities = self.parse_entities(row.get("ner_entities", "[]"))
                entities = self.validate_entities(entities, text)

                training_data.append((text, {"entities": entities}))
                try:
                    doc_bin.add(self.create_doc(text, entities, nlp))
                    processed_count += 1
                except Exception as e:
                    logging.error(f"Error processing '{text}': {e}")
                    skipped_count += 1

            if not training_data:
                logging.error("No valid training examples generated")
                return 1

            json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
            spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]

            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(training_data, f, ensure_ascii=False, indent=None)
            doc_bin.to_disk(spacy_path)

            logging.info(f"Processed: {processed_count}, Skipped: {skipped_count}")
            logging.info(f"Saved NER data in json format to {json_path}")
            logging.info(f"Saved NER data in spaCy format to {spacy_path}")
            return 0

        except Exception as e:
            logging.error(f"Failed to build NER dataset: {e}", exc_info=True)
            return 1