feat: enhance logging and memory management across modules
This commit is contained in:
@@ -13,10 +13,17 @@ class BaseNameFormatter(ABC):
|
||||
"""
|
||||
|
||||
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
||||
self.connectors = connectors or ['wa', 'ya', 'ka', 'ba']
|
||||
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
|
||||
self.additional_surnames = additional_surnames or [
|
||||
'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
|
||||
'andre', 'michel', 'robert'
|
||||
"jean",
|
||||
"paul",
|
||||
"marie",
|
||||
"joseph",
|
||||
"pierre",
|
||||
"claude",
|
||||
"andre",
|
||||
"michel",
|
||||
"robert",
|
||||
]
|
||||
|
||||
@classmethod
|
||||
@@ -26,7 +33,9 @@ class BaseNameFormatter(ABC):
|
||||
return []
|
||||
return native_str.strip().split()
|
||||
|
||||
def create_ner_tags(self, text: str, native_parts: List[str], surname: str) -> List[Tuple[int, int, str]]:
|
||||
def create_ner_tags(
|
||||
self, text: str, native_parts: List[str], surname: str
|
||||
) -> List[Tuple[int, int, str]]:
|
||||
"""Create NER entity tags for transformed text"""
|
||||
entities = []
|
||||
current_pos = 0
|
||||
@@ -38,15 +47,15 @@ class BaseNameFormatter(ABC):
|
||||
|
||||
# Determine tag based on word content
|
||||
if word in native_parts or any(connector in word for connector in self.connectors):
|
||||
tag = 'NATIVE'
|
||||
tag = "NATIVE"
|
||||
elif word == surname or word in self.additional_surnames:
|
||||
tag = 'SURNAME'
|
||||
tag = "SURNAME"
|
||||
else:
|
||||
# Check if it's a compound native word or new surname
|
||||
if any(part in word for part in native_parts):
|
||||
tag = 'NATIVE'
|
||||
tag = "NATIVE"
|
||||
else:
|
||||
tag = 'SURNAME'
|
||||
tag = "SURNAME"
|
||||
|
||||
entities.append((start_pos, end_pos, tag))
|
||||
current_pos = end_pos + 1 # +1 for space
|
||||
@@ -54,15 +63,17 @@ class BaseNameFormatter(ABC):
|
||||
return entities
|
||||
|
||||
@classmethod
|
||||
def compute_derived_attributes(cls, name: str) -> Dict:
|
||||
def compute_numeric_features(cls, name: str) -> Dict:
|
||||
"""Compute all derived attributes for the transformed name"""
|
||||
words_count = len(name.split()) if name else 0
|
||||
length = len(name) if name else 0
|
||||
|
||||
return {
|
||||
'words': words_count,
|
||||
'length': length,
|
||||
'identified_category': NameCategory.SIMPLE if words_count == 3 else NameCategory.COMPOSE,
|
||||
"words": words_count,
|
||||
"length": length,
|
||||
"identified_category": (
|
||||
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
|
||||
),
|
||||
}
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter
|
||||
|
||||
class ConnectorFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row['probable_native'])
|
||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
connector = random.choice(self.connectors)
|
||||
|
||||
# Connect native parts with a random connector
|
||||
@@ -17,20 +17,22 @@ class ConnectorFormatter(BaseNameFormatter):
|
||||
connected_native = f" {connector} ".join(native_parts)
|
||||
full_name = f"{connected_native} {surname}".strip()
|
||||
else:
|
||||
connected_native = f"{row['probable_native']} {connector} {row['probable_native']}".strip()
|
||||
connected_native = (
|
||||
f"{row['probable_native']} {connector} {row['probable_native']}".strip()
|
||||
)
|
||||
full_name = f"{connected_native} {surname}".strip()
|
||||
|
||||
return {
|
||||
'name': full_name,
|
||||
'probable_native': connected_native,
|
||||
'identify_name': connected_native,
|
||||
'probable_surname': surname,
|
||||
'identify_surname': surname,
|
||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
'transformation_type': self.transformation_type,
|
||||
**self.compute_derived_attributes(full_name)
|
||||
"name": full_name,
|
||||
"probable_native": connected_native,
|
||||
"identified_name": connected_native,
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return 'connector_added'
|
||||
return "connector_added"
|
||||
|
||||
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter
|
||||
|
||||
class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row['probable_native'])
|
||||
original_surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Add random additional surname
|
||||
additional_surname = random.choice(self.additional_surnames)
|
||||
@@ -17,16 +17,16 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
|
||||
full_name = f"{row['probable_native']} {combined_surname}".strip()
|
||||
|
||||
return {
|
||||
'name': full_name,
|
||||
'probable_native': row['probable_native'],
|
||||
'identify_name': row['probable_native'],
|
||||
'probable_surname': combined_surname,
|
||||
'identity_surname': combined_surname,
|
||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, combined_surname)),
|
||||
'transformation_type': self.transformation_type,
|
||||
**self.compute_derived_attributes(full_name)
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": combined_surname,
|
||||
"identified_surname": combined_surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return 'extended_surname'
|
||||
return "extended_surname"
|
||||
|
||||
@@ -7,22 +7,22 @@ from processing.ner.formats import BaseNameFormatter
|
||||
|
||||
class NativeOnlyFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row['probable_native'])
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
|
||||
# Only native components
|
||||
full_name = row['probable_native']
|
||||
full_name = row["probable_native"]
|
||||
|
||||
return {
|
||||
'name': full_name,
|
||||
'probable_native': row['probable_native'],
|
||||
'identify_name': row['probable_native'],
|
||||
'probable_surname': '',
|
||||
'identify_surname': '',
|
||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, '')),
|
||||
'transformation_type': self.transformation_type,
|
||||
**self.compute_derived_attributes(full_name)
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": "",
|
||||
"identified_surname": "",
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return 'native_only'
|
||||
return "native_only"
|
||||
|
||||
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter
|
||||
|
||||
class OriginalFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row['probable_native'])
|
||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Keep original order: native components + surname
|
||||
full_name = f"{row['probable_native']} {surname}".strip()
|
||||
|
||||
return {
|
||||
'name': full_name,
|
||||
'probable_native': row['probable_native'],
|
||||
'identify_name': row['probable_native'],
|
||||
'probable_surname': surname,
|
||||
'identify_surname': surname,
|
||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
'transformation_type': self.transformation_type,
|
||||
**self.compute_derived_attributes(full_name)
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return 'original'
|
||||
return "original"
|
||||
|
||||
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter
|
||||
|
||||
class PositionFlippedFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row['probable_native'])
|
||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Flip order: surname + native components
|
||||
full_name = f"{surname} {row['probable_native']}".strip()
|
||||
|
||||
return {
|
||||
'name': full_name,
|
||||
'probable_native': row['probable_native'],
|
||||
'identify_name': row['probable_native'],
|
||||
'probable_surname': surname,
|
||||
'identify_surname': surname,
|
||||
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
'transformation_type': self.transformation_type,
|
||||
**self.compute_derived_attributes(full_name)
|
||||
"name": full_name,
|
||||
"probable_native": row["probable_native"],
|
||||
"identified_name": row["probable_native"],
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return 'position_flipped'
|
||||
return "position_flipped"
|
||||
|
||||
@@ -7,24 +7,24 @@ from processing.ner.formats import BaseNameFormatter
|
||||
|
||||
class ReducedNativeFormatter(BaseNameFormatter):
|
||||
def transform(self, row: pd.Series) -> Dict:
|
||||
native_parts = self.parse_native_components(row['probable_native'])
|
||||
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
|
||||
native_parts = self.parse_native_components(row["probable_native"])
|
||||
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
|
||||
|
||||
# Keep only first native component + surname
|
||||
reduced_native = native_parts[0] if len(native_parts) > 1 else row['probable_native']
|
||||
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
|
||||
full_name = f"{reduced_native} {surname}".strip()
|
||||
|
||||
return {
|
||||
'name': full_name,
|
||||
'probable_native': reduced_native,
|
||||
'identify_name': reduced_native,
|
||||
'probable_surname': surname,
|
||||
'identify_surname': surname,
|
||||
'ner_entities': str(self.create_ner_tags(full_name, [reduced_native], surname)),
|
||||
'transformation_type': self.transformation_type,
|
||||
**self.compute_derived_attributes(full_name)
|
||||
"name": full_name,
|
||||
"probable_native": reduced_native,
|
||||
"identified_name": reduced_native,
|
||||
"probable_surname": surname,
|
||||
"identified_surname": surname,
|
||||
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
|
||||
"transformation_type": self.transformation_type,
|
||||
**self.compute_numeric_features(full_name),
|
||||
}
|
||||
|
||||
@property
|
||||
def transformation_type(self) -> str:
|
||||
return 'reduced_native'
|
||||
return "reduced_native"
|
||||
|
||||
+122
-168
@@ -10,189 +10,143 @@ from spacy.util import filter_spans
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import DataLoader
|
||||
|
||||
|
||||
class NERDataBuilder:
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.data_loader = DataLoader(config)
|
||||
|
||||
@classmethod
|
||||
def parse_entities(cls, entities_str):
|
||||
"""Parse entity string (tuple format or JSON) into spaCy-style tuples."""
|
||||
if not entities_str or entities_str in ["[]", "", "nan"]:
|
||||
return []
|
||||
@staticmethod
|
||||
def _parse_entities(series: pd.Series) -> pd.Series:
|
||||
"""Vectorized parse of entity strings."""
|
||||
|
||||
entities_str = str(entities_str).strip()
|
||||
|
||||
# Handle different formats
|
||||
try:
|
||||
# Try to parse as Python literal (tuples or lists)
|
||||
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
||||
# Standard tuple format: [(0, 6, 'NATIVE'), ...]
|
||||
return ast.literal_eval(entities_str)
|
||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||
# Nested list format: [[0, 6, 'NATIVE'], ...]
|
||||
nested_list = ast.literal_eval(entities_str)
|
||||
return [(start, end, label) for start, end, label in nested_list]
|
||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||
# JSON format: [{"start": 0, "end": 6, "label": "NATIVE"}, ...]
|
||||
json_entities = json.loads(entities_str)
|
||||
return [(e["start"], e["end"], e["label"]) for e in json_entities]
|
||||
else:
|
||||
# Try general ast.literal_eval for other formats
|
||||
parsed = ast.literal_eval(entities_str)
|
||||
if isinstance(parsed, list):
|
||||
# Convert any list format to tuples
|
||||
result = []
|
||||
for item in parsed:
|
||||
if isinstance(item, (list, tuple)) and len(item) == 3:
|
||||
result.append((item[0], item[1], item[2]))
|
||||
return result
|
||||
|
||||
except (ValueError, SyntaxError, json.JSONDecodeError) as e:
|
||||
logging.warning(f"Failed to parse entities: {entities_str} ({e})")
|
||||
return []
|
||||
|
||||
logging.warning(f"Unknown entity format: {entities_str}")
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def validate_entities(cls, entities, text):
|
||||
"""Validate and sort entity tuples, removing overlaps and invalid spans."""
|
||||
if not entities or not text:
|
||||
return []
|
||||
|
||||
text = str(text).strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Filter out invalid entities
|
||||
valid_entities = []
|
||||
for entity in entities:
|
||||
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
|
||||
logging.warning(f"Invalid entity format: {entity}")
|
||||
continue
|
||||
|
||||
start, end, label = entity
|
||||
|
||||
# Ensure start/end are integers
|
||||
def _parse(entities_str):
|
||||
if not entities_str or entities_str in ["[]", "", "nan"]:
|
||||
return []
|
||||
entities_str = str(entities_str).strip()
|
||||
try:
|
||||
start = int(start)
|
||||
end = int(end)
|
||||
except (ValueError, TypeError):
|
||||
logging.warning(f"Invalid start/end positions: {entity}")
|
||||
continue
|
||||
if entities_str.startswith("[(") and entities_str.endswith(")]"):
|
||||
return ast.literal_eval(entities_str)
|
||||
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
|
||||
return [tuple(e) for e in ast.literal_eval(entities_str)]
|
||||
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
|
||||
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
|
||||
else:
|
||||
parsed = ast.literal_eval(entities_str)
|
||||
return [
|
||||
tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
|
||||
]
|
||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
# Ensure label is string
|
||||
if not isinstance(label, str):
|
||||
logging.warning(f"Invalid label type: {entity}")
|
||||
continue
|
||||
return series.map(_parse)
|
||||
|
||||
# Check bounds
|
||||
if not (0 <= start < end <= len(text)):
|
||||
logging.warning(f"Entity span out of bounds: {entity} for text '{text}' (length {len(text)})")
|
||||
continue
|
||||
@staticmethod
|
||||
def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
|
||||
"""Vectorized entity validation."""
|
||||
|
||||
# Check that span contains actual text
|
||||
span_text = text[start:end].strip()
|
||||
if not span_text:
|
||||
logging.warning(f"Empty span: {entity} in text '{text}'")
|
||||
continue
|
||||
|
||||
valid_entities.append((start, end, label))
|
||||
|
||||
if not valid_entities:
|
||||
return []
|
||||
|
||||
# Sort by start position
|
||||
valid_entities.sort(key=lambda x: (x[0], x[1]))
|
||||
|
||||
# Remove overlapping entities (keep the first one)
|
||||
filtered = []
|
||||
for start, end, label in valid_entities:
|
||||
# Check for overlap with already added entities
|
||||
has_overlap = False
|
||||
for e_start, e_end, _ in filtered:
|
||||
if not (end <= e_start or start >= e_end):
|
||||
has_overlap = True
|
||||
logging.warning(
|
||||
f"Removing overlapping entity ({start}, {end}, '{label}') "
|
||||
f"conflicts with ({e_start}, {e_end}) in '{text}'"
|
||||
)
|
||||
break
|
||||
|
||||
if not has_overlap:
|
||||
filtered.append((start, end, label))
|
||||
|
||||
return filtered
|
||||
|
||||
@classmethod
|
||||
def create_doc(cls, text, entities, nlp):
|
||||
"""Create a spaCy Doc object with entities added."""
|
||||
doc = nlp(text)
|
||||
ents = []
|
||||
|
||||
for start, end, label in entities:
|
||||
span = doc.char_span(start, end, label=label, alignment_mode="contract") \
|
||||
or doc.char_span(start, end, label=label, alignment_mode="strict")
|
||||
if span:
|
||||
ents.append(span)
|
||||
else:
|
||||
logging.warning(f"Could not create span ({start}, {end}, '{label}') in '{text}'")
|
||||
|
||||
doc.ents = filter_spans(ents) if ents else []
|
||||
return doc
|
||||
|
||||
def build(self, data: pd.DataFrame = None) -> int:
|
||||
"""Build the dataset for NER training."""
|
||||
logging.info("Building dataset for NER training")
|
||||
try:
|
||||
df = pd.read_csv(get_data_file_path("names_featured.csv", self.config)) \
|
||||
if data is None \
|
||||
else data
|
||||
|
||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||
if ner_df.empty:
|
||||
logging.error("No NER tagged data found in the CSV")
|
||||
return 1
|
||||
|
||||
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
||||
nlp = spacy.blank("fr")
|
||||
doc_bin, training_data = DocBin(), []
|
||||
processed_count, skipped_count = 0, 0
|
||||
|
||||
for _, row in ner_df.iterrows():
|
||||
text = str(row.get("name", "")).strip()
|
||||
if not text:
|
||||
def _validate(text, entities):
|
||||
if not entities or not text:
|
||||
return []
|
||||
text = str(text).strip()
|
||||
valid = []
|
||||
for ent in entities:
|
||||
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
|
||||
continue
|
||||
|
||||
entities = self.parse_entities(row.get("ner_entities", "[]"))
|
||||
entities = self.validate_entities(entities, text)
|
||||
|
||||
training_data.append((text, {"entities": entities}))
|
||||
start, end, label = ent
|
||||
try:
|
||||
doc_bin.add(self.create_doc(text, entities, nlp))
|
||||
processed_count += 1
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing '{text}': {e}")
|
||||
skipped_count += 1
|
||||
start, end = int(start), int(end)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
if not isinstance(label, str):
|
||||
continue
|
||||
if not (0 <= start < end <= len(text)):
|
||||
continue
|
||||
if not text[start:end].strip():
|
||||
continue
|
||||
valid.append((start, end, label))
|
||||
if not valid:
|
||||
return []
|
||||
valid.sort(key=lambda x: (x[0], x[1]))
|
||||
# remove overlaps
|
||||
filtered, last_end = [], -1
|
||||
for s, e, l in valid:
|
||||
if s >= last_end:
|
||||
filtered.append((s, e, l))
|
||||
last_end = e
|
||||
return filtered
|
||||
|
||||
if not training_data:
|
||||
logging.error("No valid training examples generated")
|
||||
return 1
|
||||
return pd.Series(map(_validate, texts, entities_series), index=texts.index)
|
||||
|
||||
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
|
||||
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
|
||||
@staticmethod
|
||||
def _create_docs(nlp, texts, entities):
|
||||
"""Batch create spaCy Docs."""
|
||||
docs = []
|
||||
for text, ents in zip(texts, entities):
|
||||
doc = nlp(text)
|
||||
spans = []
|
||||
for start, end, label in ents:
|
||||
span = doc.char_span(
|
||||
start, end, label=label, alignment_mode="contract"
|
||||
) or doc.char_span(start, end, label=label, alignment_mode="strict")
|
||||
if span:
|
||||
spans.append(span)
|
||||
doc.ents = filter_spans(spans)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, indent=None)
|
||||
doc_bin.to_disk(spacy_path)
|
||||
def build(self) -> int:
|
||||
input_filepath = get_data_file_path(
|
||||
self.config.data.output_files["engineered"], self.config
|
||||
)
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
df = df[["name", "ner_tagged", "ner_entities"]]
|
||||
|
||||
logging.info(f"Processed: {processed_count}, Skipped: {skipped_count}")
|
||||
logging.info(f"Saved NER data in json format to {json_path}")
|
||||
logging.info(f"Saved NER data in spaCy format to {spacy_path}")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to build NER dataset: {e}", exc_info=True)
|
||||
# Filter early
|
||||
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
|
||||
if ner_df.empty:
|
||||
logging.error("No NER tagged data found")
|
||||
return 1
|
||||
|
||||
total_rows = len(df)
|
||||
del df # No need to keep in memory
|
||||
|
||||
logging.info(f"Found {len(ner_df)} NER tagged entries")
|
||||
nlp = spacy.blank("fr")
|
||||
|
||||
# Vectorized parsing + validation
|
||||
parsed_entities = self._parse_entities(ner_df["ner_entities"])
|
||||
validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
|
||||
|
||||
# Drop rows with no valid entities
|
||||
mask = validated_entities.map(bool)
|
||||
ner_df = ner_df.loc[mask]
|
||||
validated_entities = validated_entities.loc[mask]
|
||||
|
||||
if ner_df.empty:
|
||||
logging.error("No valid training examples after validation")
|
||||
return 1
|
||||
|
||||
# Prepare training data
|
||||
training_data = list(
|
||||
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
|
||||
)
|
||||
|
||||
# Create spaCy DocBin in batch
|
||||
docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
||||
doc_bin = DocBin(docs=docs)
|
||||
|
||||
# Save
|
||||
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
|
||||
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
|
||||
doc_bin.to_disk(spacy_path)
|
||||
|
||||
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
|
||||
logging.info(f"Saved NER JSON to {json_path}")
|
||||
logging.info(f"Saved NER spacy to {spacy_path}")
|
||||
return 0
|
||||
|
||||
@@ -1,9 +1,14 @@
|
||||
import random
|
||||
from typing import List
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from core.config import PipelineConfig
|
||||
from core.utils import get_data_file_path
|
||||
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
|
||||
from processing.ner.formats.connectors_format import ConnectorFormatter
|
||||
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
|
||||
from processing.ner.formats.native_only_format import NativeOnlyFormatter
|
||||
@@ -18,50 +23,64 @@ class NEREngineering:
|
||||
and encourage sequence characteristic learning.
|
||||
"""
|
||||
|
||||
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
|
||||
self.connectors = connectors or ['wa', 'ya', 'ka', 'ba', 'la']
|
||||
self.additional_surnames = additional_surnames or [
|
||||
'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
|
||||
'andre', 'michel', 'robert'
|
||||
def __init__(self, config: PipelineConfig):
|
||||
self.config = config
|
||||
self.data_loader = DataLoader(config)
|
||||
self.connectors = ["wa", "ya", "ka", "ba", "la"]
|
||||
self.additional_surnames = [
|
||||
"jean",
|
||||
"paul",
|
||||
"marie",
|
||||
"joseph",
|
||||
"pierre",
|
||||
"claude",
|
||||
"andre",
|
||||
"michel",
|
||||
"robert",
|
||||
]
|
||||
|
||||
random.seed(self.config.data.random_seed)
|
||||
np.random.seed(self.config.data.random_seed)
|
||||
|
||||
# Initialize format classes
|
||||
self.formatters = {
|
||||
'original': OriginalFormatter(self.connectors, self.additional_surnames),
|
||||
'native_only': NativeOnlyFormatter(self.connectors, self.additional_surnames),
|
||||
'position_flipped': PositionFlippedFormatter(self.connectors, self.additional_surnames),
|
||||
'reduced_native': ReducedNativeFormatter(self.connectors, self.additional_surnames),
|
||||
'connector_added': ConnectorFormatter(self.connectors, self.additional_surnames),
|
||||
'extended_surname': ExtendedSurnameFormatter(self.connectors, self.additional_surnames)
|
||||
"original": OriginalFormatter(self.connectors, self.additional_surnames),
|
||||
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
|
||||
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
|
||||
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
|
||||
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
|
||||
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def load_ner_data(cls, filepath: str) -> pd.DataFrame:
|
||||
def load_data(self) -> pd.DataFrame:
|
||||
"""Load and filter NER-tagged data from CSV file"""
|
||||
df = pd.read_csv(filepath)
|
||||
|
||||
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
df = self.data_loader.load_csv_complete(filepath)
|
||||
|
||||
# Filter only NER-tagged rows
|
||||
ner_data = df[df['ner_tagged'] == 1].copy()
|
||||
print(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
|
||||
ner_data = df[df["ner_tagged"] == 1].copy()
|
||||
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
|
||||
|
||||
return ner_data
|
||||
|
||||
def engineer_dataset(self, df: pd.DataFrame, random_seed: int = 42) -> pd.DataFrame:
|
||||
"""
|
||||
Apply feature engineering transformations according to the specified rules:
|
||||
- First 25%: original format
|
||||
- Second 25%: remove surname
|
||||
- Third 25%: flip positions
|
||||
- Fourth 10%: reduce native components
|
||||
- Fifth 10%: add connectors
|
||||
- Last 5%: extend surnames
|
||||
"""
|
||||
random.seed(random_seed)
|
||||
np.random.seed(random_seed)
|
||||
def compute(self) -> None:
|
||||
logging.info("Applying feature engineering transformations...")
|
||||
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
|
||||
output_filepath = get_data_file_path(
|
||||
self.config.data.output_files["engineered"], self.config
|
||||
)
|
||||
|
||||
# Shuffle the dataset
|
||||
df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
|
||||
total_rows = len(df_shuffled)
|
||||
df = self.data_loader.load_csv_complete(input_filepath)
|
||||
ner_df = df[df["ner_tagged"] == 1].copy()
|
||||
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
|
||||
|
||||
del df # No need to keep in memory
|
||||
|
||||
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
|
||||
drop=True
|
||||
)
|
||||
total_rows = len(ner_df)
|
||||
|
||||
# Calculate split points
|
||||
split_25_1 = int(total_rows * 0.25)
|
||||
@@ -71,37 +90,31 @@ class NEREngineering:
|
||||
split_10_2 = int(total_rows * 0.95)
|
||||
|
||||
# Define transformation groups
|
||||
transformation_groups = [
|
||||
(0, split_25_1, 'original'),
|
||||
(split_25_1, split_25_2, 'native_only'),
|
||||
(split_25_2, split_25_3, 'position_flipped'),
|
||||
(split_25_3, split_10_1, 'reduced_native'),
|
||||
(split_10_1, split_10_2, 'connector_added'),
|
||||
(split_10_2, total_rows, 'extended_surname')
|
||||
groups = [
|
||||
(0, split_25_1, "original"), # First 25%: original format
|
||||
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
|
||||
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
|
||||
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
|
||||
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
|
||||
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
|
||||
]
|
||||
|
||||
print("Dataset splits:")
|
||||
for start, end, trans_type in transformation_groups:
|
||||
print(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
|
||||
for start, end, trans_type in groups:
|
||||
logging.info(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
|
||||
|
||||
# Process each group
|
||||
engineered_rows = []
|
||||
for start, end, formatter_key in transformation_groups:
|
||||
rows = []
|
||||
for start, end, formatter_key in groups:
|
||||
formatter = self.formatters[formatter_key]
|
||||
|
||||
for idx in range(start, end):
|
||||
row = df_shuffled.iloc[idx]
|
||||
for idx in tqdm(range(start, end), desc=f"Processing {formatter_key}"):
|
||||
row = ner_df.iloc[idx]
|
||||
transformed = formatter.transform(row)
|
||||
|
||||
# Keep original columns and add transformed ones
|
||||
new_row = row.to_dict()
|
||||
new_row.update(transformed)
|
||||
engineered_rows.append(new_row)
|
||||
rows.append(new_row)
|
||||
|
||||
return pd.DataFrame(engineered_rows)
|
||||
|
||||
@classmethod
|
||||
def save_engineered_dataset(cls, df: pd.DataFrame, output_path: str):
|
||||
"""Save the engineered dataset to CSV file"""
|
||||
df.to_csv(output_path, index=False)
|
||||
print(f"Engineered dataset saved to {output_path}")
|
||||
self.data_loader.save_csv(pd.DataFrame(rows), output_filepath)
|
||||
logging.info(f"Engineered dataset saved to {output_filepath}")
|
||||
|
||||
@@ -48,7 +48,7 @@ class NERNameModel:
|
||||
|
||||
logging.info(f"Loading training data from {data_path}")
|
||||
|
||||
with open(data_path, 'r', encoding='utf-8') as f:
|
||||
with open(data_path, "r", encoding="utf-8") as f:
|
||||
raw_data = json.load(f)
|
||||
|
||||
# Validate and clean training data
|
||||
@@ -58,7 +58,9 @@ class NERNameModel:
|
||||
for i, item in enumerate(raw_data):
|
||||
try:
|
||||
if not isinstance(item, (list, tuple)) or len(item) != 2:
|
||||
logging.warning(f"Skipping invalid training example format at index {i}: {item}")
|
||||
logging.warning(
|
||||
f"Skipping invalid training example format at index {i}: {item}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
@@ -83,20 +85,27 @@ class NERNameModel:
|
||||
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
|
||||
try:
|
||||
import ast
|
||||
|
||||
entities = ast.literal_eval(entities_raw)
|
||||
if not isinstance(entities, list):
|
||||
logging.warning(f"Parsed entities is not a list at index {i}: {entities}")
|
||||
logging.warning(
|
||||
f"Parsed entities is not a list at index {i}: {entities}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
except (ValueError, SyntaxError) as e:
|
||||
logging.warning(f"Failed to parse entity string at index {i}: {entities_raw} ({e})")
|
||||
logging.warning(
|
||||
f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
elif isinstance(entities_raw, list):
|
||||
# Already in list format
|
||||
entities = entities_raw
|
||||
else:
|
||||
logging.warning(f"Skipping invalid entities format at index {i}: {entities_raw}")
|
||||
logging.warning(
|
||||
f"Skipping invalid entities format at index {i}: {entities_raw}"
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
@@ -110,16 +119,20 @@ class NERNameModel:
|
||||
start, end, label = entity
|
||||
|
||||
# Validate entity components
|
||||
if (not isinstance(start, int) or not isinstance(end, int) or
|
||||
not isinstance(label, str) or start >= end or
|
||||
start < 0 or end > len(text)):
|
||||
if (
|
||||
not isinstance(start, int)
|
||||
or not isinstance(end, int)
|
||||
or not isinstance(label, str)
|
||||
or start >= end
|
||||
or start < 0
|
||||
or end > len(text)
|
||||
):
|
||||
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
has_overlap = any(
|
||||
start < v_end and end > v_start
|
||||
for v_start, v_end, _ in valid_entities
|
||||
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
|
||||
)
|
||||
|
||||
if has_overlap:
|
||||
@@ -128,8 +141,10 @@ class NERNameModel:
|
||||
|
||||
# Validate that the span doesn't contain spaces (matching tagger validation)
|
||||
span_text = text[start:end]
|
||||
if not span_text or span_text != span_text.strip() or ' ' in span_text:
|
||||
logging.warning(f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'")
|
||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||
logging.warning(
|
||||
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
|
||||
)
|
||||
continue
|
||||
|
||||
valid_entities.append((start, end, label))
|
||||
@@ -148,7 +163,9 @@ class NERNameModel:
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
logging.info(f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones")
|
||||
logging.info(
|
||||
f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
|
||||
)
|
||||
|
||||
if not valid_data:
|
||||
raise ValueError("No valid training examples found in the data")
|
||||
@@ -156,15 +173,17 @@ class NERNameModel:
|
||||
return valid_data
|
||||
|
||||
def train(
|
||||
self,
|
||||
data: List[Tuple[str, Dict]],
|
||||
epochs: int = 5,
|
||||
batch_size: int = 16,
|
||||
dropout_rate: float = 0.2,
|
||||
self,
|
||||
data: List[Tuple[str, Dict]],
|
||||
epochs: int = 5,
|
||||
batch_size: int = 16,
|
||||
dropout_rate: float = 0.2,
|
||||
) -> None:
|
||||
"""Train the NER model"""
|
||||
logging.info(f"Starting NER training with {len(data)} examples")
|
||||
logging.info(f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}")
|
||||
logging.info(
|
||||
f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
|
||||
)
|
||||
|
||||
if self.nlp is None:
|
||||
raise ValueError("Model not initialized. Call create_blank_model() first.")
|
||||
@@ -184,16 +203,15 @@ class NERNameModel:
|
||||
doc = self.nlp.make_doc(text)
|
||||
example = Example.from_dict(doc, annotations)
|
||||
examples.append(example)
|
||||
logging.info(f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}")
|
||||
logging.info(
|
||||
f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
|
||||
)
|
||||
|
||||
# Train in batches
|
||||
batches = minibatch(examples, size=batch_size)
|
||||
for batch in batches:
|
||||
self.nlp.update(
|
||||
batch,
|
||||
losses=losses,
|
||||
drop=dropout_rate,
|
||||
sgd=self.nlp.create_optimizer()
|
||||
batch, losses=losses, drop=dropout_rate, sgd=self.nlp.create_optimizer()
|
||||
)
|
||||
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
|
||||
|
||||
@@ -208,7 +226,7 @@ class NERNameModel:
|
||||
"training_examples": len(data),
|
||||
"loss_history": losses_history,
|
||||
"batch_size": batch_size,
|
||||
"dropout_rate": dropout_rate
|
||||
"dropout_rate": dropout_rate,
|
||||
}
|
||||
|
||||
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
|
||||
@@ -225,7 +243,10 @@ class NERNameModel:
|
||||
predicted_entities = 0
|
||||
actual_entities = 0
|
||||
|
||||
entity_stats = {"NATIVE": {"tp": 0, "fp": 0, "fn": 0}, "SURNAME": {"tp": 0, "fp": 0, "fn": 0}}
|
||||
entity_stats = {
|
||||
"NATIVE": {"tp": 0, "fp": 0, "fn": 0},
|
||||
"SURNAME": {"tp": 0, "fp": 0, "fn": 0},
|
||||
}
|
||||
|
||||
for text, annotations in test_data:
|
||||
# Get actual entities
|
||||
@@ -259,7 +280,9 @@ class NERNameModel:
|
||||
# Calculate overall metrics
|
||||
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
|
||||
recall = correct_entities / actual_entities if actual_entities > 0 else 0
|
||||
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
||||
f1_score = (
|
||||
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
||||
)
|
||||
|
||||
# Calculate per-label metrics
|
||||
label_metrics = {}
|
||||
@@ -268,14 +291,16 @@ class NERNameModel:
|
||||
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
||||
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
||||
label_f1 = (
|
||||
2 * (label_precision * label_recall) / (label_precision + label_recall)) \
|
||||
if (label_precision + label_recall) > 0 else 0
|
||||
(2 * (label_precision * label_recall) / (label_precision + label_recall))
|
||||
if (label_precision + label_recall) > 0
|
||||
else 0
|
||||
)
|
||||
|
||||
label_metrics[label] = {
|
||||
"precision": label_precision,
|
||||
"recall": label_recall,
|
||||
"f1_score": label_f1,
|
||||
"support": tp + fn
|
||||
"support": tp + fn,
|
||||
}
|
||||
|
||||
evaluation_results = {
|
||||
@@ -286,9 +311,9 @@ class NERNameModel:
|
||||
"total_examples": total_examples,
|
||||
"correct_entities": correct_entities,
|
||||
"predicted_entities": predicted_entities,
|
||||
"actual_entities": actual_entities
|
||||
"actual_entities": actual_entities,
|
||||
},
|
||||
"by_label": label_metrics
|
||||
"by_label": label_metrics,
|
||||
}
|
||||
|
||||
logging.info(f"NER Evaluation completed. Overall F1: {f1_score:.4f}")
|
||||
@@ -309,7 +334,7 @@ class NERNameModel:
|
||||
|
||||
# Save training statistics
|
||||
stats_path = model_dir / "training_stats.json"
|
||||
with open(stats_path, 'w', encoding='utf-8') as f:
|
||||
with open(stats_path, "w", encoding="utf-8") as f:
|
||||
json.dump(self.training_stats, f, indent=2)
|
||||
|
||||
logging.info(f"NER Model saved to {model_dir}")
|
||||
@@ -328,7 +353,7 @@ class NERNameModel:
|
||||
# Load training statistics if available
|
||||
stats_path = Path(model_path) / "training_stats.json"
|
||||
if stats_path.exists():
|
||||
with open(stats_path, 'r', encoding='utf-8') as f:
|
||||
with open(stats_path, "r", encoding="utf-8") as f:
|
||||
self.training_stats = json.load(f)
|
||||
|
||||
logging.info("NER Model loaded successfully")
|
||||
@@ -342,15 +367,14 @@ class NERNameModel:
|
||||
entities = []
|
||||
|
||||
for ent in doc.ents:
|
||||
entities.append({
|
||||
"text": ent.text,
|
||||
"label": ent.label_,
|
||||
"start": ent.start_char,
|
||||
"end": ent.end_char,
|
||||
"confidence": getattr(ent, 'score', None) # If confidence scores are available
|
||||
})
|
||||
entities.append(
|
||||
{
|
||||
"text": ent.text,
|
||||
"label": ent.label_,
|
||||
"start": ent.start_char,
|
||||
"end": ent.end_char,
|
||||
"confidence": getattr(ent, "score", None), # If confidence scores are available
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"entities": entities
|
||||
}
|
||||
return {"text": text, "entities": entities}
|
||||
|
||||
@@ -3,7 +3,9 @@ import logging
|
||||
|
||||
|
||||
class NERNameTagger:
|
||||
def tag_name(self, name: str, probable_native: str, probable_surname: str) -> Union[Dict[str, Any], None]:
|
||||
def tag_name(
|
||||
self, name: str, probable_native: str, probable_surname: str
|
||||
) -> Union[Dict[str, Any], None]:
|
||||
"""Create a single NER training example using probable_native and probable_surname"""
|
||||
if not name or not probable_native or not probable_surname:
|
||||
return None
|
||||
@@ -56,9 +58,10 @@ class NERNameTagger:
|
||||
continue
|
||||
|
||||
# Check if this is a word boundary match and doesn't overlap
|
||||
if (self._is_word_boundary_match(name, pos, end_pos) and
|
||||
not has_overlap(pos, end_pos)):
|
||||
entities.append((pos, end_pos, 'NATIVE'))
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "NATIVE"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break # Only take the first non-overlapping occurrence
|
||||
|
||||
@@ -84,16 +87,19 @@ class NERNameTagger:
|
||||
start_pos = pos + 1
|
||||
continue
|
||||
|
||||
if (self._is_word_boundary_match(name, pos, end_pos) and
|
||||
not has_overlap(pos, end_pos)):
|
||||
entities.append((pos, end_pos, 'SURNAME'))
|
||||
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
|
||||
pos, end_pos
|
||||
):
|
||||
entities.append((pos, end_pos, "SURNAME"))
|
||||
used_spans.append((pos, end_pos))
|
||||
break
|
||||
|
||||
start_pos = pos + 1
|
||||
|
||||
if not entities:
|
||||
logging.warning(f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'")
|
||||
logging.warning(
|
||||
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Sort entities by position and validate
|
||||
@@ -104,7 +110,9 @@ class NERNameTagger:
|
||||
for start, end, label in entities:
|
||||
# Check bounds
|
||||
if not (0 <= start < end <= len(name)):
|
||||
logging.warning(f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'")
|
||||
logging.warning(
|
||||
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check for overlaps with already validated entities
|
||||
@@ -114,8 +122,10 @@ class NERNameTagger:
|
||||
|
||||
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
|
||||
span_text = name[start:end]
|
||||
if not span_text or span_text != span_text.strip() or ' ' in span_text:
|
||||
logging.warning(f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'")
|
||||
if not span_text or span_text != span_text.strip() or " " in span_text:
|
||||
logging.warning(
|
||||
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
|
||||
)
|
||||
continue
|
||||
|
||||
validated_entities.append((start, end, label))
|
||||
@@ -129,7 +139,7 @@ class NERNameTagger:
|
||||
|
||||
return {
|
||||
"entities": entities_str,
|
||||
"spans": validated_entities # Keep the original tuples for internal use
|
||||
"spans": validated_entities, # Keep the original tuples for internal use
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -154,6 +164,7 @@ class NERNameTagger:
|
||||
"""Validate that entity annotations are correct for a given name"""
|
||||
try:
|
||||
import ast
|
||||
|
||||
entities = ast.literal_eval(entities_str)
|
||||
|
||||
# Check for overlaps and valid bounds
|
||||
@@ -182,10 +193,11 @@ class NERNameTagger:
|
||||
@classmethod
|
||||
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
|
||||
"""Extract the actual text for each entity type"""
|
||||
result = {'NATIVE': [], 'SURNAME': []}
|
||||
result = {"NATIVE": [], "SURNAME": []}
|
||||
|
||||
try:
|
||||
import ast
|
||||
|
||||
entities = ast.literal_eval(entities_str)
|
||||
|
||||
for start, end, label in entities:
|
||||
|
||||
Reference in New Issue
Block a user