feat: enhance logging and memory management across modules

This commit is contained in:
2025-08-13 23:09:05 +02:00
parent 47e52d130c
commit 9601c5e44d
48 changed files with 1004 additions and 773 deletions
+73 -12
View File
@@ -5,6 +5,7 @@ from typing import Iterator
import pandas as pd
from processing.batch.batch_config import BatchConfig
from processing.batch.memory_monitor import MemoryMonitor
from processing.steps import PipelineStep
@@ -13,28 +14,36 @@ class BatchProcessor:
def __init__(self, config: BatchConfig):
self.config = config
self.memory_monitor = MemoryMonitor()
def create_batches(self, df: pd.DataFrame) -> Iterator[tuple[pd.DataFrame, int]]:
"""Create batches from DataFrame"""
"""Create batches from DataFrame without unnecessary copies"""
total_rows = len(df)
batch_size = self.config.batch_size
for i in range(0, total_rows, batch_size):
batch = df.iloc[i : i + batch_size].copy()
batch = df.iloc[i : i + batch_size]
batch_id = i // batch_size
yield batch, batch_id
def process_sequential(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process batches sequentially"""
"""Memory-optimized sequential processing"""
results = []
memory_threshold_mb = 1000 # Clean memory when usage exceeds 1 GB
for batch, batch_id in self.create_batches(df):
for batch_num, (batch, batch_id) in enumerate(self.create_batches(df)):
if step.batch_exists(batch_id):
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
processed_batch = step.load_batch(batch_id)
else:
try:
processed_batch = step.process_batch(batch, batch_id)
# Only copy if the processing step requires mutation
if step.requires_batch_mutation:
batch_copy = batch.copy()
processed_batch = step.process_batch(batch_copy, batch_id)
else:
processed_batch = step.process_batch(batch, batch_id)
step.save_batch(processed_batch, batch_id)
step.state.processed_batches += 1
except Exception as e:
@@ -44,14 +53,32 @@ class BatchProcessor:
results.append(processed_batch)
# Memory management
if batch_num % self.config.checkpoint_interval == 0:
current_memory = self.memory_monitor.get_memory_usage_mb()
if current_memory > memory_threshold_mb:
logging.info(f"Memory cleanup triggered at {current_memory:.1f} MB")
self.memory_monitor.cleanup_memory()
# Save state periodically
if batch_id % self.config.checkpoint_interval == 0:
step.save_state()
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
# Final memory cleanup before concatenation
self.memory_monitor.cleanup_memory()
self.memory_monitor.log_memory_usage("before_concat")
result = self._safe_concat(results) if results else pd.DataFrame()
# Final cleanup
del results
self.memory_monitor.cleanup_memory()
self.memory_monitor.log_memory_usage("sequential_complete")
return result
def process_concurrent(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process batches concurrently"""
"""Memory-optimized concurrent processing"""
executor_class = (
ProcessPoolExecutor if self.config.use_multiprocessing else ThreadPoolExecutor
)
@@ -65,7 +92,9 @@ class BatchProcessor:
logging.info(f"Batch {batch_id} already processed, loading from checkpoint")
results[batch_id] = step.load_batch(batch_id)
else:
future = executor.submit(step.process_batch, batch, batch_id)
# Only copy if necessary for concurrent processing
batch_copy = batch.copy() if step.requires_batch_mutation else batch
future = executor.submit(step.process_batch, batch_copy, batch_id)
future_to_batch[future] = (batch_id, batch)
# Collect results as they complete
@@ -81,13 +110,24 @@ class BatchProcessor:
logging.error(f"Failed to process batch {batch_id}: {e}")
step.state.failed_batches.append(batch_id)
# Reassemble results in order
# Memory-efficient reassembly
ordered_results = []
for batch_id in sorted(results.keys()):
ordered_results.append(results[batch_id])
step.save_state()
return pd.concat(ordered_results, ignore_index=True) if ordered_results else pd.DataFrame()
# Cleanup before concat
del results
self.memory_monitor.cleanup_memory()
result = self._safe_concat(ordered_results) if ordered_results else pd.DataFrame()
# Final cleanup
del ordered_results
self.memory_monitor.cleanup_memory()
return result
def process(self, step: PipelineStep, df: pd.DataFrame) -> pd.DataFrame:
"""Process data using the configured strategy"""
@@ -95,8 +135,29 @@ class BatchProcessor:
step.load_state()
logging.info(f"Starting {step.name} with {step.state.total_batches} batches")
self.memory_monitor.log_memory_usage("process_start")
if self.config.max_workers == 1:
return self.process_sequential(step, df)
result = self.process_sequential(step, df)
else:
return self.process_concurrent(step, df)
result = self.process_concurrent(step, df)
self.memory_monitor.log_memory_usage("process_complete")
return result
def _safe_concat(self, dfs: list) -> pd.DataFrame:
"""Memory-safe concatenation with monitoring"""
if not dfs:
return pd.DataFrame()
memory = self.memory_monitor.get_memory_usage_mb()
logging.info(f"Starting concat of {len(dfs)} DataFrames at {memory:.1f} MB")
# Use copy=False to avoid unnecessary copying during concat
result = pd.concat(dfs, ignore_index=True, copy=False)
# Monitor memory after concat
memory = self.memory_monitor.get_memory_usage_mb()
logging.info(f"Concat complete. Memory: {memory:.1f} MB")
return result
+25
View File
@@ -0,0 +1,25 @@
import gc
import logging
import psutil
class MemoryMonitor:
"""Monitor and manage memory usage during batch processing"""
@staticmethod
def get_memory_usage_mb() -> float:
"""Get current memory usage in MB"""
process = psutil.Process()
return process.memory_info().rss / 1024 / 1024
@staticmethod
def cleanup_memory():
"""Force garbage collection"""
gc.collect()
@staticmethod
def log_memory_usage(step_name: str):
"""Log current memory usage"""
memory_mb = MemoryMonitor.get_memory_usage_mb()
logging.info(f"Memory usage after {step_name}: {memory_mb:.1f} MB")
-52
View File
@@ -1,52 +0,0 @@
import logging
from typing import Dict
import pandas as pd
class DatasetAnalyzer:
"""Analyze dataset statistics and quality"""
def __init__(self, filepath: str):
self.filepath = filepath
self.df = None
def load_data(self) -> bool:
"""Load dataset for analysis"""
try:
self.df = pd.read_csv(self.filepath)
return True
except Exception as e:
logging.error(f"Failed to load {self.filepath}: {e}")
return False
def analyze_completion(self) -> Dict:
"""Analyze annotation completion status"""
if self.df is None:
return {}
total_rows = len(self.df)
# Check annotation status
if "annotated" in self.df.columns:
annotated_count = (self.df["annotated"] == 1).sum()
unannotated_count = (self.df["annotated"] == 0).sum()
else:
annotated_count = 0
unannotated_count = total_rows
# Analyze name completeness
complete_names = 0
if "identified_name" in self.df.columns and "identified_surname" in self.df.columns:
complete_names = (
(self.df["identified_name"].notna()) & (self.df["identified_surname"].notna())
).sum()
return {
"total_rows": total_rows,
"annotated_rows": annotated_count,
"unannotated_rows": unannotated_count,
"annotation_percentage": (annotated_count / total_rows * 100) if total_rows > 0 else 0,
"complete_names": complete_names,
"completeness_percentage": (complete_names / total_rows * 100) if total_rows > 0 else 0,
}
+7 -1
View File
@@ -19,7 +19,13 @@ class PipelineMonitor:
self.paths = paths
self.checkpoint_dir = paths.checkpoints_dir
self.steps = ["data_cleaning", "feature_extraction", "ner_annotation", "llm_annotation", "data_splitting"]
self.steps = [
"data_cleaning",
"feature_extraction",
"ner_annotation",
"llm_annotation",
"data_splitting",
]
def get_step_status(self, step_name: str) -> Dict:
"""Get status of a specific pipeline step"""
+23 -12
View File
@@ -13,10 +13,17 @@ class BaseNameFormatter(ABC):
"""
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
self.connectors = connectors or ['wa', 'ya', 'ka', 'ba']
self.connectors = connectors or ["wa", "ya", "ka", "ba"]
self.additional_surnames = additional_surnames or [
'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
'andre', 'michel', 'robert'
"jean",
"paul",
"marie",
"joseph",
"pierre",
"claude",
"andre",
"michel",
"robert",
]
@classmethod
@@ -26,7 +33,9 @@ class BaseNameFormatter(ABC):
return []
return native_str.strip().split()
def create_ner_tags(self, text: str, native_parts: List[str], surname: str) -> List[Tuple[int, int, str]]:
def create_ner_tags(
self, text: str, native_parts: List[str], surname: str
) -> List[Tuple[int, int, str]]:
"""Create NER entity tags for transformed text"""
entities = []
current_pos = 0
@@ -38,15 +47,15 @@ class BaseNameFormatter(ABC):
# Determine tag based on word content
if word in native_parts or any(connector in word for connector in self.connectors):
tag = 'NATIVE'
tag = "NATIVE"
elif word == surname or word in self.additional_surnames:
tag = 'SURNAME'
tag = "SURNAME"
else:
# Check if it's a compound native word or new surname
if any(part in word for part in native_parts):
tag = 'NATIVE'
tag = "NATIVE"
else:
tag = 'SURNAME'
tag = "SURNAME"
entities.append((start_pos, end_pos, tag))
current_pos = end_pos + 1 # +1 for space
@@ -54,15 +63,17 @@ class BaseNameFormatter(ABC):
return entities
@classmethod
def compute_derived_attributes(cls, name: str) -> Dict:
def compute_numeric_features(cls, name: str) -> Dict:
"""Compute all derived attributes for the transformed name"""
words_count = len(name.split()) if name else 0
length = len(name) if name else 0
return {
'words': words_count,
'length': length,
'identified_category': NameCategory.SIMPLE if words_count == 3 else NameCategory.COMPOSE,
"words": words_count,
"length": length,
"identified_category": (
NameCategory.SIMPLE.value if words_count == 3 else NameCategory.COMPOSE.value
),
}
@abstractmethod
+14 -12
View File
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter
class ConnectorFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row['probable_native'])
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
connector = random.choice(self.connectors)
# Connect native parts with a random connector
@@ -17,20 +17,22 @@ class ConnectorFormatter(BaseNameFormatter):
connected_native = f" {connector} ".join(native_parts)
full_name = f"{connected_native} {surname}".strip()
else:
connected_native = f"{row['probable_native']} {connector} {row['probable_native']}".strip()
connected_native = (
f"{row['probable_native']} {connector} {row['probable_native']}".strip()
)
full_name = f"{connected_native} {surname}".strip()
return {
'name': full_name,
'probable_native': connected_native,
'identify_name': connected_native,
'probable_surname': surname,
'identify_surname': surname,
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
'transformation_type': self.transformation_type,
**self.compute_derived_attributes(full_name)
"name": full_name,
"probable_native": connected_native,
"identified_name": connected_native,
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return 'connector_added'
return "connector_added"
@@ -8,8 +8,8 @@ from processing.ner.formats import BaseNameFormatter
class ExtendedSurnameFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row['probable_native'])
original_surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
native_parts = self.parse_native_components(row["probable_native"])
original_surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Add random additional surname
additional_surname = random.choice(self.additional_surnames)
@@ -17,16 +17,16 @@ class ExtendedSurnameFormatter(BaseNameFormatter):
full_name = f"{row['probable_native']} {combined_surname}".strip()
return {
'name': full_name,
'probable_native': row['probable_native'],
'identify_name': row['probable_native'],
'probable_surname': combined_surname,
'identity_surname': combined_surname,
'ner_entities': str(self.create_ner_tags(full_name, native_parts, combined_surname)),
'transformation_type': self.transformation_type,
**self.compute_derived_attributes(full_name)
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": combined_surname,
"identified_surname": combined_surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, combined_surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return 'extended_surname'
return "extended_surname"
+11 -11
View File
@@ -7,22 +7,22 @@ from processing.ner.formats import BaseNameFormatter
class NativeOnlyFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row['probable_native'])
native_parts = self.parse_native_components(row["probable_native"])
# Only native components
full_name = row['probable_native']
full_name = row["probable_native"]
return {
'name': full_name,
'probable_native': row['probable_native'],
'identify_name': row['probable_native'],
'probable_surname': '',
'identify_surname': '',
'ner_entities': str(self.create_ner_tags(full_name, native_parts, '')),
'transformation_type': self.transformation_type,
**self.compute_derived_attributes(full_name)
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": "",
"identified_surname": "",
"ner_entities": str(self.create_ner_tags(full_name, native_parts, "")),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return 'native_only'
return "native_only"
+11 -11
View File
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter
class OriginalFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row['probable_native'])
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Keep original order: native components + surname
full_name = f"{row['probable_native']} {surname}".strip()
return {
'name': full_name,
'probable_native': row['probable_native'],
'identify_name': row['probable_native'],
'probable_surname': surname,
'identify_surname': surname,
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
'transformation_type': self.transformation_type,
**self.compute_derived_attributes(full_name)
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return 'original'
return "original"
@@ -7,23 +7,23 @@ from processing.ner.formats import BaseNameFormatter
class PositionFlippedFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row['probable_native'])
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Flip order: surname + native components
full_name = f"{surname} {row['probable_native']}".strip()
return {
'name': full_name,
'probable_native': row['probable_native'],
'identify_name': row['probable_native'],
'probable_surname': surname,
'identify_surname': surname,
'ner_entities': str(self.create_ner_tags(full_name, native_parts, surname)),
'transformation_type': self.transformation_type,
**self.compute_derived_attributes(full_name)
"name": full_name,
"probable_native": row["probable_native"],
"identified_name": row["probable_native"],
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, native_parts, surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return 'position_flipped'
return "position_flipped"
+12 -12
View File
@@ -7,24 +7,24 @@ from processing.ner.formats import BaseNameFormatter
class ReducedNativeFormatter(BaseNameFormatter):
def transform(self, row: pd.Series) -> Dict:
native_parts = self.parse_native_components(row['probable_native'])
surname = row['probable_surname'] if pd.notna(row['probable_surname']) else ''
native_parts = self.parse_native_components(row["probable_native"])
surname = row["probable_surname"] if pd.notna(row["probable_surname"]) else ""
# Keep only first native component + surname
reduced_native = native_parts[0] if len(native_parts) > 1 else row['probable_native']
reduced_native = native_parts[0] if len(native_parts) > 1 else row["probable_native"]
full_name = f"{reduced_native} {surname}".strip()
return {
'name': full_name,
'probable_native': reduced_native,
'identify_name': reduced_native,
'probable_surname': surname,
'identify_surname': surname,
'ner_entities': str(self.create_ner_tags(full_name, [reduced_native], surname)),
'transformation_type': self.transformation_type,
**self.compute_derived_attributes(full_name)
"name": full_name,
"probable_native": reduced_native,
"identified_name": reduced_native,
"probable_surname": surname,
"identified_surname": surname,
"ner_entities": str(self.create_ner_tags(full_name, [reduced_native], surname)),
"transformation_type": self.transformation_type,
**self.compute_numeric_features(full_name),
}
@property
def transformation_type(self) -> str:
return 'reduced_native'
return "reduced_native"
+122 -168
View File
@@ -10,189 +10,143 @@ from spacy.util import filter_spans
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import DataLoader
class NERDataBuilder:
def __init__(self, config: PipelineConfig):
self.config = config
self.data_loader = DataLoader(config)
@classmethod
def parse_entities(cls, entities_str):
"""Parse entity string (tuple format or JSON) into spaCy-style tuples."""
if not entities_str or entities_str in ["[]", "", "nan"]:
return []
@staticmethod
def _parse_entities(series: pd.Series) -> pd.Series:
"""Vectorized parse of entity strings."""
entities_str = str(entities_str).strip()
# Handle different formats
try:
# Try to parse as Python literal (tuples or lists)
if entities_str.startswith("[(") and entities_str.endswith(")]"):
# Standard tuple format: [(0, 6, 'NATIVE'), ...]
return ast.literal_eval(entities_str)
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
# Nested list format: [[0, 6, 'NATIVE'], ...]
nested_list = ast.literal_eval(entities_str)
return [(start, end, label) for start, end, label in nested_list]
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
# JSON format: [{"start": 0, "end": 6, "label": "NATIVE"}, ...]
json_entities = json.loads(entities_str)
return [(e["start"], e["end"], e["label"]) for e in json_entities]
else:
# Try general ast.literal_eval for other formats
parsed = ast.literal_eval(entities_str)
if isinstance(parsed, list):
# Convert any list format to tuples
result = []
for item in parsed:
if isinstance(item, (list, tuple)) and len(item) == 3:
result.append((item[0], item[1], item[2]))
return result
except (ValueError, SyntaxError, json.JSONDecodeError) as e:
logging.warning(f"Failed to parse entities: {entities_str} ({e})")
return []
logging.warning(f"Unknown entity format: {entities_str}")
return []
@classmethod
def validate_entities(cls, entities, text):
"""Validate and sort entity tuples, removing overlaps and invalid spans."""
if not entities or not text:
return []
text = str(text).strip()
if not text:
return []
# Filter out invalid entities
valid_entities = []
for entity in entities:
if not isinstance(entity, (list, tuple)) or len(entity) != 3:
logging.warning(f"Invalid entity format: {entity}")
continue
start, end, label = entity
# Ensure start/end are integers
def _parse(entities_str):
if not entities_str or entities_str in ["[]", "", "nan"]:
return []
entities_str = str(entities_str).strip()
try:
start = int(start)
end = int(end)
except (ValueError, TypeError):
logging.warning(f"Invalid start/end positions: {entity}")
continue
if entities_str.startswith("[(") and entities_str.endswith(")]"):
return ast.literal_eval(entities_str)
elif entities_str.startswith("[[") and entities_str.endswith("]]"):
return [tuple(e) for e in ast.literal_eval(entities_str)]
elif entities_str.startswith("[{") and entities_str.endswith("}]"):
return [(e["start"], e["end"], e["label"]) for e in json.loads(entities_str)]
else:
parsed = ast.literal_eval(entities_str)
return [
tuple(e) for e in parsed if isinstance(e, (list, tuple)) and len(e) == 3
]
except (ValueError, SyntaxError, json.JSONDecodeError):
return []
# Ensure label is string
if not isinstance(label, str):
logging.warning(f"Invalid label type: {entity}")
continue
return series.map(_parse)
# Check bounds
if not (0 <= start < end <= len(text)):
logging.warning(f"Entity span out of bounds: {entity} for text '{text}' (length {len(text)})")
continue
@staticmethod
def _validate_entities(texts: pd.Series, entities_series: pd.Series) -> pd.Series:
"""Vectorized entity validation."""
# Check that span contains actual text
span_text = text[start:end].strip()
if not span_text:
logging.warning(f"Empty span: {entity} in text '{text}'")
continue
valid_entities.append((start, end, label))
if not valid_entities:
return []
# Sort by start position
valid_entities.sort(key=lambda x: (x[0], x[1]))
# Remove overlapping entities (keep the first one)
filtered = []
for start, end, label in valid_entities:
# Check for overlap with already added entities
has_overlap = False
for e_start, e_end, _ in filtered:
if not (end <= e_start or start >= e_end):
has_overlap = True
logging.warning(
f"Removing overlapping entity ({start}, {end}, '{label}') "
f"conflicts with ({e_start}, {e_end}) in '{text}'"
)
break
if not has_overlap:
filtered.append((start, end, label))
return filtered
@classmethod
def create_doc(cls, text, entities, nlp):
"""Create a spaCy Doc object with entities added."""
doc = nlp(text)
ents = []
for start, end, label in entities:
span = doc.char_span(start, end, label=label, alignment_mode="contract") \
or doc.char_span(start, end, label=label, alignment_mode="strict")
if span:
ents.append(span)
else:
logging.warning(f"Could not create span ({start}, {end}, '{label}') in '{text}'")
doc.ents = filter_spans(ents) if ents else []
return doc
def build(self, data: pd.DataFrame = None) -> int:
"""Build the dataset for NER training."""
logging.info("Building dataset for NER training")
try:
df = pd.read_csv(get_data_file_path("names_featured.csv", self.config)) \
if data is None \
else data
ner_df = df[df["ner_tagged"] == 1].copy()
if ner_df.empty:
logging.error("No NER tagged data found in the CSV")
return 1
logging.info(f"Found {len(ner_df)} NER tagged entries")
nlp = spacy.blank("fr")
doc_bin, training_data = DocBin(), []
processed_count, skipped_count = 0, 0
for _, row in ner_df.iterrows():
text = str(row.get("name", "")).strip()
if not text:
def _validate(text, entities):
if not entities or not text:
return []
text = str(text).strip()
valid = []
for ent in entities:
if not isinstance(ent, (list, tuple)) or len(ent) != 3:
continue
entities = self.parse_entities(row.get("ner_entities", "[]"))
entities = self.validate_entities(entities, text)
training_data.append((text, {"entities": entities}))
start, end, label = ent
try:
doc_bin.add(self.create_doc(text, entities, nlp))
processed_count += 1
except Exception as e:
logging.error(f"Error processing '{text}': {e}")
skipped_count += 1
start, end = int(start), int(end)
except (ValueError, TypeError):
continue
if not isinstance(label, str):
continue
if not (0 <= start < end <= len(text)):
continue
if not text[start:end].strip():
continue
valid.append((start, end, label))
if not valid:
return []
valid.sort(key=lambda x: (x[0], x[1]))
# remove overlaps
filtered, last_end = [], -1
for s, e, l in valid:
if s >= last_end:
filtered.append((s, e, l))
last_end = e
return filtered
if not training_data:
logging.error("No valid training examples generated")
return 1
return pd.Series(map(_validate, texts, entities_series), index=texts.index)
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
@staticmethod
def _create_docs(nlp, texts, entities):
"""Batch create spaCy Docs."""
docs = []
for text, ents in zip(texts, entities):
doc = nlp(text)
spans = []
for start, end, label in ents:
span = doc.char_span(
start, end, label=label, alignment_mode="contract"
) or doc.char_span(start, end, label=label, alignment_mode="strict")
if span:
spans.append(span)
doc.ents = filter_spans(spans)
docs.append(doc)
return docs
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, indent=None)
doc_bin.to_disk(spacy_path)
def build(self) -> int:
input_filepath = get_data_file_path(
self.config.data.output_files["engineered"], self.config
)
df = self.data_loader.load_csv_complete(input_filepath)
df = df[["name", "ner_tagged", "ner_entities"]]
logging.info(f"Processed: {processed_count}, Skipped: {skipped_count}")
logging.info(f"Saved NER data in json format to {json_path}")
logging.info(f"Saved NER data in spaCy format to {spacy_path}")
return 0
except Exception as e:
logging.error(f"Failed to build NER dataset: {e}", exc_info=True)
# Filter early
ner_df = df.loc[df["ner_tagged"] == 1, ["name", "ner_entities"]]
if ner_df.empty:
logging.error("No NER tagged data found")
return 1
total_rows = len(df)
del df # No need to keep in memory
logging.info(f"Found {len(ner_df)} NER tagged entries")
nlp = spacy.blank("fr")
# Vectorized parsing + validation
parsed_entities = self._parse_entities(ner_df["ner_entities"])
validated_entities = self._validate_entities(ner_df["name"], parsed_entities)
# Drop rows with no valid entities
mask = validated_entities.map(bool)
ner_df = ner_df.loc[mask]
validated_entities = validated_entities.loc[mask]
if ner_df.empty:
logging.error("No valid training examples after validation")
return 1
# Prepare training data
training_data = list(
zip(ner_df["name"].tolist(), [{"entities": ents} for ents in validated_entities])
)
# Create spaCy DocBin in batch
docs = self._create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
doc_bin = DocBin(docs=docs)
# Save
json_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_data"]
spacy_path = Path(self.config.paths.data_dir) / self.config.data.output_files["ner_spacy"]
with open(json_path, "w", encoding="utf-8") as f:
json.dump(training_data, f, ensure_ascii=False, separators=(",", ":"))
doc_bin.to_disk(spacy_path)
logging.info(f"Processed: {len(training_data)}, Skipped: {total_rows - len(training_data)}")
logging.info(f"Saved NER JSON to {json_path}")
logging.info(f"Saved NER spacy to {spacy_path}")
return 0
+66 -53
View File
@@ -1,9 +1,14 @@
import random
from typing import List
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
from core.config import PipelineConfig
from core.utils import get_data_file_path
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
from processing.ner.formats.connectors_format import ConnectorFormatter
from processing.ner.formats.extended_surname_format import ExtendedSurnameFormatter
from processing.ner.formats.native_only_format import NativeOnlyFormatter
@@ -18,50 +23,64 @@ class NEREngineering:
and encourage sequence characteristic learning.
"""
def __init__(self, connectors: List[str] = None, additional_surnames: List[str] = None):
self.connectors = connectors or ['wa', 'ya', 'ka', 'ba', 'la']
self.additional_surnames = additional_surnames or [
'jean', 'paul', 'marie', 'joseph', 'pierre', 'claude',
'andre', 'michel', 'robert'
def __init__(self, config: PipelineConfig):
self.config = config
self.data_loader = DataLoader(config)
self.connectors = ["wa", "ya", "ka", "ba", "la"]
self.additional_surnames = [
"jean",
"paul",
"marie",
"joseph",
"pierre",
"claude",
"andre",
"michel",
"robert",
]
random.seed(self.config.data.random_seed)
np.random.seed(self.config.data.random_seed)
# Initialize format classes
self.formatters = {
'original': OriginalFormatter(self.connectors, self.additional_surnames),
'native_only': NativeOnlyFormatter(self.connectors, self.additional_surnames),
'position_flipped': PositionFlippedFormatter(self.connectors, self.additional_surnames),
'reduced_native': ReducedNativeFormatter(self.connectors, self.additional_surnames),
'connector_added': ConnectorFormatter(self.connectors, self.additional_surnames),
'extended_surname': ExtendedSurnameFormatter(self.connectors, self.additional_surnames)
"original": OriginalFormatter(self.connectors, self.additional_surnames),
"native_only": NativeOnlyFormatter(self.connectors, self.additional_surnames),
"position_flipped": PositionFlippedFormatter(self.connectors, self.additional_surnames),
"reduced_native": ReducedNativeFormatter(self.connectors, self.additional_surnames),
"connector_added": ConnectorFormatter(self.connectors, self.additional_surnames),
"extended_surname": ExtendedSurnameFormatter(self.connectors, self.additional_surnames),
}
@classmethod
def load_ner_data(cls, filepath: str) -> pd.DataFrame:
def load_data(self) -> pd.DataFrame:
"""Load and filter NER-tagged data from CSV file"""
df = pd.read_csv(filepath)
filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
df = self.data_loader.load_csv_complete(filepath)
# Filter only NER-tagged rows
ner_data = df[df['ner_tagged'] == 1].copy()
print(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
ner_data = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_data)} NER-tagged records from {len(df)} total records")
return ner_data
def engineer_dataset(self, df: pd.DataFrame, random_seed: int = 42) -> pd.DataFrame:
"""
Apply feature engineering transformations according to the specified rules:
- First 25%: original format
- Second 25%: remove surname
- Third 25%: flip positions
- Fourth 10%: reduce native components
- Fifth 10%: add connectors
- Last 5%: extend surnames
"""
random.seed(random_seed)
np.random.seed(random_seed)
def compute(self) -> None:
logging.info("Applying feature engineering transformations...")
input_filepath = get_data_file_path(self.config.data.output_files["featured"], self.config)
output_filepath = get_data_file_path(
self.config.data.output_files["engineered"], self.config
)
# Shuffle the dataset
df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
total_rows = len(df_shuffled)
df = self.data_loader.load_csv_complete(input_filepath)
ner_df = df[df["ner_tagged"] == 1].copy()
logging.info(f"Loaded {len(ner_df)} NER-tagged records from {len(df)} total records")
del df # No need to keep in memory
ner_df = ner_df.sample(frac=1, random_state=self.config.data.random_seed).reset_index(
drop=True
)
total_rows = len(ner_df)
# Calculate split points
split_25_1 = int(total_rows * 0.25)
@@ -71,37 +90,31 @@ class NEREngineering:
split_10_2 = int(total_rows * 0.95)
# Define transformation groups
transformation_groups = [
(0, split_25_1, 'original'),
(split_25_1, split_25_2, 'native_only'),
(split_25_2, split_25_3, 'position_flipped'),
(split_25_3, split_10_1, 'reduced_native'),
(split_10_1, split_10_2, 'connector_added'),
(split_10_2, total_rows, 'extended_surname')
groups = [
(0, split_25_1, "original"), # First 25%: original format
(split_25_1, split_25_2, "native_only"), # Second 25%: remove surname
(split_25_2, split_25_3, "position_flipped"), # Third 25%: flip positions
(split_25_3, split_10_1, "reduced_native"), # Fourth 10%: reduce native components
(split_10_1, split_10_2, "connector_added"), # Fifth 10%: add connectors
(split_10_2, total_rows, "extended_surname"), # Last 5%: extend surnames
]
print("Dataset splits:")
for start, end, trans_type in transformation_groups:
print(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
for start, end, trans_type in groups:
logging.info(f"Group {trans_type}: {start} to {end} ({end - start} rows)")
# Process each group
engineered_rows = []
for start, end, formatter_key in transformation_groups:
rows = []
for start, end, formatter_key in groups:
formatter = self.formatters[formatter_key]
for idx in range(start, end):
row = df_shuffled.iloc[idx]
for idx in tqdm(range(start, end), desc=f"Processing {formatter_key}"):
row = ner_df.iloc[idx]
transformed = formatter.transform(row)
# Keep original columns and add transformed ones
new_row = row.to_dict()
new_row.update(transformed)
engineered_rows.append(new_row)
rows.append(new_row)
return pd.DataFrame(engineered_rows)
@classmethod
def save_engineered_dataset(cls, df: pd.DataFrame, output_path: str):
"""Save the engineered dataset to CSV file"""
df.to_csv(output_path, index=False)
print(f"Engineered dataset saved to {output_path}")
self.data_loader.save_csv(pd.DataFrame(rows), output_filepath)
logging.info(f"Engineered dataset saved to {output_filepath}")
+69 -45
View File
@@ -48,7 +48,7 @@ class NERNameModel:
logging.info(f"Loading training data from {data_path}")
with open(data_path, 'r', encoding='utf-8') as f:
with open(data_path, "r", encoding="utf-8") as f:
raw_data = json.load(f)
# Validate and clean training data
@@ -58,7 +58,9 @@ class NERNameModel:
for i, item in enumerate(raw_data):
try:
if not isinstance(item, (list, tuple)) or len(item) != 2:
logging.warning(f"Skipping invalid training example format at index {i}: {item}")
logging.warning(
f"Skipping invalid training example format at index {i}: {item}"
)
skipped_count += 1
continue
@@ -83,20 +85,27 @@ class NERNameModel:
# String format from tagger: "[(0, 6, 'NATIVE'), ...]"
try:
import ast
entities = ast.literal_eval(entities_raw)
if not isinstance(entities, list):
logging.warning(f"Parsed entities is not a list at index {i}: {entities}")
logging.warning(
f"Parsed entities is not a list at index {i}: {entities}"
)
skipped_count += 1
continue
except (ValueError, SyntaxError) as e:
logging.warning(f"Failed to parse entity string at index {i}: {entities_raw} ({e})")
logging.warning(
f"Failed to parse entity string at index {i}: {entities_raw} ({e})"
)
skipped_count += 1
continue
elif isinstance(entities_raw, list):
# Already in list format
entities = entities_raw
else:
logging.warning(f"Skipping invalid entities format at index {i}: {entities_raw}")
logging.warning(
f"Skipping invalid entities format at index {i}: {entities_raw}"
)
skipped_count += 1
continue
@@ -110,16 +119,20 @@ class NERNameModel:
start, end, label = entity
# Validate entity components
if (not isinstance(start, int) or not isinstance(end, int) or
not isinstance(label, str) or start >= end or
start < 0 or end > len(text)):
if (
not isinstance(start, int)
or not isinstance(end, int)
or not isinstance(label, str)
or start >= end
or start < 0
or end > len(text)
):
logging.warning(f"Skipping invalid entity bounds in '{text}': {entity}")
continue
# Check for overlaps with already validated entities
has_overlap = any(
start < v_end and end > v_start
for v_start, v_end, _ in valid_entities
start < v_end and end > v_start for v_start, v_end, _ in valid_entities
)
if has_overlap:
@@ -128,8 +141,10 @@ class NERNameModel:
# Validate that the span doesn't contain spaces (matching tagger validation)
span_text = text[start:end]
if not span_text or span_text != span_text.strip() or ' ' in span_text:
logging.warning(f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'")
if not span_text or span_text != span_text.strip() or " " in span_text:
logging.warning(
f"Skipping entity with spaces in '{text}': {entity} -> '{span_text}'"
)
continue
valid_entities.append((start, end, label))
@@ -148,7 +163,9 @@ class NERNameModel:
skipped_count += 1
continue
logging.info(f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones")
logging.info(
f"Loaded {len(valid_data)} valid training examples, skipped {skipped_count} invalid ones"
)
if not valid_data:
raise ValueError("No valid training examples found in the data")
@@ -156,15 +173,17 @@ class NERNameModel:
return valid_data
def train(
self,
data: List[Tuple[str, Dict]],
epochs: int = 5,
batch_size: int = 16,
dropout_rate: float = 0.2,
self,
data: List[Tuple[str, Dict]],
epochs: int = 5,
batch_size: int = 16,
dropout_rate: float = 0.2,
) -> None:
"""Train the NER model"""
logging.info(f"Starting NER training with {len(data)} examples")
logging.info(f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}")
logging.info(
f"Training parameters: epochs={epochs}, batch_size={batch_size}, dropout={dropout_rate}"
)
if self.nlp is None:
raise ValueError("Model not initialized. Call create_blank_model() first.")
@@ -184,16 +203,15 @@ class NERNameModel:
doc = self.nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
examples.append(example)
logging.info(f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}")
logging.info(
f"Training example: {text[:30]}... with entities {annotations.get('entities', [])}"
)
# Train in batches
batches = minibatch(examples, size=batch_size)
for batch in batches:
self.nlp.update(
batch,
losses=losses,
drop=dropout_rate,
sgd=self.nlp.create_optimizer()
batch, losses=losses, drop=dropout_rate, sgd=self.nlp.create_optimizer()
)
logging.info(f"Training batch with {len(batch)} examples, current losses: {losses}")
@@ -208,7 +226,7 @@ class NERNameModel:
"training_examples": len(data),
"loss_history": losses_history,
"batch_size": batch_size,
"dropout_rate": dropout_rate
"dropout_rate": dropout_rate,
}
logging.info(f"Training completed. Final loss: {self.training_stats['final_loss']:.4f}")
@@ -225,7 +243,10 @@ class NERNameModel:
predicted_entities = 0
actual_entities = 0
entity_stats = {"NATIVE": {"tp": 0, "fp": 0, "fn": 0}, "SURNAME": {"tp": 0, "fp": 0, "fn": 0}}
entity_stats = {
"NATIVE": {"tp": 0, "fp": 0, "fn": 0},
"SURNAME": {"tp": 0, "fp": 0, "fn": 0},
}
for text, annotations in test_data:
# Get actual entities
@@ -259,7 +280,9 @@ class NERNameModel:
# Calculate overall metrics
precision = correct_entities / predicted_entities if predicted_entities > 0 else 0
recall = correct_entities / actual_entities if actual_entities > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
f1_score = (
2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
)
# Calculate per-label metrics
label_metrics = {}
@@ -268,14 +291,16 @@ class NERNameModel:
label_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
label_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
label_f1 = (
2 * (label_precision * label_recall) / (label_precision + label_recall)) \
if (label_precision + label_recall) > 0 else 0
(2 * (label_precision * label_recall) / (label_precision + label_recall))
if (label_precision + label_recall) > 0
else 0
)
label_metrics[label] = {
"precision": label_precision,
"recall": label_recall,
"f1_score": label_f1,
"support": tp + fn
"support": tp + fn,
}
evaluation_results = {
@@ -286,9 +311,9 @@ class NERNameModel:
"total_examples": total_examples,
"correct_entities": correct_entities,
"predicted_entities": predicted_entities,
"actual_entities": actual_entities
"actual_entities": actual_entities,
},
"by_label": label_metrics
"by_label": label_metrics,
}
logging.info(f"NER Evaluation completed. Overall F1: {f1_score:.4f}")
@@ -309,7 +334,7 @@ class NERNameModel:
# Save training statistics
stats_path = model_dir / "training_stats.json"
with open(stats_path, 'w', encoding='utf-8') as f:
with open(stats_path, "w", encoding="utf-8") as f:
json.dump(self.training_stats, f, indent=2)
logging.info(f"NER Model saved to {model_dir}")
@@ -328,7 +353,7 @@ class NERNameModel:
# Load training statistics if available
stats_path = Path(model_path) / "training_stats.json"
if stats_path.exists():
with open(stats_path, 'r', encoding='utf-8') as f:
with open(stats_path, "r", encoding="utf-8") as f:
self.training_stats = json.load(f)
logging.info("NER Model loaded successfully")
@@ -342,15 +367,14 @@ class NERNameModel:
entities = []
for ent in doc.ents:
entities.append({
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
"confidence": getattr(ent, 'score', None) # If confidence scores are available
})
entities.append(
{
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
"confidence": getattr(ent, "score", None), # If confidence scores are available
}
)
return {
"text": text,
"entities": entities
}
return {"text": text, "entities": entities}
+25 -13
View File
@@ -3,7 +3,9 @@ import logging
class NERNameTagger:
def tag_name(self, name: str, probable_native: str, probable_surname: str) -> Union[Dict[str, Any], None]:
def tag_name(
self, name: str, probable_native: str, probable_surname: str
) -> Union[Dict[str, Any], None]:
"""Create a single NER training example using probable_native and probable_surname"""
if not name or not probable_native or not probable_surname:
return None
@@ -56,9 +58,10 @@ class NERNameTagger:
continue
# Check if this is a word boundary match and doesn't overlap
if (self._is_word_boundary_match(name, pos, end_pos) and
not has_overlap(pos, end_pos)):
entities.append((pos, end_pos, 'NATIVE'))
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "NATIVE"))
used_spans.append((pos, end_pos))
break # Only take the first non-overlapping occurrence
@@ -84,16 +87,19 @@ class NERNameTagger:
start_pos = pos + 1
continue
if (self._is_word_boundary_match(name, pos, end_pos) and
not has_overlap(pos, end_pos)):
entities.append((pos, end_pos, 'SURNAME'))
if self._is_word_boundary_match(name, pos, end_pos) and not has_overlap(
pos, end_pos
):
entities.append((pos, end_pos, "SURNAME"))
used_spans.append((pos, end_pos))
break
start_pos = pos + 1
if not entities:
logging.warning(f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'")
logging.warning(
f"No valid entities found for name: '{name}' with native: '{probable_native}' and surname: '{probable_surname}'"
)
return None
# Sort entities by position and validate
@@ -104,7 +110,9 @@ class NERNameTagger:
for start, end, label in entities:
# Check bounds
if not (0 <= start < end <= len(name)):
logging.warning(f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'")
logging.warning(
f"Invalid span bounds ({start}, {end}) for text length {len(name)}: '{name}'"
)
continue
# Check for overlaps with already validated entities
@@ -114,8 +122,10 @@ class NERNameTagger:
# CRITICAL VALIDATION: Check that the span contains only the expected word (no spaces)
span_text = name[start:end]
if not span_text or span_text != span_text.strip() or ' ' in span_text:
logging.warning(f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'")
if not span_text or span_text != span_text.strip() or " " in span_text:
logging.warning(
f"Span contains spaces or is empty ({start}, {end}) in '{name}': '{span_text}'"
)
continue
validated_entities.append((start, end, label))
@@ -129,7 +139,7 @@ class NERNameTagger:
return {
"entities": entities_str,
"spans": validated_entities # Keep the original tuples for internal use
"spans": validated_entities, # Keep the original tuples for internal use
}
@classmethod
@@ -154,6 +164,7 @@ class NERNameTagger:
"""Validate that entity annotations are correct for a given name"""
try:
import ast
entities = ast.literal_eval(entities_str)
# Check for overlaps and valid bounds
@@ -182,10 +193,11 @@ class NERNameTagger:
@classmethod
def extract_entity_text(cls, name: str, entities_str: str) -> Dict[str, List[str]]:
"""Extract the actual text for each entity type"""
result = {'NATIVE': [], 'SURNAME': []}
result = {"NATIVE": [], "SURNAME": []}
try:
import ast
entities = ast.literal_eval(entities_str)
for start, end, label in entities:
+10 -3
View File
@@ -9,6 +9,7 @@ import pandas as pd
from pydantic import BaseModel
from core.config.pipeline_config import PipelineConfig
from core.utils.data_loader import OPTIMIZED_DTYPES, DataLoader
from processing.batch.batch_config import BatchConfig
@@ -37,10 +38,11 @@ class PipelineStep(ABC):
"""Abstract base class for pipeline steps"""
def __init__(
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
self, name: str, pipeline_config: PipelineConfig, batch_config: Optional[BatchConfig] = None
):
self.name = name
self.pipeline_config = pipeline_config
self.data_loader = DataLoader(pipeline_config)
# Use provided batch_config or create default from pipeline config
if batch_config is None:
@@ -53,6 +55,11 @@ class PipelineStep(ABC):
self.batch_config = batch_config
self.state = PipelineState()
@property
def requires_batch_mutation(self) -> bool:
"""Indicates if this step modifies the batch data"""
return False
@abstractmethod
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process a single batch of data"""
@@ -108,12 +115,12 @@ class PipelineStep(ABC):
def save_batch(self, batch: pd.DataFrame, batch_id: int):
"""Save processed batch to checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
batch.to_csv(checkpoint_path, index=False)
self.data_loader.save_csv(batch, checkpoint_path)
logging.info(f"Saved batch {batch_id} to {checkpoint_path}")
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
"""Load processed batch from checkpoint"""
checkpoint_path = self.get_checkpoint_path(batch_id)
if os.path.exists(checkpoint_path):
return pd.read_csv(checkpoint_path)
return self.data_loader.load_csv_complete(checkpoint_path)
return None
+11 -8
View File
@@ -2,11 +2,10 @@ import numpy as np
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps.feature_extraction_step import Gender
from core.utils.data_loader import DataLoader
from core.utils.region_mapper import RegionMapper
from processing.batch.batch_config import BatchConfig
from processing.steps import PipelineStep
from processing.steps.feature_extraction_step import Gender
class DataSplittingStep(PipelineStep):
@@ -20,7 +19,6 @@ class DataSplittingStep(PipelineStep):
use_multiprocessing=False,
)
super().__init__("data_splitting", pipeline_config, batch_config)
self.data_loader = DataLoader(pipeline_config)
self.eval_indices = None
def determine_eval_indices(self, total_size: int) -> set:
@@ -33,9 +31,9 @@ class DataSplittingStep(PipelineStep):
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Process batch for data splitting - no modification needed"""
return batch.copy()
return batch
def save_splits(self, df: pd.DataFrame) -> None:
def split(self, df: pd.DataFrame) -> None:
"""Save the split datasets based on configuration"""
output_files = self.pipeline_config.data.output_files
data_dir = self.pipeline_config.paths.data_dir
@@ -52,9 +50,14 @@ class DataSplittingStep(PipelineStep):
else:
self.data_loader.save_csv(df, data_dir / output_files["featured"])
if self.pipeline_config.data.split_by_province:
for province in RegionMapper.get_provinces():
df_region = df[df.province == province]
self.data_loader.save_csv(df_region, data_dir / "provinces" / f"{province}.csv")
if self.pipeline_config.data.split_by_gender:
df_males = df[df["sex"] == Gender.MALE.value]
df_females = df[df["sex"] == Gender.FEMALE.value]
df_males = df[df.sex == Gender.MALE.value]
df_females = df[df.sex == Gender.FEMALE.value]
self.data_loader.save_csv(df_males, data_dir / output_files["males"])
self.data_loader.save_csv(df_females, data_dir / output_files["females"])
+131 -48
View File
@@ -1,5 +1,7 @@
import gc
import logging
from enum import Enum
from typing import Dict, Any
import pandas as pd
@@ -27,10 +29,15 @@ class FeatureExtractionStep(PipelineStep):
self.region_mapper = RegionMapper()
self.name_tagger = NERNameTagger()
@classmethod
def requires_batch_mutation(cls) -> bool:
"""This step creates new columns, so mutation is required"""
return True
@classmethod
def validate_gender(cls, gender: str) -> Gender:
"""Validate and normalize gender value"""
gender_lower = gender.lower().strip()
gender_lower = str(gender).lower().strip()
if gender_lower in ["m", "male", "homme", "masculin"]:
return Gender.MALE
elif gender_lower in ["f", "female", "femme", "féminin"]:
@@ -41,68 +48,144 @@ class FeatureExtractionStep(PipelineStep):
@classmethod
def get_name_category(cls, word_count: int) -> NameCategory:
"""Determine name category based on word count"""
if word_count == 3:
return NameCategory.SIMPLE
else:
return NameCategory.COMPOSE
return NameCategory.SIMPLE if word_count == 3 else NameCategory.COMPOSE
def process_batch(self, batch: pd.DataFrame, batch_id: int) -> pd.DataFrame:
"""Extract features from names in batch"""
logging.info(f"Extracting features for batch {batch_id} with {len(batch)} rows")
batch = batch.copy()
result = batch.copy()
numeric_features = self._compute_numeric_features(result["name"])
result = result.assign(**numeric_features)
# Basic features
batch["words"] = batch["name"].str.count(" ") + 1
batch["length"] = batch["name"].str.len()
# Initialize features columns with optimal dtypes
features_columns = self._initialize_features_columns(len(result))
result = result.assign(**features_columns)
# Handle year column
if "year" in batch.columns:
batch["year"] = pd.to_numeric(batch["year"], errors="coerce").astype("Int64")
self._assign_probable_names(result)
self._process_simple_names(result)
result["identified_category"] = self._assign_identified_category(result["words"])
# Initialize new columns
batch["probable_native"] = None
batch["probable_surname"] = None
batch["identified_name"] = None
batch["identified_surname"] = None
batch["ner_entities"] = None
batch["ner_tagged"] = 0
batch["annotated"] = 0
if "year" in result.columns:
result["year"] = pd.to_numeric(result["year"], errors="coerce").astype("Int16")
# Vectorized category assignment
batch["identified_category"] = batch["words"].apply(
lambda x: self.get_name_category(x).value
if "region" in result.columns:
result["province"] = self.region_mapper.map(result["region"])
result["province"] = result["province"].astype("category")
if "sex" in result.columns:
result["sex"] = self._normalize_gender(result["sex"])
# Apply final dtype optimizations
result = self._optimize_dtypes(result)
# Cleanup
del numeric_features, features_columns
if batch_id % 10 == 0: # Periodic cleanup
gc.collect()
return result
@classmethod
def _compute_numeric_features(cls, series: pd.Series) -> Dict[str, pd.Series]:
"""Calculate basic features in vectorized manner"""
return {
"words": (series.str.count(" ") + 1).astype("Int8"),
"length": series.str.len().astype("Int16"),
}
@classmethod
def _initialize_features_columns(cls, size: int) -> Dict[str, Any]:
"""Initialize new columns with optimal dtypes"""
return {
"probable_native": pd.Series([None] * size, dtype="string"),
"probable_surname": pd.Series([None] * size, dtype="string"),
"identified_name": pd.Series([None] * size, dtype="string"),
"identified_surname": pd.Series([None] * size, dtype="string"),
"ner_entities": pd.Series([None] * size, dtype="string"),
"ner_tagged": pd.Series([0] * size, dtype="Int8"),
"annotated": pd.Series([0] * size, dtype="Int8"),
}
@classmethod
def _assign_probable_names(cls, df: pd.DataFrame) -> None:
"""Assign probable native and surname names efficiently"""
name_splits = df["name"].str.split()
mask = name_splits.str.len() >= 2
df.loc[mask, "probable_native"] = name_splits[mask].apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) else None
)
df.loc[mask, "probable_surname"] = name_splits[mask].apply(
lambda x: x[-1] if isinstance(x, list) else None
)
# Assign probable_native and probable_surname for all names
name_splits = batch["name"].str.split()
batch["probable_native"] = name_splits.apply(
lambda x: " ".join(x[:-1]) if isinstance(x, list) and len(x) >= 2 else None
)
batch["probable_surname"] = name_splits.apply(
lambda x: x[-1] if isinstance(x, list) and len(x) >= 2 else None
)
def _assign_identified_category(self, series: pd.Series) -> pd.Series:
"""Assign identified category based on word count"""
return series.map(lambda x: self.get_name_category(x).value).astype("category")
# Auto-assign for 3-word names
three_word_mask = batch["words"] == 3
batch.loc[three_word_mask, "identified_name"] = batch.loc[three_word_mask, "probable_native"]
batch.loc[three_word_mask, "identified_surname"] = batch.loc[three_word_mask, "probable_surname"]
batch.loc[three_word_mask, "annotated"] = 1
def _process_simple_names(self, df: pd.DataFrame) -> None:
"""Process 3-word names efficiently with vectorized operations"""
mask = df["words"] == 3
# Tag names with NER entities
three_word_rows = batch[three_word_mask]
if not mask.any():
return
df.loc[mask, "identified_name"] = df.loc[mask, "probable_native"]
df.loc[mask, "identified_surname"] = df.loc[mask, "probable_surname"]
df.loc[mask, "annotated"] = 1
# NER tagging for 3-word names
three_word_rows = df[mask]
for idx, row in three_word_rows.iterrows():
entity = self.name_tagger.tag_name(row['name'], row['identified_name'], row['identified_surname'])
try:
entity = self.name_tagger.tag_name(
row["name"], row["identified_name"], row["identified_surname"]
)
if entity:
batch.at[idx, "ner_entities"] = entity["entities"]
batch.at[idx, "ner_tagged"] = 1
if entity:
df.at[idx, "ner_entities"] = str(entity["entities"])
df.at[idx, "ner_tagged"] = 1
except Exception as e:
logging.warning(f"NER tagging failed for row {idx}: {e}")
# Map regions to provinces
batch["province"] = self.region_mapper.map_regions_vectorized(batch["region"])
def _normalize_gender(self, series: pd.Series) -> pd.Series:
gender_mapping = {
"m": "m",
"male": "m",
"homme": "m",
"masculin": "m",
"f": "f",
"female": "f",
"femme": "f",
"féminin": "f",
}
# Normalize gender
if "sex" in batch.columns:
batch["sex"] = batch["sex"].apply(lambda x: self.validate_gender(str(x)).value)
# Apply mapping with error handling
normalized = series.astype(str).str.lower().str.strip().map(gender_mapping)
return normalized.astype("category")
return batch
@classmethod
def _optimize_dtypes(cls, df: pd.DataFrame) -> pd.DataFrame:
categories = ["province", "identified_category", "sex"]
for col in categories:
if col in df.columns and df[col].dtype != "category":
df[col] = df[col].astype("category")
# Ensure string columns are proper string dtype
string_cols = [
"name",
"probable_native",
"probable_surname",
"identified_name",
"identified_surname",
"ner_entities",
]
for col in string_cols:
if col in df.columns and df[col].dtype == "object":
df[col] = df[col].astype("string")
return df
+5 -4
View File
@@ -24,8 +24,7 @@ class LLMAnnotationStep(PipelineStep):
batch_config = BatchConfig(
batch_size=pipeline_config.processing.batch_size,
max_workers=min(
self.llm_config.max_concurrent_requests,
pipeline_config.processing.max_workers
self.llm_config.max_concurrent_requests, pipeline_config.processing.max_workers
),
checkpoint_interval=pipeline_config.processing.checkpoint_interval,
use_multiprocessing=pipeline_config.processing.use_multiprocessing,
@@ -98,7 +97,7 @@ class LLMAnnotationStep(PipelineStep):
# Exponential backoff with jitter
if attempt < self.llm_config.retry_attempts - 1:
wait_time = (2 ** attempt) + (time.time() % 1)
wait_time = (2**attempt) + (time.time() % 1)
time.sleep(min(wait_time, 10))
self.failed_requests += 1
@@ -156,6 +155,8 @@ class LLMAnnotationStep(PipelineStep):
batch.loc[idx, "annotated"] = 0
# Ensure proper data types
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
batch["annotated"] = (
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
)
return batch
+10 -8
View File
@@ -6,8 +6,8 @@ from typing import Dict
import pandas as pd
from core.config.pipeline_config import PipelineConfig
from processing.steps import PipelineStep, NameAnnotation
from processing.ner.ner_name_model import NERNameModel
from processing.steps import PipelineStep, NameAnnotation
class NERAnnotationStep(PipelineStep):
@@ -63,7 +63,7 @@ class NERAnnotationStep(PipelineStep):
# Get NER predictions
prediction = self.ner_trainer.predict(name.lower())
entities = prediction.get('entities', [])
entities = prediction.get("entities", [])
elapsed_time = time.time() - start_time
@@ -72,15 +72,15 @@ class NERAnnotationStep(PipelineStep):
surname_parts = []
for entity in entities:
if entity['label'] == 'NATIVE':
native_parts.append(entity['text'])
elif entity['label'] == 'SURNAME':
surname_parts.append(entity['text'])
if entity["label"] == "NATIVE":
native_parts.append(entity["text"])
elif entity["label"] == "SURNAME":
surname_parts.append(entity["text"])
# Create annotation result in same format as LLM step
annotation = NameAnnotation(
identified_name=" ".join(native_parts) if native_parts else None,
identified_surname=" ".join(surname_parts) if surname_parts else None
identified_surname=" ".join(surname_parts) if surname_parts else None,
)
result = {
@@ -159,6 +159,8 @@ class NERAnnotationStep(PipelineStep):
batch.loc[idx, "annotated"] = 0
# Ensure proper data types
batch["annotated"] = pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
batch["annotated"] = (
pd.to_numeric(batch["annotated"], errors="coerce").fillna(0).astype("Int8")
)
return batch