feat: support gpu

This commit is contained in:
2025-09-29 21:07:23 +02:00
parent 9e35f95107
commit a1d500830b
15 changed files with 661 additions and 85 deletions
+9
View File
@@ -29,6 +29,15 @@ class NameModel:
"""Create a blank spaCy model with NER pipeline"""
logging.info(f"Creating blank {language} model for NER training")
# Prefer GPU for spaCy if available (falls back to CPU automatically)
try:
if spacy.prefer_gpu():
logging.info("spaCy GPU enabled (cupy) for NER training")
else:
logging.info("spaCy running on CPU")
except Exception as e:
logging.debug(f"spaCy GPU selection skipped: {e}")
# Create blank model - French tokenizer works well for DRC names
self.nlp = spacy.blank(language)
+6 -2
View File
@@ -20,11 +20,15 @@ class DataSelectionStep(PipelineStep):
# Remove rows where region == "global" only for specific years
if "region" in batch.columns and "year" in batch.columns:
target_years = {2015, 2021, 2022}
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(target_years)
mask_remove = batch["region"].str.lower().eq("global") & batch["year"].isin(
target_years
)
removed = int(mask_remove.sum())
if removed:
batch = batch[~mask_remove]
logging.info(f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}")
logging.info(
f"Removed {removed} rows with region == 'global' for years {sorted(target_years)} in batch {batch_id}"
)
# Check which columns exist in the batch
available_columns = [col for col in self.selected_columns if col in batch.columns]