feat: add osm data

This commit is contained in:
2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
+6
View File
@@ -21,6 +21,7 @@ paths:
# List of stages in the processing pipeline
stages:
- "data_cleaning" # Data cleaning stage
- "data_selection" # Data selection stage - keep only required columns
- "feature_extraction" # Feature extraction stage
- "ner_annotation" # NER-based annotation stage
- "llm_annotation" # LLM annotation stage (computational intensive)
@@ -64,6 +65,11 @@ data:
females: "names_females.csv" # Output files for female names
ner_data: "names_ner.json" # Output file for NER annotated data
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
selected_columns: # Required columns for processing
- name
- sex
- region
- year
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
split_by_gender: true # Should the dataset be split by gender ?
split_by_province: true # Should the dataset be split by province ?