feat: add osm data

This commit is contained in:
2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
+1
View File
@@ -11,6 +11,7 @@ processing:
# Pipeline stages
stages:
- "data_cleaning"
- "data_selection"
- "feature_extraction"
#- "ner_annotation"
#- "llm_annotation"
+6 -5
View File
@@ -3,17 +3,18 @@ debug: false
# Processing settings
processing:
batch_size: 10_000
max_workers: 8
batch_size: 100_000
max_workers: 4
checkpoint_interval: 10
use_multiprocessing: true
# Pipeline stages
stages:
- "data_cleaning"
- "data_selection"
- "feature_extraction"
- "ner_annotation"
- "llm_annotation"
# - "ner_annotation"
# - "llm_annotation"
- "data_splitting"
# Production LLM settings
@@ -34,7 +35,7 @@ data:
# Production logging (less verbose)
logging:
level: "INFO"
console_logging: false
console_logging: true
file_logging: true
log_file: "pipeline.production.log"
max_log_size: 52428800 # 50MB
+6
View File
@@ -21,6 +21,7 @@ paths:
# List of stages in the processing pipeline
stages:
- "data_cleaning" # Data cleaning stage
- "data_selection" # Data selection stage - keep only required columns
- "feature_extraction" # Feature extraction stage
- "ner_annotation" # NER-based annotation stage
- "llm_annotation" # LLM annotation stage (computational intensive)
@@ -64,6 +65,11 @@ data:
females: "names_females.csv" # Output files for female names
ner_data: "names_ner.json" # Output file for NER annotated data
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
selected_columns: # Required columns for processing
- name
- sex
- region
- year
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
split_by_gender: true # Should the dataset be split by gender ?
split_by_province: true # Should the dataset be split by province ?