feat: add osm data

2025-09-21 16:23:44 +02:00
parent 63e23d6600
commit c1b502c878
39 changed files with 955 additions and 338 deletions
@@ -11,6 +11,7 @@ processing:
 # Pipeline stages
 stages:
  - "data_cleaning"
+  - "data_selection"
  - "feature_extraction"
  #- "ner_annotation"
  #- "llm_annotation"
@@ -3,17 +3,18 @@ debug: false

 # Processing settings
 processing:
-  batch_size: 10_000
-  max_workers: 8
+  batch_size: 100_000
+  max_workers: 4
  checkpoint_interval: 10
  use_multiprocessing: true

 # Pipeline stages
 stages:
  - "data_cleaning"
+  - "data_selection"
  - "feature_extraction"
-  - "ner_annotation"
-  - "llm_annotation"
+  # - "ner_annotation"
+  # - "llm_annotation"
  - "data_splitting"

 # Production LLM settings
@@ -34,7 +35,7 @@ data:
 # Production logging (less verbose)
 logging:
  level: "INFO"
-  console_logging: false
+  console_logging: true
  file_logging: true
  log_file: "pipeline.production.log"
  max_log_size: 52428800  # 50MB
@@ -21,6 +21,7 @@ paths:
 # List of stages in the processing pipeline
 stages:
  - "data_cleaning"                        # Data cleaning stage
+  - "data_selection"                       # Data selection stage - keep only required columns
  - "feature_extraction"                   # Feature extraction stage
  - "ner_annotation"                       # NER-based annotation stage
  - "llm_annotation"                       # LLM annotation stage (computational intensive)
@@ -64,6 +65,11 @@ data:
    females: "names_females.csv"            # Output files for female names
    ner_data: "names_ner.json"              # Output file for NER annotated data
    ner_spacy: "names_ner.spacy"            # Output file for NER annotated data using spaCy format
+  selected_columns:                         # Required columns for processing
+    - name
+    - sex
+    - region
+    - year
  split_evaluation: false                   # Should the dataset be split into training and evaluation sets ?
  split_by_gender: true                     # Should the dataset be split by gender ?
  split_by_province: true                   # Should the dataset be split by province ?