feat: add osm data
This commit is contained in:
@@ -11,6 +11,7 @@ processing:
|
||||
# Pipeline stages
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "data_selection"
|
||||
- "feature_extraction"
|
||||
#- "ner_annotation"
|
||||
#- "llm_annotation"
|
||||
|
||||
@@ -3,17 +3,18 @@ debug: false
|
||||
|
||||
# Processing settings
|
||||
processing:
|
||||
batch_size: 10_000
|
||||
max_workers: 8
|
||||
batch_size: 100_000
|
||||
max_workers: 4
|
||||
checkpoint_interval: 10
|
||||
use_multiprocessing: true
|
||||
|
||||
# Pipeline stages
|
||||
stages:
|
||||
- "data_cleaning"
|
||||
- "data_selection"
|
||||
- "feature_extraction"
|
||||
- "ner_annotation"
|
||||
- "llm_annotation"
|
||||
# - "ner_annotation"
|
||||
# - "llm_annotation"
|
||||
- "data_splitting"
|
||||
|
||||
# Production LLM settings
|
||||
@@ -34,7 +35,7 @@ data:
|
||||
# Production logging (less verbose)
|
||||
logging:
|
||||
level: "INFO"
|
||||
console_logging: false
|
||||
console_logging: true
|
||||
file_logging: true
|
||||
log_file: "pipeline.production.log"
|
||||
max_log_size: 52428800 # 50MB
|
||||
|
||||
@@ -21,6 +21,7 @@ paths:
|
||||
# List of stages in the processing pipeline
|
||||
stages:
|
||||
- "data_cleaning" # Data cleaning stage
|
||||
- "data_selection" # Data selection stage - keep only required columns
|
||||
- "feature_extraction" # Feature extraction stage
|
||||
- "ner_annotation" # NER-based annotation stage
|
||||
- "llm_annotation" # LLM annotation stage (computational intensive)
|
||||
@@ -64,6 +65,11 @@ data:
|
||||
females: "names_females.csv" # Output files for female names
|
||||
ner_data: "names_ner.json" # Output file for NER annotated data
|
||||
ner_spacy: "names_ner.spacy" # Output file for NER annotated data using spaCy format
|
||||
selected_columns: # Required columns for processing
|
||||
- name
|
||||
- sex
|
||||
- region
|
||||
- year
|
||||
split_evaluation: false # Should the dataset be split into training and evaluation sets ?
|
||||
split_by_gender: true # Should the dataset be split by gender ?
|
||||
split_by_province: true # Should the dataset be split by province ?
|
||||
|
||||
Reference in New Issue
Block a user