feat: web application multipage support

2025-08-16 19:05:24 +02:00
parent 7b652d6999
commit 84f7d41a84
38 changed files with 765 additions and 507 deletions
@@ -18,7 +18,8 @@ paths:
  checkpoints_dir: "./data/checkpoints"   # Directory for model checkpoints

 # Pipeline stages
-stages: # List of stages in the processing pipeline
+# List of stages in the processing pipeline
+stages:
  - "data_cleaning"                        # Data cleaning stage
  - "feature_extraction"                   # Feature extraction stage
  - "ner_annotation"                       # NER-based annotation stage
@@ -36,6 +37,7 @@ processing:
    - "utf-16"
    - "latin1"
  chunk_size: 100_000                      # Size of data chunks to process in parallel
+  epochs: 2                                # Number of Epochs for training

 # Annotation settings
 annotation:
@@ -72,8 +74,9 @@ data:
  balance_by_sex: false                     # Should the dataset be balanced by sex when limiting the dataset size?

 # Logging configuration
+# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
 logging:
-  level: "INFO"                            # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  file_logging: true                       # Enable logging to file
  console_logging: true                    # Enable logging to console
@@ -7,7 +7,7 @@ baseline_experiments:
      max_len: 20
      embedding_dim: 64
      gru_units: 32
-      epochs: 10
+      epochs: 2
      batch_size: 32
    tags: [ "baseline", "neural", "bigru" ]

@@ -21,7 +21,7 @@ baseline_experiments:
      filters: 64
      kernel_size: 3
      dropout: 0.5
-      epochs: 10
+      epochs: 2
      batch_size: 32
    tags: [ "baseline", "neural", "cnn" ]

@@ -79,7 +79,7 @@ baseline_experiments:
    model_params:
      embedding_dim: 128
      lstm_units: 64
-      epochs: 10
+      epochs: 2
      batch_size: 64
    tags: [ "baseline", "neural", "lstm" ]

@@ -121,7 +121,7 @@ baseline_experiments:
      embedding_dim: 128
      num_heads: 4
      num_layers: 2
-      epochs: 10
+      epochs: 2
      batch_size: 64
    tags: [ "baseline", "neural", "transformer" ]

@@ -0,0 +1,145 @@
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+seed = 42
+
+[nlp]
+lang = "fr"
+pipeline = ["tok2vec","ner"]
+batch_size = 100000
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+vectors = {"@vectors":"spacy.Vectors.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,1000,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]