diff --git a/core/utils/data_loader.py b/core/utils/data_loader.py index 390ba60..d429e88 100644 --- a/core/utils/data_loader.py +++ b/core/utils/data_loader.py @@ -164,5 +164,5 @@ class DataLoader: if create_dirs: filepath.parent.mkdir(parents=True, exist_ok=True) - df.to_csv(filepath, index=False, encoding="utf-8") + df.to_csv(filepath, index=False, encoding="utf-8", sep=",", quoting=1) logging.info(f"Saved {len(df)} rows to {filepath}") diff --git a/processing/ner/name_builder.py b/processing/ner/name_builder.py index b569f7f..f724b3f 100644 --- a/processing/ner/name_builder.py +++ b/processing/ner/name_builder.py @@ -11,6 +11,10 @@ from .name_tagger import NameTagger class NameBuilder: def __init__(self, config: PipelineConfig): + config = config.model_copy(deep=True) + config.data.max_dataset_size = 1_000_000 + config.data.balance_by_sex = True + self.config = config self.data_loader = DataLoader(config) self.tagger = NameTagger() @@ -33,8 +37,8 @@ class NameBuilder: nlp = spacy.blank("fr") # Use NERNameTagger for parsing and validation - parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"]) - validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities) + parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"]) + validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities) # Drop rows with no valid entities mask = validated_entities.map(bool) @@ -51,7 +55,7 @@ class NameBuilder: ) # Use NERNameTagger to create spaCy DocBin - docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist()) + docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist()) doc_bin = DocBin(docs=docs) # Save diff --git a/processing/ner/name_model.py b/processing/ner/name_model.py index 8488424..c8d1141 100644 --- a/processing/ner/name_model.py +++ b/processing/ner/name_model.py @@ -8,6 +8,7 @@ from typing import Dict, Any, List, Tuple import spacy from spacy.training import Example from spacy.util import minibatch +from tqdm import tqdm from core.config.pipeline_config import PipelineConfig @@ -198,13 +199,10 @@ class NameModel: # Create training examples examples = [] - for text, annotations in data: + for text, annotations in tqdm(data, description="Create training examples"): doc = self.nlp.make_doc(text) example = Example.from_dict(doc, annotations) examples.append(example) - logging.info( - f"Training example: {text[:30]} with entities {annotations.get('entities', [])}" - ) # Train in batches batches = minibatch(examples, size=batch_size) diff --git a/web/interfaces/experiments.py b/web/interfaces/experiments.py index 3a4afa5..6624e13 100644 --- a/web/interfaces/experiments.py +++ b/web/interfaces/experiments.py @@ -13,7 +13,10 @@ from research.model_registry import list_available_models class Experiments: def __init__( - self, config: PipelineConfig, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner + self, + config: PipelineConfig, + experiment_tracker: ExperimentTracker, + experiment_runner: ExperimentRunner ): self.config = config self.experiment_tracker = experiment_tracker @@ -22,6 +25,7 @@ class Experiments: def index(self): st.title("Experiments") + tab1, tab2, tab3 = st.tabs( ["Templates", "Experiments", "Batch Experiments"])