fix: escape csv field to avoid error on empty fields

This commit is contained in:
2025-08-17 13:39:19 +02:00
parent ed60f9deff
commit 3122c92f5e
4 changed files with 15 additions and 9 deletions
+1 -1
View File
@@ -164,5 +164,5 @@ class DataLoader:
if create_dirs: if create_dirs:
filepath.parent.mkdir(parents=True, exist_ok=True) filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False, encoding="utf-8") df.to_csv(filepath, index=False, encoding="utf-8", sep=",", quoting=1)
logging.info(f"Saved {len(df)} rows to {filepath}") logging.info(f"Saved {len(df)} rows to {filepath}")
+7 -3
View File
@@ -11,6 +11,10 @@ from .name_tagger import NameTagger
class NameBuilder: class NameBuilder:
def __init__(self, config: PipelineConfig): def __init__(self, config: PipelineConfig):
config = config.model_copy(deep=True)
config.data.max_dataset_size = 1_000_000
config.data.balance_by_sex = True
self.config = config self.config = config
self.data_loader = DataLoader(config) self.data_loader = DataLoader(config)
self.tagger = NameTagger() self.tagger = NameTagger()
@@ -33,8 +37,8 @@ class NameBuilder:
nlp = spacy.blank("fr") nlp = spacy.blank("fr")
# Use NERNameTagger for parsing and validation # Use NERNameTagger for parsing and validation
parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"]) parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities) validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities)
# Drop rows with no valid entities # Drop rows with no valid entities
mask = validated_entities.map(bool) mask = validated_entities.map(bool)
@@ -51,7 +55,7 @@ class NameBuilder:
) )
# Use NERNameTagger to create spaCy DocBin # Use NERNameTagger to create spaCy DocBin
docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist()) docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
doc_bin = DocBin(docs=docs) doc_bin = DocBin(docs=docs)
# Save # Save
+2 -4
View File
@@ -8,6 +8,7 @@ from typing import Dict, Any, List, Tuple
import spacy import spacy
from spacy.training import Example from spacy.training import Example
from spacy.util import minibatch from spacy.util import minibatch
from tqdm import tqdm
from core.config.pipeline_config import PipelineConfig from core.config.pipeline_config import PipelineConfig
@@ -198,13 +199,10 @@ class NameModel:
# Create training examples # Create training examples
examples = [] examples = []
for text, annotations in data: for text, annotations in tqdm(data, description="Create training examples"):
doc = self.nlp.make_doc(text) doc = self.nlp.make_doc(text)
example = Example.from_dict(doc, annotations) example = Example.from_dict(doc, annotations)
examples.append(example) examples.append(example)
logging.info(
f"Training example: {text[:30]} with entities {annotations.get('entities', [])}"
)
# Train in batches # Train in batches
batches = minibatch(examples, size=batch_size) batches = minibatch(examples, size=batch_size)
+5 -1
View File
@@ -13,7 +13,10 @@ from research.model_registry import list_available_models
class Experiments: class Experiments:
def __init__( def __init__(
self, config: PipelineConfig, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner self,
config: PipelineConfig,
experiment_tracker: ExperimentTracker,
experiment_runner: ExperimentRunner
): ):
self.config = config self.config = config
self.experiment_tracker = experiment_tracker self.experiment_tracker = experiment_tracker
@@ -22,6 +25,7 @@ class Experiments:
def index(self): def index(self):
st.title("Experiments") st.title("Experiments")
tab1, tab2, tab3 = st.tabs( tab1, tab2, tab3 = st.tabs(
["Templates", "Experiments", "Batch Experiments"]) ["Templates", "Experiments", "Batch Experiments"])