fix: escape csv field to avoid error on empty fields
This commit is contained in:
@@ -164,5 +164,5 @@ class DataLoader:
|
|||||||
if create_dirs:
|
if create_dirs:
|
||||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
df.to_csv(filepath, index=False, encoding="utf-8")
|
df.to_csv(filepath, index=False, encoding="utf-8", sep=",", quoting=1)
|
||||||
logging.info(f"Saved {len(df)} rows to {filepath}")
|
logging.info(f"Saved {len(df)} rows to {filepath}")
|
||||||
|
|||||||
@@ -11,6 +11,10 @@ from .name_tagger import NameTagger
|
|||||||
|
|
||||||
class NameBuilder:
|
class NameBuilder:
|
||||||
def __init__(self, config: PipelineConfig):
|
def __init__(self, config: PipelineConfig):
|
||||||
|
config = config.model_copy(deep=True)
|
||||||
|
config.data.max_dataset_size = 1_000_000
|
||||||
|
config.data.balance_by_sex = True
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.data_loader = DataLoader(config)
|
self.data_loader = DataLoader(config)
|
||||||
self.tagger = NameTagger()
|
self.tagger = NameTagger()
|
||||||
@@ -33,8 +37,8 @@ class NameBuilder:
|
|||||||
nlp = spacy.blank("fr")
|
nlp = spacy.blank("fr")
|
||||||
|
|
||||||
# Use NERNameTagger for parsing and validation
|
# Use NERNameTagger for parsing and validation
|
||||||
parsed_entities = NameTagger.parse_entities(ner_df["ner_entities"])
|
parsed_entities = self.tagger.parse_entities(ner_df["ner_entities"])
|
||||||
validated_entities = NameTagger.validate_entities(ner_df["name"], parsed_entities)
|
validated_entities = self.tagger.validate_entities(ner_df["name"], parsed_entities)
|
||||||
|
|
||||||
# Drop rows with no valid entities
|
# Drop rows with no valid entities
|
||||||
mask = validated_entities.map(bool)
|
mask = validated_entities.map(bool)
|
||||||
@@ -51,7 +55,7 @@ class NameBuilder:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Use NERNameTagger to create spaCy DocBin
|
# Use NERNameTagger to create spaCy DocBin
|
||||||
docs = NameTagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
docs = self.tagger.create_docs(nlp, ner_df["name"].tolist(), validated_entities.tolist())
|
||||||
doc_bin = DocBin(docs=docs)
|
doc_bin = DocBin(docs=docs)
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from typing import Dict, Any, List, Tuple
|
|||||||
import spacy
|
import spacy
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import minibatch
|
from spacy.util import minibatch
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from core.config.pipeline_config import PipelineConfig
|
from core.config.pipeline_config import PipelineConfig
|
||||||
|
|
||||||
@@ -198,13 +199,10 @@ class NameModel:
|
|||||||
|
|
||||||
# Create training examples
|
# Create training examples
|
||||||
examples = []
|
examples = []
|
||||||
for text, annotations in data:
|
for text, annotations in tqdm(data, description="Create training examples"):
|
||||||
doc = self.nlp.make_doc(text)
|
doc = self.nlp.make_doc(text)
|
||||||
example = Example.from_dict(doc, annotations)
|
example = Example.from_dict(doc, annotations)
|
||||||
examples.append(example)
|
examples.append(example)
|
||||||
logging.info(
|
|
||||||
f"Training example: {text[:30]} with entities {annotations.get('entities', [])}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Train in batches
|
# Train in batches
|
||||||
batches = minibatch(examples, size=batch_size)
|
batches = minibatch(examples, size=batch_size)
|
||||||
|
|||||||
@@ -13,7 +13,10 @@ from research.model_registry import list_available_models
|
|||||||
|
|
||||||
class Experiments:
|
class Experiments:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, config: PipelineConfig, experiment_tracker: ExperimentTracker, experiment_runner: ExperimentRunner
|
self,
|
||||||
|
config: PipelineConfig,
|
||||||
|
experiment_tracker: ExperimentTracker,
|
||||||
|
experiment_runner: ExperimentRunner
|
||||||
):
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.experiment_tracker = experiment_tracker
|
self.experiment_tracker = experiment_tracker
|
||||||
@@ -22,6 +25,7 @@ class Experiments:
|
|||||||
|
|
||||||
def index(self):
|
def index(self):
|
||||||
st.title("Experiments")
|
st.title("Experiments")
|
||||||
|
|
||||||
tab1, tab2, tab3 = st.tabs(
|
tab1, tab2, tab3 = st.tabs(
|
||||||
["Templates", "Experiments", "Batch Experiments"])
|
["Templates", "Experiments", "Batch Experiments"])
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user