Files
drc-ners-nlp/misc/__init__.py
T

60 lines
1.8 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import csv
import json
import os
from datetime import datetime
from typing import Optional
# Paths
ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, 'dataset')
MODELS_DIR = os.path.join(ROOT_DIR, 'models')
GENDER_MODELS_DIR = os.path.join(MODELS_DIR, 'gender')
NER_MODELS_DIR = os.path.join(MODELS_DIR, 'ner')
# Training
TRAINING_EPOCHS = 5
MODEL_NAME = f"./models/ners-{datetime.now().strftime('%Y%m%d%H%M%S')}"
def clean_spacing(filename: str) -> Optional[str]:
try:
with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf8') as f:
content = f.read()
return content.translate(str.maketrans({'\00': ' ', ' ': ' '}))
except Exception as e:
return None
def load_json_dataset(path: str) -> list:
print(f">> Loading JSON dataset from {path}")
with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f:
return json.load(f)
def save_csv_dataset(data: list, path: str) -> None:
print(f">> Saving CSV dataset to {path}")
with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
def load_csv_dataset(path: str, limit: int = None) -> list:
print(f">> Loading CSV dataset from {path}")
data = []
with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
data.append(row)
if limit and len(data) >= limit:
break
return data
def save_json_dataset(data: list, path: str) -> None:
print(f">> Saving JSON dataset to {path}")
with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, separators=(',', ':'))