From f454ba79388b8358682a79a41b95c5a7bb0e14ec Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Thu, 19 Jun 2025 18:45:11 +0200 Subject: [PATCH] Initial commit --- .env | 1 + .gitignore | 10 +++++++++ Makefile | 25 ++++++++++++++++++++++ README.md | 19 +++++++++++++++++ misc/__init__.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+) create mode 100644 .env create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 misc/__init__.py diff --git a/.env b/.env new file mode 100644 index 0000000..75f1af2 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +DATASET_URL=https://example.com/dataset.zip diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c526c05 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +.idea/ +.vscode/ +.venv/ +__pycache__/ +.ipynb_checkpoints/ +*.pyc +models/ +.env.local +var/ +/dataset/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2e887a2 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +.PHONY: default +default: help + +.PHONY: help +help: + @echo Tasks: + @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) + +.PHONY: download +download: + @if [ ! -f dataset/names.csv ]; then \ + set -a; [ -f .env.local ] && . .env.local; set +a; \ + [ -z "$$DATASET_URL" ] && . .env; \ + mkdir -p dataset; \ + curl -L "$${DATASET_URL}" -o dataset/names.csv; \ + else \ + echo "dataset/names.csv already exists. Skipping download."; \ + fi + +.PHONY: clean +clean: + rm -rf ./models + rm -rf ./results + rm -rf ./dataset/spacy/train.spacy + rm -rf ./dataset/spacy/dev.spacy diff --git a/README.md b/README.md new file mode 100644 index 0000000..e2c7dd2 --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# NERS-NLP: A Culturally-Aware Natural Language Processing System with Named Entity Recognition and Gender Inference Models + +Despite the growing success of Named Entity Recognition (NER) systems and gender inference models in Natural Language Processing (NLP), these tools often underperform when applied to culturally diverse African contexts due to the lack of culturally-representative training data. In this paper, we propose NERS-NLP, a culturally-aware NLP system with Named Entity Recognition and Gender Inference Models. This study introduces a large-scale dataset of over 7 million names of the population of the Democratic Republic of Congo (DRC) annotated with gender and demographic metadata, including geographical distribution. We explore the linguistic and sociocultural features embedded in these names and examine their impact on two key NLP tasks, namely, entity recognition and gender classification. +Our approach involves (1) a statistical and feature analysis of Congolese name structures, (2) the development of supervised gender prediction models leveraging name components and demographic patterns, and (3) the integration of the curated name lexicon into NER pipelines to improve recognition accuracy for Congolese entities. Experiments conducted on custom evaluation sets, including multilingual and code-switched Congolese texts, show that our culturally-aware methods significantly outperform state-of-the-art multilingual baselines. +This work demonstrates the importance of culturally grounded resources in reducing bias and improving performance in NLP systems applied to underrepresented regions. Our findings open new directions for inclusive language technologies in African contexts and contribute a valuable resource for future research in regional linguistics, onomastics, and identity-aware artificial intelligence. + + +# Usage +```bash +git clone https://github.com/bernard-ng/drc-ners-nlp.git +cd drc-ners-nlp + +python3 -m venv .venv +source .venv/bin/activate +cp .env .env.local +make download + +pip install -r requirements.txt +``` diff --git a/misc/__init__.py b/misc/__init__.py new file mode 100644 index 0000000..dbfc2d4 --- /dev/null +++ b/misc/__init__.py @@ -0,0 +1,55 @@ +import csv +import json +import os +from datetime import datetime +from typing import Optional + +# Paths +ROOT_DIR = os.getcwd() +DATA_DIR = os.path.join(ROOT_DIR, 'dataset') + +# Training +TRAINING_EPOCHS = 5 +MODEL_NAME = f"./models/ners-{datetime.now().strftime('%Y%m%d%H%M%S')}" + + +def clean_spacing(filename: str) -> Optional[str]: + try: + with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf8') as f: + content = f.read() + return content.translate(str.maketrans({'\00': ' ', ' ': ' '})) + except Exception as e: + return None + + +def load_json_dataset(path: str) -> list: + print(f">> Loading JSON dataset from {path}") + with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f: + return json.load(f) + + +def save_csv_dataset(data: list, path: str) -> None: + print(f">> Saving CSV dataset to {path}") + with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=data[0].keys()) + writer.writeheader() + writer.writerows(data) + + +def load_csv_dataset(path: str, limit: int = None) -> list: + print(f">> Loading CSV dataset from {path}") + data = [] + with open(os.path.join(DATA_DIR, path), "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + data.append(row) + if limit and len(data) >= limit: + break + + return data + + +def save_json_dataset(data: list, path: str) -> None: + print(f">> Saving JSON dataset to {path}") + with open(os.path.join(DATA_DIR, path), "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, separators=(',', ':'))