feat(monorepo): migrate to typescript monorepo

This commit is contained in:
2025-11-07 17:09:29 +02:00
committed by BernardNganduDev
parent 3e09956f05
commit 075a388ccb
745 changed files with 2341 additions and 5082 deletions
+20
View File
@@ -0,0 +1,20 @@
# paths
BASANGO_CRAWLER_ROOT_PATH=
BASANGO_CRAWLER_DATA_PATH=
BASANGO_CRAWLER_LOGS_PATH=
BASANGO_CRAWLER_CONFIG_PATH=
# crawler settings
BASANGO_CRAWLER_UPDATE_DIRECTION=forward
BASANGO_CRAWLER_FETCH_USER_AGENT="Basango/0.1 (+https://github.com/bernard-ng/basango)"
BASANGO_CRAWLER_FETCH_MAX_RETRIES=3
BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER=true
BASANGO_CRAWLER_ASYNC_REDIS_URL="redis://localhost:6379/0"
BASANGO_CRAWLER_ASYNC_TTL_RESULT=3600
BASANGO_CRAWLER_ASYNC_TTL_FAILURE=3600
BASANGO_CRAWLER_ASYNC_QUEUE_LISTING="listing"
BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS="details"
BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING="processing"
BASANGO_CRAWLER_BACKEND_API_ENDPOINT="http://localhost:3000/api/aggregator/articles?token=dev"
+179
View File
@@ -0,0 +1,179 @@
# @basango/crawler
A powerful, scalable web crawler application built with Node.js and TypeScript for extracting and processing data from various news sources and websites.
The Basango Crawler is designed to systematically crawl news websites and extract article content. It supports both synchronous and asynchronous crawling modes, with configurable sources, queue-based processing, and robust error handling.
## Features
- **Multi-mode Operation**: Synchronous and asynchronous crawling capabilities
- **Queue-based Processing**: Uses BullMQ with Redis for scalable job processing
- **Configurable Sources**: JSON-based configuration for different website sources
- **HTML & WordPress Support**: Built-in parsers for HTML websites and WordPress APIs
- **Rate Limiting**: Respects website rate limits and implements backoff strategies
- **Data Persistence**: JSONL output format for processed articles
- **Worker Management**: Distributed worker system for parallel processing
- **Type Safety**: Full TypeScript implementation with Zod schema validation
## Prerequisites
- [Bun](https://bun.sh/) runtime (recommended) or Node.js (v22+)
- Redis server (for async operations)
- TypeScript knowledge for configuration
## Installation
```bash
# Navigate to the crawler directory
cd basango/apps/crawler
# Install dependencies
bun install
```
## Configuration
### 1. Environment Variables
Create a `.env.local` file with the following variables:
```bash
# Redis configuration for async operations
BASANGO_CRAWLER_ASYNC_REDIS_URL=redis://localhost:6379/0
BASANGO_CRAWLER_ASYNC_QUEUE_LISTING=listing
BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS=details
BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING=processing
# Fetch configuration
BASANGO_CRAWLER_FETCH_MAX_RETRIES=3
BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER=true
BASANGO_CRAWLER_FETCH_USER_AGENT=Basango/0.1 (+https://github.com/bernard-ng/basango)
# Crawler behavior
BASANGO_CRAWLER_UPDATE_DIRECTION=forward
# TTL settings (in seconds)
BASANGO_CRAWLER_ASYNC_TTL_FAILURE=3600
BASANGO_CRAWLER_ASYNC_TTL_RESULT=3600
```
### 2. Source Configuration
Sources are configured in `config/sources.json`. Example source configuration:
```json
{
"sources": {
"html": [
{
"sourceId": "example.com",
"sourceKind": "html",
"sourceUrl": "https://example.com",
"sourceSelectors": {
"articles": ".article-list .article",
"articleTitle": "h2.title",
"articleLink": "a.permalink",
"articleDate": ".publish-date",
"articleBody": ".content",
"pagination": ".pagination .next"
},
"requiresDetails": true,
"supportsCategories": false
}
]
}
}
```
## Usage
### Synchronous Crawling
Perfect for immediate, one-time crawling tasks:
```bash
# Crawl a specific source
bun run crawler:sync -- --sourceId radiookapi.net
# Crawl with page range filter
bun run crawler:sync -- --sourceId radiookapi.net --pageRange 1:5
# Crawl with date range filter
bun run crawler:sync -- --sourceId radiookapi.net --dateRange 2024-01-01:2024-01-31
# Crawl specific category (if supported)
bun run crawler:sync -- --sourceId example.com --category politics
```
### Asynchronous Crawling
Best for large-scale operations and when you need job queuing:
```bash
# Schedule an async crawl job
bun run crawler:async -- --sourceId radiookapi.net
# Schedule with filters
bun run crawler:async -- --sourceId radiookapi.net --pageRange 1:10 --category economics
```
### Worker Management
Start workers to process async jobs:
```bash
# Start workers for all queues
bun run crawler:worker
# Start workers for specific queues
bun run crawler:worker -- --queue listing --queue details
# Start workers with short option
bun run crawler:worker -- -q listing -q processing
```
## CLI Options
### Crawling Commands
| Option | Description | Example |
|--------|-------------|---------|
| `--sourceId` | **Required.** Source identifier from sources.json | `--sourceId radiookapi.net` |
| `--pageRange` | Page range to crawl (format: start:end) | `--pageRange 1:5` |
| `--dateRange` | Date range filter (format: YYYY-MM-DD:YYYY-MM-DD) | `--dateRange 2024-01-01:2024-01-31` |
| `--category` | Category slug to crawl | `--category politics` |
### Worker Commands
| Option | Description | Example |
|--------|-------------|---------|
| `--queue`, `-q` | Specify queue(s) to process (can be used multiple times) | `--queue listing --queue details` |
## Project Structure
```
basango/apps/crawler/
├── src/
│ ├── config.ts # Configuration schema and loading
│ ├── constants.ts # Application constants
│ ├── schema.ts # Zod validation schemas
│ ├── utils.ts # Utility functions
│ ├── http/ # HTTP client and utilities
│ ├── process/ # Core crawling logic
│ │ ├── async/ # Async processing (queues, workers)
│ │ ├── sync/ # Synchronous processing
│ │ ├── parsers/ # Content parsers (HTML, WordPress)
│ │ ├── crawler.ts # Main crawler interface
│ │ └── persistence.ts # Data persistence layer
│ ├── scripts/ # CLI entry points
│ │ ├── crawl.ts # Sync crawling script
│ │ ├── queue.ts # Async job scheduling
│ │ ├── worker.ts # Worker process
│ │ └── utils.ts # CLI utilities
│ └── __tests__/ # Test files
├── config/
│ ├── sources.json # Source configurations
│ └── pipeline.json # Pipeline settings
├── data/ # Output directory for crawled data
└── package.json
```
+41
View File
@@ -0,0 +1,41 @@
{
"fetch": {
"async": {
"prefix": "basango:crawler",
"queues": {
"details": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS)%",
"listing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_LISTING)%",
"processing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING)%"
},
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
"ttl": {
"default": 600,
"failure": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_FAILURE)%",
"result": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_RESULT)%"
}
},
"client": {
"backoffInitial": 1,
"backoffMax": 30,
"backoffMultiplier": 2,
"followRedirects": true,
"maxRetries": "%env(number:BASANGO_CRAWLER_FETCH_MAX_RETRIES)%",
"respectRetryAfter": "%env(boolean:BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER)%",
"rotate": true,
"timeout": 20,
"userAgent": "%env(BASANGO_CRAWLER_FETCH_USER_AGENT)%",
"verifySsl": true
},
"crawler": {
"direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%",
"maxWorkers": 5,
"notify": false,
"useMultiThreading": false
}
},
"paths": {
"config": "%env(BASANGO_CRAWLER_CONFIG_PATH)%",
"data": "%env(BASANGO_CRAWLER_DATA_PATH)%",
"root": "%env(BASANGO_CRAWLER_ROOT_PATH)%"
}
}
+219
View File
@@ -0,0 +1,219 @@
{
"sources": {
"html": [
{
"paginationTemplate": "actualite",
"requiresDetails": true,
"requiresRateLimit": false,
"sourceDate": {
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"sourceId": "radiookapi.net",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": ".field-name-body",
"articleCategories": ".views-field-field-cat-gorie a",
"articleDate": ".views-field-created",
"articleLink": ".views-field-title a",
"articles": ".view-content > .views-row.content-row",
"articleTitle": "h1.page-header",
"pagination": "ul.pagination > li.pager-last > a"
},
"sourceUrl": "https://www.radiookapi.net",
"supportsCategories": false
},
{
"categories": ["politique", "economie", "culture", "sport", "societe"],
"paginationTemplate": "index.php/category/{category}",
"requiresDetails": false,
"requiresRateLimit": false,
"sourceDate": {
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"sourceId": "7sur7.cd",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": ".field.field--name-body",
"articleDate": ".views-field-created",
"articleLink": ".views-field-title a",
"articles": ".view-content > .row.views-row",
"articleTitle": ".views-field-title a",
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
},
"sourceUrl": "https://7sur7.cd",
"supportsCategories": true
},
{
"paginationTemplate": "articles.html",
"requiresDetails": true,
"requiresRateLimit": false,
"sourceDate": {
"format": "%d.%m.%Y %H:%M"
},
"sourceId": "mediacongo.net",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": ".article_ttext",
"articleCategories": "a.color_link",
"articleDate": ".article_other_about",
"articleLink": "a:first-child",
"articles": ".for_aitems > .article_other_item",
"articleTitle": "h1",
"pagination": "div.pagination > div > a:last-child"
},
"sourceUrl": "https://www.mediacongo.net",
"supportsCategories": false
},
{
"paginationTemplate": "actualite",
"requiresDetails": true,
"requiresRateLimit": false,
"sourceDate": {
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$4-$3-$2 $5"
},
"sourceId": "actualite.cd",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": ".views-field.views-field-body",
"articleCategories": "#actu-cat",
"articleDate": "#p-date",
"articleLink": "#actu-titre a",
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
"articleTitle": "h1.page-title"
},
"sourceUrl": "https://actualite.cd",
"supportsCategories": false
}
],
"wordpress": [
{
"requiresRateLimit": true,
"sourceId": "beto.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://beto.cd"
},
{ "sourceId": "newscd.net", "sourceKind": "wordpress", "sourceUrl": "https://newscd.net" },
{
"sourceId": "africanewsrdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://www.africanewsrdc.net"
},
{
"sourceId": "angazainstitute.ac.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://angazainstitute.ac.cd"
},
{ "sourceId": "b-onetv.cd", "sourceKind": "wordpress", "sourceUrl": "https://b-onetv.cd" },
{
"sourceId": "bukavufm.com",
"sourceKind": "wordpress",
"sourceUrl": "https://bukavufm.com"
},
{
"sourceId": "changement7.net",
"sourceKind": "wordpress",
"sourceUrl": "https://changement7.net"
},
{
"sourceId": "congoactu.net",
"sourceKind": "wordpress",
"sourceUrl": "https://congoactu.net"
},
{
"sourceId": "congoindependant.com",
"sourceKind": "wordpress",
"sourceUrl": "https://www.congoindependant.com"
},
{
"sourceId": "congoquotidien.com",
"sourceKind": "wordpress",
"sourceUrl": "https://www.congoquotidien.com"
},
{
"sourceId": "cumulard.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://www.cumulard.cd"
},
{
"sourceId": "environews-rdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://environews-rdc.net"
},
{
"sourceId": "freemediardc.info",
"sourceKind": "wordpress",
"sourceUrl": "https://www.freemediardc.info"
},
{
"sourceId": "geopolismagazine.org",
"sourceKind": "wordpress",
"sourceUrl": "https://geopolismagazine.org"
},
{
"sourceId": "habarirdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://habarirdc.net"
},
{ "sourceId": "infordc.com", "sourceKind": "wordpress", "sourceUrl": "https://infordc.com" },
{
"sourceId": "kilalopress.net",
"sourceKind": "wordpress",
"sourceUrl": "https://kilalopress.net"
},
{
"sourceId": "laprosperiteonline.net",
"sourceKind": "wordpress",
"sourceUrl": "https://laprosperiteonline.net"
},
{
"sourceId": "laprunellerdc.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://laprunellerdc.cd"
},
{
"sourceId": "lesmedias.net",
"sourceKind": "wordpress",
"sourceUrl": "https://lesmedias.net"
},
{
"sourceId": "lesvolcansnews.net",
"sourceKind": "wordpress",
"sourceUrl": "https://lesvolcansnews.net"
},
{
"sourceId": "netic-news.net",
"sourceKind": "wordpress",
"sourceUrl": "https://www.netic-news.net"
},
{
"sourceId": "objectif-infos.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://objectif-infos.cd"
},
{
"sourceId": "scooprdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://scooprdc.net"
},
{
"sourceId": "journaldekinshasa.com",
"sourceKind": "wordpress",
"sourceUrl": "https://www.journaldekinshasa.com"
},
{
"sourceId": "lepotentiel.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://lepotentiel.cd"
},
{ "sourceId": "acturdc.com", "sourceKind": "wordpress", "sourceUrl": "https://acturdc.com" },
{
"sourceId": "matininfos.net",
"sourceKind": "wordpress",
"sourceUrl": "https://matininfos.net"
}
]
}
}
+10
View File
@@ -0,0 +1,10 @@
{"title":"Sommet de Doha : Tshisekedi met en exergue la Couverture Santé Universelle et la gratuité de l'enseignement primaire","link":"https://7sur7.cd/2025/11/04/sommet-de-doha-tshisekedi-met-en-exergue-la-couverture-sante-universelle-et-la-gratuite","body":"## [Sommet de Doha : Tshisekedi met en exergue la Couverture Santé Universelle et la gratuité de l'enseignement primaire](/2025/11/04/sommet-de-doha-tshisekedi-met-en-exergue-la-couverture-sante-universelle-et-la-gratuite)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251104-WA0007.jpg?itok=NoApR-84)](/2025/11/04/sommet-de-doha-tshisekedi-met-en-exergue-la-couverture-sante-universelle-et-la-gratuite)\n\nmar 04/11/2025 - 13:13\n\nLe chef de l'État Félix Antoine Tshisekedi Tshilombo a pris la parole ce mardi 4 novembre 2025, du haut de la tribune du Centre national des congrès du Qatar, à loccasion de louverture du deuxième sommet mondial pour le développement social.Ces assises sont organisées par les autorités qataries, sous l’égide de lAssemblée générale des Nations Unies.","categories":[],"source":"7sur7.cd","timestamp":1762510176,"metadata":{"title":"Sommet de Doha : Tshisekedi met en exergue la Couverture Santé Universelle et la gratuité de l'enseignement primaire","description":"Le chef de l'État Félix Antoine Tshisekedi Tshilombo a pris la parole ce mardi 4 novembre 2025, du haut de la tribune du Centre national des congrès du Qatar, à loccasion de louverture du deuxième sommet mondial pour le développement social.Ces assises sont organisées par les autorités qataries, sous l’égide de lAssemblée générale des Nations Unies.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/04/sommet-de-doha-tshisekedi-met-en-exergue-la-couverture-sante-universelle-et-la-gratuite"},"tokenStatistics":{"title":37,"body":271,"excerpt":70,"categories":0}}
{"title":"Grands Lacs : Guy Kabombo et Peter Pham planchent sur la situation sécuritaire dans la région","link":"https://7sur7.cd/2025/11/04/grands-lacs-guy-kabombo-et-peter-pham-planchent-sur-la-situation-securitaire-dans-la","body":"## [Grands Lacs : Guy Kabombo et Peter Pham planchent sur la situation sécuritaire dans la région](/2025/11/04/grands-lacs-guy-kabombo-et-peter-pham-planchent-sur-la-situation-securitaire-dans-la)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251104-WA0003.jpg?itok=0Xsms65Z)](/2025/11/04/grands-lacs-guy-kabombo-et-peter-pham-planchent-sur-la-situation-securitaire-dans-la)\n\nmar 04/11/2025 - 10:45\n\nLe vice-premier ministre de la Défense nationale et anciens combattants, Me Guy Kabombo Muadiamvita, a reçu, ce lundi 03 novembre 2025, à Kinshasa, Peter Pham, ancien envoyé spécial du président des USA, Donald Trump, pour la région des Grands Lacs. Selon une dépêche consultée par 7SUR7.CD, les deux personnalités ont échangé sur la situation sécuritaire en RDC et sur des enjeux politiques de l'heure relatifs à la diplomatie sécuritaire.","categories":[],"source":"7sur7.cd","timestamp":1762510177,"metadata":{"title":"Grands Lacs : Guy Kabombo et Peter Pham planchent sur la situation sécuritaire dans la région","description":"Le vice-premier ministre de la Défense nationale et anciens combattants, Me Guy Kabombo Muadiamvita, a reçu, ce lundi 03 novembre 2025, à Kinshasa, Peter Pham, ancien envoyé spécial du président des USA, Donald Trump, pour la région des Grands Lacs. Selon une dépêche consultée par 7SUR7.CD, les deux personnalités ont échangé sur la situation sécuritaire en RDC et sur des enjeux politiques de l'heure relatifs à la diplomatie sécuritaire.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/04/grands-lacs-guy-kabombo-et-peter-pham-planchent-sur-la-situation-securitaire-dans-la"},"tokenStatistics":{"title":23,"body":284,"excerpt":64,"categories":0}}
{"title":"Le Gouvernement annonce lasphaltage des chefs-lieux de territoires du Maniema","link":"https://7sur7.cd/2025/11/03/le-gouvernement-annonce-lasphaltage-des-chefs-lieux-de-territoires-du-maniema","body":"## [Le Gouvernement annonce lasphaltage des chefs-lieux de territoires du Maniema](/2025/11/03/le-gouvernement-annonce-lasphaltage-des-chefs-lieux-de-territoires-du-maniema)\n\n[![Image d'illustration ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251103-WA0080.jpg?itok=ZRrqaHGK)](/2025/11/03/le-gouvernement-annonce-lasphaltage-des-chefs-lieux-de-territoires-du-maniema)\n\nlun 03/11/2025 - 17:38\n\nLe ministre des Infrastructures et Travaux publics, en séjour à Kindu, a annoncé le lancement prochain des travaux dasphaltage de plusieurs chefs-lieux de territoires dans la province du Maniema, dans le cadre du programme national visant à améliorer les infrastructures rurales.Cette initiative permettra de désenclaver les zones éloignées, de faciliter la circulation des biens et des personnes, et de soutenir les activités économiques locales, notamment le commerce et les échanges entre territoires.","categories":[],"source":"7sur7.cd","timestamp":1762510178,"metadata":{"title":"Le Gouvernement annonce lasphaltage des chefs-lieux de territoires du Maniema","description":"Le ministre des Infrastructures et Travaux publics, en séjour à Kindu, a annoncé le lancement prochain des travaux dasphaltage de plusieurs chefs-lieux de territoires dans la province du Maniema, dans le cadre du programme national visant à améliorer les infrastructures rurales.Cette initiative permettra de désenclaver les zones éloignées, de faciliter la circulation des biens et des personnes, et de soutenir les activités économiques locales, notamment le commerce et les échanges entre territoires.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/03/le-gouvernement-annonce-lasphaltage-des-chefs-lieux-de-territoires-du-maniema"},"tokenStatistics":{"title":22,"body":275,"excerpt":66,"categories":0}}
{"title":"Est de la RDC : Tshisekedi annonce la reprise des discussions avec le M23 à Doha","link":"https://7sur7.cd/2025/11/03/est-de-la-rdc-tshisekedi-annonce-la-reprise-des-discussions-avec-le-m23-doha","body":"## [Est de la RDC : Tshisekedi annonce la reprise des discussions avec le M23 à Doha](/2025/11/03/est-de-la-rdc-tshisekedi-annonce-la-reprise-des-discussions-avec-le-m23-doha)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20250822-WA0004%281%29.jpg?itok=-jDRzq0g)](/2025/11/03/est-de-la-rdc-tshisekedi-annonce-la-reprise-des-discussions-avec-le-m23-doha)\n\nlun 03/11/2025 - 10:32\n\nLes pourparlers entre le gouvernement congolais et la rébellion de l'AFC/M23 vont reprendre cette semaine à Doha, au Qatar. C'est le président Félix Tshisekedi qui l'a annoncé, le week-end dernier, lors de son échange avec la communauté congolaise vivant au Caire, en Égypte. À l'en croire, c'est à l'issue de ces discussions, qui devraient déboucher sur un accord de paix, que lui et son homologue rwandais seront convoqués par Washington.","categories":[],"source":"7sur7.cd","timestamp":1762510179,"metadata":{"title":"Est de la RDC : Tshisekedi annonce la reprise des discussions avec le M23 à Doha","description":"Les pourparlers entre le gouvernement congolais et la rébellion de l'AFC/M23 vont reprendre cette semaine à Doha, au Qatar. C'est le président Félix Tshisekedi qui l'a annoncé, le week-end dernier, lors de son échange avec la communauté congolaise vivant au Caire, en Égypte. À l'en croire, c'est à l'issue de ces discussions, qui devraient déboucher sur un accord de paix, que lui et son homologue rwandais seront convoqués par Washington.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/03/est-de-la-rdc-tshisekedi-annonce-la-reprise-des-discussions-avec-le-m23-doha"},"tokenStatistics":{"title":24,"body":281,"excerpt":70,"categories":0}}
{"title":"Diplomatie : Kinshasa va abriter des commissions mixtes RDC-Maroc et RDC-Côte d'Ivoire en décembre","link":"https://7sur7.cd/2025/11/03/diplomatie-kinshasa-va-abriter-des-commissions-mixtes-rdc-maroc-et-rdc-cote-divoire-en","body":"## [Diplomatie : Kinshasa va abriter des commissions mixtes RDC-Maroc et RDC-Côte d'Ivoire en décembre](/2025/11/03/diplomatie-kinshasa-va-abriter-des-commissions-mixtes-rdc-maroc-et-rdc-cote-divoire-en)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251103-WA0006.jpg?itok=d58Tht_m)](/2025/11/03/diplomatie-kinshasa-va-abriter-des-commissions-mixtes-rdc-maroc-et-rdc-cote-divoire-en)\n\nlun 03/11/2025 - 09:16\n\nLa vice-ministre des Affaires étrangères, Noella Ayeganagato, a informé le Conseil des ministres, vendredi dernier, de la tenue en décembre 2025 de la 6ᵉ session de la commission mixte de concertation République Démocratique du Congo-Royaume du Maroc, et de la 4ᵉ session de la grande commission mixte République Démocratique du Congo - République de Côte d'Ivoire.C'est ce que rapporte le compte-rendu de la soixante cinquième réunion ordinaire du Conseil des ministres lu à la télévision nationale par le ministre de la Communication et médias, Patrick Muyaya Katembwe.","categories":[],"source":"7sur7.cd","timestamp":1762510180,"metadata":{"title":"Diplomatie : Kinshasa va abriter des commissions mixtes RDC-Maroc et RDC-Côte d'Ivoire en décembre","description":"La vice-ministre des Affaires étrangères, Noella Ayeganagato, a informé le Conseil des ministres, vendredi dernier, de la tenue en décembre 2025 de la 6ᵉ session de la commission mixte de concertation République Démocratique du Congo-Royaume du Maroc, et de la 4ᵉ session de la grande commission mixte République Démocratique du Congo - République de Côte d'Ivoire.C'est ce que rapporte le compte-rendu de la soixante cinquième réunion ordinaire du Conseil des ministres lu à la télévision nationale par le ministre de la Communication et médias, Patrick Muyaya Katembwe.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/03/diplomatie-kinshasa-va-abriter-des-commissions-mixtes-rdc-maroc-et-rdc-cote-divoire-en"},"tokenStatistics":{"title":35,"body":331,"excerpt":72,"categories":0}}
{"title":"Présidentielle en Tanzanie : Félix Tshisekedi salue la victoire de Samia Suluhu et la « maturité du peuple frère tanzanien »","link":"https://7sur7.cd/2025/11/03/presidentielle-en-tanzanie-felix-tshisekedi-salue-la-victoire-de-samia-suluhu-et-la","body":"## [Présidentielle en Tanzanie : Félix Tshisekedi salue la victoire de Samia Suluhu et la « maturité du peuple frère tanzanien »](/2025/11/03/presidentielle-en-tanzanie-felix-tshisekedi-salue-la-victoire-de-samia-suluhu-et-la)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251103-WA0001.jpg?itok=G448Tx_K)](/2025/11/03/presidentielle-en-tanzanie-felix-tshisekedi-salue-la-victoire-de-samia-suluhu-et-la)\n\nlun 03/11/2025 - 08:56\n\nDepuis Doha, au Qatar, où il séjourne, le président de la République démocratique du Congo, Félix-Antoine Tshisekedi Tshilombo, a adressé un message de félicitations à son homologue tanzanienne, Samia Suluhu Hassan, à la suite de sa « large » victoire à l’élection présidentielle en République Unie de Tanzanie.Ce dimanche, le chef de l’État congolais sest entretenu par téléphone avec la présidente réélue, lui exprimant ses « chaleureuses félicitations » pour sa reconduction à la tête du pays.","categories":[],"source":"7sur7.cd","timestamp":1762510181,"metadata":{"title":"Présidentielle en Tanzanie : Félix Tshisekedi salue la victoire de Samia Suluhu et la « maturité du peuple frère tanzanien »","description":"Depuis Doha, au Qatar, où il séjourne, le président de la République démocratique du Congo, Félix-Antoine Tshisekedi Tshilombo, a adressé un message de félicitations à son homologue tanzanienne, Samia Suluhu Hassan, à la suite de sa « large » victoire à l’élection présidentielle en République Unie de Tanzanie.Ce dimanche, le chef de l’État congolais sest entretenu par téléphone avec la présidente réélue, lui exprimant ses « chaleureuses félicitations » pour sa reconduction à la tête du pays.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/03/presidentielle-en-tanzanie-felix-tshisekedi-salue-la-victoire-de-samia-suluhu-et-la"},"tokenStatistics":{"title":45,"body":328,"excerpt":76,"categories":0}}
{"title":"Est de la RDC : Le Qatar réitère son engagement à soutenir les efforts « pacifiques » visant à mettre fin au conflit","link":"https://7sur7.cd/2025/11/03/est-de-la-rdc-le-qatar-reitere-son-engagement-soutenir-les-efforts-pacifiques-visant","body":"## [Est de la RDC : Le Qatar réitère son engagement à soutenir les efforts « pacifiques » visant à mettre fin au conflit](/2025/11/03/est-de-la-rdc-le-qatar-reitere-son-engagement-soutenir-les-efforts-pacifiques-visant)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251103-WA0000.jpg?itok=uo9qnN_J)](/2025/11/03/est-de-la-rdc-le-qatar-reitere-son-engagement-soutenir-les-efforts-pacifiques-visant)\n\nlun 03/11/2025 - 08:53\n\nLa ministre des Affaires étrangères de la République démocratique du Congo, Thérèse Kayigwamba Wagner, a été reçue, ce dimanche 2 novembre 2025, à Doha, par son homologue Mohammed bin Abdulaziz bin Saleh Al Khulaifi, ministre d’État aux Affaires étrangères du Qatar, dans le cadre dune visite officielle.Selon le communiqué du Gouvernement qatari annonçant cette visite, les échanges ont porté sur le renforcement de la coopération bilatérale entre les deux pays et les moyens de la développer davantage.","categories":[],"source":"7sur7.cd","timestamp":1762510181,"metadata":{"title":"Est de la RDC : Le Qatar réitère son engagement à soutenir les efforts « pacifiques » visant à mettre fin au conflit","description":"La ministre des Affaires étrangères de la République démocratique du Congo, Thérèse Kayigwamba Wagner, a été reçue, ce dimanche 2 novembre 2025, à Doha, par son homologue Mohammed bin Abdulaziz bin Saleh Al Khulaifi, ministre d’État aux Affaires étrangères du Qatar, dans le cadre dune visite officielle.Selon le communiqué du Gouvernement qatari annonçant cette visite, les échanges ont porté sur le renforcement de la coopération bilatérale entre les deux pays et les moyens de la développer davantage.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/03/est-de-la-rdc-le-qatar-reitere-son-engagement-soutenir-les-efforts-pacifiques-visant"},"tokenStatistics":{"title":34,"body":310,"excerpt":67,"categories":0}}
{"title":"Tshisekedi sur Kagame : « Son objectif est de scinder notre pays et doccuper, voire dannexer la partie Est »","link":"https://7sur7.cd/2025/11/02/tshisekedi-sur-kagame-son-objectif-est-de-scinder-notre-pays-et-doccuper-voire-dannexer","body":"## [Tshisekedi sur Kagame : « Son objectif est de scinder notre pays et doccuper, voire dannexer la partie Est »](/2025/11/02/tshisekedi-sur-kagame-son-objectif-est-de-scinder-notre-pays-et-doccuper-voire-dannexer)\n\n[![Droits tiers](/sites/default/files/styles/800x600/public/2025-11/6889f856-21cd-4ea3-835c-bb6b6d960344.jpeg?itok=RS7dfULI)](/2025/11/02/tshisekedi-sur-kagame-son-objectif-est-de-scinder-notre-pays-et-doccuper-voire-dannexer)\n\ndim 02/11/2025 - 21:08\n\nDevant la communauté congolaise vivant au Caire, en Égypte, ce dimanche, le président Félix Tshisekedi a accusé son homologue rwandais de vouloir scinder la République démocratique du Congo.À en croire le chef de l'État, l'objectif de Kigali est aussi de vouloir annexer la partie de la RDC qui est riche en ressources minérales et agricoles.« Ses intentions sont belliqueuses et hégémoniques. Son objectif est de scinder notre pays et doccuper, voire dannexer la partie Est qui est une terre très riche en ressources minérales et agricoles », a-t-il déclaré.","categories":[],"source":"7sur7.cd","timestamp":1762510182,"metadata":{"title":"Tshisekedi sur Kagame : « Son objectif est de scinder notre pays et doccuper, voire dannexer la partie Est »","description":"Devant la communauté congolaise vivant au Caire, en Égypte, ce dimanche, le président Félix Tshisekedi a accusé son homologue rwandais de vouloir scinder la République démocratique du Congo.À en croire le chef de l'État, l'objectif de Kigali est aussi de vouloir annexer la partie de la RDC qui est riche en ressources minérales et agricoles.« Ses intentions sont belliqueuses et hégémoniques. Son objectif est de scinder notre pays et doccuper, voire dannexer la partie Est qui est une terre très riche en ressources minérales et agricoles », a-t-il déclaré.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/02/tshisekedi-sur-kagame-son-objectif-est-de-scinder-notre-pays-et-doccuper-voire-dannexer"},"tokenStatistics":{"title":38,"body":362,"excerpt":74,"categories":0}}
{"title":"RDC : la suspension de certains partis politiques de l'opposition viole la constitution dans son article 62 (Emery Okundji)","link":"https://7sur7.cd/2025/11/02/rdc-la-suspension-de-certains-partis-politiques-de-lopposition-viole-la-constitution","body":"## [RDC : la suspension de certains partis politiques de l'opposition viole la constitution dans son article 62 (Emery Okundji)](/2025/11/02/rdc-la-suspension-de-certains-partis-politiques-de-lopposition-viole-la-constitution)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251102-WA0022.jpg?itok=b1FMyWhI)](/2025/11/02/rdc-la-suspension-de-certains-partis-politiques-de-lopposition-viole-la-constitution)\n\ndim 02/11/2025 - 17:07\n\nLa récente suspension des activités de plusieurs partis politiques de lopposition, notamment le PPRD, lATD, LGD, PISTE pour l'émergence et lAAP, continue de susciter de vives réactions dans la sphère politique nationale. Dans une interview accordée à 7SUR7.CD le samedi 02 octobre 2025, lancien ministre des postes et télécommunications, Emery Okundji estime que cette décision du vice-Premier ministre et ministre de lIntérieur constitue une « violation intentionnelle et manifeste » de la Constitution.","categories":[],"source":"7sur7.cd","timestamp":1762510183,"metadata":{"title":"RDC : la suspension de certains partis politiques de l'opposition viole la constitution dans son article 62 (Emery Okundji)","description":"La récente suspension des activités de plusieurs partis politiques de lopposition, notamment le PPRD, lATD, LGD, PISTE pour l'émergence et lAAP, continue de susciter de vives réactions dans la sphère politique nationale. Dans une interview accordée à 7SUR7.CD le samedi 02 octobre 2025, lancien ministre des postes et télécommunications, Emery Okundji estime que cette décision du vice-Premier ministre et ministre de lIntérieur constitue une « violation intentionnelle et manifeste » de la Constitution.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/02/rdc-la-suspension-de-certains-partis-politiques-de-lopposition-viole-la-constitution"},"tokenStatistics":{"title":35,"body":297,"excerpt":61,"categories":0}}
{"title":"Sankuru : Visé par une pétition, le président de lassemblée provinciale démissionne","link":"https://7sur7.cd/2025/11/02/sankuru-vise-par-une-petition-le-president-de-lassemblee-provinciale-demissionne","body":"## [Sankuru : Visé par une pétition, le président de lassemblée provinciale démissionne](/2025/11/02/sankuru-vise-par-une-petition-le-president-de-lassemblee-provinciale-demissionne)\n\n[![Image ](/sites/default/files/styles/800x600/public/2025-11/IMG-20251102-WA0000%281%29.jpg?itok=66qms_P-)](/2025/11/02/sankuru-vise-par-une-petition-le-president-de-lassemblee-provinciale-demissionne)\n\ndim 02/11/2025 - 10:25\n\nLambert Makondjo, président de l'Assemblée Provinciale du Sankuru, a finalement démissionné, sous la pression dune fronde interne menée par plusieurs députés provinciaux, il a préféré se retirer. À lorigine de cette contestation, une pétition signée par plus dune dizaine d’élus exigeant sa déchéance pour « gestion opaque » et soupçons de détournement de fonds publics. Convoqué en plénière pour sexpliquer, Lambert Makondjo a défendu son bilan, rejetant les accusations portées contre lui et dénonçant une procédure quil jugeait entachée dirrégularités.","categories":[],"source":"7sur7.cd","timestamp":1762510184,"metadata":{"title":"Sankuru : Visé par une pétition, le président de lassemblée provinciale démissionne","description":"Lambert Makondjo, président de l'Assemblée Provinciale du Sankuru, a finalement démissionné, sous la pression dune fronde interne menée par plusieurs députés provinciaux, il a préféré se retirer. À lorigine de cette contestation, une pétition signée par plus dune dizaine d’élus exigeant sa déchéance pour « gestion opaque » et soupçons de détournement de fonds publics. Convoqué en plénière pour sexpliquer, Lambert Makondjo a défendu son bilan, rejetant les accusations portées contre lui et dénonçant une procédure quil jugeait entachée dirrégularités.","image":"/sites/default/files/cropped-logo-7sur7-fond-blanc-75px.jpg","url":"https://7sur7.cd/2025/11/02/sankuru-vise-par-une-petition-le-president-de-lassemblee-provinciale-demissionne"},"tokenStatistics":{"title":25,"body":319,"excerpt":65,"categories":0}}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+31
View File
@@ -0,0 +1,31 @@
{
"name": "@basango/crawler",
"private": true,
"dependencies": {
"@basango/logger": "workspace:*",
"@devscast/config": "^1.0.3",
"bullmq": "^4.18.3",
"date-fns": "^3.6.0",
"ioredis": "^5.8.2",
"node-html-parser": "^7.0.1",
"tiktoken": "^1.0.22",
"turndown": "^7.2.2",
"zod": "^4.1.12"
},
"devDependencies": {
"@types/turndown": "^5.0.6",
"vitest": "^4.0.7"
},
"scripts": {
"crawler:async": "bun run src/scripts/queue.ts",
"crawler:sync": "bun run src/scripts/crawl.ts",
"crawler:worker": "bun run src/scripts/worker.ts",
"clean": "rm -rf .turbo node_modules",
"format": "biome format --write .",
"lint": "biome check .",
"lint:fix": "biome check --write .",
"typecheck": "tsc --noEmit",
"test": "vitest --run"
},
"type": "module"
}
+81
View File
@@ -0,0 +1,81 @@
import path from "node:path";
import { loadConfig as defineConfig } from "@devscast/config";
import { z } from "zod";
import {
DateRangeSchema,
HtmlSourceConfigSchema,
PageRangeSchema,
UpdateDirectionSchema,
WordPressSourceConfigSchema,
} from "@/schema";
export const PROJECT_DIR = path.resolve(__dirname, "../");
export const PipelineConfigSchema = z.object({
fetch: z.object({
async: z.object({
prefix: z.string().default("basango:crawler:queue"),
queues: z.object({
details: z.string().default("details"),
listing: z.string().default("listing"),
processing: z.string().default("processing"),
}),
redisUrl: z.string().default("redis://localhost:6379/0"),
ttl: z.object({
default: z.number().int().positive().default(600),
failure: z.number().int().nonnegative().default(3600),
result: z.number().int().nonnegative().default(3600),
}),
}),
client: z.object({
backoffInitial: z.number().nonnegative().default(1),
backoffMax: z.number().nonnegative().default(30),
backoffMultiplier: z.number().positive().default(2),
followRedirects: z.boolean().default(true),
maxRetries: z.number().int().nonnegative().default(3),
respectRetryAfter: z.boolean().default(true),
rotate: z.boolean().default(true),
timeout: z.number().positive().default(20),
userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
verifySsl: z.boolean().default(true),
}),
crawler: z.object({
category: z.string().optional(),
dateRange: DateRangeSchema.optional(),
direction: UpdateDirectionSchema.default("forward"),
isUpdate: z.boolean().default(false),
maxWorkers: z.number().int().positive().default(5),
notify: z.boolean().default(false),
pageRange: PageRangeSchema.optional(),
source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
useMultiThreading: z.boolean().default(false),
}),
}),
paths: z.object({
config: z.string().default(path.join(PROJECT_DIR, "config")),
data: z.string().default(path.join(PROJECT_DIR, "data", "datasets")),
root: z.string().default(PROJECT_DIR),
}),
sources: z.object({
html: z.array(HtmlSourceConfigSchema).default([]),
wordpress: z.array(WordPressSourceConfigSchema).default([]),
}),
});
export const { config, env } = defineConfig({
cwd: process.cwd(),
env: {
path: path.join(PROJECT_DIR, ".env"),
},
schema: PipelineConfigSchema,
sources: [
path.join(PROJECT_DIR, "config", "pipeline.json"),
path.join(PROJECT_DIR, "config", "sources.json"),
],
});
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
export type FetchClientConfig = PipelineConfig["fetch"]["client"];
export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];
+6
View File
@@ -0,0 +1,6 @@
export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd";
export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)";
export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1";
export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504];
export const DEFAULT_RETRY_AFTER_HEADER = "retry-after";
+241
View File
@@ -0,0 +1,241 @@
import { setTimeout as delay } from "node:timers/promises";
import { FetchClientConfig } from "@/config";
import {
DEFAULT_RETRY_AFTER_HEADER,
DEFAULT_USER_AGENT,
TRANSIENT_HTTP_STATUSES,
} from "@/constants";
import { UserAgents } from "@/http/user-agent";
export type HttpHeaders = Record<string, string>;
export type HttpParams = Record<string, string | number | boolean | null | undefined>;
export type HttpData = unknown;
export interface HttpClientOptions {
userAgentProvider?: UserAgents;
defaultHeaders?: HttpHeaders;
fetchImpl?: typeof fetch;
sleep?: (ms: number) => Promise<void>;
}
export interface HttpRequestOptions {
headers?: HttpHeaders;
params?: HttpParams;
data?: HttpData;
json?: HttpData;
retryAfterHeader?: string;
}
export class HttpError extends Error {
readonly status: number;
readonly response: Response;
constructor(message: string, response: Response) {
super(message);
this.status = response.status;
this.response = response;
}
}
/**
* Default sleep function using setTimeout.
* @param ms - Milliseconds to sleep
*/
const defaultSleep = (ms: number): Promise<void> => {
return delay(ms).then(() => undefined);
};
/**
* Builds a URL with query parameters.
* @param url - The base URL
* @param params - The query parameters to append
*/
const buildUrl = (url: string, params?: HttpParams): string => {
if (!params || Object.keys(params).length === 0) {
return url;
}
const target = new URL(url);
for (const [key, value] of Object.entries(params)) {
if (value === undefined || value === null) continue;
target.searchParams.set(key, String(value));
}
return target.toString();
};
/**
* Computes the backoff time in milliseconds based on the configuration and attempt number.
* @param config - Fetch client configuration
* @param attempt - Current attempt number
*/
const computeBackoff = (config: FetchClientConfig, attempt: number): number => {
const base = Math.min(
config.backoffInitial * config.backoffMultiplier ** attempt,
config.backoffMax,
);
const jitter = Math.random() * base * 0.25;
return (base + jitter) * 1000;
};
const parseRetryAfter = (header: string): number => {
const numeric = Number.parseInt(header, 10);
if (!Number.isNaN(numeric)) {
return Math.max(0, numeric * 1000);
}
const parsed = Date.parse(header);
if (Number.isNaN(parsed)) {
return 0;
}
const delta = parsed - Date.now();
return delta > 0 ? delta : 0;
};
/**
* Base HTTP client providing common functionality.
*
* @author Bernard Ngandu <bernard@devscast.tech>
*/
export class BaseHttpClient {
protected readonly config: FetchClientConfig;
protected readonly fetchImpl: typeof fetch;
protected readonly sleep: (ms: number) => Promise<void>;
protected readonly headers: HttpHeaders;
constructor(config: FetchClientConfig, options: HttpClientOptions = {}) {
this.config = config;
const provider =
options.userAgentProvider ??
new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT);
const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT;
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
if (options.defaultHeaders) {
Object.assign(baseHeaders, options.defaultHeaders);
}
this.headers = baseHeaders;
this.fetchImpl = options.fetchImpl ?? fetch;
this.sleep = options.sleep ?? defaultSleep;
}
protected buildHeaders(headers?: HttpHeaders): HeadersInit {
return { ...this.headers, ...(headers ?? {}) };
}
protected async maybeDelay(
attempt: number,
response?: Response,
retryAfterHeader: string = DEFAULT_RETRY_AFTER_HEADER,
): Promise<void> {
let waitMs = 0;
if (response) {
const retryAfter = response.headers.get(retryAfterHeader);
if (retryAfter && this.config.respectRetryAfter) {
waitMs = parseRetryAfter(retryAfter);
}
}
if (waitMs === 0) {
waitMs = computeBackoff(this.config, attempt);
}
if (waitMs > 0) {
await this.sleep(waitMs);
}
}
}
/**
* Synchronous HTTP client with retry and timeout capabilities.
*
* @author Bernard Ngandu <bernard@devscast.tech>
*/
export class SyncHttpClient extends BaseHttpClient {
async request(method: string, url: string, options: HttpRequestOptions = {}): Promise<Response> {
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
const target = buildUrl(url, options.params);
const maxAttempts = this.config.maxRetries + 1;
let attempt = 0;
let lastError: unknown;
while (attempt < maxAttempts) {
const controller = new AbortController();
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
try {
timeoutHandle = setTimeout(() => controller.abort(), this.config.timeout * 1000);
const headers = this.buildHeaders(options.headers);
const init: RequestInit = {
body: options.data as BodyInit | undefined,
headers,
method,
redirect: this.config.followRedirects ? "follow" : "manual",
signal: controller.signal,
};
if (options.json !== undefined) {
init.body = JSON.stringify(options.json);
(init.headers as Record<string, string>)["Content-Type"] ??= "application/json";
}
const response = await this.fetchImpl(target, init);
if (
TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
attempt < this.config.maxRetries
) {
await this.maybeDelay(attempt, response, retryAfterHeader);
attempt += 1;
continue;
}
if (!response.ok) {
throw new HttpError(`HTTP ${response.status} ${response.statusText}`, response);
}
return response;
} catch (error) {
if (error instanceof HttpError) {
lastError = error;
throw error;
}
if (error instanceof DOMException && error.name === "AbortError") {
lastError = error;
if (attempt >= this.config.maxRetries) {
throw error;
}
} else {
lastError = error;
if (attempt >= this.config.maxRetries) {
throw error;
}
}
await this.maybeDelay(attempt);
attempt += 1;
} finally {
if (timeoutHandle) {
clearTimeout(timeoutHandle);
}
}
}
throw lastError instanceof Error ? lastError : new Error("HTTP request failed after retries");
}
get(url: string, options?: Omit<HttpRequestOptions, "data" | "json">): Promise<Response> {
return this.request("GET", url, options);
}
post(url: string, options: HttpRequestOptions = {}): Promise<Response> {
return this.request("POST", url, options);
}
}
export type HttpClient = SyncHttpClient;
+102
View File
@@ -0,0 +1,102 @@
import { parse } from "node-html-parser";
import { config } from "@/config";
import { OPEN_GRAPH_USER_AGENT } from "@/constants";
import { SyncHttpClient } from "@/http/http-client";
import { UserAgents } from "@/http/user-agent";
import { ArticleMetadata } from "@/schema";
/**
* Picks the first non-empty value from the provided array.
* @param values - An array of string values
*/
const pick = (values: Array<string | null | undefined>): string | undefined => {
for (const value of values) {
if (value && value.trim().length > 0) {
return value.trim();
}
}
return undefined;
};
/**
* Extracts the content of a meta tag given its property or name.
* @param root - The root HTML element
* @param property - The property or name of the meta tag to extract
*/
const extract = (root: ReturnType<typeof parse>, property: string): string | null => {
const selector = `meta[property='${property}'], meta[name='${property}']`;
const node = root.querySelector(selector);
if (!node) {
return null;
}
return node.getAttribute("content") ?? null;
};
/**
* OpenGraph consumer for extracting Open Graph metadata from HTML pages.
* Uses a synchronous HTTP client to fetch the HTML content.
*
* @author Bernard Ngandu <bernard@devscast.tech>
*/
export class OpenGraph {
private readonly client: Pick<SyncHttpClient, "get">;
constructor() {
const settings = config.fetch.client;
const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT);
this.client = new SyncHttpClient(settings, {
defaultHeaders: { "User-Agent": provider.og() },
userAgentProvider: provider,
});
}
/**
* Consume a URL and extract Open Graph metadata.
* @param url - The URL to fetch and parse
*/
async consumeUrl(url: string): Promise<ArticleMetadata | undefined> {
try {
const response = await this.client.get(url);
const html = await response.text();
return OpenGraph.consumeHtml(html, url);
} catch {
return undefined;
}
}
/**
* Consume HTML content and extract Open Graph metadata.
* @param html - HTML content as a string
* @param url - Optional URL of the page
*/
static consumeHtml(html: string, url?: string): ArticleMetadata | undefined {
if (!html) {
return undefined;
}
const root = parse(html);
const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]);
const description = pick([extract(root, "og:description"), extract(root, "description")]);
const image = pick([
extract(root, "og:image"),
root.querySelector("img")?.getAttribute("src") ?? null,
]);
const canonical = pick([
extract(root, "og:url"),
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
url ?? null,
]);
if (!title && !description && !image && !canonical) {
return undefined;
}
return {
description,
image,
title,
url: canonical,
};
}
}
+41
View File
@@ -0,0 +1,41 @@
import { DEFAULT_USER_AGENT, OPEN_GRAPH_USER_AGENT } from "@/constants";
/**
* User agent provider with optional rotation.
* Allows fetching a random user agent from a predefined list
* or using a fallback user agent.
*
* @author Bernard Ngandu <bernard@devscast.tech>
*/
export class UserAgents {
private static readonly USER_AGENTS: string[] = [
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
"Mozilla/50.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7",
"Mozilla/5.0 (Linux; U; Android 5.0; SM-P815 Build/LRX22G) AppleWebKit/600.4 (KHTML, like Gecko) Chrome/48.0.1562.260 Mobile Safari/600.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.3;) AppleWebKit/533.34 (KHTML, like Gecko) Chrome/51.0.1883.215 Safari/533",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.3; x64; en-US Trident/4.0)",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_10_3) Gecko/20100101 Firefox/63.4",
"Mozilla/5.0 (Linux; Linux x86_64; en-US) AppleWebKit/603.50 (KHTML, like Gecko) Chrome/55.0.2226.116 Safari/601",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 7_8_3; en-US) Gecko/20100101 Firefox/68.9",
"Mozilla/5.0 (iPhone; CPU iPhone OS 8_9_8; like Mac OS X) AppleWebKit/603.34 (KHTML, like Gecko) Chrome/47.0.1126.107 Mobile Safari/602.7",
"Mozilla/5.0 (iPod; CPU iPod OS 8_2_0; like Mac OS X) AppleWebKit/601.40 (KHTML, like Gecko) Chrome/47.0.1590.178 Mobile Safari/535.2",
];
private readonly rotate: boolean;
private readonly fallback: string;
constructor(rotate: boolean = true, fallback: string = DEFAULT_USER_AGENT) {
this.rotate = rotate;
this.fallback = fallback;
}
og(): string {
return OPEN_GRAPH_USER_AGENT;
}
get(): string {
if (!this.rotate) return this.fallback;
const idx = Math.floor(Math.random() * UserAgents.USER_AGENTS.length);
return UserAgents.USER_AGENTS[idx]!;
}
}
+137
View File
@@ -0,0 +1,137 @@
import { logger } from "@basango/logger";
import { config, env } from "@/config";
import { SyncHttpClient } from "@/http/http-client";
import { createQueueManager, QueueManager } from "@/process/async/queue";
import {
DetailsTaskPayload,
ListingTaskPayload,
ProcessingTaskPayload,
} from "@/process/async/schemas";
import { resolveCrawlerConfig } from "@/process/crawler";
import { HtmlCrawler } from "@/process/parsers/html";
import { WordPressCrawler } from "@/process/parsers/wordpress";
import { JsonlPersistor } from "@/process/persistence";
import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
export const collectHtmlListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
if (source.sourceKind !== "html") {
return await collectWordPressListing(payload, manager);
}
const settings = resolveCrawlerConfig(source, payload);
const crawler = new HtmlCrawler(settings);
const pageRange = settings.pageRange ?? (await crawler.getPagination());
let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`;
try {
const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
for (const node of items) {
const url = crawler.extractLink(node);
if (!url) continue;
await manager.enqueueArticle({
category: payload.category,
dateRange: createDateRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
queued += 1;
}
} catch (error) {
logger.error({ error, target }, "Failed to crawl page");
}
}
return queued;
};
export const collectWordPressListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
if (source.sourceKind !== "wordpress") {
return await collectHtmlListing(payload, manager);
}
const settings = resolveCrawlerConfig(source, payload);
const crawler = new WordPressCrawler(settings);
const pageRange = settings.pageRange ?? (await crawler.getPagination());
let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const url = crawler.postsEndpoint(page);
try {
const entries = await crawler.fetchLinks(url);
for (const data of entries) {
const url = data.link;
if (!url) continue;
await manager.enqueueArticle({
category: payload.category,
data,
dateRange: createDateRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
queued += 1;
}
} catch (error) {
logger.error({ error, page }, "Failed to fetch WordPress page");
}
}
return queued;
};
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
const source = resolveSourceConfig(payload.sourceId);
const settings = resolveCrawlerConfig(source, {
category: payload.category,
dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
sourceId: payload.sourceId,
});
const persistors = [
new JsonlPersistor({
directory: config.paths.data,
sourceId: String(source.sourceId),
}),
];
if (source.sourceKind === SourceKindSchema.enum.html) {
if (!payload.url) throw new Error("Missing article url");
const crawler = new HtmlCrawler(settings, { persistors });
const html = await crawler.crawl(payload.url);
return await crawler.fetchOne(html, settings.dateRange);
}
if (source.sourceKind === SourceKindSchema.enum.wordpress) {
const crawler = new WordPressCrawler(settings, { persistors });
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
}
throw new Error(`Unsupported source kind`);
};
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
logger.info({ article: payload.article.title }, "Ready for downstream processing");
const client = new SyncHttpClient(config.fetch.client);
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
await client.post(endpoint, { json: payload.article });
logger.info({ article: payload.article.title }, "Forwarded article to API");
return payload.article;
};
+107
View File
@@ -0,0 +1,107 @@
import { randomUUID } from "node:crypto";
import { JobsOptions, Queue, QueueOptions } from "bullmq";
import IORedis from "ioredis";
import { config, FetchAsyncConfig } from "@/config";
import {
DetailsTaskPayload,
DetailsTaskPayloadSchema,
ListingTaskPayload,
ListingTaskPayloadSchema,
ProcessingTaskPayload,
ProcessingTaskPayloadSchema,
} from "@/process/async/schemas";
import { parseRedisUrl } from "@/utils";
export interface QueueBackend<T = unknown> {
add: (name: string, data: T, opts?: JobsOptions) => Promise<{ id: string }>;
}
export type QueueFactory = (
queueName: string,
settings: FetchAsyncConfig,
connection?: IORedis,
) => QueueBackend;
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
const redisConnection =
connection ??
new IORedis(settings.redisUrl, {
...parseRedisUrl(settings.redisUrl),
maxRetriesPerRequest: null,
});
const options: QueueOptions = {
connection: redisConnection,
prefix: settings.prefix,
};
const queue = new Queue(queueName, options);
return {
add: async (name, data, opts) => {
const job = await queue.add(name, data, {
removeOnComplete: settings.ttl.result === 0 ? true : undefined,
removeOnFail: settings.ttl.failure === 0 ? true : undefined,
...opts,
});
return { id: job.id ?? randomUUID() };
},
};
};
export interface CreateQueueManagerOptions {
queueFactory?: QueueFactory;
connection?: IORedis;
}
export interface QueueManager {
readonly settings: FetchAsyncConfig;
readonly connection: IORedis;
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
enqueueProcessed: (payload: ProcessingTaskPayload) => Promise<{ id: string }>;
iterQueueNames: () => string[];
queueName: (suffix: string) => string;
close: () => Promise<void>;
}
export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
const settings = config.fetch.async;
const connection =
options.connection ??
new IORedis(settings.redisUrl, {
...parseRedisUrl(settings.redisUrl),
maxRetriesPerRequest: null,
});
const factory = options.queueFactory ?? defaultQueueFactory;
const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
return {
close: async () => {
await connection.quit();
},
connection,
enqueueArticle: (payload) => {
const data = DetailsTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.details);
return queue.add("collect_article", data);
},
enqueueListing: (payload) => {
const data = ListingTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.listing);
return queue.add("collect_listing", data);
},
enqueueProcessed: (payload) => {
const data = ProcessingTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.processing);
return queue.add("forward_for_processing", data);
},
iterQueueNames: () => [
`${settings.prefix}:${settings.queues.listing}`,
`${settings.prefix}:${settings.queues.details}`,
`${settings.prefix}:${settings.queues.processing}`,
],
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
settings,
};
};
+28
View File
@@ -0,0 +1,28 @@
import { z } from "zod";
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema";
export const ListingTaskPayloadSchema = z.object({
category: z.string().optional(),
dateRange: z.string().optional(),
pageRange: z.string().optional(),
sourceId: z.string(),
});
export const DetailsTaskPayloadSchema = z.object({
category: z.string().optional(),
data: z.any().optional(),
dateRange: DateRangeSchema.optional(),
page: z.number().int().nonnegative().optional(),
pageRange: PageRangeSchema.optional(),
sourceId: z.string(),
url: z.url(),
});
export const ProcessingTaskPayloadSchema = z.object({
article: ArticleSchema,
sourceId: z.string(),
});
export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
export type DetailsTaskPayload = z.infer<typeof DetailsTaskPayloadSchema>;
export type ProcessingTaskPayload = z.infer<typeof ProcessingTaskPayloadSchema>;
+60
View File
@@ -0,0 +1,60 @@
import { logger } from "@basango/logger";
import * as handlers from "@/process/async/handlers";
import { createQueueManager } from "@/process/async/queue";
import {
DetailsTaskPayloadSchema,
ListingTaskPayloadSchema,
ProcessingTaskPayloadSchema,
} from "@/process/async/schemas";
import { CrawlingOptions } from "@/process/crawler";
export const collectListing = async (payload: unknown): Promise<number> => {
const data = ListingTaskPayloadSchema.parse(payload);
logger.debug({ data }, "Collecting listing");
const count = await handlers.collectHtmlListing(data);
logger.info({ count }, "Listing collection completed");
return count;
};
export const collectArticle = async (payload: unknown): Promise<unknown> => {
const data = DetailsTaskPayloadSchema.parse(payload);
logger.info({ data }, "Collecting article");
const result = await handlers.collectArticle(data);
logger.info({ url: data.url }, "Article collection completed");
return result;
};
export const forwardForProcessing = async (payload: unknown): Promise<unknown> => {
const data = ProcessingTaskPayloadSchema.parse(payload);
logger.debug({ sourceId: data.sourceId }, "Forwarding article for processing");
const result = await handlers.forwardForProcessing(data);
logger.info({ result }, "Article forwarded for processing");
return result;
};
export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise<string> => {
const payload = ListingTaskPayloadSchema.parse({
category: options.category,
dateRange: options.dateRange,
pageRange: options.pageRange,
sourceId: options.sourceId,
});
const manager = createQueueManager();
logger.info({ payload }, "Scheduling listing collection job");
try {
const job = await manager.enqueueListing(payload);
logger.info({ job }, "Scheduled listing collection job");
return job.id;
} finally {
await manager.close();
}
};
+74
View File
@@ -0,0 +1,74 @@
import { QueueEvents, Worker } from "bullmq";
import IORedis from "ioredis";
import { QueueFactory, QueueManager } from "@/process/async/queue";
import { collectArticle, collectListing, forwardForProcessing } from "@/process/async/tasks";
export interface WorkerOptions {
queueNames?: string[];
connection?: IORedis;
queueFactory?: QueueFactory;
concurrency?: number;
onError?: (error: Error) => void;
queueManager: QueueManager;
}
export interface WorkerHandle {
readonly workers: Worker[];
readonly events: QueueEvents[];
close: () => Promise<void>;
}
export const startWorker = (options: WorkerOptions): WorkerHandle => {
const manager = options.queueManager;
const queueNames = options.queueNames ?? manager.iterQueueNames();
const workers: Worker[] = [];
const events: QueueEvents[] = [];
const connection = manager.connection;
for (const queueName of queueNames) {
const worker = new Worker(
queueName,
async (job) => {
switch (job.name) {
case "collect_listing":
return collectListing(job.data);
case "collect_article":
return collectArticle(job.data);
case "forward_for_processing":
return forwardForProcessing(job.data);
default:
throw new Error(`Unknown job name: ${job.name}`);
}
},
{
concurrency: options.concurrency ?? 5,
connection,
},
);
if (options.onError) {
worker.on("failed", (_, err) => options.onError?.(err as Error));
worker.on("error", (err) => options.onError?.(err as Error));
}
const queueEvents = new QueueEvents(queueName, { connection });
workers.push(worker);
events.push(queueEvents);
}
return {
close: async () => {
await Promise.all(workers.map((worker) => worker.close()));
await Promise.all(events.map((event) => event.close()));
if (!options.queueManager) {
await manager.close();
}
},
events,
workers,
};
};
+44
View File
@@ -0,0 +1,44 @@
import logger from "@basango/logger";
import { config, FetchCrawlerConfig } from "@/config";
import { JsonlPersistor, Persistor } from "@/process/persistence";
import { AnySourceConfig } from "@/schema";
import { createDateRange, createPageRange } from "@/utils";
export interface CrawlingOptions {
sourceId: string;
pageRange?: string | undefined;
dateRange?: string | undefined;
category?: string | undefined;
}
export const resolveCrawlerConfig = (
source: AnySourceConfig,
options: CrawlingOptions,
): FetchCrawlerConfig => {
return {
...config.fetch.crawler,
category: options.category,
dateRange: createDateRange(options.dateRange),
pageRange: createPageRange(options.pageRange),
source,
};
};
export const createPersistors = (source: AnySourceConfig): Persistor[] => {
return [
new JsonlPersistor({
directory: config.paths.data,
sourceId: source.sourceId,
}),
];
};
export const closePersistors = async (persistors: Persistor[]): Promise<void> => {
for (const persistor of persistors) {
try {
await persistor.close();
} catch (error) {
logger.warn({ error }, "Failed to close persistor");
}
}
};
+107
View File
@@ -0,0 +1,107 @@
import { HTMLElement, parse as parseHtml } from "node-html-parser";
import { config, FetchCrawlerConfig } from "@/config";
import { SyncHttpClient } from "@/http/http-client";
import { OpenGraph } from "@/http/open-graph";
import type { Persistor } from "@/process/persistence";
import { AnySourceConfig, Article } from "@/schema";
export interface CrawlerOptions {
persistors?: Persistor[];
}
export abstract class BaseCrawler {
protected readonly settings: FetchCrawlerConfig;
protected readonly source: AnySourceConfig;
protected readonly http: SyncHttpClient;
protected readonly persistors: Persistor[];
protected readonly openGraph: OpenGraph;
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
if (!settings.source) {
throw new Error("Crawler requires a bound source");
}
this.http = new SyncHttpClient(config.fetch.client);
this.persistors = options.persistors ?? [];
this.openGraph = new OpenGraph();
this.settings = settings;
this.source = settings.source as AnySourceConfig;
}
/**
* Fetch and process articles from the source.
*/
abstract fetch(): Promise<void> | void;
/**
* Crawl the given URL and return the HTML content as a string.
* @param url - The URL to crawl
*/
async crawl(url: string): Promise<string> {
const response = await this.http.get(url);
return await response.text();
}
/**
* Extract text content from an HTML node.
* @param node - The HTML node
*/
protected textContent(node: HTMLElement | null | undefined): string | null {
if (!node) return null;
// innerText keeps spacing similar to browser rendering
const value = node.innerText ?? node.text;
const text = value.trim();
return text.length ? text : null;
}
/**
* Extract the first matching element from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null {
if (!selector) return null;
try {
return root.querySelector(selector) ?? null;
} catch {
return null;
}
}
/**
* Extract all matching elements from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] {
if (!selector) return [];
try {
return root.querySelectorAll(selector);
} catch {
return [];
}
}
/**
* Parse HTML string into an HTMLElement.
* @param html - The HTML string
*/
protected parseHtml(html: string): HTMLElement {
return parseHtml(html) as unknown as HTMLElement;
}
/**
* Enrich the record with Open Graph metadata from the given URL.
* @param record - The article record
* @param url - The URL to fetch Open Graph data from
*/
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
try {
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
return { ...record, metadata };
} catch {
return { ...record, metadata: undefined };
}
}
}
+335
View File
@@ -0,0 +1,335 @@
import { logger } from "@basango/logger";
import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
import { HTMLElement } from "node-html-parser";
import TurndownService from "turndown";
import { FetchCrawlerConfig } from "@/config";
import { BaseCrawler } from "@/process/parsers/base";
import { Persistor, persist } from "@/process/persistence";
import { DateRange, HtmlSourceConfig } from "@/schema";
import { createAbsoluteUrl, isTimestampInRange } from "@/utils";
const md = new TurndownService({
bulletListMarker: "-",
headingStyle: "atx",
hr: "---",
});
/**
* Create a safe RegExp from the given pattern.
* @param pattern
*/
const safeRegExp = (pattern?: string | null): RegExp | null => {
if (!pattern) return null;
try {
return new RegExp(pattern, "g");
} catch {
return null;
}
};
/**
* Crawler for generic HTML pages.
*/
export class HtmlCrawler extends BaseCrawler {
readonly source: HtmlSourceConfig;
private currentArticleUrl: string | null = null;
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
super(settings, options);
if (!settings.source || settings.source.sourceKind !== "html") {
throw new Error("HtmlCrawler requires a source of kind 'html'");
}
this.source = this.settings.source as HtmlSourceConfig;
}
async fetch(): Promise<void> {
const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange;
const articleSelector = this.source.sourceSelectors.articles;
if (!articleSelector) {
logger.error(
{ source: this.source.sourceId },
"No article selector configured for HTML source",
);
return;
}
let stop = false;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const pageUrl = this.buildPageUrl(page);
let html: string;
try {
html = await this.crawl(pageUrl);
} catch (error) {
logger.error({ error, page, pageUrl }, "> page %s => [failed]", page);
continue;
}
const root = this.parseHtml(html);
const articles = this.extractAll(root, articleSelector);
if (!articles.length) {
logger.info({ page }, "No articles found on page");
continue;
}
for (const node of articles) {
try {
this.currentArticleUrl = this.extractLink(node);
let targetHtml = node.toString();
if (this.source.requiresDetails) {
if (!this.currentArticleUrl) {
logger.debug({ page }, "Skipping article without link for details");
continue;
}
try {
targetHtml = await this.crawl(this.currentArticleUrl);
} catch (err) {
logger.error(
{ error: err, url: this.currentArticleUrl },
"Failed to fetch detail page",
);
continue;
}
}
const saved = await this.fetchOne(targetHtml, dateRange);
// stop early on first out-of-range if pages are sorted by date desc
if (saved === null) {
stop = true;
break;
}
} catch (error) {
logger.error({ error, pageUrl }, "Failed to process article on page");
} finally {
this.currentArticleUrl = null;
}
}
if (stop) break;
}
}
/**
* Fetch and process a single HTML article.
* @param html - The HTML content of the article
* @param dateRange - Optional date range for filtering
*/
async fetchOne(html: string, dateRange?: DateRange | null) {
const root = this.parseHtml(html);
const sel = this.source.sourceSelectors;
const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled";
const link = this.currentArticleUrl ?? this.extractLink(root);
if (!link) {
logger.warn({ title: titleText }, "Skipping article without link");
return null;
}
const body = this.extractBody(root, sel.articleBody);
const categories = this.extractCategories(root, sel.articleCategories);
const rawDate = this.extractText(root, sel.articleDate);
const timestamp = this.computeTimestamp(rawDate);
if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
logger.info(
{ date: rawDate, link, timestamp, title: titleText },
"Skipping article outside date range",
);
return null;
}
const enriched = await this.enrichWithOpenGraph(
{
body,
categories,
link,
source: this.source.sourceId,
timestamp,
title: titleText,
},
link,
);
return await persist(enriched, this.persistors);
}
/**
* Fetch links from the target URL using the given selector.
* @param target - The target URL to crawl
* @param selector - The CSS selector to extract links
*/
async fetchLinks(target: string, selector: string) {
const html = await this.crawl(target);
const root = this.parseHtml(html);
return this.extractAll(root, selector);
}
/**
* Get the pagination range (start and end page numbers).
*/
async getPagination(): Promise<{ start: number; end: number }> {
return { end: await this.getLastPage(), start: 0 };
}
/**
* Determine the last page number from pagination links.
*/
private async getLastPage(): Promise<number> {
const template = this.applyCategory(this.source.paginationTemplate);
const url = `${this.source.sourceUrl}${template}`;
try {
const html = await this.crawl(url);
const root = this.parseHtml(html);
const links = this.extractAll(root, this.source.sourceSelectors.pagination);
if (!links.length) return 1;
const last = links[links.length - 1]!;
const href = last.getAttribute("href") as string | null;
if (!href) return 1;
// Heuristic: prefer a number in the href, else "page" query param
const numberMatch = href.match(/(\d+)/);
if (numberMatch) {
const page = Number.parseInt(numberMatch[1]!, 10);
return Number.isFinite(page) && page > 0 ? page : 1;
}
const urlObj = new URL(createAbsoluteUrl(this.source.sourceUrl, href));
const pageParam = urlObj.searchParams.get("page");
if (pageParam) {
const page = Number.parseInt(pageParam, 10);
return Number.isFinite(page) && page > 0 ? page : 1;
}
return 1;
} catch {
return 1;
}
}
/**
* Build the URL for a given page number.
* @param page - The page number
*/
buildPageUrl(page: number): string {
let template = this.applyCategory(this.source.paginationTemplate);
if (template.includes("{page}")) {
template = template.replace("{page}", String(page));
} else if (page > 0) {
const sep = template.includes("?") ? "&" : "?";
template = `${template}${sep}page=${page}`;
}
return createAbsoluteUrl(this.source.sourceUrl, template);
}
/**
* Apply category replacement in the template if needed.
* @param template - The URL template
*/
private applyCategory(template: string): string {
if (template.includes("{category}")) {
const replacement = this.settings.category ?? "";
return template.replace("{category}", replacement);
}
return template;
}
/**
* Extract link URL from the given node using the selector.
* @param node - The HTML element
*/
extractLink(node: HTMLElement): string | null {
const selector = this.source.sourceSelectors.articleLink;
if (!selector) return null;
const target = this.extractFirst(node, selector);
if (!target) return null;
const href =
target.getAttribute("href") ?? target.getAttribute("data-href") ?? target.getAttribute("src");
if (!href) return null;
return createAbsoluteUrl(this.source.sourceUrl, href);
}
/**
* Extract text content from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
private extractText(root: HTMLElement, selector?: string | null): string | null {
if (!selector) return null;
const target = this.extractFirst(root, selector);
if (!target) return null;
// If it's an image, prefer alt/title
const tag = target.tagName.toLowerCase();
if (tag === "img") {
const alt = target.getAttribute("alt");
const title = target.getAttribute("title");
const pick = (alt ?? title ?? "").trim();
if (pick.length > 0) return pick;
}
return this.textContent(target);
}
/**
* Extract body content from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
private extractBody(root: HTMLElement, selector?: string | null): string {
if (selector) {
const nodes = this.extractAll(root, selector);
if (nodes.length) {
const parts = nodes.map((n) => md.turndown(n.toString())).filter(Boolean);
if (parts.length) return parts.join("\n");
}
}
return md.turndown(root.toString());
}
/**
* Extract categories from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
if (!selector) return [];
const values: string[] = [];
for (const node of this.extractAll(root, selector)) {
const text = this.textContent(node);
if (!text) continue;
const lower = text.toLowerCase();
if (!values.includes(lower)) values.push(lower);
}
return values;
}
/**
* Compute Unix timestamp from raw date string.
* @param raw - Raw date string
* @private
*/
private computeTimestamp(raw?: string | null): number {
if (!raw) return Math.floor(Date.now() / 1000);
let value = raw.trim();
const pattern = safeRegExp(this.source.sourceDate?.pattern);
const replacement = this.source.sourceDate?.replacement ?? "";
if (pattern) {
try {
value = value.replace(pattern, replacement);
} catch {
// ignore pattern failures
}
}
const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm";
if (!isDateMatch(value, format)) {
// fallback: try native Date.parse as last resort
const parsed = Date.parse(value);
return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000);
}
const date = parseDateFns(value, format, new Date());
const ts = getUnixTime(date);
return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000);
}
}
@@ -0,0 +1,239 @@
import { logger } from "@basango/logger";
import TurndownService from "turndown";
import { FetchCrawlerConfig } from "@/config";
import { BaseCrawler } from "@/process/parsers/base";
import { Persistor, persist } from "@/process/persistence";
import { DateRange, PageRange, WordPressSourceConfig } from "@/schema";
const md = new TurndownService({
bulletListMarker: "-",
headingStyle: "atx",
hr: "---",
});
interface WordPressPost {
link?: string;
slug?: string;
title?: { rendered?: string };
content?: { rendered?: string };
date?: string;
categories?: number[];
}
/**
* Crawler for WordPress sites using the REST API.
*/
export class WordPressCrawler extends BaseCrawler {
readonly source: WordPressSourceConfig;
private categoryMap: Map<number, string> = new Map();
private static readonly POST_QUERY =
"_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc";
private static readonly CATEGORY_QUERY =
"_fields=id,slug,count&orderby=count&order=desc&per_page=100";
private static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
private static readonly TOTAL_POSTS_HEADER = "x-wp-total";
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
super(settings, options);
if (!settings.source || settings.source.sourceKind !== "wordpress") {
throw new Error("HtmlCrawler requires a source of kind 'wordpress'");
}
this.source = this.settings.source as WordPressSourceConfig;
}
/**
* Fetch and process WordPress posts.
*/
async fetch(): Promise<void> {
const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange;
let stop = false;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const endpoint = this.postsEndpoint(page);
try {
const response = await this.http.get(endpoint);
const data = (await response.json()) as unknown;
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
if (!Array.isArray(data)) {
logger.warn({ page, type: typeof data }, "Unexpected WordPress payload type");
}
for (const entry of articles) {
const saved = await this.fetchOne(entry, dateRange);
if (saved === null) {
stop = true;
break;
}
}
} catch (error) {
logger.error({ error, page }, "> page %s => [failed]", page);
continue;
}
if (stop) break;
}
}
/**
* Fetch links from a WordPress posts endpoint.
* @param url - The posts endpoint URL
*/
async fetchLinks(url: string) {
const response = await this.http.get(url);
const data = (await response.json()) as unknown;
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
if (!Array.isArray(data)) {
logger.warn({ type: typeof data }, "Unexpected WordPress payload type");
}
return articles;
}
/**
* Fetch and process a single WordPress post.
* @param input - Decoded JSON object or raw JSON string
* @param dateRange - Optional date range for filtering
*/
async fetchOne(input: unknown, dateRange?: DateRange | null) {
// input can be the decoded JSON object or a raw JSON string
let data: WordPressPost | null = null;
try {
if (typeof input === "string") {
data = JSON.parse(input) as WordPressPost;
} else if (input && typeof input === "object") {
data = input as WordPressPost;
}
} catch (error) {
logger.error({ error }, "Failed to decode WordPress payload");
throw error;
}
if (!data || typeof data !== "object") {
throw new Error("Unexpected WordPress payload type");
}
const link = data.link;
if (!link) {
logger.error("Skipping WordPress article without link");
return null;
}
const titleHtml = data.title?.rendered ?? "";
const bodyHtml = data.content?.rendered ?? "";
const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled";
const body = md.turndown(bodyHtml);
const timestamp = this.computeTimestamp(data.date);
const categories = await this.mapCategories(data.categories ?? []);
// date range skip as in HTML crawler
if (dateRange) {
const { isTimestampInRange } = await import("@/utils");
if (!isTimestampInRange(dateRange, timestamp)) {
logger.info(
{ date: data.date, link, timestamp, title },
"Skipping article outside date range",
);
return null;
}
}
const enriched = await this.enrichWithOpenGraph(
{
body,
categories,
link,
source: this.source.sourceId,
timestamp,
title,
},
link,
);
return await persist(enriched, this.persistors);
}
/**
* Get pagination info from WordPress API.
*/
async getPagination(): Promise<PageRange> {
try {
const url = `${this.baseUrl()}wp-json/wp/v2/posts?_fields=id&per_page=100`;
const response = await this.http.get(url);
const pages = Number.parseInt(
response.headers.get(WordPressCrawler.TOTAL_PAGES_HEADER) ?? "1",
10,
);
const posts = Number.parseInt(
response.headers.get(WordPressCrawler.TOTAL_POSTS_HEADER) ?? "0",
10,
);
logger.info({ pages, posts }, "WordPress pagination");
const end = Number.isFinite(pages) && pages > 0 ? pages : 1;
return { end, start: 1 };
} catch {
return { end: 1, start: 1 };
}
}
/**
* Get base URL for WordPress REST API.
*/
private baseUrl(): string {
const base = String(this.source.sourceUrl);
return base.endsWith("/") ? base : `${base}/`;
}
/**
* Construct posts endpoint URL for a given page.
* @param page - Page number
*/
postsEndpoint(page: number): string {
return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`;
}
/**
* Fetch and cache WordPress categories.
*/
private async fetchCategories(): Promise<void> {
const url = `${this.baseUrl()}wp-json/wp/v2/categories?${WordPressCrawler.CATEGORY_QUERY}`;
const response = await this.http.get(url);
const list = (await response.json()) as Array<{ id: number; slug: string }>;
for (const c of list) {
this.categoryMap.set(c.id, c.slug);
}
}
/**
* Map category IDs to slugs.
* @param ids - Category IDs
*/
private async mapCategories(ids: number[]): Promise<string[]> {
if (this.categoryMap.size === 0) {
try {
await this.fetchCategories();
} catch (error) {
logger.warn({ error }, "Failed to fetch WordPress categories");
}
}
const values: string[] = [];
for (const id of [...ids].sort((a, b) => a - b)) {
const slug = this.categoryMap.get(id);
if (slug && !values.includes(slug)) values.push(slug);
}
return values;
}
/**
* Compute UNIX timestamp from WordPress date string.
* @param raw - Raw date string
*/
private computeTimestamp(raw?: string | null): number {
if (!raw) return Math.floor(Date.now() / 1000);
// Normalize WordPress Z into +00:00 for Date parsing robustness
const cleaned = raw.replace("Z", "+00:00");
const parsed = Date.parse(cleaned);
if (!Number.isNaN(parsed)) return Math.floor(parsed / 1000);
return Math.floor(Date.now() / 1000);
}
}
+102
View File
@@ -0,0 +1,102 @@
import fs from "node:fs";
import path from "node:path";
import logger from "@basango/logger";
import { Article } from "@/schema";
import { countTokens } from "@/utils";
export interface Persistor {
persist(record: Article): Promise<void> | void;
close: () => Promise<void> | void;
}
export interface PersistorOptions {
directory: string;
sourceId: string;
suffix?: string;
encoding?: BufferEncoding;
}
const sanitize = (text: string): string => {
if (!text) return text;
let s = text.replace(/\u00A0/g, " "); // remove NBSP
s = s.replace(" ", " "); // remove other NBSP
s = s.replace("", " "); // remove NARROW NO-BREAK SPACE
s = s.replace(/\u200B/g, ""); // remove ZERO WIDTH SPACE
s = s.replace(/\u200C/g, ""); // remove ZERO WIDTH NON-JOINER
s = s.replace(/\u200D/g, ""); // remove ZERO WIDTH JOINER
s = s.replace(/\uFEFF/g, ""); // remove ZERO WIDTH NO-BREAK SPACE
s = s.replace(/\r\n/g, "\n"); // normalize CRLF to LF
s = s.replace(/\n{2,}/g, "\n"); // collapse multiple newlines to one
// s = s.replace(/[ \t]{2,}/g, " "); // collapse multiple spaces/tabs
return s.trim();
};
export const persist = async (payload: Article, persistors: Persistor[]): Promise<Article> => {
const data = {
...payload,
body: sanitize(payload.body),
categories: payload.categories.map(sanitize),
title: sanitize(payload.title),
};
const article = {
...data,
tokenStatistics: {
body: countTokens(payload.body),
categories: countTokens(payload.categories.join(",")),
excerpt: countTokens(payload.body.substring(0, 200)),
title: countTokens(payload.title),
},
} as Article;
for (const persistor of persistors) {
try {
await persistor.persist(article);
} catch (error) {
logger.error({ error }, "Failed to persist article record");
}
}
logger.info({ url: article.link }, "article successfully persisted");
return article;
};
export class JsonlPersistor implements Persistor {
private readonly filePath: string;
private readonly encoding: BufferEncoding;
private pending: Promise<void> = Promise.resolve();
private closed = false;
constructor(options: PersistorOptions) {
const suffix = options.suffix ?? ".jsonl";
this.encoding = options.encoding ?? "utf-8";
fs.mkdirSync(options.directory, { recursive: true });
this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`);
if (!fs.existsSync(this.filePath)) {
fs.writeFileSync(this.filePath, "", { encoding: this.encoding });
}
}
persist(record: Article): Promise<void> {
if (this.closed) {
return Promise.reject(new Error("Persistor has been closed"));
}
const payload = `${JSON.stringify(record)}\n`;
this.pending = this.pending.then(async () => {
fs.appendFileSync(this.filePath, payload, { encoding: this.encoding });
});
return this.pending;
}
async close(): Promise<void> {
this.closed = true;
await this.pending;
}
}
+29
View File
@@ -0,0 +1,29 @@
import logger from "@basango/logger";
import {
CrawlingOptions,
closePersistors,
createPersistors,
resolveCrawlerConfig,
} from "@/process/crawler";
import { HtmlCrawler } from "@/process/parsers/html";
import { WordPressCrawler } from "@/process/parsers/wordpress";
import { resolveSourceConfig } from "@/utils";
export const runSyncCrawl = async (options: CrawlingOptions): Promise<void> => {
const source = resolveSourceConfig(options.sourceId);
const settings = resolveCrawlerConfig(source, options);
const persistors = createPersistors(source);
const crawler =
source.sourceKind === "wordpress"
? new WordPressCrawler(settings, { persistors })
: new HtmlCrawler(settings, { persistors });
try {
await crawler.fetch();
} finally {
await closePersistors(persistors);
}
logger.info({ ...options }, "Synchronous crawl completed");
};
+131
View File
@@ -0,0 +1,131 @@
import { z } from "zod";
export const UpdateDirectionSchema = z.enum(["forward", "backward"]);
export const SourceKindSchema = z.enum(["wordpress", "html"]);
export const DateRangeSchema = z
.object({
end: z.number().int(),
start: z.number().int(),
})
.superRefine((value, ctx) => {
if (value.start === 0 || value.end === 0) {
ctx.addIssue({
code: "custom",
message: "Timestamp cannot be zero",
});
}
if (value.end < value.start) {
ctx.addIssue({
code: "custom",
message: "End timestamp must be greater than or equal to start",
});
}
});
export const PageRangeSchema = z
.object({
end: z.number().int().min(0),
start: z.number().int().min(0),
})
.superRefine((value, ctx) => {
if (value.end < value.start) {
ctx.addIssue({
code: "custom",
message: "End page must be greater than or equal to start page",
});
}
});
export const PageRangeSpecSchema = z
.string()
.regex(/^[0-9]+:[0-9]+$/, "Invalid page range format. Use start:end")
.transform((spec) => {
const [startText, endText] = spec.split(":");
return {
end: Number.parseInt(String(endText), 10),
start: Number.parseInt(String(startText), 10),
};
});
export const DateRangeSpecSchema = z
.string()
.regex(/.+:.+/, "Expected start:end format")
.transform((spec) => {
const [startRaw, endRaw] = spec.split(":");
return { endRaw: String(endRaw), startRaw: String(startRaw) };
});
export const SourceDateSchema = z.object({
format: z.string().default("yyyy-LL-dd HH:mm"),
pattern: z.string().nullable().optional(),
replacement: z.string().nullable().optional(),
});
const BaseSourceSchema = z.object({
categories: z.array(z.string()).default([]),
requiresDetails: z.boolean().default(false),
requiresRateLimit: z.boolean().default(false),
sourceDate: SourceDateSchema,
sourceId: z.string(),
sourceKind: SourceKindSchema,
sourceUrl: z.url(),
supportsCategories: z.boolean().default(false),
});
export const HtmlSourceConfigSchema = BaseSourceSchema.extend({
paginationTemplate: z.string(),
sourceKind: z.literal("html"),
sourceSelectors: z.object({
articleBody: z.string(),
articleCategories: z.string().optional(),
articleDate: z.string(),
articleLink: z.string(),
articles: z.string(),
articleTitle: z.string(),
pagination: z.string().default("ul.pagination > li a"),
}),
});
export const WordPressSourceConfigSchema = BaseSourceSchema.extend({
sourceDate: SourceDateSchema.default(SourceDateSchema.parse({ format: "yyyy-LL-dd'T'HH:mm:ss" })),
sourceKind: z.literal("wordpress"),
});
export const ArticleMetadataSchema = z.object({
description: z.string().optional(),
image: z.string().optional(),
title: z.string().optional(),
url: z.url().optional(),
});
export const ArticleTokenStatisticsSchema = z.object({
body: z.number().int().nonnegative().default(0),
categories: z.number().int().nonnegative().default(0),
excerpt: z.number().int().nonnegative().default(0),
title: z.number().int().nonnegative().default(0),
});
export const ArticleSchema = z.object({
body: z.string(),
categories: z.array(z.string()).default([]),
link: z.url(),
metadata: ArticleMetadataSchema.optional(),
source: z.string(),
timestamp: z.number().int(),
title: z.string(),
tokenStatistics: ArticleTokenStatisticsSchema.optional(),
});
export type ArticleMetadata = z.infer<typeof ArticleMetadataSchema>;
export type Article = z.infer<typeof ArticleSchema>;
export type DateRange = z.infer<typeof DateRangeSchema>;
export type PageRange = z.infer<typeof PageRangeSchema>;
export type HtmlSourceConfig = z.infer<typeof HtmlSourceConfigSchema>;
export type WordPressSourceConfig = z.infer<typeof WordPressSourceConfigSchema>;
export type AnySourceConfig = HtmlSourceConfig | WordPressSourceConfig;
export interface CreateDateRangeOptions {
format?: string;
separator?: string;
}
+22
View File
@@ -0,0 +1,22 @@
import { logger } from "@basango/logger";
import { runSyncCrawl } from "@/process/sync/tasks";
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
const main = async (): Promise<void> => {
const options = parseCrawlingCliArgs();
if (options.sourceId === undefined) {
console.log(CRAWLING_USAGE);
process.exitCode = 1;
return;
}
try {
await runSyncCrawl({ ...options });
} catch (error) {
logger.error({ error }, "Synchronous crawl failed");
process.exitCode = 1;
}
};
void main();
+24
View File
@@ -0,0 +1,24 @@
import { logger } from "@basango/logger";
import { scheduleAsyncCrawl } from "@/process/async/tasks";
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
const main = async (): Promise<void> => {
const options = parseCrawlingCliArgs();
if (options.sourceId === undefined) {
console.log(CRAWLING_USAGE);
process.exitCode = 1;
return;
}
try {
const id = await scheduleAsyncCrawl({ ...options });
logger.info({ id, options }, "Scheduled asynchronous crawl job");
} catch (error) {
logger.error({ error }, "Failed to schedule crawl job");
process.exitCode = 1;
}
};
void main();
+39
View File
@@ -0,0 +1,39 @@
import { parseArgs } from "node:util";
import { CrawlingOptions } from "@/process/crawler";
interface WorkerCliOptions {
queue?: string[];
}
export const CRAWLING_USAGE = `
Usage: bun run crawl:[async|sync] -- --sourceId <id> [options]
Options:
--pageRange <range> Optional page range filter (e.g. 1:5)
--dateRange <range> Optional date range filter (e.g. 2024-01-01:2024-01-31)
--category <slug> Optional category to crawl
-h, --help Show this message
`;
export const parseWorkerCliArgs = (): WorkerCliOptions => {
const { values } = parseArgs({
options: {
queue: { multiple: true, short: "q", type: "string" },
},
});
return values as WorkerCliOptions;
};
export const parseCrawlingCliArgs = (): CrawlingOptions => {
const { values } = parseArgs({
options: {
category: { type: "string" },
dateRange: { type: "string" },
pageRange: { type: "string" },
sourceId: { type: "string" },
},
});
return values as CrawlingOptions;
};
+35
View File
@@ -0,0 +1,35 @@
import { logger } from "@basango/logger";
import { createQueueManager } from "@/process/async/queue";
import { startWorker } from "@/process/async/worker";
import { parseWorkerCliArgs } from "@/scripts/utils";
const main = async (): Promise<void> => {
const options = parseWorkerCliArgs();
const manager = createQueueManager();
const queues = options.queue?.length
? options.queue.map((name) => manager.queueName(name))
: undefined;
const handle = startWorker({
queueManager: manager,
queueNames: queues,
});
const shutdown = async (signal: NodeJS.Signals) => {
logger.info({ signal }, "Received shutdown signal, draining workers");
try {
await handle.close();
} finally {
await manager.close();
process.exit(0);
}
};
process.once("SIGINT", (signal) => void shutdown(signal));
process.once("SIGTERM", (signal) => void shutdown(signal));
logger.info({ queueNames: queues }, "Crawler workers started");
};
void main();
+163
View File
@@ -0,0 +1,163 @@
import { format, getUnixTime, isMatch, parse } from "date-fns";
import type { RedisOptions } from "ioredis";
import { get_encoding, TiktokenEncoding } from "tiktoken";
import { config } from "@/config";
import { DEFAULT_DATE_FORMAT } from "@/constants";
import {
AnySourceConfig,
CreateDateRangeOptions,
DateRange,
DateRangeSchema,
DateRangeSpecSchema,
HtmlSourceConfig,
PageRange,
PageRangeSchema,
PageRangeSpecSchema,
WordPressSourceConfig,
} from "@/schema";
/**
* Resolve a source configuration by its ID.
* @param id - The source ID
*/
export const resolveSourceConfig = (id: string): AnySourceConfig => {
const source =
config.sources.html.find((s: HtmlSourceConfig) => s.sourceId === id) ||
config.sources.wordpress.find((s: WordPressSourceConfig) => s.sourceId === id);
if (source === undefined) {
throw new Error(`Source '${id}' not found in configuration`);
}
return source;
};
/**
* Parse a Redis URL into RedisOptions.
* @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0")
*/
export const parseRedisUrl = (url: string): RedisOptions => {
if (!url.startsWith("redis://")) {
return {};
}
const parsed = new URL(url);
return {
db: Number(parsed.pathname?.replace("/", "") || 0),
host: parsed.hostname,
password: parsed.password || undefined,
port: Number(parsed.port || 6379),
};
};
/**
* Parse a date string using the specified format.
* @param value - The date string to parse
* @param format - The date format
*/
const parseDate = (value: string, format: string): Date => {
if (!isMatch(value, format)) {
throw new Error(`Invalid date '${value}' for format '${format}'`);
}
const parsed = parse(value, format, new Date());
if (Number.isNaN(parsed.getTime())) {
throw new Error(`Invalid date '${value}' for format '${format}'`);
}
return parsed;
};
/**
* Count the number of tokens in the given text using the specified encoding.
* @param text - The input text
* @param encoding - The token encoding (default: "cl100k_base")
*/
export const countTokens = (text: string, encoding: TiktokenEncoding = "cl100k_base"): number => {
try {
const encoder = get_encoding(encoding);
const tokens = encoder.encode(text);
encoder.free();
return tokens.length;
} catch {
return text.length;
}
};
/**
* Create a page range from a string specification.
* @param spec - The page range specification (e.g., "1:10")
*/
export const createPageRange = (spec: string | undefined): PageRange | undefined => {
if (!spec) return undefined;
const parsed = PageRangeSpecSchema.parse(spec);
return PageRangeSchema.parse(parsed);
};
/**
* Create a date range from a string specification.
* @param spec - The date range specification (e.g., "2023-01-01:2023-12-31")
* @param options - Options for date range creation
*/
export const createDateRange = (
spec: string | undefined,
options: CreateDateRangeOptions = {},
): DateRange | undefined => {
if (!spec) return undefined;
const { format = DEFAULT_DATE_FORMAT, separator = ":" } = options;
if (!separator) {
throw new Error("Separator cannot be empty");
}
const normalized = spec.replace(separator, ":");
const parsedSpec = DateRangeSpecSchema.parse(normalized);
const startDate = parseDate(parsedSpec.startRaw, format);
const endDate = parseDate(parsedSpec.endRaw, format);
const range = {
end: getUnixTime(endDate),
start: getUnixTime(startDate),
};
return DateRangeSchema.parse(range);
};
/**
* Format a date range into a string representation.
* @param range - The date range
* @param fmt - The date format (default: DEFAULT_DATE_FORMAT)
*/
export const formatDateRange = (range: DateRange, fmt = DEFAULT_DATE_FORMAT): string => {
const start = format(new Date(range.start * 1000), fmt);
const end = format(new Date(range.end * 1000), fmt);
return `${start}:${end}`;
};
/**
* Format a page range into a string representation.
* @param range - The page range
*/
export const formatPageRange = (range: PageRange): string => {
return `${range.start}:${range.end}`;
};
/**
* Check if a timestamp is within a given date range.
* @param range - The date range
* @param timestamp - The timestamp to check
*/
export const isTimestampInRange = (range: DateRange, timestamp: number): boolean => {
return range.start <= timestamp && timestamp <= range.end;
};
/**
* Convert a relative URL to an absolute URL based on the base URL.
* @param base - The base URL
* @param href - The relative or absolute URL
*/
export const createAbsoluteUrl = (base: string, href: string): string => {
try {
// new URL handles relative paths with base
return new URL(href, base.endsWith("/") ? base : `${base}/`).toString();
} catch {
return href;
}
};
+11
View File
@@ -0,0 +1,11 @@
{
"compilerOptions": {
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"extends": "@basango/typescript-config/base.json",
"include": ["src"],
"references": []
}
+17
View File
@@ -0,0 +1,17 @@
import path from "node:path";
import { defineConfig } from "vitest/config";
export default defineConfig({
resolve: {
alias: {
"@": path.resolve(__dirname, "src"),
},
},
test: {
environment: "node",
globals: true,
include: ["src/**/*.test.ts"],
setupFiles: ["./vitest.setup.ts"],
},
});
+1
View File
@@ -0,0 +1 @@
process.env.NODE_ENV = process.env.NODE_ENV ?? "test";