[crawler] configuration based api
This commit is contained in:
+3
-2
@@ -31,8 +31,9 @@ yarn-error.log*
|
|||||||
.pnpm-debug.log*
|
.pnpm-debug.log*
|
||||||
|
|
||||||
# local env files
|
# local env files
|
||||||
.env
|
.env.local
|
||||||
.env*.local
|
.env.*.local
|
||||||
|
.env.*.local.*
|
||||||
|
|
||||||
# vercel
|
# vercel
|
||||||
.vercel
|
.vercel
|
||||||
|
|||||||
@@ -0,0 +1,20 @@
|
|||||||
|
# paths
|
||||||
|
BASANGO_CRAWLER_ROOT_PATH=
|
||||||
|
BASANGO_CRAWLER_DATA_PATH=
|
||||||
|
BASANGO_CRAWLER_LOGS_PATH=
|
||||||
|
BASANGO_CRAWLER_CONFIG_PATH=
|
||||||
|
|
||||||
|
# crawler settings
|
||||||
|
BASANGO_CRAWLER_UPDATE_DIRECTION=forward
|
||||||
|
BASANGO_CRAWLER_FETCH_USER_AGENT="Basango/0.1 (+https://github.com/bernard-ng/basango)"
|
||||||
|
BASANGO_CRAWLER_FETCH_MAX_RETRIES=3
|
||||||
|
BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER=true
|
||||||
|
|
||||||
|
BASANGO_CRAWLER_ASYNC_REDIS_URL="redis://localhost:6379/0"
|
||||||
|
BASANGO_CRAWLER_ASYNC_TTL_RESULT=3600
|
||||||
|
BASANGO_CRAWLER_ASYNC_TTL_FAILURE=3600
|
||||||
|
BASANGO_CRAWLER_ASYNC_QUEUE_LISTING="listing"
|
||||||
|
BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS="details"
|
||||||
|
BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING="processing"
|
||||||
|
|
||||||
|
BASANGO_CRAWLER_BACKEND_API_ENDPOINT="http://localhost:3000/api/aggregator/articles?token=dev"
|
||||||
@@ -1,195 +1,42 @@
|
|||||||
{
|
{
|
||||||
|
"paths": {
|
||||||
|
"root": "%env(BASANGO_CRAWLER_ROOT_PATH)%",
|
||||||
|
"data": "%env(BASANGO_CRAWLER_DATA_PATH)%",
|
||||||
|
"logs": "%env(BASANGO_CRAWLER_LOGS_PATH)%",
|
||||||
|
"config": "%env(BASANGO_CRAWLER_CONFIG_PATH)%"
|
||||||
|
},
|
||||||
"fetch": {
|
"fetch": {
|
||||||
"client": {
|
"client": {
|
||||||
"timeout": 20,
|
"timeout": 20,
|
||||||
"user_agent": "Basango/0.1 (+https://github.com/bernard-ng/basango)",
|
"userAgent": "%env(BASANGO_CRAWLER_FETCH_USER_AGENT)%",
|
||||||
"follow_redirects": true,
|
"followRedirects": true,
|
||||||
"verify_ssl": true,
|
"verifySsl": true,
|
||||||
"rotate": true,
|
"rotate": true,
|
||||||
"max_retries": 3,
|
"maxRetries": "%env(BASANGO_CRAWLER_FETCH_MAX_RETRIES)%",
|
||||||
"backoff_initial": 1,
|
"backoffInitial": 1,
|
||||||
"backoff_multiplier": 2,
|
"backoffMultiplier": 2,
|
||||||
"backoff_max": 30,
|
"backoffMax": 30,
|
||||||
"respect_retry_after": true
|
"respectRetryAfter": "%env(BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER)%"
|
||||||
},
|
},
|
||||||
"crawler": {
|
"crawler": {
|
||||||
"notify": false,
|
"notify": false,
|
||||||
"use_multi_threading": false,
|
"use_multi_threading": false,
|
||||||
"max_workers": 5
|
"maxWorkers": 5,
|
||||||
}
|
"direction": "%env(BASANGO_CRAWLER_DEFAULT_DIRECTION)%"
|
||||||
},
|
},
|
||||||
"logging": {
|
"async": {
|
||||||
"level": "INFO",
|
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
|
||||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
"prefix": "basango:crawler",
|
||||||
"file_logging": true,
|
"ttl": {
|
||||||
"console_logging": true,
|
"default": 600,
|
||||||
"log_file": "pipeline.log",
|
"result": "%env(BASANGO_CRAWLER_ASYNC_TTL_RESULT)%",
|
||||||
"max_log_size": 10485760,
|
"failure": "%env(BASANGO_CRAWLER_ASYNC_TTL_FAILURE)%"
|
||||||
"backup_count": 5
|
|
||||||
},
|
|
||||||
"sources": {
|
|
||||||
"html": [
|
|
||||||
{
|
|
||||||
"source_id": "radiookapi.net",
|
|
||||||
"source_url": "https://www.radiookapi.net",
|
|
||||||
"source_date": {
|
|
||||||
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
|
||||||
"replacement": "$3-$2-$1 $4"
|
|
||||||
},
|
|
||||||
"source_selectors": {
|
|
||||||
"articles": ".view-content > .views-row.content-row",
|
|
||||||
"article_title": "h1.page-header",
|
|
||||||
"article_link": ".views-field-title a",
|
|
||||||
"article_body": ".field-name-body",
|
|
||||||
"article_date": ".views-field-created",
|
|
||||||
"article_categories": ".views-field-field-cat-gorie a",
|
|
||||||
"pagination": "ul.pagination > li.pager-last > a"
|
|
||||||
},
|
|
||||||
"pagination_template": "actualite",
|
|
||||||
"supports_categories": false,
|
|
||||||
"requires_details": true,
|
|
||||||
"requires_rate_limit": false
|
|
||||||
},
|
},
|
||||||
{
|
"queues": {
|
||||||
"source_id": "7sur7.cd",
|
"listing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_LISTING)%",
|
||||||
"source_url": "https://7sur7.cd",
|
"details": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS)%",
|
||||||
"source_date": {
|
"processing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING)%"
|
||||||
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
|
||||||
"replacement": "$3-$2-$1 $4"
|
|
||||||
},
|
|
||||||
"categories": ["politique", "economie", "culture", "sport", "societe"],
|
|
||||||
"source_selectors": {
|
|
||||||
"articles": ".view-content > .row.views-row",
|
|
||||||
"article_title": ".views-field-title a",
|
|
||||||
"article_link": ".views-field-title a",
|
|
||||||
"article_body": ".field.field--name-body",
|
|
||||||
"article_date": ".views-field-created",
|
|
||||||
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
|
|
||||||
},
|
|
||||||
"pagination_template": "index.php/category/{category}",
|
|
||||||
"supports_categories": true,
|
|
||||||
"requires_details": false,
|
|
||||||
"requires_rate_limit": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "mediacongo.net",
|
|
||||||
"source_url": "https://www.mediacongo.net",
|
|
||||||
"source_date": {
|
|
||||||
"format": "%d.%m.%Y %H:%M"
|
|
||||||
},
|
|
||||||
"source_selectors": {
|
|
||||||
"articles": ".for_aitems > .article_other_item",
|
|
||||||
"article_title": "img",
|
|
||||||
"article_link": "a:first-child",
|
|
||||||
"article_categories": "a.color_link",
|
|
||||||
"article_body": ".article_ttext",
|
|
||||||
"article_date": ".article_other_about",
|
|
||||||
"pagination": "div.pagination > div > a:last-child"
|
|
||||||
},
|
|
||||||
"pagination_template": "articles.html",
|
|
||||||
"supports_categories": false,
|
|
||||||
"requires_details": true,
|
|
||||||
"requires_rate_limit": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "actualite.cd",
|
|
||||||
"source_url": "https://actualite.cd",
|
|
||||||
"source_date": {
|
|
||||||
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
|
|
||||||
"replacement": "$4-$3-$2 $5"
|
|
||||||
},
|
|
||||||
"source_selectors": {
|
|
||||||
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
|
|
||||||
"article_title": "#actu-titre a",
|
|
||||||
"article_link": "#actu-titre a",
|
|
||||||
"article_categories": "#actu-cat a",
|
|
||||||
"article_body": ".views-field.views-field-body",
|
|
||||||
"article_date": "#p-date"
|
|
||||||
},
|
|
||||||
"pagination_template": "actualite",
|
|
||||||
"supports_categories": false,
|
|
||||||
"requires_details": true,
|
|
||||||
"requires_rate_limit": false
|
|
||||||
}
|
}
|
||||||
],
|
}
|
||||||
"wordpress": [
|
|
||||||
{
|
|
||||||
"source_id": "beto.cd",
|
|
||||||
"source_url": "https://beto.cd",
|
|
||||||
"requires_rate_limit": true
|
|
||||||
},
|
|
||||||
{ "source_id": "newscd.net", "source_url": "https://newscd.net" },
|
|
||||||
{
|
|
||||||
"source_id": "africanewsrdc.net",
|
|
||||||
"source_url": "https://www.africanewsrdc.net"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "angazainstitute.ac.cd",
|
|
||||||
"source_url": "https://angazainstitute.ac.cd"
|
|
||||||
},
|
|
||||||
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" },
|
|
||||||
{ "source_id": "bukavufm.com", "source_url": "https://bukavufm.com" },
|
|
||||||
{
|
|
||||||
"source_id": "changement7.net",
|
|
||||||
"source_url": "https://changement7.net"
|
|
||||||
},
|
|
||||||
{ "source_id": "congoactu.net", "source_url": "https://congoactu.net" },
|
|
||||||
{
|
|
||||||
"source_id": "congoindependant.com",
|
|
||||||
"source_url": "https://www.congoindependant.com"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "congoquotidien.com",
|
|
||||||
"source_url": "https://www.congoquotidien.com"
|
|
||||||
},
|
|
||||||
{ "source_id": "cumulard.cd", "source_url": "https://www.cumulard.cd" },
|
|
||||||
{
|
|
||||||
"source_id": "environews-rdc.net",
|
|
||||||
"source_url": "https://environews-rdc.net"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "freemediardc.info",
|
|
||||||
"source_url": "https://www.freemediardc.info"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "geopolismagazine.org",
|
|
||||||
"source_url": "https://geopolismagazine.org"
|
|
||||||
},
|
|
||||||
{ "source_id": "habarirdc.net", "source_url": "https://habarirdc.net" },
|
|
||||||
{ "source_id": "infordc.com", "source_url": "https://infordc.com" },
|
|
||||||
{
|
|
||||||
"source_id": "kilalopress.net",
|
|
||||||
"source_url": "https://kilalopress.net"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "laprosperiteonline.net",
|
|
||||||
"source_url": "https://laprosperiteonline.net"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "laprunellerdc.cd",
|
|
||||||
"source_url": "https://laprunellerdc.cd"
|
|
||||||
},
|
|
||||||
{ "source_id": "lesmedias.net", "source_url": "https://lesmedias.net" },
|
|
||||||
{
|
|
||||||
"source_id": "lesvolcansnews.net",
|
|
||||||
"source_url": "https://lesvolcansnews.net"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "netic-news.net",
|
|
||||||
"source_url": "https://www.netic-news.net"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source_id": "objectif-infos.cd",
|
|
||||||
"source_url": "https://objectif-infos.cd"
|
|
||||||
},
|
|
||||||
{ "source_id": "scooprdc.net", "source_url": "https://scooprdc.net" },
|
|
||||||
{
|
|
||||||
"source_id": "journaldekinshasa.com",
|
|
||||||
"source_url": "https://www.journaldekinshasa.com"
|
|
||||||
},
|
|
||||||
{ "source_id": "lepotentiel.cd", "source_url": "https://lepotentiel.cd" },
|
|
||||||
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" },
|
|
||||||
{ "source_id": "matininfos.net", "source_url": "https://matininfos.net" }
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+43
-47
@@ -1,32 +1,4 @@
|
|||||||
{
|
{
|
||||||
"fetch": {
|
|
||||||
"client": {
|
|
||||||
"timeout": 20,
|
|
||||||
"user_agent": "Basango/0.1 (+https://github.com/bernard-ng/basango)",
|
|
||||||
"follow_redirects": true,
|
|
||||||
"verify_ssl": true,
|
|
||||||
"rotate": true,
|
|
||||||
"max_retries": 3,
|
|
||||||
"backoff_initial": 1,
|
|
||||||
"backoff_multiplier": 2,
|
|
||||||
"backoff_max": 30,
|
|
||||||
"respect_retry_after": true
|
|
||||||
},
|
|
||||||
"crawler": {
|
|
||||||
"notify": false,
|
|
||||||
"use_multi_threading": false,
|
|
||||||
"max_workers": 5
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"logging": {
|
|
||||||
"level": "ERROR",
|
|
||||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
||||||
"file_logging": true,
|
|
||||||
"console_logging": true,
|
|
||||||
"log_file": "pipeline.log",
|
|
||||||
"max_log_size": 10485760,
|
|
||||||
"backup_count": 5
|
|
||||||
},
|
|
||||||
"sources": {
|
"sources": {
|
||||||
"html": [
|
"html": [
|
||||||
{
|
{
|
||||||
@@ -38,16 +10,16 @@
|
|||||||
},
|
},
|
||||||
"source_selectors": {
|
"source_selectors": {
|
||||||
"articles": ".view-content > .views-row.content-row",
|
"articles": ".view-content > .views-row.content-row",
|
||||||
"article_title": ".views-field-title a",
|
"article_title": "h1.page-header",
|
||||||
"article_link": ".views-field-title a",
|
"article_link": ".views-field-title a",
|
||||||
"article_body": ".field-name-body",
|
"article_body": ".field-name-body",
|
||||||
"article_date": ".views-field-created",
|
"article_date": ".views-field-created",
|
||||||
"article_categories": ".views-field-field-cat-gorie a",
|
"article_categories": ".views-field-field-cat-gorie a",
|
||||||
"pagination": "ul.pagination > li a(:last-child)"
|
"pagination": "ul.pagination > li.pager-last > a"
|
||||||
},
|
},
|
||||||
"pagination_template": "/actualite?page={page}",
|
"pagination_template": "actualite",
|
||||||
"supports_categories": false,
|
"supports_categories": false,
|
||||||
"requires_details": false,
|
"requires_details": true,
|
||||||
"requires_rate_limit": false
|
"requires_rate_limit": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -64,29 +36,29 @@
|
|||||||
"article_link": ".views-field-title a",
|
"article_link": ".views-field-title a",
|
||||||
"article_body": ".field.field--name-body",
|
"article_body": ".field.field--name-body",
|
||||||
"article_date": ".views-field-created",
|
"article_date": ".views-field-created",
|
||||||
"pagination": "ul.pagination > li a(:last-child)"
|
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
|
||||||
},
|
},
|
||||||
"pagination_template": "/index.php/category/{category}?page={page}",
|
"pagination_template": "index.php/category/{category}",
|
||||||
"supports_categories": true,
|
"supports_categories": true,
|
||||||
"requires_details": false,
|
"requires_details": false,
|
||||||
"requires_rate_limit": false
|
"requires_rate_limit": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "mediacongo.net",
|
"source_id": "mediacongo.net",
|
||||||
"source_url": "https://mediacongo.net",
|
"source_url": "https://www.mediacongo.net",
|
||||||
"source_date": {
|
"source_date": {
|
||||||
"format": "%d.%m.%Y %H:%M"
|
"format": "%d.%m.%Y %H:%M"
|
||||||
},
|
},
|
||||||
"source_selectors": {
|
"source_selectors": {
|
||||||
"articles": ".for_aitems > .article_other_item",
|
"articles": ".for_aitems > .article_other_item",
|
||||||
"article_title": "img",
|
"article_title": "img",
|
||||||
"article_link": "a(:first-child)",
|
"article_link": "a:first-child",
|
||||||
"article_categories": "a.color_link",
|
"article_categories": "a.color_link",
|
||||||
"article_body": ".article_ttext",
|
"article_body": ".article_ttext",
|
||||||
"article_date": ".article_other_about",
|
"article_date": ".article_other_about",
|
||||||
"pagination": ".nav > a(:last-child)"
|
"pagination": "div.pagination > div > a:last-child"
|
||||||
},
|
},
|
||||||
"pagination_template": "/articles.html?page={page}",
|
"pagination_template": "articles.html",
|
||||||
"supports_categories": false,
|
"supports_categories": false,
|
||||||
"requires_details": true,
|
"requires_details": true,
|
||||||
"requires_rate_limit": false
|
"requires_rate_limit": false
|
||||||
@@ -106,7 +78,7 @@
|
|||||||
"article_body": ".views-field.views-field-body",
|
"article_body": ".views-field.views-field-body",
|
||||||
"article_date": "#p-date"
|
"article_date": "#p-date"
|
||||||
},
|
},
|
||||||
"pagination_template": "/actualite?page={page}",
|
"pagination_template": "actualite",
|
||||||
"supports_categories": false,
|
"supports_categories": false,
|
||||||
"requires_details": true,
|
"requires_details": true,
|
||||||
"requires_rate_limit": false
|
"requires_rate_limit": false
|
||||||
@@ -128,12 +100,18 @@
|
|||||||
"source_url": "https://angazainstitute.ac.cd"
|
"source_url": "https://angazainstitute.ac.cd"
|
||||||
},
|
},
|
||||||
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" },
|
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" },
|
||||||
{ "source_id": "bukavufm.com", "source_url": "https://bukavufm.com" },
|
{
|
||||||
|
"source_id": "bukavufm.com",
|
||||||
|
"source_url": "https://bukavufm.com"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"source_id": "changement7.net",
|
"source_id": "changement7.net",
|
||||||
"source_url": "https://changement7.net"
|
"source_url": "https://changement7.net"
|
||||||
},
|
},
|
||||||
{ "source_id": "congoactu.net", "source_url": "https://congoactu.net" },
|
{
|
||||||
|
"source_id": "congoactu.net",
|
||||||
|
"source_url": "https://congoactu.net"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"source_id": "congoindependant.com",
|
"source_id": "congoindependant.com",
|
||||||
"source_url": "https://www.congoindependant.com"
|
"source_url": "https://www.congoindependant.com"
|
||||||
@@ -142,7 +120,10 @@
|
|||||||
"source_id": "congoquotidien.com",
|
"source_id": "congoquotidien.com",
|
||||||
"source_url": "https://www.congoquotidien.com"
|
"source_url": "https://www.congoquotidien.com"
|
||||||
},
|
},
|
||||||
{ "source_id": "cumulard.cd", "source_url": "https://www.cumulard.cd" },
|
{
|
||||||
|
"source_id": "cumulard.cd",
|
||||||
|
"source_url": "https://www.cumulard.cd"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"source_id": "environews-rdc.net",
|
"source_id": "environews-rdc.net",
|
||||||
"source_url": "https://environews-rdc.net"
|
"source_url": "https://environews-rdc.net"
|
||||||
@@ -155,7 +136,10 @@
|
|||||||
"source_id": "geopolismagazine.org",
|
"source_id": "geopolismagazine.org",
|
||||||
"source_url": "https://geopolismagazine.org"
|
"source_url": "https://geopolismagazine.org"
|
||||||
},
|
},
|
||||||
{ "source_id": "habarirdc.net", "source_url": "https://habarirdc.net" },
|
{
|
||||||
|
"source_id": "habarirdc.net",
|
||||||
|
"source_url": "https://habarirdc.net"
|
||||||
|
},
|
||||||
{ "source_id": "infordc.com", "source_url": "https://infordc.com" },
|
{ "source_id": "infordc.com", "source_url": "https://infordc.com" },
|
||||||
{
|
{
|
||||||
"source_id": "kilalopress.net",
|
"source_id": "kilalopress.net",
|
||||||
@@ -169,7 +153,10 @@
|
|||||||
"source_id": "laprunellerdc.cd",
|
"source_id": "laprunellerdc.cd",
|
||||||
"source_url": "https://laprunellerdc.cd"
|
"source_url": "https://laprunellerdc.cd"
|
||||||
},
|
},
|
||||||
{ "source_id": "lesmedias.net", "source_url": "https://lesmedias.net" },
|
{
|
||||||
|
"source_id": "lesmedias.net",
|
||||||
|
"source_url": "https://lesmedias.net"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"source_id": "lesvolcansnews.net",
|
"source_id": "lesvolcansnews.net",
|
||||||
"source_url": "https://lesvolcansnews.net"
|
"source_url": "https://lesvolcansnews.net"
|
||||||
@@ -182,14 +169,23 @@
|
|||||||
"source_id": "objectif-infos.cd",
|
"source_id": "objectif-infos.cd",
|
||||||
"source_url": "https://objectif-infos.cd"
|
"source_url": "https://objectif-infos.cd"
|
||||||
},
|
},
|
||||||
{ "source_id": "scooprdc.net", "source_url": "https://scooprdc.net" },
|
{
|
||||||
|
"source_id": "scooprdc.net",
|
||||||
|
"source_url": "https://scooprdc.net"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"source_id": "journaldekinshasa.com",
|
"source_id": "journaldekinshasa.com",
|
||||||
"source_url": "https://www.journaldekinshasa.com"
|
"source_url": "https://www.journaldekinshasa.com"
|
||||||
},
|
},
|
||||||
{ "source_id": "lepotentiel.cd", "source_url": "https://lepotentiel.cd" },
|
{
|
||||||
|
"source_id": "lepotentiel.cd",
|
||||||
|
"source_url": "https://lepotentiel.cd"
|
||||||
|
},
|
||||||
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" },
|
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" },
|
||||||
{ "source_id": "matininfos.net", "source_url": "https://matininfos.net" }
|
{
|
||||||
|
"source_id": "matininfos.net",
|
||||||
|
"source_url": "https://matininfos.net"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3,20 +3,25 @@
|
|||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
"=========== CODE STYLE ============": "",
|
||||||
"test": "vitest --run",
|
"test": "vitest --run",
|
||||||
"lint": "biome check .",
|
"lint": "biome check .",
|
||||||
"lint:fix": "biome check --write .",
|
"lint:fix": "biome check --write .",
|
||||||
"format": "biome format --write .",
|
"format": "biome format --write .",
|
||||||
"queue": "bun run src/scripts/queue.ts",
|
"============= CLI =============": "",
|
||||||
"worker": "bun run src/scripts/worker.ts"
|
"crawl:sync": "bun run src/scripts/crawl.ts",
|
||||||
|
"crawl:async": "bun run src/scripts/queue.ts",
|
||||||
|
"crawl:worker": "bun run src/scripts/worker.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@basango/logger": "workspace:*",
|
"@basango/logger": "workspace:*",
|
||||||
|
"@devscast/config": "^1.0.2",
|
||||||
"bullmq": "^4.17.0",
|
"bullmq": "^4.17.0",
|
||||||
"date-fns": "catalog:",
|
"date-fns": "catalog:",
|
||||||
"ioredis": "^5.3.2",
|
"ioredis": "^5.3.2",
|
||||||
"node-html-parser": "^6.1.10",
|
"node-html-parser": "^7.0.1",
|
||||||
"tiktoken": "^1.0.14",
|
"tiktoken": "^1.0.14",
|
||||||
|
"turndown": "^7.2.2",
|
||||||
"zod": "catalog:"
|
"zod": "catalog:"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,81 +0,0 @@
|
|||||||
import fs from "node:fs";
|
|
||||||
import os from "node:os";
|
|
||||||
import path from "node:path";
|
|
||||||
|
|
||||||
import { describe, expect, it } from "vitest";
|
|
||||||
import { loadConfig } from "@/config";
|
|
||||||
|
|
||||||
import { resolveConfigPath } from "@/utils";
|
|
||||||
|
|
||||||
describe("loadConfig", () => {
|
|
||||||
it("parses json configuration and ensures directories", () => {
|
|
||||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "crawler-config-"));
|
|
||||||
const paths = {
|
|
||||||
root: tempDir,
|
|
||||||
data: path.join(tempDir, "data"),
|
|
||||||
logs: path.join(tempDir, "logs"),
|
|
||||||
configs: path.join(tempDir, "configs"),
|
|
||||||
};
|
|
||||||
|
|
||||||
const configPath = path.join(tempDir, "pipeline.json");
|
|
||||||
fs.writeFileSync(
|
|
||||||
configPath,
|
|
||||||
JSON.stringify(
|
|
||||||
{
|
|
||||||
paths,
|
|
||||||
fetch: {
|
|
||||||
client: { timeout: 10 },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
null,
|
|
||||||
2,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
|
|
||||||
const config = loadConfig({ path: configPath });
|
|
||||||
|
|
||||||
expect(config.fetch.client.timeout).toBe(10);
|
|
||||||
expect(fs.existsSync(paths.data)).toBe(true);
|
|
||||||
expect(fs.existsSync(paths.logs)).toBe(true);
|
|
||||||
expect(fs.existsSync(paths.configs)).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("merges environment override if available", () => {
|
|
||||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "crawler-config-"));
|
|
||||||
const paths = {
|
|
||||||
root: tempDir,
|
|
||||||
data: path.join(tempDir, "data"),
|
|
||||||
logs: path.join(tempDir, "logs"),
|
|
||||||
configs: path.join(tempDir, "configs"),
|
|
||||||
};
|
|
||||||
|
|
||||||
const basePath = path.join(tempDir, "pipeline.json");
|
|
||||||
fs.writeFileSync(
|
|
||||||
basePath,
|
|
||||||
JSON.stringify(
|
|
||||||
{
|
|
||||||
paths,
|
|
||||||
logging: { level: "INFO" },
|
|
||||||
},
|
|
||||||
null,
|
|
||||||
2,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
|
|
||||||
const overridePath = resolveConfigPath(basePath, "production");
|
|
||||||
fs.writeFileSync(
|
|
||||||
overridePath,
|
|
||||||
JSON.stringify(
|
|
||||||
{
|
|
||||||
logging: { level: "DEBUG" },
|
|
||||||
},
|
|
||||||
null,
|
|
||||||
2,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
|
|
||||||
const config = loadConfig({ path: basePath, env: "production" });
|
|
||||||
|
|
||||||
expect(config.logging.level).toBe("DEBUG");
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,91 +0,0 @@
|
|||||||
import { describe, expect, it, beforeEach, vi } from "vitest";
|
|
||||||
|
|
||||||
import { PipelineConfigManager } from "@/config";
|
|
||||||
import { registerCrawler, clearCrawlerRegistry, runSyncCrawl } from "@/process/crawler";
|
|
||||||
import { PipelineConfigSchema, SourceKindSchema } from "@/schema";
|
|
||||||
|
|
||||||
const createPipeline = () =>
|
|
||||||
PipelineConfigSchema.parse({
|
|
||||||
paths: {
|
|
||||||
root: ".",
|
|
||||||
data: ".",
|
|
||||||
logs: ".",
|
|
||||||
configs: ".",
|
|
||||||
},
|
|
||||||
sources: {
|
|
||||||
html: [
|
|
||||||
{
|
|
||||||
source_id: "demo",
|
|
||||||
source_url: "https://example.com",
|
|
||||||
source_kind: SourceKindSchema.enum.html,
|
|
||||||
pagination_template: "/page/{page}",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
wordpress: [],
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
describe("runSyncCrawl", () => {
|
|
||||||
beforeEach(() => {
|
|
||||||
clearCrawlerRegistry();
|
|
||||||
});
|
|
||||||
|
|
||||||
it("invokes registered crawler factory", async () => {
|
|
||||||
const pipeline = createPipeline();
|
|
||||||
const fetch = vi.fn().mockResolvedValue(undefined);
|
|
||||||
const close = vi.fn();
|
|
||||||
|
|
||||||
registerCrawler(SourceKindSchema.enum.html, () => ({ fetch, close }));
|
|
||||||
|
|
||||||
const manager = {
|
|
||||||
get: vi.fn().mockReturnValue(pipeline),
|
|
||||||
setupLogging: vi.fn(),
|
|
||||||
} as unknown as PipelineConfigManager;
|
|
||||||
|
|
||||||
const persistClose = vi.fn();
|
|
||||||
const persistFactory = vi.fn().mockReturnValue([
|
|
||||||
{ persist: vi.fn(), close: persistClose },
|
|
||||||
]);
|
|
||||||
|
|
||||||
await runSyncCrawl({
|
|
||||||
sourceId: "demo",
|
|
||||||
env: "test",
|
|
||||||
manager,
|
|
||||||
persistFactory,
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(fetch).toHaveBeenCalledTimes(1);
|
|
||||||
expect(close).toHaveBeenCalledTimes(1);
|
|
||||||
expect(persistFactory).toHaveBeenCalledWith({
|
|
||||||
pipeline,
|
|
||||||
source: pipeline.sources.html[0],
|
|
||||||
resolvedSourceId: "demo",
|
|
||||||
});
|
|
||||||
expect(persistClose).toHaveBeenCalledTimes(1);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("throws when source is missing", async () => {
|
|
||||||
const pipeline = createPipeline();
|
|
||||||
registerCrawler(SourceKindSchema.enum.html, () => ({ fetch: vi.fn() }));
|
|
||||||
const manager = {
|
|
||||||
get: vi.fn().mockReturnValue(pipeline),
|
|
||||||
setupLogging: vi.fn(),
|
|
||||||
} as unknown as PipelineConfigManager;
|
|
||||||
|
|
||||||
await expect(
|
|
||||||
runSyncCrawl({ sourceId: "unknown", manager }),
|
|
||||||
).rejects.toThrow("Source 'unknown' not found");
|
|
||||||
});
|
|
||||||
|
|
||||||
it("throws when no crawler registered", async () => {
|
|
||||||
const pipeline = createPipeline();
|
|
||||||
const manager = {
|
|
||||||
get: vi.fn().mockReturnValue(pipeline),
|
|
||||||
setupLogging: vi.fn(),
|
|
||||||
} as unknown as PipelineConfigManager;
|
|
||||||
|
|
||||||
await expect(
|
|
||||||
runSyncCrawl({ sourceId: "demo", manager }),
|
|
||||||
).rejects.toThrow("No crawler registered");
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
import { describe, expect, it, vi } from "vitest";
|
|
||||||
|
|
||||||
import { ClientConfigSchema } from "@/schema";
|
|
||||||
import { HttpError, SyncHttpClient } from "@/http/http-client";
|
|
||||||
|
|
||||||
const createConfig = () =>
|
|
||||||
ClientConfigSchema.parse({
|
|
||||||
timeout: 1,
|
|
||||||
max_retries: 2,
|
|
||||||
backoff_initial: 0.001,
|
|
||||||
backoff_multiplier: 2,
|
|
||||||
backoff_max: 0.01,
|
|
||||||
});
|
|
||||||
|
|
||||||
describe("SyncHttpClient", () => {
|
|
||||||
it("retries transient statuses", async () => {
|
|
||||||
const config = createConfig();
|
|
||||||
const sleep = vi.fn().mockResolvedValue(undefined);
|
|
||||||
const fetchMock = vi
|
|
||||||
.fn()
|
|
||||||
.mockResolvedValueOnce(new Response("retry", { status: 503 }))
|
|
||||||
.mockResolvedValueOnce(new Response("ok", { status: 200, body: "done" }));
|
|
||||||
|
|
||||||
const client = new SyncHttpClient(config, { fetchImpl: fetchMock, sleep });
|
|
||||||
const response = await client.get("https://example.com");
|
|
||||||
|
|
||||||
expect(await response.text()).toBe("done");
|
|
||||||
expect(fetchMock).toHaveBeenCalledTimes(2);
|
|
||||||
expect(sleep).toHaveBeenCalled();
|
|
||||||
});
|
|
||||||
|
|
||||||
it("respects retry-after header", async () => {
|
|
||||||
const config = createConfig();
|
|
||||||
const sleep = vi.fn().mockResolvedValue(undefined);
|
|
||||||
const fetchMock = vi
|
|
||||||
.fn()
|
|
||||||
.mockResolvedValueOnce(
|
|
||||||
new Response("retry", { status: 503, headers: { "Retry-After": "3" } }),
|
|
||||||
)
|
|
||||||
.mockResolvedValueOnce(new Response("ok", { status: 200 }));
|
|
||||||
|
|
||||||
const client = new SyncHttpClient(config, { fetchImpl: fetchMock, sleep });
|
|
||||||
await client.get("https://example.com");
|
|
||||||
|
|
||||||
expect(sleep).toHaveBeenCalledWith(3000);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("throws http error on non transient failure", async () => {
|
|
||||||
const config = createConfig();
|
|
||||||
const fetchMock = vi
|
|
||||||
.fn()
|
|
||||||
.mockResolvedValueOnce(new Response("bad", { status: 404, statusText: "Not Found" }));
|
|
||||||
|
|
||||||
const client = new SyncHttpClient(config, { fetchImpl: fetchMock });
|
|
||||||
|
|
||||||
await expect(client.get("https://example.com"))
|
|
||||||
.rejects.toBeInstanceOf(HttpError);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("sends json payload and query params", async () => {
|
|
||||||
const config = createConfig();
|
|
||||||
const fetchMock = vi
|
|
||||||
.fn()
|
|
||||||
.mockResolvedValue(new Response("ok", { status: 200 }));
|
|
||||||
|
|
||||||
const client = new SyncHttpClient(config, { fetchImpl: fetchMock });
|
|
||||||
await client.post("https://example.com/api", {
|
|
||||||
params: { page: 1, q: "news" },
|
|
||||||
json: { hello: "world" },
|
|
||||||
headers: { Authorization: "token" },
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
|
||||||
const [url, init] = fetchMock.mock.calls[0]!;
|
|
||||||
expect(url).toBe("https://example.com/api?page=1&q=news");
|
|
||||||
expect(init?.method).toBe("POST");
|
|
||||||
expect(init?.body).toBe(JSON.stringify({ hello: "world" }));
|
|
||||||
expect((init?.headers as Record<string, string>)["Authorization"]).toBe("token");
|
|
||||||
expect((init?.headers as Record<string, string>)["Content-Type"]).toBe(
|
|
||||||
"application/json",
|
|
||||||
);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
import { describe, expect, it, vi } from "vitest";
|
|
||||||
|
|
||||||
import { OpenGraphProvider } from "@/http/open-graph";
|
|
||||||
|
|
||||||
const sampleHtml = `
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Example Article</title>
|
|
||||||
<meta property="og:title" content="Open Graph Title" />
|
|
||||||
<meta property="og:description" content="Summary" />
|
|
||||||
<meta property="og:image" content="https://cdn.example.com/image.jpg" />
|
|
||||||
<meta property="og:url" content="https://example.com/article" />
|
|
||||||
<link rel="canonical" href="https://example.com/canonical" />
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<img src="https://cdn.example.com/fallback.jpg" />
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
`;
|
|
||||||
|
|
||||||
describe("OpenGraphProvider", () => {
|
|
||||||
it("extracts metadata from html", () => {
|
|
||||||
const metadata = OpenGraphProvider.consumeHtml(sampleHtml, "https://example.com");
|
|
||||||
|
|
||||||
expect(metadata).toEqual({
|
|
||||||
title: "Open Graph Title",
|
|
||||||
description: "Summary",
|
|
||||||
image: "https://cdn.example.com/image.jpg",
|
|
||||||
url: "https://example.com/article",
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
it("falls back to null when no metadata present", () => {
|
|
||||||
const empty = OpenGraphProvider.consumeHtml("<html><body></body></html>");
|
|
||||||
expect(empty).toBeNull();
|
|
||||||
});
|
|
||||||
|
|
||||||
it("fetches metadata from url", async () => {
|
|
||||||
const response = new Response(sampleHtml, { status: 200 });
|
|
||||||
const get = vi.fn().mockResolvedValue(response);
|
|
||||||
|
|
||||||
const provider = new OpenGraphProvider({ client: { get } });
|
|
||||||
const metadata = await provider.consumeUrl("https://example.com/article");
|
|
||||||
|
|
||||||
expect(get).toHaveBeenCalledWith("https://example.com/article");
|
|
||||||
expect(metadata?.title).toBe("Open Graph Title");
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
import fs from "node:fs";
|
|
||||||
import os from "node:os";
|
|
||||||
import path from "node:path";
|
|
||||||
|
|
||||||
import { describe, expect, it } from "vitest";
|
|
||||||
|
|
||||||
import { JsonlPersistor } from "@/persistence";
|
|
||||||
|
|
||||||
describe("JsonlPersistor", () => {
|
|
||||||
it("writes json lines sequentially", async () => {
|
|
||||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "jsonl-test-"));
|
|
||||||
const persistor = new JsonlPersistor({ directory: tempDir, sourceId: "demo" });
|
|
||||||
|
|
||||||
await Promise.all([
|
|
||||||
persistor.persist({ id: 1, title: "first" }),
|
|
||||||
persistor.persist({ id: 2, title: "second" }),
|
|
||||||
]);
|
|
||||||
|
|
||||||
await persistor.close();
|
|
||||||
|
|
||||||
const contents = fs.readFileSync(path.join(tempDir, "demo.jsonl"), "utf-8");
|
|
||||||
const lines = contents.trim().split("\n").map((line) => JSON.parse(line));
|
|
||||||
|
|
||||||
expect(lines).toContainEqual({ id: 1, title: "first" });
|
|
||||||
expect(lines).toContainEqual({ id: 2, title: "second" });
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
import { describe, expect, it } from "vitest";
|
|
||||||
import { createQueueManager, createQueueSettings } from "@/process/async/queue";
|
|
||||||
|
|
||||||
class InMemoryQueue {
|
|
||||||
public jobs: Array<{ name: string; data: unknown }> = [];
|
|
||||||
|
|
||||||
async add(name: string, data: unknown) {
|
|
||||||
this.jobs.push({ name, data });
|
|
||||||
return { id: `${name}-${this.jobs.length}` };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
describe("createQueueManager", () => {
|
|
||||||
it("prefixes queue names", () => {
|
|
||||||
const manager = createQueueManager({
|
|
||||||
settings: createQueueSettings({ prefix: "test" }),
|
|
||||||
queueFactory: (queueName) => {
|
|
||||||
expect(queueName).toBe("listing");
|
|
||||||
return new InMemoryQueue();
|
|
||||||
},
|
|
||||||
connection: {
|
|
||||||
quit: async () => undefined,
|
|
||||||
} as any,
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(manager.iterQueueNames()).toEqual([
|
|
||||||
"test:listing",
|
|
||||||
"test:articles",
|
|
||||||
"test:processed",
|
|
||||||
]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("enqueues listing job with validated payload", async () => {
|
|
||||||
const queue = new InMemoryQueue();
|
|
||||||
const manager = createQueueManager({
|
|
||||||
queueFactory: () => queue,
|
|
||||||
connection: { quit: async () => undefined } as any,
|
|
||||||
});
|
|
||||||
|
|
||||||
const job = await manager.enqueueListing({
|
|
||||||
source_id: "radiookapi",
|
|
||||||
env: "test",
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(job.id).toBe("collect_listing-1");
|
|
||||||
expect(queue.jobs[0]).toEqual({
|
|
||||||
name: "collect_listing",
|
|
||||||
data: {
|
|
||||||
source_id: "radiookapi",
|
|
||||||
env: "test",
|
|
||||||
page_range: undefined,
|
|
||||||
date_range: undefined,
|
|
||||||
category: undefined,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,37 +0,0 @@
|
|||||||
import { describe, expect, it } from "vitest";
|
|
||||||
import {
|
|
||||||
PageRangeSchema,
|
|
||||||
PageRangeSpecSchema,
|
|
||||||
PipelineConfigSchema,
|
|
||||||
} from "@/schema";
|
|
||||||
import {
|
|
||||||
createDateRange,
|
|
||||||
formatDateRange,
|
|
||||||
isTimestampInRange,
|
|
||||||
schemaToJSON,
|
|
||||||
} from "@/utils";
|
|
||||||
|
|
||||||
describe("schema helpers", () => {
|
|
||||||
it("creates date range from spec", () => {
|
|
||||||
const range = createDateRange("2024-01-01:2024-01-31");
|
|
||||||
expect(range.start).toBeLessThan(range.end);
|
|
||||||
expect(formatDateRange(range)).toBe("2024-01-01:2024-01-31");
|
|
||||||
});
|
|
||||||
|
|
||||||
it("checks membership", () => {
|
|
||||||
const range = createDateRange("2024-01-01:2024-01-02");
|
|
||||||
expect(isTimestampInRange(range, range.start)).toBe(true);
|
|
||||||
expect(isTimestampInRange(range, range.start - 1)).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("parses page range spec", () => {
|
|
||||||
const range = PageRangeSchema.parse(PageRangeSpecSchema.parse("1:10"));
|
|
||||||
expect(range).toEqual({ start: 1, end: 10 });
|
|
||||||
});
|
|
||||||
|
|
||||||
it("produces json schema", () => {
|
|
||||||
const json = schemaToJSON(PipelineConfigSchema);
|
|
||||||
// @ts-ignore
|
|
||||||
expect(json.type).toBe("object");
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
import { describe, expect, it, vi } from "vitest";
|
|
||||||
import { QueueManager } from "../process/async/queue";
|
|
||||||
import {
|
|
||||||
collectListing,
|
|
||||||
registerCrawlerTaskHandlers,
|
|
||||||
scheduleAsyncCrawl,
|
|
||||||
} from "@/process/async/tasks";
|
|
||||||
|
|
||||||
describe("Async tasks", () => {
|
|
||||||
it("schedules crawl with provided manager", async () => {
|
|
||||||
const enqueueListing = vi.fn().mockResolvedValue({ id: "job-1" });
|
|
||||||
const manager = {
|
|
||||||
enqueueListing,
|
|
||||||
} as unknown as QueueManager;
|
|
||||||
|
|
||||||
const jobId = await scheduleAsyncCrawl({
|
|
||||||
sourceId: "radiookapi",
|
|
||||||
queueManager: manager,
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(jobId).toBe("job-1");
|
|
||||||
expect(enqueueListing).toHaveBeenCalledWith({
|
|
||||||
source_id: "radiookapi",
|
|
||||||
env: "development",
|
|
||||||
page_range: undefined,
|
|
||||||
date_range: undefined,
|
|
||||||
category: undefined,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
it("delegates listing collection to registered handler", async () => {
|
|
||||||
const handler = vi.fn().mockResolvedValue(5);
|
|
||||||
registerCrawlerTaskHandlers({ collectListing: handler });
|
|
||||||
|
|
||||||
const count = await collectListing({
|
|
||||||
source_id: "radiookapi",
|
|
||||||
env: "development",
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(count).toBe(5);
|
|
||||||
expect(handler).toHaveBeenCalledWith({
|
|
||||||
source_id: "radiookapi",
|
|
||||||
env: "development",
|
|
||||||
page_range: undefined,
|
|
||||||
date_range: undefined,
|
|
||||||
category: undefined,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,152 +1,81 @@
|
|||||||
import fs from "node:fs";
|
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
||||||
import { logger } from "@basango/logger";
|
import { loadConfig } from "@devscast/config";
|
||||||
|
import { z } from "zod";
|
||||||
import { PipelineConfig, PipelineConfigSchema } from "@/schema";
|
|
||||||
import {
|
import {
|
||||||
ensureDirectories,
|
DateRangeSchema,
|
||||||
mergePipelineConfig,
|
HtmlSourceConfigSchema,
|
||||||
resolveConfigPath,
|
PageRangeSchema,
|
||||||
resolveProjectPaths,
|
UpdateDirectionSchema,
|
||||||
} from "@/utils";
|
WordPressSourceConfigSchema,
|
||||||
import { DEFAULT_CONFIG_FILES } from "@/constants";
|
} from "@/schema";
|
||||||
|
|
||||||
export interface LoadConfigOptions {
|
export const PROJECT_DIR = path.resolve(process.cwd(), "basango", "apps", "crawler");
|
||||||
path?: string;
|
|
||||||
env?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const readJsonFile = (filePath: string): unknown => {
|
export const PipelineConfigSchema = z.object({
|
||||||
const contents = fs.readFileSync(filePath, "utf-8");
|
paths: z.object({
|
||||||
return contents.trim() === "" ? {} : JSON.parse(contents);
|
root: z.string().default(PROJECT_DIR),
|
||||||
};
|
data: z.string().default(path.join(PROJECT_DIR, "data", "dataset")),
|
||||||
|
config: z.string().default(path.join(PROJECT_DIR, "config")),
|
||||||
|
}),
|
||||||
|
fetch: z.object({
|
||||||
|
client: z.object({
|
||||||
|
timeout: z.number().positive().default(20),
|
||||||
|
userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
|
||||||
|
followRedirects: z.boolean().default(true),
|
||||||
|
verifySsl: z.boolean().default(true),
|
||||||
|
rotate: z.boolean().default(true),
|
||||||
|
maxRetries: z.number().int().nonnegative().default(3),
|
||||||
|
backoffInitial: z.number().nonnegative().default(1),
|
||||||
|
backoffMultiplier: z.number().positive().default(2),
|
||||||
|
backoffMax: z.number().nonnegative().default(30),
|
||||||
|
respectRetryAfter: z.boolean().default(true),
|
||||||
|
}),
|
||||||
|
crawler: z.object({
|
||||||
|
source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
|
||||||
|
pageRange: PageRangeSchema.optional(),
|
||||||
|
dateRange: DateRangeSchema.optional(),
|
||||||
|
category: z.string().optional(),
|
||||||
|
notify: z.boolean().default(false),
|
||||||
|
isUpdate: z.boolean().default(false),
|
||||||
|
useMultiThreading: z.boolean().default(false),
|
||||||
|
maxWorkers: z.number().int().positive().default(5),
|
||||||
|
direction: UpdateDirectionSchema.default("forward"),
|
||||||
|
}),
|
||||||
|
async: z.object({
|
||||||
|
redisUrl: z.string().default("redis://localhost:6379/0"),
|
||||||
|
prefix: z.string().default("basango:crawler:queue"),
|
||||||
|
ttl: z.object({
|
||||||
|
default: z.number().int().positive().default(600),
|
||||||
|
result: z.number().int().nonnegative().default(3600),
|
||||||
|
failure: z.number().int().nonnegative().default(3600),
|
||||||
|
}),
|
||||||
|
queues: z.object({
|
||||||
|
listing: z.string().default("listing"),
|
||||||
|
details: z.string().default("details"),
|
||||||
|
processing: z.string().default("processing"),
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
sources: z.object({
|
||||||
|
html: z.array(HtmlSourceConfigSchema).default([]),
|
||||||
|
wordpress: z.array(WordPressSourceConfigSchema).default([]),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
const locateConfigFile = (explicit?: string): string => {
|
export const { config, env } = loadConfig({
|
||||||
if (explicit && fs.existsSync(explicit)) {
|
schema: PipelineConfigSchema,
|
||||||
return explicit;
|
cwd: process.cwd(),
|
||||||
}
|
env: {
|
||||||
|
path: path.join(PROJECT_DIR, ".env"),
|
||||||
|
},
|
||||||
|
sources: [
|
||||||
|
path.join(PROJECT_DIR, "config", "pipeline.json"),
|
||||||
|
path.join(PROJECT_DIR, "config", "sources.json"),
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
for (const candidate of DEFAULT_CONFIG_FILES) {
|
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
|
||||||
if (fs.existsSync(candidate)) {
|
export type FetchClientConfig = PipelineConfig["fetch"]["client"];
|
||||||
return candidate;
|
export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
|
||||||
}
|
export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];
|
||||||
}
|
|
||||||
|
|
||||||
return DEFAULT_CONFIG_FILES[0]!;
|
|
||||||
};
|
|
||||||
|
|
||||||
const readPipelineConfig = (configPath: string): PipelineConfig => {
|
|
||||||
if (!fs.existsSync(configPath)) {
|
|
||||||
return PipelineConfigSchema.parse({
|
|
||||||
paths: resolveProjectPaths(path.resolve(".")),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
const raw = readJsonFile(configPath);
|
|
||||||
return PipelineConfigSchema.parse(raw);
|
|
||||||
};
|
|
||||||
|
|
||||||
const applyEnvironmentOverride = (
|
|
||||||
baseConfig: PipelineConfig,
|
|
||||||
basePath: string,
|
|
||||||
env?: string,
|
|
||||||
): PipelineConfig => {
|
|
||||||
if (!env || env === "development") {
|
|
||||||
return baseConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
const overridePath = resolveConfigPath(basePath, env);
|
|
||||||
if (!fs.existsSync(overridePath)) {
|
|
||||||
return baseConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
const overrides = PipelineConfigSchema.parse(readJsonFile(overridePath));
|
|
||||||
return mergePipelineConfig(baseConfig, overrides);
|
|
||||||
};
|
|
||||||
|
|
||||||
export const loadConfig = (options: LoadConfigOptions = {}): PipelineConfig => {
|
|
||||||
const basePath = locateConfigFile(options.path);
|
|
||||||
const config = applyEnvironmentOverride(
|
|
||||||
readPipelineConfig(basePath),
|
|
||||||
basePath,
|
|
||||||
options.env,
|
|
||||||
);
|
|
||||||
|
|
||||||
ensureDirectories(config.paths);
|
|
||||||
return config;
|
|
||||||
};
|
|
||||||
|
|
||||||
export const dumpConfig = (
|
|
||||||
config: PipelineConfig,
|
|
||||||
targetPath?: string,
|
|
||||||
): void => {
|
|
||||||
const destination = targetPath ?? locateConfigFile();
|
|
||||||
const normalized = PipelineConfigSchema.parse(config);
|
|
||||||
fs.mkdirSync(path.dirname(destination), { recursive: true });
|
|
||||||
fs.writeFileSync(destination, JSON.stringify(normalized, null, 2));
|
|
||||||
};
|
|
||||||
|
|
||||||
export interface PipelineConfigManagerOptions {
|
|
||||||
path?: string;
|
|
||||||
env?: string;
|
|
||||||
autoLoad?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
export class PipelineConfigManager {
|
|
||||||
private readonly explicitPath?: string;
|
|
||||||
|
|
||||||
private readonly defaultEnv: string;
|
|
||||||
|
|
||||||
private cache?: PipelineConfig;
|
|
||||||
|
|
||||||
constructor(options: PipelineConfigManagerOptions = {}) {
|
|
||||||
this.explicitPath = options.path;
|
|
||||||
this.defaultEnv = options.env ?? "development";
|
|
||||||
|
|
||||||
if (options.autoLoad !== false) {
|
|
||||||
this.cache = loadConfig({
|
|
||||||
path: this.explicitPath,
|
|
||||||
env: this.defaultEnv,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
get(env?: string): PipelineConfig {
|
|
||||||
const resolvedEnv = env ?? this.defaultEnv;
|
|
||||||
|
|
||||||
if (resolvedEnv !== this.defaultEnv) {
|
|
||||||
return loadConfig({
|
|
||||||
path: this.explicitPath,
|
|
||||||
env: resolvedEnv,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!this.cache) {
|
|
||||||
this.cache = loadConfig({
|
|
||||||
path: this.explicitPath,
|
|
||||||
env: resolvedEnv,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return this.cache;
|
|
||||||
}
|
|
||||||
|
|
||||||
setupLogging(config?: PipelineConfig): void {
|
|
||||||
const pipeline = config ?? this.get();
|
|
||||||
ensureDirectories(pipeline.paths);
|
|
||||||
|
|
||||||
const level = pipeline.logging.level.toLowerCase();
|
|
||||||
process.env.LOG_LEVEL = level;
|
|
||||||
logger.level = level as typeof logger.level;
|
|
||||||
|
|
||||||
if (pipeline.logging.file_logging) {
|
|
||||||
const logDir = pipeline.paths.logs;
|
|
||||||
const destination = path.join(logDir, pipeline.logging.log_file);
|
|
||||||
fs.mkdirSync(path.dirname(destination), { recursive: true });
|
|
||||||
if (!fs.existsSync(destination)) {
|
|
||||||
fs.writeFileSync(destination, "");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,15 +1,6 @@
|
|||||||
import path from "node:path";
|
|
||||||
|
|
||||||
export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd";
|
export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd";
|
||||||
export const DEFAULT_CONFIG_FILES = [
|
export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)";
|
||||||
path.join(process.cwd(), "config", "pipeline.json"),
|
|
||||||
path.join(process.cwd(), "pipeline.json"),
|
|
||||||
];
|
|
||||||
|
|
||||||
export const DEFAULT_USER_AGENT =
|
|
||||||
"Basango/0.1 (+https://github.com/bernard-ng/basango)";
|
|
||||||
export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1";
|
export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1";
|
||||||
|
|
||||||
export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504] as const;
|
export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504];
|
||||||
|
|
||||||
export const DEFAULT_RETRY_AFTER_HEADER = "retry-after";
|
export const DEFAULT_RETRY_AFTER_HEADER = "retry-after";
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
import { setTimeout as delay } from "node:timers/promises";
|
import { setTimeout as delay } from "node:timers/promises";
|
||||||
|
|
||||||
import type { ClientConfig } from "@/schema";
|
import {
|
||||||
import { DEFAULT_RETRY_AFTER_HEADER, DEFAULT_USER_AGENT, TRANSIENT_HTTP_STATUSES } from "@/constants";
|
DEFAULT_RETRY_AFTER_HEADER,
|
||||||
|
DEFAULT_USER_AGENT,
|
||||||
|
TRANSIENT_HTTP_STATUSES,
|
||||||
|
} from "@/constants";
|
||||||
import { UserAgents } from "@/http/user-agent";
|
import { UserAgents } from "@/http/user-agent";
|
||||||
|
import { FetchClientConfig } from "@/config";
|
||||||
|
|
||||||
export type HttpHeaders = Record<string, string>;
|
export type HttpHeaders = Record<string, string>;
|
||||||
export type HttpParams = Record<string, string | number | boolean | null | undefined>;
|
export type HttpParams = Record<string, string | number | boolean | null | undefined>;
|
||||||
@@ -34,13 +38,19 @@ export class HttpError extends Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default sleep function using setTimeout.
|
||||||
|
* @param ms - Milliseconds to sleep
|
||||||
|
*/
|
||||||
const defaultSleep = (ms: number): Promise<void> => {
|
const defaultSleep = (ms: number): Promise<void> => {
|
||||||
if (typeof Bun !== "undefined" && typeof Bun.sleep === "function") {
|
|
||||||
return Bun.sleep(ms);
|
|
||||||
}
|
|
||||||
return delay(ms).then(() => undefined);
|
return delay(ms).then(() => undefined);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a URL with query parameters.
|
||||||
|
* @param url - The base URL
|
||||||
|
* @param params - The query parameters to append
|
||||||
|
*/
|
||||||
const buildUrl = (url: string, params?: HttpParams): string => {
|
const buildUrl = (url: string, params?: HttpParams): string => {
|
||||||
if (!params || Object.keys(params).length === 0) {
|
if (!params || Object.keys(params).length === 0) {
|
||||||
return url;
|
return url;
|
||||||
@@ -55,10 +65,15 @@ const buildUrl = (url: string, params?: HttpParams): string => {
|
|||||||
return target.toString();
|
return target.toString();
|
||||||
};
|
};
|
||||||
|
|
||||||
const computeBackoff = (config: ClientConfig, attempt: number): number => {
|
/**
|
||||||
|
* Computes the backoff time in milliseconds based on the configuration and attempt number.
|
||||||
|
* @param config - Fetch client configuration
|
||||||
|
* @param attempt - Current attempt number
|
||||||
|
*/
|
||||||
|
const computeBackoff = (config: FetchClientConfig, attempt: number): number => {
|
||||||
const base = Math.min(
|
const base = Math.min(
|
||||||
config.backoff_initial * Math.pow(config.backoff_multiplier, attempt),
|
config.backoffInitial * Math.pow(config.backoffMultiplier, attempt),
|
||||||
config.backoff_max,
|
config.backoffMax,
|
||||||
);
|
);
|
||||||
const jitter = Math.random() * base * 0.25;
|
const jitter = Math.random() * base * 0.25;
|
||||||
return (base + jitter) * 1000;
|
return (base + jitter) * 1000;
|
||||||
@@ -79,18 +94,23 @@ const parseRetryAfter = (header: string): number => {
|
|||||||
return delta > 0 ? delta : 0;
|
return delta > 0 ? delta : 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base HTTP client providing common functionality.
|
||||||
|
*
|
||||||
|
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||||
|
*/
|
||||||
export class BaseHttpClient {
|
export class BaseHttpClient {
|
||||||
protected readonly config: ClientConfig;
|
protected readonly config: FetchClientConfig;
|
||||||
protected readonly fetchImpl: typeof fetch;
|
protected readonly fetchImpl: typeof fetch;
|
||||||
protected readonly sleep: (ms: number) => Promise<void>;
|
protected readonly sleep: (ms: number) => Promise<void>;
|
||||||
protected readonly headers: HttpHeaders;
|
protected readonly headers: HttpHeaders;
|
||||||
|
|
||||||
constructor(config: ClientConfig, options: HttpClientOptions = {}) {
|
constructor(config: FetchClientConfig, options: HttpClientOptions = {}) {
|
||||||
this.config = config;
|
this.config = config;
|
||||||
const provider =
|
const provider =
|
||||||
options.userAgentProvider ??
|
options.userAgentProvider ??
|
||||||
new UserAgents(config.rotate, config.user_agent ?? DEFAULT_USER_AGENT);
|
new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT);
|
||||||
const userAgent = provider.get() ?? config.user_agent ?? DEFAULT_USER_AGENT;
|
const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT;
|
||||||
|
|
||||||
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
|
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
|
||||||
if (options.defaultHeaders) {
|
if (options.defaultHeaders) {
|
||||||
@@ -115,7 +135,7 @@ export class BaseHttpClient {
|
|||||||
|
|
||||||
if (response) {
|
if (response) {
|
||||||
const retryAfter = response.headers.get(retryAfterHeader);
|
const retryAfter = response.headers.get(retryAfterHeader);
|
||||||
if (retryAfter && this.config.respect_retry_after) {
|
if (retryAfter && this.config.respectRetryAfter) {
|
||||||
waitMs = parseRetryAfter(retryAfter);
|
waitMs = parseRetryAfter(retryAfter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -130,16 +150,17 @@ export class BaseHttpClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Synchronous HTTP client with retry and timeout capabilities.
|
||||||
|
*
|
||||||
|
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||||
|
*/
|
||||||
export class SyncHttpClient extends BaseHttpClient {
|
export class SyncHttpClient extends BaseHttpClient {
|
||||||
async request(
|
async request(method: string, url: string, options: HttpRequestOptions = {}): Promise<Response> {
|
||||||
method: string,
|
|
||||||
url: string,
|
|
||||||
options: HttpRequestOptions = {},
|
|
||||||
): Promise<Response> {
|
|
||||||
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
|
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
|
||||||
const target = buildUrl(url, options.params);
|
const target = buildUrl(url, options.params);
|
||||||
|
|
||||||
const maxAttempts = this.config.max_retries + 1;
|
const maxAttempts = this.config.maxRetries + 1;
|
||||||
let attempt = 0;
|
let attempt = 0;
|
||||||
let lastError: unknown;
|
let lastError: unknown;
|
||||||
|
|
||||||
@@ -155,20 +176,19 @@ export class SyncHttpClient extends BaseHttpClient {
|
|||||||
headers,
|
headers,
|
||||||
body: options.data as BodyInit | undefined,
|
body: options.data as BodyInit | undefined,
|
||||||
signal: controller.signal,
|
signal: controller.signal,
|
||||||
redirect: this.config.follow_redirects ? "follow" : "manual",
|
redirect: this.config.followRedirects ? "follow" : "manual",
|
||||||
};
|
};
|
||||||
|
|
||||||
if (options.json !== undefined) {
|
if (options.json !== undefined) {
|
||||||
init.body = JSON.stringify(options.json);
|
init.body = JSON.stringify(options.json);
|
||||||
(init.headers as Record<string, string>)["Content-Type"] ??=
|
(init.headers as Record<string, string>)["Content-Type"] ??= "application/json";
|
||||||
"application/json";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const response = await this.fetchImpl(target, init);
|
const response = await this.fetchImpl(target, init);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
|
TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
|
||||||
attempt < this.config.max_retries
|
attempt < this.config.maxRetries
|
||||||
) {
|
) {
|
||||||
await this.maybeDelay(attempt, response, retryAfterHeader);
|
await this.maybeDelay(attempt, response, retryAfterHeader);
|
||||||
attempt += 1;
|
attempt += 1;
|
||||||
@@ -188,12 +208,12 @@ export class SyncHttpClient extends BaseHttpClient {
|
|||||||
|
|
||||||
if (error instanceof DOMException && error.name === "AbortError") {
|
if (error instanceof DOMException && error.name === "AbortError") {
|
||||||
lastError = error;
|
lastError = error;
|
||||||
if (attempt >= this.config.max_retries) {
|
if (attempt >= this.config.maxRetries) {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
lastError = error;
|
lastError = error;
|
||||||
if (attempt >= this.config.max_retries) {
|
if (attempt >= this.config.maxRetries) {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -207,9 +227,7 @@ export class SyncHttpClient extends BaseHttpClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
throw lastError instanceof Error
|
throw lastError instanceof Error ? lastError : new Error("HTTP request failed after retries");
|
||||||
? lastError
|
|
||||||
: new Error("HTTP request failed after retries");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
get(url: string, options?: Omit<HttpRequestOptions, "data" | "json">): Promise<Response> {
|
get(url: string, options?: Omit<HttpRequestOptions, "data" | "json">): Promise<Response> {
|
||||||
|
|||||||
@@ -1,33 +1,30 @@
|
|||||||
import { parse } from "node-html-parser";
|
import { parse } from "node-html-parser";
|
||||||
|
|
||||||
import { OPEN_GRAPH_USER_AGENT } from "@/constants";
|
import { OPEN_GRAPH_USER_AGENT } from "@/constants";
|
||||||
import type { ClientConfig } from "@/schema";
|
|
||||||
import { SyncHttpClient } from "@/http/http-client";
|
import { SyncHttpClient } from "@/http/http-client";
|
||||||
import { UserAgents } from "@/http/user-agent";
|
import { UserAgents } from "@/http/user-agent";
|
||||||
|
import { config } from "@/config";
|
||||||
|
import { ArticleMetadata } from "@/schema";
|
||||||
|
|
||||||
export interface OpenGraphMetadata {
|
/**
|
||||||
title?: string | null;
|
* Picks the first non-empty value from the provided array.
|
||||||
description?: string | null;
|
* @param values - An array of string values
|
||||||
image?: string | null;
|
*/
|
||||||
url?: string | null;
|
const pick = (values: Array<string | null | undefined>): string | undefined => {
|
||||||
}
|
|
||||||
|
|
||||||
export interface OpenGraphProviderOptions {
|
|
||||||
client?: Pick<SyncHttpClient, "get">;
|
|
||||||
clientConfig?: ClientConfig;
|
|
||||||
userAgentProvider?: UserAgents;
|
|
||||||
}
|
|
||||||
|
|
||||||
const pick = (values: Array<string | null | undefined>): string | null => {
|
|
||||||
for (const value of values) {
|
for (const value of values) {
|
||||||
if (value && value.trim().length > 0) {
|
if (value && value.trim().length > 0) {
|
||||||
return value.trim();
|
return value.trim();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return undefined;
|
||||||
};
|
};
|
||||||
|
|
||||||
const extractMeta = (root: ReturnType<typeof parse>, property: string): string | null => {
|
/**
|
||||||
|
* Extracts the content of a meta tag given its property or name.
|
||||||
|
* @param root - The root HTML element
|
||||||
|
* @param property - The property or name of the meta tag to extract
|
||||||
|
*/
|
||||||
|
const extract = (root: ReturnType<typeof parse>, property: string): string | null => {
|
||||||
const selector = `meta[property='${property}'], meta[name='${property}']`;
|
const selector = `meta[property='${property}'], meta[name='${property}']`;
|
||||||
const node = root.querySelector(selector);
|
const node = root.querySelector(selector);
|
||||||
if (!node) {
|
if (!node) {
|
||||||
@@ -36,70 +33,64 @@ const extractMeta = (root: ReturnType<typeof parse>, property: string): string |
|
|||||||
return node.getAttribute("content") ?? null;
|
return node.getAttribute("content") ?? null;
|
||||||
};
|
};
|
||||||
|
|
||||||
export class OpenGraphProvider {
|
/**
|
||||||
|
* OpenGraph consumer for extracting Open Graph metadata from HTML pages.
|
||||||
|
* Uses a synchronous HTTP client to fetch the HTML content.
|
||||||
|
*
|
||||||
|
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||||
|
*/
|
||||||
|
export class OpenGraph {
|
||||||
private readonly client: Pick<SyncHttpClient, "get">;
|
private readonly client: Pick<SyncHttpClient, "get">;
|
||||||
|
|
||||||
constructor(options: OpenGraphProviderOptions = {}) {
|
constructor() {
|
||||||
const provider =
|
const settings = config.fetch.client;
|
||||||
options.userAgentProvider ?? new UserAgents(false, OPEN_GRAPH_USER_AGENT);
|
const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT);
|
||||||
const clientConfig: ClientConfig =
|
|
||||||
options.clientConfig ?? ({
|
|
||||||
timeout: 20,
|
|
||||||
user_agent: OPEN_GRAPH_USER_AGENT,
|
|
||||||
follow_redirects: true,
|
|
||||||
verify_ssl: true,
|
|
||||||
rotate: false,
|
|
||||||
max_retries: 2,
|
|
||||||
backoff_initial: 1,
|
|
||||||
backoff_multiplier: 2,
|
|
||||||
backoff_max: 5,
|
|
||||||
respect_retry_after: true,
|
|
||||||
} satisfies ClientConfig);
|
|
||||||
|
|
||||||
this.client =
|
this.client = new SyncHttpClient(settings, {
|
||||||
options.client ??
|
userAgentProvider: provider,
|
||||||
new SyncHttpClient(clientConfig, {
|
defaultHeaders: { "User-Agent": provider.og() },
|
||||||
userAgentProvider: provider,
|
});
|
||||||
defaultHeaders: { "User-Agent": provider.og() },
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async consumeUrl(url: string): Promise<OpenGraphMetadata | null> {
|
/**
|
||||||
|
* Consume a URL and extract Open Graph metadata.
|
||||||
|
* @param url - The URL to fetch and parse
|
||||||
|
*/
|
||||||
|
async consumeUrl(url: string): Promise<ArticleMetadata | undefined> {
|
||||||
try {
|
try {
|
||||||
const response = await this.client.get(url);
|
const response = await this.client.get(url);
|
||||||
const html = await response.text();
|
const html = await response.text();
|
||||||
return OpenGraphProvider.consumeHtml(html, url);
|
return OpenGraph.consumeHtml(html, url);
|
||||||
} catch {
|
} catch {
|
||||||
return null;
|
return undefined;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static consumeHtml(html: string, url?: string): OpenGraphMetadata | null {
|
/**
|
||||||
|
* Consume HTML content and extract Open Graph metadata.
|
||||||
|
* @param html - HTML content as a string
|
||||||
|
* @param url - Optional URL of the page
|
||||||
|
*/
|
||||||
|
static consumeHtml(html: string, url?: string): ArticleMetadata | undefined {
|
||||||
if (!html) {
|
if (!html) {
|
||||||
return null;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
const root = parse(html);
|
const root = parse(html);
|
||||||
const title = pick([
|
const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]);
|
||||||
extractMeta(root, "og:title"),
|
const description = pick([extract(root, "og:description"), extract(root, "description")]);
|
||||||
root.querySelector("title")?.text,
|
|
||||||
]);
|
|
||||||
const description = pick([
|
|
||||||
extractMeta(root, "og:description"),
|
|
||||||
extractMeta(root, "description"),
|
|
||||||
]);
|
|
||||||
const image = pick([
|
const image = pick([
|
||||||
extractMeta(root, "og:image"),
|
extract(root, "og:image"),
|
||||||
root.querySelector("img")?.getAttribute("src") ?? null,
|
root.querySelector("img")?.getAttribute("src") ?? null,
|
||||||
]);
|
]);
|
||||||
const canonical = pick([
|
const canonical = pick([
|
||||||
extractMeta(root, "og:url"),
|
extract(root, "og:url"),
|
||||||
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
|
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
|
||||||
url ?? null,
|
url ?? null,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (!title && !description && !image && !canonical) {
|
if (!title && !description && !image && !canonical) {
|
||||||
return null;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -1,5 +1,12 @@
|
|||||||
import { DEFAULT_USER_AGENT, OPEN_GRAPH_USER_AGENT } from "@/constants";
|
import { DEFAULT_USER_AGENT, OPEN_GRAPH_USER_AGENT } from "@/constants";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User agent provider with optional rotation.
|
||||||
|
* Allows fetching a random user agent from a predefined list
|
||||||
|
* or using a fallback user agent.
|
||||||
|
*
|
||||||
|
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||||
|
*/
|
||||||
export class UserAgents {
|
export class UserAgents {
|
||||||
private static readonly USER_AGENTS: string[] = [
|
private static readonly USER_AGENTS: string[] = [
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
|
||||||
@@ -22,7 +29,7 @@ export class UserAgents {
|
|||||||
this.fallback = fallback;
|
this.fallback = fallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
static og(): string {
|
og(): string {
|
||||||
return OPEN_GRAPH_USER_AGENT;
|
return OPEN_GRAPH_USER_AGENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,59 +0,0 @@
|
|||||||
import fs from "node:fs";
|
|
||||||
import path from "node:path";
|
|
||||||
|
|
||||||
export interface PersistedRecord {
|
|
||||||
[key: string]: unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface Persistor {
|
|
||||||
persist(record: PersistedRecord): Promise<void> | void;
|
|
||||||
close?: () => Promise<void> | void;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface JsonlPersistorOptions {
|
|
||||||
directory: string;
|
|
||||||
sourceId: string;
|
|
||||||
suffix?: string;
|
|
||||||
encoding?: BufferEncoding;
|
|
||||||
}
|
|
||||||
|
|
||||||
export class JsonlPersistor implements Persistor {
|
|
||||||
private readonly filePath: string;
|
|
||||||
private readonly encoding: BufferEncoding;
|
|
||||||
private pending: Promise<void> = Promise.resolve();
|
|
||||||
private closed = false;
|
|
||||||
|
|
||||||
constructor(options: JsonlPersistorOptions) {
|
|
||||||
const suffix = options.suffix ?? ".jsonl";
|
|
||||||
this.encoding = options.encoding ?? "utf-8";
|
|
||||||
|
|
||||||
fs.mkdirSync(options.directory, { recursive: true });
|
|
||||||
this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`);
|
|
||||||
|
|
||||||
if (!fs.existsSync(this.filePath)) {
|
|
||||||
fs.writeFileSync(this.filePath, "", { encoding: this.encoding });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
persist(record: PersistedRecord): Promise<void> {
|
|
||||||
if (this.closed) {
|
|
||||||
return Promise.reject(new Error("Persistor has been closed"));
|
|
||||||
}
|
|
||||||
|
|
||||||
const payload = `${JSON.stringify(record)}\n`;
|
|
||||||
|
|
||||||
this.pending = this.pending.then(async () => {
|
|
||||||
const file = Bun.file(this.filePath);
|
|
||||||
await Bun.write(file, payload, { append: true });
|
|
||||||
});
|
|
||||||
|
|
||||||
return this.pending;
|
|
||||||
}
|
|
||||||
|
|
||||||
async close(): Promise<void> {
|
|
||||||
this.closed = true;
|
|
||||||
await this.pending;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export type { JsonlPersistorOptions as JsonlOptions };
|
|
||||||
@@ -0,0 +1,138 @@
|
|||||||
|
import { logger } from "@basango/logger";
|
||||||
|
|
||||||
|
import { config, env } from "@/config";
|
||||||
|
import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
|
||||||
|
import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
|
||||||
|
import {
|
||||||
|
DetailsTaskPayload,
|
||||||
|
ListingTaskPayload,
|
||||||
|
ProcessingTaskPayload,
|
||||||
|
} from "@/process/async/schemas";
|
||||||
|
import { createQueueManager, QueueManager } from "@/process/async/queue";
|
||||||
|
import { HtmlCrawler } from "@/process/parsers/html";
|
||||||
|
import { WordPressCrawler } from "@/process/parsers/wordpress";
|
||||||
|
import { JsonlPersistor } from "@/process/persistence";
|
||||||
|
import { SyncHttpClient } from "@/http/http-client";
|
||||||
|
|
||||||
|
import { resolveCrawlerConfig } from "@/process/crawler";
|
||||||
|
|
||||||
|
export const collectHtmlListing = async (
|
||||||
|
payload: ListingTaskPayload,
|
||||||
|
manager: QueueManager = createQueueManager(),
|
||||||
|
): Promise<number> => {
|
||||||
|
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
|
||||||
|
if (source.sourceKind !== "html") {
|
||||||
|
return await collectWordPressListing(payload, manager);
|
||||||
|
}
|
||||||
|
|
||||||
|
const settings = resolveCrawlerConfig(source, payload);
|
||||||
|
const crawler = new HtmlCrawler(settings);
|
||||||
|
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
||||||
|
|
||||||
|
let queued = 0;
|
||||||
|
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||||
|
const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
|
||||||
|
for (const node of items) {
|
||||||
|
const url = crawler.extractLink(node);
|
||||||
|
if (!url) continue;
|
||||||
|
|
||||||
|
await manager.enqueueArticle({
|
||||||
|
url,
|
||||||
|
sourceId: payload.sourceId,
|
||||||
|
category: payload.category,
|
||||||
|
dateRange: createDateRange(payload.dateRange),
|
||||||
|
} as DetailsTaskPayload);
|
||||||
|
queued += 1;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error, target }, "Failed to crawl page");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return queued;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const collectWordPressListing = async (
|
||||||
|
payload: ListingTaskPayload,
|
||||||
|
manager: QueueManager = createQueueManager(),
|
||||||
|
): Promise<number> => {
|
||||||
|
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
|
||||||
|
if (source.sourceKind !== "wordpress") {
|
||||||
|
return await collectHtmlListing(payload, manager);
|
||||||
|
}
|
||||||
|
|
||||||
|
const settings = resolveCrawlerConfig(source, payload);
|
||||||
|
const crawler = new WordPressCrawler(settings);
|
||||||
|
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
||||||
|
|
||||||
|
let queued = 0;
|
||||||
|
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||||
|
const url = crawler.postsEndpoint(page);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const entries = await crawler.fetchLinks(url);
|
||||||
|
for (const data of entries) {
|
||||||
|
const url = data.link;
|
||||||
|
if (!url) continue;
|
||||||
|
|
||||||
|
await manager.enqueueArticle({
|
||||||
|
url,
|
||||||
|
data,
|
||||||
|
sourceId: payload.sourceId,
|
||||||
|
category: payload.category,
|
||||||
|
dateRange: createDateRange(payload.dateRange),
|
||||||
|
} as DetailsTaskPayload);
|
||||||
|
queued += 1;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error, page }, "Failed to fetch WordPress page");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return queued;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
|
||||||
|
const source = resolveSourceConfig(payload.sourceId);
|
||||||
|
const settings = resolveCrawlerConfig(source, {
|
||||||
|
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
|
||||||
|
dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
|
||||||
|
sourceId: payload.sourceId,
|
||||||
|
category: payload.category,
|
||||||
|
});
|
||||||
|
const persistors = [
|
||||||
|
new JsonlPersistor({
|
||||||
|
directory: config.paths.data,
|
||||||
|
sourceId: String(source.sourceId),
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
if (source.sourceKind === SourceKindSchema.enum.html) {
|
||||||
|
if (!payload.url) throw new Error("Missing article url");
|
||||||
|
const crawler = new HtmlCrawler(settings, { persistors });
|
||||||
|
const html = await crawler.crawl(payload.url);
|
||||||
|
return await crawler.fetchOne(html, settings.dateRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (source.sourceKind === SourceKindSchema.enum.wordpress) {
|
||||||
|
const crawler = new WordPressCrawler(settings, { persistors });
|
||||||
|
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Unsupported source kind`);
|
||||||
|
};
|
||||||
|
|
||||||
|
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
|
||||||
|
logger.info({ article: payload.article.title }, "Ready for downstream processing");
|
||||||
|
|
||||||
|
const client = new SyncHttpClient(config.fetch.client);
|
||||||
|
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
|
||||||
|
|
||||||
|
await client.post(endpoint, { json: payload.article });
|
||||||
|
logger.info({ article: payload.article.title }, "Forwarded article to API");
|
||||||
|
|
||||||
|
return payload.article;
|
||||||
|
};
|
||||||
@@ -2,49 +2,17 @@ import { randomUUID } from "node:crypto";
|
|||||||
|
|
||||||
import IORedis from "ioredis";
|
import IORedis from "ioredis";
|
||||||
import { JobsOptions, Queue, QueueOptions } from "bullmq";
|
import { JobsOptions, Queue, QueueOptions } from "bullmq";
|
||||||
import { z } from "zod";
|
|
||||||
|
|
||||||
import {
|
import {
|
||||||
ArticleTaskPayload,
|
DetailsTaskPayload,
|
||||||
ArticleTaskPayloadSchema,
|
DetailsTaskPayloadSchema,
|
||||||
ListingTaskPayload,
|
ListingTaskPayload,
|
||||||
ListingTaskPayloadSchema,
|
ListingTaskPayloadSchema,
|
||||||
ProcessedTaskPayload,
|
ProcessingTaskPayload,
|
||||||
ProcessedTaskPayloadSchema,
|
ProcessingTaskPayloadSchema,
|
||||||
} from "@/process/async/schemas";
|
} from "@/process/async/schemas";
|
||||||
import { parseRedisUrl } from "@/utils";
|
import { parseRedisUrl } from "@/utils";
|
||||||
|
import { config, FetchAsyncConfig } from "@/config";
|
||||||
const QueueSettingsSchema = z.object({
|
|
||||||
redis_url: z
|
|
||||||
.string()
|
|
||||||
.default(process.env.BASANGO_REDIS_URL ?? "redis://localhost:6379/0"),
|
|
||||||
prefix: z.string().default(process.env.BASANGO_QUEUE_PREFIX ?? "crawler"),
|
|
||||||
default_timeout: z
|
|
||||||
.number()
|
|
||||||
.int()
|
|
||||||
.positive()
|
|
||||||
.default(Number(process.env.BASANGO_QUEUE_TIMEOUT ?? 600)),
|
|
||||||
result_ttl: z
|
|
||||||
.number()
|
|
||||||
.int()
|
|
||||||
.nonnegative()
|
|
||||||
.default(Number(process.env.BASANGO_QUEUE_RESULT_TTL ?? 3600)),
|
|
||||||
failure_ttl: z
|
|
||||||
.number()
|
|
||||||
.int()
|
|
||||||
.nonnegative()
|
|
||||||
.default(Number(process.env.BASANGO_QUEUE_FAILURE_TTL ?? 3600)),
|
|
||||||
listing_queue: z.string().default("listing"),
|
|
||||||
article_queue: z.string().default("articles"),
|
|
||||||
processed_queue: z.string().default("processed"),
|
|
||||||
});
|
|
||||||
|
|
||||||
export type QueueSettingsInput = z.input<typeof QueueSettingsSchema>;
|
|
||||||
export type QueueSettings = z.output<typeof QueueSettingsSchema>;
|
|
||||||
|
|
||||||
export const createQueueSettings = (
|
|
||||||
input?: QueueSettingsInput,
|
|
||||||
): QueueSettings => QueueSettingsSchema.parse(input ?? {});
|
|
||||||
|
|
||||||
export interface QueueBackend<T = unknown> {
|
export interface QueueBackend<T = unknown> {
|
||||||
add: (name: string, data: T, opts?: JobsOptions) => Promise<{ id: string }>;
|
add: (name: string, data: T, opts?: JobsOptions) => Promise<{ id: string }>;
|
||||||
@@ -52,14 +20,13 @@ export interface QueueBackend<T = unknown> {
|
|||||||
|
|
||||||
export type QueueFactory = (
|
export type QueueFactory = (
|
||||||
queueName: string,
|
queueName: string,
|
||||||
settings: QueueSettings,
|
settings: FetchAsyncConfig,
|
||||||
connection?: IORedis,
|
connection?: IORedis,
|
||||||
) => QueueBackend;
|
) => QueueBackend;
|
||||||
|
|
||||||
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
|
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
|
||||||
const redisConnection =
|
const redisConnection =
|
||||||
connection ??
|
connection ?? new IORedis(settings.redisUrl, parseRedisUrl(settings.redisUrl));
|
||||||
new IORedis(settings.redis_url, parseRedisUrl(settings.redis_url));
|
|
||||||
const options: QueueOptions = {
|
const options: QueueOptions = {
|
||||||
connection: redisConnection,
|
connection: redisConnection,
|
||||||
prefix: settings.prefix,
|
prefix: settings.prefix,
|
||||||
@@ -69,9 +36,8 @@ const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
|
|||||||
return {
|
return {
|
||||||
add: async (name, data, opts) => {
|
add: async (name, data, opts) => {
|
||||||
const job = await queue.add(name, data, {
|
const job = await queue.add(name, data, {
|
||||||
removeOnComplete: settings.result_ttl === 0 ? true : undefined,
|
removeOnComplete: settings.ttl.result === 0 ? true : undefined,
|
||||||
removeOnFail: settings.failure_ttl === 0 ? true : undefined,
|
removeOnFail: settings.ttl.failure === 0 ? true : undefined,
|
||||||
//timeout: settings.default_timeout * 1000,
|
|
||||||
...opts,
|
...opts,
|
||||||
});
|
});
|
||||||
return { id: job.id ?? randomUUID() };
|
return { id: job.id ?? randomUUID() };
|
||||||
@@ -80,59 +46,52 @@ const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export interface CreateQueueManagerOptions {
|
export interface CreateQueueManagerOptions {
|
||||||
settings?: QueueSettings | QueueSettingsInput;
|
|
||||||
queueFactory?: QueueFactory;
|
queueFactory?: QueueFactory;
|
||||||
connection?: IORedis;
|
connection?: IORedis;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface QueueManager {
|
export interface QueueManager {
|
||||||
readonly settings: QueueSettings;
|
readonly settings: FetchAsyncConfig;
|
||||||
readonly connection: IORedis;
|
readonly connection: IORedis;
|
||||||
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
|
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
|
||||||
enqueueArticle: (payload: ArticleTaskPayload) => Promise<{ id: string }>;
|
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
|
||||||
enqueueProcessed: (payload: ProcessedTaskPayload) => Promise<{ id: string }>;
|
enqueueProcessed: (payload: ProcessingTaskPayload) => Promise<{ id: string }>;
|
||||||
iterQueueNames: () => string[];
|
iterQueueNames: () => string[];
|
||||||
queueName: (suffix: string) => string;
|
queueName: (suffix: string) => string;
|
||||||
close: () => Promise<void>;
|
close: () => Promise<void>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const createQueueManager = (
|
export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
|
||||||
options: CreateQueueManagerOptions = {},
|
const settings = config.fetch.async;
|
||||||
): QueueManager => {
|
|
||||||
const settings = createQueueSettings(
|
|
||||||
options.settings as QueueSettingsInput | undefined,
|
|
||||||
);
|
|
||||||
|
|
||||||
const connection =
|
const connection =
|
||||||
options.connection ??
|
options.connection ?? new IORedis(settings.redisUrl, parseRedisUrl(settings.redisUrl));
|
||||||
new IORedis(settings.redis_url, parseRedisUrl(settings.redis_url));
|
|
||||||
const factory = options.queueFactory ?? defaultQueueFactory;
|
const factory = options.queueFactory ?? defaultQueueFactory;
|
||||||
|
|
||||||
const ensureQueue = (queueName: string) =>
|
const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
|
||||||
factory(queueName, settings, connection);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
settings,
|
settings,
|
||||||
connection,
|
connection,
|
||||||
enqueueListing: (payload) => {
|
enqueueListing: (payload) => {
|
||||||
const data = ListingTaskPayloadSchema.parse(payload);
|
const data = ListingTaskPayloadSchema.parse(payload);
|
||||||
const queue = ensureQueue(settings.listing_queue);
|
const queue = ensureQueue(settings.queues.listing);
|
||||||
return queue.add("collect_listing", data);
|
return queue.add("collect_listing", data);
|
||||||
},
|
},
|
||||||
enqueueArticle: (payload) => {
|
enqueueArticle: (payload) => {
|
||||||
const data = ArticleTaskPayloadSchema.parse(payload);
|
const data = DetailsTaskPayloadSchema.parse(payload);
|
||||||
const queue = ensureQueue(settings.article_queue);
|
const queue = ensureQueue(settings.queues.details);
|
||||||
return queue.add("collect_article", data);
|
return queue.add("collect_article", data);
|
||||||
},
|
},
|
||||||
enqueueProcessed: (payload) => {
|
enqueueProcessed: (payload) => {
|
||||||
const data = ProcessedTaskPayloadSchema.parse(payload);
|
const data = ProcessingTaskPayloadSchema.parse(payload);
|
||||||
const queue = ensureQueue(settings.processed_queue);
|
const queue = ensureQueue(settings.queues.processing);
|
||||||
return queue.add("forward_for_processing", data);
|
return queue.add("forward_for_processing", data);
|
||||||
},
|
},
|
||||||
iterQueueNames: () => [
|
iterQueueNames: () => [
|
||||||
`${settings.prefix}:${settings.listing_queue}`,
|
`${settings.prefix}:${settings.queues.listing}`,
|
||||||
`${settings.prefix}:${settings.article_queue}`,
|
`${settings.prefix}:${settings.queues.details}`,
|
||||||
`${settings.prefix}:${settings.processed_queue}`,
|
`${settings.prefix}:${settings.queues.processing}`,
|
||||||
],
|
],
|
||||||
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
|
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
|
||||||
close: async () => {
|
close: async () => {
|
||||||
|
|||||||
@@ -1,36 +1,28 @@
|
|||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { AnySourceConfig, DateRangeSchema, PageRangeSchema } from "@/schema";
|
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema";
|
||||||
|
|
||||||
export const ListingTaskPayloadSchema = z.object({
|
export const ListingTaskPayloadSchema = z.object({
|
||||||
source_id: z.string(),
|
sourceId: z.string(),
|
||||||
env: z.string().default("development"),
|
pageRange: z.string().optional(),
|
||||||
page_range: z.string().optional().nullable(),
|
dateRange: z.string().optional(),
|
||||||
date_range: z.string().optional().nullable(),
|
category: z.string().optional(),
|
||||||
category: z.string().optional().nullable(),
|
});
|
||||||
|
|
||||||
|
export const DetailsTaskPayloadSchema = z.object({
|
||||||
|
sourceId: z.string(),
|
||||||
|
url: z.url(),
|
||||||
|
data: z.any().optional(),
|
||||||
|
page: z.number().int().nonnegative().optional(),
|
||||||
|
pageRange: PageRangeSchema.optional(),
|
||||||
|
dateRange: DateRangeSchema.optional(),
|
||||||
|
category: z.string().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export const ProcessingTaskPayloadSchema = z.object({
|
||||||
|
sourceId: z.string(),
|
||||||
|
article: ArticleSchema,
|
||||||
});
|
});
|
||||||
|
|
||||||
export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
|
export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
|
||||||
|
export type DetailsTaskPayload = z.infer<typeof DetailsTaskPayloadSchema>;
|
||||||
export const ArticleTaskPayloadSchema = z.object({
|
export type ProcessingTaskPayload = z.infer<typeof ProcessingTaskPayloadSchema>;
|
||||||
source_id: z.string(),
|
|
||||||
env: z.string().default("development"),
|
|
||||||
url: z.url(),
|
|
||||||
page: z.number().int().nonnegative().optional(),
|
|
||||||
page_range: PageRangeSchema.optional().nullable(),
|
|
||||||
date_range: DateRangeSchema.optional().nullable(),
|
|
||||||
category: z.string().optional().nullable(),
|
|
||||||
});
|
|
||||||
|
|
||||||
export type ArticleTaskPayload = z.infer<typeof ArticleTaskPayloadSchema>;
|
|
||||||
|
|
||||||
export const ProcessedTaskPayloadSchema = z.object({
|
|
||||||
source_id: z.string(),
|
|
||||||
env: z.string().default("development"),
|
|
||||||
article: z.any(),
|
|
||||||
});
|
|
||||||
|
|
||||||
export type ProcessedTaskPayload = z.infer<typeof ProcessedTaskPayloadSchema>;
|
|
||||||
|
|
||||||
export interface ListingContext {
|
|
||||||
source: AnySourceConfig;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,171 +1,61 @@
|
|||||||
import { logger } from "@basango/logger";
|
import { logger } from "@basango/logger";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
ArticleTaskPayload,
|
DetailsTaskPayloadSchema,
|
||||||
ArticleTaskPayloadSchema,
|
|
||||||
ListingTaskPayload,
|
|
||||||
ListingTaskPayloadSchema,
|
ListingTaskPayloadSchema,
|
||||||
ProcessedTaskPayload,
|
ProcessingTaskPayloadSchema,
|
||||||
ProcessedTaskPayloadSchema,
|
|
||||||
} from "@/process/async/schemas";
|
} from "@/process/async/schemas";
|
||||||
import {
|
import { createQueueManager } from "@/process/async/queue";
|
||||||
createQueueManager,
|
import * as handlers from "@/process/async/handlers";
|
||||||
QueueManager,
|
import { CrawlingOptions } from "@/process/crawler";
|
||||||
QueueSettings,
|
|
||||||
QueueSettingsInput,
|
|
||||||
} from "@/process/async/queue";
|
|
||||||
|
|
||||||
export interface CrawlerTaskHandlers {
|
|
||||||
collectListing: (payload: ListingTaskPayload) => Promise<number> | number;
|
|
||||||
collectArticle: (payload: ArticleTaskPayload) => Promise<unknown> | unknown;
|
|
||||||
forwardForProcessing: (
|
|
||||||
payload: ProcessedTaskPayload,
|
|
||||||
) => Promise<unknown> | unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
const notImplemented = (name: keyof CrawlerTaskHandlers) => () => {
|
|
||||||
throw new Error(`Crawler task handler '${name}' is not implemented`);
|
|
||||||
};
|
|
||||||
|
|
||||||
let handlers: CrawlerTaskHandlers = {
|
|
||||||
collectListing: notImplemented("collectListing"),
|
|
||||||
collectArticle: notImplemented("collectArticle"),
|
|
||||||
forwardForProcessing: notImplemented("forwardForProcessing"),
|
|
||||||
};
|
|
||||||
|
|
||||||
export const registerCrawlerTaskHandlers = (
|
|
||||||
overrides: Partial<CrawlerTaskHandlers>,
|
|
||||||
): void => {
|
|
||||||
handlers = { ...handlers, ...overrides };
|
|
||||||
};
|
|
||||||
|
|
||||||
export interface ScheduleAsyncCrawlOptions {
|
|
||||||
sourceId: string;
|
|
||||||
env?: string;
|
|
||||||
pageRange?: string | null;
|
|
||||||
dateRange?: string | null;
|
|
||||||
category?: string | null;
|
|
||||||
settings?: QueueSettings | QueueSettingsInput;
|
|
||||||
queueManager?: QueueManager;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const scheduleAsyncCrawl = async ({
|
|
||||||
sourceId,
|
|
||||||
env = "development",
|
|
||||||
pageRange,
|
|
||||||
dateRange,
|
|
||||||
category,
|
|
||||||
settings,
|
|
||||||
queueManager,
|
|
||||||
}: ScheduleAsyncCrawlOptions): Promise<string> => {
|
|
||||||
const payload = ListingTaskPayloadSchema.parse({
|
|
||||||
source_id: sourceId,
|
|
||||||
env,
|
|
||||||
page_range: pageRange ?? undefined,
|
|
||||||
date_range: dateRange ?? undefined,
|
|
||||||
category: category ?? undefined,
|
|
||||||
});
|
|
||||||
|
|
||||||
const manager = queueManager ?? createQueueManager({ settings });
|
|
||||||
logger.debug(
|
|
||||||
{
|
|
||||||
sourceId,
|
|
||||||
env: payload.env,
|
|
||||||
pageRange: payload.page_range,
|
|
||||||
dateRange: payload.date_range,
|
|
||||||
category: payload.category,
|
|
||||||
},
|
|
||||||
"Scheduling listing collection job",
|
|
||||||
);
|
|
||||||
try {
|
|
||||||
const job = await manager.enqueueListing(payload);
|
|
||||||
logger.info(
|
|
||||||
{ jobId: job.id, sourceId, env: payload.env },
|
|
||||||
"Scheduled listing collection job",
|
|
||||||
);
|
|
||||||
return job.id;
|
|
||||||
} finally {
|
|
||||||
if (!queueManager) {
|
|
||||||
await manager.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
export const collectListing = async (payload: unknown): Promise<number> => {
|
export const collectListing = async (payload: unknown): Promise<number> => {
|
||||||
const data = ListingTaskPayloadSchema.parse(payload);
|
const data = ListingTaskPayloadSchema.parse(payload);
|
||||||
logger.debug(
|
logger.debug({ data }, "Collecting listing");
|
||||||
{
|
|
||||||
sourceId: data.source_id,
|
|
||||||
env: data.env,
|
|
||||||
pageRange: data.page_range,
|
|
||||||
dateRange: data.date_range,
|
|
||||||
category: data.category,
|
|
||||||
},
|
|
||||||
"Collecting listing",
|
|
||||||
);
|
|
||||||
|
|
||||||
const result = await handlers.collectListing(data);
|
const count = await handlers.collectHtmlListing(data);
|
||||||
const count = typeof result === "number" ? result : 0;
|
logger.info({ count }, "Listing collection completed");
|
||||||
|
|
||||||
logger.info(
|
|
||||||
{
|
|
||||||
sourceId: data.source_id,
|
|
||||||
env: data.env,
|
|
||||||
queuedArticles: count,
|
|
||||||
},
|
|
||||||
"Listing collection completed",
|
|
||||||
);
|
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const collectArticle = async (payload: unknown): Promise<unknown> => {
|
export const collectArticle = async (payload: unknown): Promise<unknown> => {
|
||||||
const data = ArticleTaskPayloadSchema.parse(payload);
|
const data = DetailsTaskPayloadSchema.parse(payload);
|
||||||
logger.debug(
|
logger.info({ data }, "Collecting article");
|
||||||
{
|
|
||||||
sourceId: data.source_id,
|
|
||||||
env: data.env,
|
|
||||||
url: data.url,
|
|
||||||
page: data.page,
|
|
||||||
},
|
|
||||||
"Collecting article",
|
|
||||||
);
|
|
||||||
|
|
||||||
const result = await handlers.collectArticle(data);
|
const result = await handlers.collectArticle(data);
|
||||||
|
logger.info({ url: data.url }, "Article collection completed");
|
||||||
logger.info(
|
|
||||||
{
|
|
||||||
sourceId: data.source_id,
|
|
||||||
env: data.env,
|
|
||||||
url: data.url,
|
|
||||||
},
|
|
||||||
"Article collection completed",
|
|
||||||
);
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const forwardForProcessing = async (
|
export const forwardForProcessing = async (payload: unknown): Promise<unknown> => {
|
||||||
payload: unknown,
|
const data = ProcessingTaskPayloadSchema.parse(payload);
|
||||||
): Promise<unknown> => {
|
logger.debug({ sourceId: data.sourceId }, "Forwarding article for processing");
|
||||||
const data = ProcessedTaskPayloadSchema.parse(payload);
|
|
||||||
logger.debug(
|
|
||||||
{
|
|
||||||
sourceId: data.source_id,
|
|
||||||
env: data.env,
|
|
||||||
},
|
|
||||||
"Forwarding article for processing",
|
|
||||||
);
|
|
||||||
|
|
||||||
const result = await handlers.forwardForProcessing(data);
|
const result = await handlers.forwardForProcessing(data);
|
||||||
|
logger.info({ result }, "Article forwarded for processing");
|
||||||
logger.info(
|
|
||||||
{
|
|
||||||
sourceId: data.source_id,
|
|
||||||
env: data.env,
|
|
||||||
},
|
|
||||||
"Article forwarded for processing",
|
|
||||||
);
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise<string> => {
|
||||||
|
const payload = ListingTaskPayloadSchema.parse({
|
||||||
|
sourceId: options.sourceId,
|
||||||
|
pageRange: options.pageRange,
|
||||||
|
dateRange: options.dateRange,
|
||||||
|
category: options.category,
|
||||||
|
});
|
||||||
|
|
||||||
|
const manager = createQueueManager();
|
||||||
|
logger.info({ payload }, "Scheduling listing collection job");
|
||||||
|
|
||||||
|
try {
|
||||||
|
const job = await manager.enqueueListing(payload);
|
||||||
|
logger.info({ job }, "Scheduled listing collection job");
|
||||||
|
|
||||||
|
return job.id;
|
||||||
|
} finally {
|
||||||
|
await manager.close();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|||||||
@@ -1,27 +1,16 @@
|
|||||||
import IORedis from "ioredis";
|
import IORedis from "ioredis";
|
||||||
import { QueueEvents, Worker } from "bullmq";
|
import { QueueEvents, Worker } from "bullmq";
|
||||||
|
|
||||||
import {
|
import { QueueFactory, QueueManager } from "@/process/async/queue";
|
||||||
createQueueManager,
|
import { collectArticle, collectListing, forwardForProcessing } from "@/process/async/tasks";
|
||||||
QueueFactory,
|
|
||||||
QueueManager,
|
|
||||||
QueueSettings,
|
|
||||||
QueueSettingsInput,
|
|
||||||
} from "@/process/async/queue";
|
|
||||||
import {
|
|
||||||
collectArticle,
|
|
||||||
collectListing,
|
|
||||||
forwardForProcessing,
|
|
||||||
} from "@/process/async/tasks";
|
|
||||||
|
|
||||||
export interface WorkerOptions {
|
export interface WorkerOptions {
|
||||||
queueNames?: string[];
|
queueNames?: string[];
|
||||||
settings?: QueueSettings | QueueSettingsInput;
|
|
||||||
connection?: IORedis;
|
connection?: IORedis;
|
||||||
queueFactory?: QueueFactory;
|
queueFactory?: QueueFactory;
|
||||||
concurrency?: number;
|
concurrency?: number;
|
||||||
onError?: (error: Error) => void;
|
onError?: (error: Error) => void;
|
||||||
queueManager?: QueueManager;
|
queueManager: QueueManager;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface WorkerHandle {
|
export interface WorkerHandle {
|
||||||
@@ -30,15 +19,8 @@ export interface WorkerHandle {
|
|||||||
close: () => Promise<void>;
|
close: () => Promise<void>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const startWorker = (options: WorkerOptions = {}): WorkerHandle => {
|
export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||||
const manager =
|
const manager = options.queueManager;
|
||||||
options.queueManager ??
|
|
||||||
createQueueManager({
|
|
||||||
settings: options.settings,
|
|
||||||
connection: options.connection,
|
|
||||||
queueFactory: options.queueFactory,
|
|
||||||
});
|
|
||||||
|
|
||||||
const queueNames = options.queueNames ?? manager.iterQueueNames();
|
const queueNames = options.queueNames ?? manager.iterQueueNames();
|
||||||
const workers: Worker[] = [];
|
const workers: Worker[] = [];
|
||||||
const events: QueueEvents[] = [];
|
const events: QueueEvents[] = [];
|
||||||
|
|||||||
@@ -1,158 +1,44 @@
|
|||||||
import { logger } from "@basango/logger";
|
import { config, FetchCrawlerConfig } from "@/config";
|
||||||
|
import { JsonlPersistor, Persistor } from "@/process/persistence";
|
||||||
|
import { AnySourceConfig } from "@/schema";
|
||||||
|
import logger from "@basango/logger";
|
||||||
|
import { createDateRange, createPageRange } from "@/utils";
|
||||||
|
|
||||||
import { PipelineConfigManager } from "@/config";
|
export interface CrawlingOptions {
|
||||||
import { JsonlPersistor, Persistor } from "@/persistence";
|
|
||||||
import {
|
|
||||||
AnySourceConfig,
|
|
||||||
ClientConfig,
|
|
||||||
CrawlerConfig,
|
|
||||||
CrawlerConfigSchema,
|
|
||||||
PipelineConfig,
|
|
||||||
SourceKind,
|
|
||||||
} from "@/schema";
|
|
||||||
import { createDateRange } from "@/utils";
|
|
||||||
import { PageRangeSchema, PageRangeSpecSchema } from "@/schema";
|
|
||||||
|
|
||||||
export interface CrawlerInstance {
|
|
||||||
fetch: () => Promise<void> | void;
|
|
||||||
close?: () => Promise<void> | void;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface CrawlerContext {
|
|
||||||
pipeline: PipelineConfig;
|
|
||||||
source: AnySourceConfig;
|
|
||||||
clientConfig: ClientConfig;
|
|
||||||
crawlerConfig: CrawlerConfig;
|
|
||||||
persistors: Persistor[];
|
|
||||||
}
|
|
||||||
|
|
||||||
export type CrawlerFactory = (context: CrawlerContext) => CrawlerInstance;
|
|
||||||
|
|
||||||
const registry = new Map<SourceKind, CrawlerFactory>();
|
|
||||||
|
|
||||||
export const registerCrawler = (kind: SourceKind, factory: CrawlerFactory): void => {
|
|
||||||
registry.set(kind, factory);
|
|
||||||
};
|
|
||||||
|
|
||||||
export const clearCrawlerRegistry = (): void => {
|
|
||||||
registry.clear();
|
|
||||||
};
|
|
||||||
|
|
||||||
export interface RunSyncCrawlOptions {
|
|
||||||
sourceId: string;
|
sourceId: string;
|
||||||
env?: string;
|
pageRange?: string | undefined;
|
||||||
pageRange?: string | null;
|
dateRange?: string | undefined;
|
||||||
dateRange?: string | null;
|
category?: string | undefined;
|
||||||
category?: string | null;
|
|
||||||
notify?: boolean;
|
|
||||||
manager?: PipelineConfigManager;
|
|
||||||
persistFactory?: (context: {
|
|
||||||
pipeline: PipelineConfig;
|
|
||||||
source: AnySourceConfig;
|
|
||||||
resolvedSourceId: string;
|
|
||||||
}) => Persistor[];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const resolvePageRange = (spec?: string | null) => {
|
export const resolveCrawlerConfig = (
|
||||||
if (!spec) return undefined;
|
|
||||||
const parsed = PageRangeSpecSchema.parse(spec);
|
|
||||||
return PageRangeSchema.parse(parsed);
|
|
||||||
};
|
|
||||||
|
|
||||||
const resolveCrawlerConfig = (
|
|
||||||
source: AnySourceConfig,
|
source: AnySourceConfig,
|
||||||
options: RunSyncCrawlOptions,
|
options: CrawlingOptions,
|
||||||
): CrawlerConfig => {
|
): FetchCrawlerConfig => {
|
||||||
const page_range = resolvePageRange(options.pageRange);
|
return {
|
||||||
const date_range = options.dateRange ? createDateRange(options.dateRange) : undefined;
|
...config.fetch.crawler,
|
||||||
|
|
||||||
return CrawlerConfigSchema.parse({
|
|
||||||
source,
|
source,
|
||||||
page_range,
|
dateRange: createDateRange(options.dateRange),
|
||||||
date_range,
|
pageRange: createPageRange(options.pageRange),
|
||||||
category: options.category ?? undefined,
|
category: options.category,
|
||||||
notify: options.notify ?? false,
|
};
|
||||||
});
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const createPersistors = (
|
export const createPersistors = (source: AnySourceConfig): Persistor[] => {
|
||||||
context: { pipeline: PipelineConfig; source: AnySourceConfig; sourceId: string },
|
|
||||||
factory?: RunSyncCrawlOptions["persistFactory"],
|
|
||||||
): Persistor[] => {
|
|
||||||
if (factory) {
|
|
||||||
return factory({
|
|
||||||
pipeline: context.pipeline,
|
|
||||||
source: context.source,
|
|
||||||
resolvedSourceId: context.sourceId,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
new JsonlPersistor({
|
new JsonlPersistor({
|
||||||
directory: context.pipeline.paths.data,
|
directory: config.paths.data,
|
||||||
sourceId: context.sourceId,
|
sourceId: source.sourceId,
|
||||||
}),
|
}),
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
export const runSyncCrawl = async (options: RunSyncCrawlOptions): Promise<void> => {
|
export const closePersistors = async (persistors: Persistor[]): Promise<void> => {
|
||||||
const env = options.env ?? "development";
|
for (const persistor of persistors) {
|
||||||
const manager = options.manager ?? new PipelineConfigManager({ env });
|
try {
|
||||||
const pipeline = manager.get(env);
|
await persistor.close();
|
||||||
manager.setupLogging(pipeline);
|
} catch (error) {
|
||||||
|
logger.warn({ error }, "Failed to close persistor");
|
||||||
const source = pipeline.sources.find(options.sourceId);
|
|
||||||
if (!source) {
|
|
||||||
throw new Error(`Source '${options.sourceId}' not found in configuration`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const crawlerConfig = resolveCrawlerConfig(source, options);
|
|
||||||
const sourceId = source.source_id ?? options.sourceId;
|
|
||||||
const persistors = createPersistors({ pipeline, source, sourceId }, options.persistFactory);
|
|
||||||
|
|
||||||
const factory = registry.get(source.source_kind as SourceKind);
|
|
||||||
if (!factory) {
|
|
||||||
throw new Error(`No crawler registered for source kind '${source.source_kind}'`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const context: CrawlerContext = {
|
|
||||||
pipeline,
|
|
||||||
source,
|
|
||||||
clientConfig: pipeline.fetch.client,
|
|
||||||
crawlerConfig,
|
|
||||||
persistors,
|
|
||||||
};
|
|
||||||
|
|
||||||
const crawler = factory(context);
|
|
||||||
if (!crawler || typeof crawler.fetch !== "function") {
|
|
||||||
throw new Error("Registered crawler did not return a valid instance");
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
await crawler.fetch();
|
|
||||||
logger.info(
|
|
||||||
{
|
|
||||||
sourceId: options.sourceId,
|
|
||||||
kind: source.source_kind,
|
|
||||||
env,
|
|
||||||
},
|
|
||||||
"Synchronous crawl completed",
|
|
||||||
);
|
|
||||||
} finally {
|
|
||||||
for (const persistor of persistors) {
|
|
||||||
try {
|
|
||||||
await persistor.close?.();
|
|
||||||
} catch (error) {
|
|
||||||
logger.warn({ error }, "Failed to close persistor");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof crawler.close === "function") {
|
|
||||||
try {
|
|
||||||
await crawler.close();
|
|
||||||
} catch (error) {
|
|
||||||
logger.warn({ error }, "Failed to close crawler");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -0,0 +1,108 @@
|
|||||||
|
import { parse as parseHtml, HTMLElement } from "node-html-parser";
|
||||||
|
|
||||||
|
import { SyncHttpClient } from "@/http/http-client";
|
||||||
|
import { OpenGraph } from "@/http/open-graph";
|
||||||
|
import type { Persistor } from "@/process/persistence";
|
||||||
|
import { config, FetchCrawlerConfig } from "@/config";
|
||||||
|
import { AnySourceConfig, Article } from "@/schema";
|
||||||
|
|
||||||
|
export interface CrawlerOptions {
|
||||||
|
persistors?: Persistor[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export abstract class BaseCrawler {
|
||||||
|
protected readonly settings: FetchCrawlerConfig;
|
||||||
|
protected readonly source: AnySourceConfig;
|
||||||
|
protected readonly http: SyncHttpClient;
|
||||||
|
protected readonly persistors: Persistor[];
|
||||||
|
protected readonly openGraph: OpenGraph;
|
||||||
|
|
||||||
|
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
|
||||||
|
if (!settings.source) {
|
||||||
|
throw new Error("Crawler requires a bound source");
|
||||||
|
}
|
||||||
|
|
||||||
|
this.http = new SyncHttpClient(config.fetch.client);
|
||||||
|
this.persistors = options.persistors ?? [];
|
||||||
|
this.openGraph = new OpenGraph();
|
||||||
|
|
||||||
|
this.settings = settings;
|
||||||
|
this.source = settings.source as AnySourceConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch and process articles from the source.
|
||||||
|
*/
|
||||||
|
abstract fetch(): Promise<void> | void;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Crawl the given URL and return the HTML content as a string.
|
||||||
|
* @param url - The URL to crawl
|
||||||
|
*/
|
||||||
|
async crawl(url: string): Promise<string> {
|
||||||
|
const response = await this.http.get(url);
|
||||||
|
return await response.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text content from an HTML node.
|
||||||
|
* @param node - The HTML node
|
||||||
|
*/
|
||||||
|
protected textContent(node: HTMLElement | null | undefined): string | null {
|
||||||
|
if (!node) return null;
|
||||||
|
// innerText keeps spacing similar to browser rendering
|
||||||
|
const value = (node as any).innerText ?? node.text;
|
||||||
|
const text = typeof value === "string" ? value.trim() : String(value ?? "").trim();
|
||||||
|
return text.length ? text : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the first matching element from the root using the selector.
|
||||||
|
* @param root - The root HTML element
|
||||||
|
* @param selector - The CSS selector
|
||||||
|
*/
|
||||||
|
protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null {
|
||||||
|
if (!selector) return null;
|
||||||
|
try {
|
||||||
|
return (root as any).querySelector?.(selector) ?? null;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all matching elements from the root using the selector.
|
||||||
|
* @param root - The root HTML element
|
||||||
|
* @param selector - The CSS selector
|
||||||
|
*/
|
||||||
|
protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] {
|
||||||
|
if (!selector) return [];
|
||||||
|
try {
|
||||||
|
return ((root as any).querySelectorAll?.(selector) ?? []) as HTMLElement[];
|
||||||
|
} catch {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse HTML string into an HTMLElement.
|
||||||
|
* @param html - The HTML string
|
||||||
|
*/
|
||||||
|
protected parseHtml(html: string): HTMLElement {
|
||||||
|
return parseHtml(html) as unknown as HTMLElement;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enrich the record with Open Graph metadata from the given URL.
|
||||||
|
* @param record - The article record
|
||||||
|
* @param url - The URL to fetch Open Graph data from
|
||||||
|
*/
|
||||||
|
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
|
||||||
|
try {
|
||||||
|
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
|
||||||
|
return { ...record, metadata };
|
||||||
|
} catch {
|
||||||
|
return { ...record, metadata: undefined };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,338 @@
|
|||||||
|
import { logger } from "@basango/logger";
|
||||||
|
import { HTMLElement } from "node-html-parser";
|
||||||
|
import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
|
||||||
|
|
||||||
|
import { isTimestampInRange, createAbsoluteUrl } from "@/utils";
|
||||||
|
import { persist, Persistor } from "@/process/persistence";
|
||||||
|
import { BaseCrawler } from "@/process/parsers/base";
|
||||||
|
import TurndownService from "turndown";
|
||||||
|
import { DateRange, HtmlSourceConfig } from "@/schema";
|
||||||
|
import { FetchCrawlerConfig } from "@/config";
|
||||||
|
|
||||||
|
const md = new TurndownService({
|
||||||
|
headingStyle: "atx",
|
||||||
|
hr: "---",
|
||||||
|
bulletListMarker: "-",
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a safe RegExp from the given pattern.
|
||||||
|
* @param pattern
|
||||||
|
*/
|
||||||
|
const safeRegExp = (pattern?: string | null): RegExp | null => {
|
||||||
|
if (!pattern) return null;
|
||||||
|
try {
|
||||||
|
return new RegExp(pattern, "g");
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Crawler for generic HTML pages.
|
||||||
|
*/
|
||||||
|
export class HtmlCrawler extends BaseCrawler {
|
||||||
|
readonly source: HtmlSourceConfig;
|
||||||
|
private currentArticleUrl: string | null = null;
|
||||||
|
|
||||||
|
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||||
|
super(settings, options);
|
||||||
|
|
||||||
|
if (!settings.source || settings.source.sourceKind !== "html") {
|
||||||
|
throw new Error("HtmlCrawler requires a source of kind 'html'");
|
||||||
|
}
|
||||||
|
this.source = this.settings.source as HtmlSourceConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fetch(): Promise<void> {
|
||||||
|
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||||
|
const dateRange = this.settings.dateRange;
|
||||||
|
|
||||||
|
const articleSelector = this.source.sourceSelectors.articles;
|
||||||
|
if (!articleSelector) {
|
||||||
|
logger.error(
|
||||||
|
{ source: this.source.sourceId },
|
||||||
|
"No article selector configured for HTML source",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let stop = false;
|
||||||
|
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||||
|
const pageUrl = this.buildPageUrl(page);
|
||||||
|
let html: string;
|
||||||
|
try {
|
||||||
|
html = await this.crawl(pageUrl);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error, page, pageUrl }, "> page %s => [failed]", page);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const root = this.parseHtml(html);
|
||||||
|
const articles = this.extractAll(root, articleSelector);
|
||||||
|
if (!articles.length) {
|
||||||
|
logger.info({ page }, "No articles found on page");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const node of articles) {
|
||||||
|
try {
|
||||||
|
this.currentArticleUrl = this.extractLink(node);
|
||||||
|
let targetHtml = node.toString();
|
||||||
|
|
||||||
|
if (this.source.requiresDetails) {
|
||||||
|
if (!this.currentArticleUrl) {
|
||||||
|
logger.debug({ page }, "Skipping article without link for details");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
targetHtml = await this.crawl(this.currentArticleUrl);
|
||||||
|
} catch (err) {
|
||||||
|
logger.error(
|
||||||
|
{ error: err, url: this.currentArticleUrl },
|
||||||
|
"Failed to fetch detail page",
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const saved = await this.fetchOne(targetHtml, dateRange);
|
||||||
|
// stop early on first out-of-range if pages are sorted by date desc
|
||||||
|
if (saved === null) {
|
||||||
|
stop = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error, pageUrl }, "Failed to process article on page");
|
||||||
|
} finally {
|
||||||
|
this.currentArticleUrl = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stop) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch and process a single HTML article.
|
||||||
|
* @param html - The HTML content of the article
|
||||||
|
* @param dateRange - Optional date range for filtering
|
||||||
|
*/
|
||||||
|
async fetchOne(html: string, dateRange?: DateRange | null) {
|
||||||
|
const root = this.parseHtml(html);
|
||||||
|
const sel = this.source.sourceSelectors;
|
||||||
|
|
||||||
|
const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled";
|
||||||
|
const link = this.currentArticleUrl ?? this.extractLink(root);
|
||||||
|
if (!link) {
|
||||||
|
logger.warn({ title: titleText }, "Skipping article without link");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const body = this.extractBody(root, sel.articleBody);
|
||||||
|
const categories = this.extractCategories(root, sel.articleCategories);
|
||||||
|
const rawDate = this.extractText(root, sel.articleDate);
|
||||||
|
const timestamp = this.computeTimestamp(rawDate);
|
||||||
|
|
||||||
|
if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
|
||||||
|
logger.info(
|
||||||
|
{ title: titleText, link, date: rawDate, timestamp },
|
||||||
|
"Skipping article outside date range",
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const enriched = await this.enrichWithOpenGraph(
|
||||||
|
{
|
||||||
|
title: titleText,
|
||||||
|
link,
|
||||||
|
body,
|
||||||
|
categories,
|
||||||
|
source: this.source.sourceId,
|
||||||
|
timestamp,
|
||||||
|
},
|
||||||
|
link,
|
||||||
|
);
|
||||||
|
|
||||||
|
return await persist(enriched, this.persistors);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch links from the target URL using the given selector.
|
||||||
|
* @param target - The target URL to crawl
|
||||||
|
* @param selector - The CSS selector to extract links
|
||||||
|
*/
|
||||||
|
async fetchLinks(target: string, selector: string) {
|
||||||
|
const html = await this.crawl(target);
|
||||||
|
const root = this.parseHtml(html);
|
||||||
|
return this.extractAll(root, selector);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the pagination range (start and end page numbers).
|
||||||
|
*/
|
||||||
|
async getPagination(): Promise<{ start: number; end: number }> {
|
||||||
|
return { start: 0, end: await this.getLastPage() };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine the last page number from pagination links.
|
||||||
|
*/
|
||||||
|
private async getLastPage(): Promise<number> {
|
||||||
|
const template = this.applyCategory(this.source.paginationTemplate);
|
||||||
|
const url = `${this.source.sourceUrl}${template}`;
|
||||||
|
try {
|
||||||
|
const html = await this.crawl(url);
|
||||||
|
const root = this.parseHtml(html);
|
||||||
|
const links = this.extractAll(root, this.source.sourceSelectors.pagination);
|
||||||
|
if (!links.length) return 1;
|
||||||
|
const last = links[links.length - 1]!;
|
||||||
|
const href = (last as any).getAttribute?.("href") as string | null;
|
||||||
|
if (!href) return 1;
|
||||||
|
|
||||||
|
// Heuristic: prefer a number in the href, else "page" query param
|
||||||
|
const numberMatch = href.match(/(\d+)/);
|
||||||
|
if (numberMatch) {
|
||||||
|
const page = Number.parseInt(numberMatch[1]!, 10);
|
||||||
|
return Number.isFinite(page) && page > 0 ? page : 1;
|
||||||
|
}
|
||||||
|
const urlObj = new URL(createAbsoluteUrl(this.source.sourceUrl, href));
|
||||||
|
const pageParam = urlObj.searchParams.get("page");
|
||||||
|
if (pageParam) {
|
||||||
|
const page = Number.parseInt(pageParam, 10);
|
||||||
|
return Number.isFinite(page) && page > 0 ? page : 1;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
} catch {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build the URL for a given page number.
|
||||||
|
* @param page - The page number
|
||||||
|
*/
|
||||||
|
buildPageUrl(page: number): string {
|
||||||
|
let template = this.applyCategory(this.source.paginationTemplate);
|
||||||
|
if (template.includes("{page}")) {
|
||||||
|
template = template.replace("{page}", String(page));
|
||||||
|
} else if (page > 0) {
|
||||||
|
const sep = template.includes("?") ? "&" : "?";
|
||||||
|
template = `${template}${sep}page=${page}`;
|
||||||
|
}
|
||||||
|
return createAbsoluteUrl(this.source.sourceUrl, template);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Apply category replacement in the template if needed.
|
||||||
|
* @param template - The URL template
|
||||||
|
*/
|
||||||
|
private applyCategory(template: string): string {
|
||||||
|
if (template.includes("{category}")) {
|
||||||
|
const replacement = this.settings.category ?? "";
|
||||||
|
return template.replace("{category}", replacement);
|
||||||
|
}
|
||||||
|
return template;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract link URL from the given node using the selector.
|
||||||
|
* @param node - The HTML element
|
||||||
|
*/
|
||||||
|
extractLink(node: HTMLElement): string | null {
|
||||||
|
const selector = this.source.sourceSelectors.articleLink;
|
||||||
|
if (!selector) return null;
|
||||||
|
const target = this.extractFirst(node, selector);
|
||||||
|
if (!target) return null;
|
||||||
|
|
||||||
|
const href =
|
||||||
|
(target.getAttribute?.("href") as string | null) ??
|
||||||
|
((target as any).getAttribute?.("data-href") as string | null) ??
|
||||||
|
((target as any).getAttribute?.("src") as string | null);
|
||||||
|
if (!href) return null;
|
||||||
|
const absolute = createAbsoluteUrl(this.source.sourceUrl, href);
|
||||||
|
return absolute;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text content from the root using the selector.
|
||||||
|
* @param root - The root HTML element
|
||||||
|
* @param selector - The CSS selector
|
||||||
|
*/
|
||||||
|
private extractText(root: HTMLElement, selector?: string | null): string | null {
|
||||||
|
if (!selector) return null;
|
||||||
|
const target = this.extractFirst(root, selector);
|
||||||
|
if (!target) return null;
|
||||||
|
|
||||||
|
// If it's an image, prefer alt/title
|
||||||
|
const tag = (target as any).tagName?.toLowerCase?.() as string | undefined;
|
||||||
|
if (tag === "img") {
|
||||||
|
const alt = (target as any).getAttribute?.("alt") as string | null;
|
||||||
|
const title = (target as any).getAttribute?.("title") as string | null;
|
||||||
|
const pick = (alt ?? title ?? "").trim();
|
||||||
|
if (pick.length > 0) return pick;
|
||||||
|
}
|
||||||
|
return this.textContent(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract body content from the root using the selector.
|
||||||
|
* @param root - The root HTML element
|
||||||
|
* @param selector - The CSS selector
|
||||||
|
*/
|
||||||
|
private extractBody(root: HTMLElement, selector?: string | null): string {
|
||||||
|
if (selector) {
|
||||||
|
const nodes = this.extractAll(root, selector);
|
||||||
|
if (nodes.length) {
|
||||||
|
const parts = nodes.map((n) => md.turndown(n.toString())).filter(Boolean);
|
||||||
|
if (parts.length) return parts.join("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return md.turndown(root.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract categories from the root using the selector.
|
||||||
|
* @param root - The root HTML element
|
||||||
|
* @param selector - The CSS selector
|
||||||
|
*/
|
||||||
|
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
|
||||||
|
if (!selector) return [];
|
||||||
|
const values: string[] = [];
|
||||||
|
for (const node of this.extractAll(root, selector)) {
|
||||||
|
const text = this.textContent(node);
|
||||||
|
if (!text) continue;
|
||||||
|
const lower = text.toLowerCase();
|
||||||
|
if (!values.includes(lower)) values.push(lower);
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute Unix timestamp from raw date string.
|
||||||
|
* @param raw - Raw date string
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
private computeTimestamp(raw?: string | null): number {
|
||||||
|
if (!raw) return Math.floor(Date.now() / 1000);
|
||||||
|
let value = raw.trim();
|
||||||
|
const pattern = safeRegExp(this.source.sourceDate?.pattern);
|
||||||
|
const replacement = this.source.sourceDate?.replacement ?? "";
|
||||||
|
if (pattern) {
|
||||||
|
try {
|
||||||
|
value = value.replace(pattern, replacement);
|
||||||
|
} catch {
|
||||||
|
// ignore pattern failures
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm";
|
||||||
|
if (!isDateMatch(value, format)) {
|
||||||
|
// fallback: try native Date.parse as last resort
|
||||||
|
const parsed = Date.parse(value);
|
||||||
|
return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000);
|
||||||
|
}
|
||||||
|
const date = parseDateFns(value, format, new Date());
|
||||||
|
const ts = getUnixTime(date);
|
||||||
|
return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,240 @@
|
|||||||
|
import { logger } from "@basango/logger";
|
||||||
|
|
||||||
|
import { DateRange, PageRange, WordPressSourceConfig } from "@/schema";
|
||||||
|
import { BaseCrawler } from "@/process/parsers/base";
|
||||||
|
import { persist, Persistor } from "@/process/persistence";
|
||||||
|
import TurndownService from "turndown";
|
||||||
|
import { FetchCrawlerConfig } from "@/config";
|
||||||
|
|
||||||
|
const md = new TurndownService({
|
||||||
|
headingStyle: "atx",
|
||||||
|
hr: "---",
|
||||||
|
bulletListMarker: "-",
|
||||||
|
});
|
||||||
|
|
||||||
|
interface WordPressPost {
|
||||||
|
link?: string;
|
||||||
|
slug?: string;
|
||||||
|
title?: { rendered?: string };
|
||||||
|
content?: { rendered?: string };
|
||||||
|
date?: string;
|
||||||
|
categories?: number[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Crawler for WordPress sites using the REST API.
|
||||||
|
*/
|
||||||
|
export class WordPressCrawler extends BaseCrawler {
|
||||||
|
readonly source: WordPressSourceConfig;
|
||||||
|
private categoryMap: Map<number, string> = new Map();
|
||||||
|
|
||||||
|
private static readonly POST_QUERY =
|
||||||
|
"_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc";
|
||||||
|
private static readonly CATEGORY_QUERY =
|
||||||
|
"_fields=id,slug,count&orderby=count&order=desc&per_page=100";
|
||||||
|
private static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
|
||||||
|
private static readonly TOTAL_POSTS_HEADER = "x-wp-total";
|
||||||
|
|
||||||
|
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||||
|
super(settings, options);
|
||||||
|
|
||||||
|
if (!settings.source || settings.source.sourceKind !== "wordpress") {
|
||||||
|
throw new Error("HtmlCrawler requires a source of kind 'wordpress'");
|
||||||
|
}
|
||||||
|
this.source = this.settings.source as WordPressSourceConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch and process WordPress posts.
|
||||||
|
*/
|
||||||
|
async fetch(): Promise<void> {
|
||||||
|
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||||
|
const dateRange = this.settings.dateRange;
|
||||||
|
|
||||||
|
let stop = false;
|
||||||
|
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||||
|
const endpoint = this.postsEndpoint(page);
|
||||||
|
try {
|
||||||
|
const response = await this.http.get(endpoint);
|
||||||
|
const data = (await response.json()) as unknown;
|
||||||
|
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
|
||||||
|
if (!Array.isArray(data)) {
|
||||||
|
logger.warn({ type: typeof data, page }, "Unexpected WordPress payload type");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const entry of articles) {
|
||||||
|
const saved = await this.fetchOne(entry, dateRange);
|
||||||
|
if (saved === null) {
|
||||||
|
stop = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error, page }, "> page %s => [failed]", page);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (stop) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch links from a WordPress posts endpoint.
|
||||||
|
* @param url - The posts endpoint URL
|
||||||
|
*/
|
||||||
|
async fetchLinks(url: string) {
|
||||||
|
const response = await this.http.get(url);
|
||||||
|
const data = (await response.json()) as unknown;
|
||||||
|
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
|
||||||
|
if (!Array.isArray(data)) {
|
||||||
|
logger.warn({ type: typeof data }, "Unexpected WordPress payload type");
|
||||||
|
}
|
||||||
|
return articles;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch and process a single WordPress post.
|
||||||
|
* @param input - Decoded JSON object or raw JSON string
|
||||||
|
* @param dateRange - Optional date range for filtering
|
||||||
|
*/
|
||||||
|
async fetchOne(input: unknown, dateRange?: DateRange | null) {
|
||||||
|
// input can be the decoded JSON object or a raw JSON string
|
||||||
|
let data: WordPressPost | null = null;
|
||||||
|
try {
|
||||||
|
if (typeof input === "string") {
|
||||||
|
data = JSON.parse(input) as WordPressPost;
|
||||||
|
} else if (input && typeof input === "object") {
|
||||||
|
data = input as WordPressPost;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error }, "Failed to decode WordPress payload");
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data || typeof data !== "object") {
|
||||||
|
throw new Error("Unexpected WordPress payload type");
|
||||||
|
}
|
||||||
|
|
||||||
|
const link = data.link;
|
||||||
|
if (!link) {
|
||||||
|
logger.error("Skipping WordPress article without link");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const titleHtml = data.title?.rendered ?? "";
|
||||||
|
const bodyHtml = data.content?.rendered ?? "";
|
||||||
|
const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled";
|
||||||
|
const body = md.turndown(bodyHtml);
|
||||||
|
const timestamp = this.computeTimestamp(data.date);
|
||||||
|
const categories = await this.mapCategories(data.categories ?? []);
|
||||||
|
|
||||||
|
// date range skip as in HTML crawler
|
||||||
|
if (dateRange) {
|
||||||
|
const { isTimestampInRange } = await import("@/utils");
|
||||||
|
if (!isTimestampInRange(dateRange, timestamp)) {
|
||||||
|
logger.info(
|
||||||
|
{ title, link, date: data.date, timestamp },
|
||||||
|
"Skipping article outside date range",
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const enriched = await this.enrichWithOpenGraph(
|
||||||
|
{
|
||||||
|
title,
|
||||||
|
link,
|
||||||
|
body,
|
||||||
|
categories,
|
||||||
|
source: this.source.sourceId,
|
||||||
|
timestamp,
|
||||||
|
},
|
||||||
|
link,
|
||||||
|
);
|
||||||
|
|
||||||
|
return await persist(enriched, this.persistors);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get pagination info from WordPress API.
|
||||||
|
*/
|
||||||
|
async getPagination(): Promise<PageRange> {
|
||||||
|
try {
|
||||||
|
const url = `${this.baseUrl()}wp-json/wp/v2/posts?_fields=id&per_page=100`;
|
||||||
|
const response = await this.http.get(url);
|
||||||
|
const pages = Number.parseInt(
|
||||||
|
response.headers.get(WordPressCrawler.TOTAL_PAGES_HEADER) ?? "1",
|
||||||
|
10,
|
||||||
|
);
|
||||||
|
const posts = Number.parseInt(
|
||||||
|
response.headers.get(WordPressCrawler.TOTAL_POSTS_HEADER) ?? "0",
|
||||||
|
10,
|
||||||
|
);
|
||||||
|
logger.info({ posts, pages }, "WordPress pagination");
|
||||||
|
const end = Number.isFinite(pages) && pages > 0 ? pages : 1;
|
||||||
|
return { start: 1, end };
|
||||||
|
} catch {
|
||||||
|
return { start: 1, end: 1 };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get base URL for WordPress REST API.
|
||||||
|
*/
|
||||||
|
private baseUrl(): string {
|
||||||
|
const base = String(this.source.sourceUrl);
|
||||||
|
return base.endsWith("/") ? base : `${base}/`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct posts endpoint URL for a given page.
|
||||||
|
* @param page - Page number
|
||||||
|
*/
|
||||||
|
postsEndpoint(page: number): string {
|
||||||
|
return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch and cache WordPress categories.
|
||||||
|
*/
|
||||||
|
private async fetchCategories(): Promise<void> {
|
||||||
|
const url = `${this.baseUrl()}wp-json/wp/v2/categories?${WordPressCrawler.CATEGORY_QUERY}`;
|
||||||
|
const response = await this.http.get(url);
|
||||||
|
const list = (await response.json()) as Array<{ id: number; slug: string }>;
|
||||||
|
for (const c of list) {
|
||||||
|
this.categoryMap.set(c.id, c.slug);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map category IDs to slugs.
|
||||||
|
* @param ids - Category IDs
|
||||||
|
*/
|
||||||
|
private async mapCategories(ids: number[]): Promise<string[]> {
|
||||||
|
if (this.categoryMap.size === 0) {
|
||||||
|
try {
|
||||||
|
await this.fetchCategories();
|
||||||
|
} catch (error) {
|
||||||
|
logger.warn({ error }, "Failed to fetch WordPress categories");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const values: string[] = [];
|
||||||
|
for (const id of [...ids].sort((a, b) => a - b)) {
|
||||||
|
const slug = this.categoryMap.get(id);
|
||||||
|
if (slug && !values.includes(slug)) values.push(slug);
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute UNIX timestamp from WordPress date string.
|
||||||
|
* @param raw - Raw date string
|
||||||
|
*/
|
||||||
|
private computeTimestamp(raw?: string | null): number {
|
||||||
|
if (!raw) return Math.floor(Date.now() / 1000);
|
||||||
|
// Normalize WordPress Z into +00:00 for Date parsing robustness
|
||||||
|
const cleaned = raw.replace("Z", "+00:00");
|
||||||
|
const parsed = Date.parse(cleaned);
|
||||||
|
if (!Number.isNaN(parsed)) return Math.floor(parsed / 1000);
|
||||||
|
return Math.floor(Date.now() / 1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,81 @@
|
|||||||
|
import fs from "node:fs";
|
||||||
|
import path from "node:path";
|
||||||
|
import { Article } from "@/schema";
|
||||||
|
import { countTokens } from "@/utils";
|
||||||
|
import logger from "@basango/logger";
|
||||||
|
|
||||||
|
export interface Persistor {
|
||||||
|
persist(record: Article): Promise<void> | void;
|
||||||
|
close: () => Promise<void> | void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PersistorOptions {
|
||||||
|
directory: string;
|
||||||
|
sourceId: string;
|
||||||
|
suffix?: string;
|
||||||
|
encoding?: BufferEncoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const persist = async (payload: Article, persistors: Persistor[]): Promise<Article> => {
|
||||||
|
const article = {
|
||||||
|
...payload,
|
||||||
|
tokenStatistics: {
|
||||||
|
title: countTokens(payload.title),
|
||||||
|
body: countTokens(payload.body),
|
||||||
|
excerpt: countTokens(payload.body.substring(0, 200)),
|
||||||
|
categories: countTokens(payload.categories.join(",")),
|
||||||
|
},
|
||||||
|
} as Article;
|
||||||
|
|
||||||
|
for (const persistor of persistors) {
|
||||||
|
try {
|
||||||
|
await persistor.persist(article);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error }, "Failed to persist article record");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info({ url: article.link }, "article successfully persisted");
|
||||||
|
return article;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class JsonlPersistor implements Persistor {
|
||||||
|
private readonly filePath: string;
|
||||||
|
private readonly encoding: BufferEncoding;
|
||||||
|
private pending: Promise<void> = Promise.resolve();
|
||||||
|
private closed = false;
|
||||||
|
|
||||||
|
constructor(options: PersistorOptions) {
|
||||||
|
const suffix = options.suffix ?? ".jsonl";
|
||||||
|
this.encoding = options.encoding ?? "utf-8";
|
||||||
|
|
||||||
|
fs.mkdirSync(options.directory, { recursive: true });
|
||||||
|
this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`);
|
||||||
|
|
||||||
|
if (!fs.existsSync(this.filePath)) {
|
||||||
|
fs.writeFileSync(this.filePath, "", { encoding: this.encoding });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
persist(record: Article): Promise<void> {
|
||||||
|
if (this.closed) {
|
||||||
|
return Promise.reject(new Error("Persistor has been closed"));
|
||||||
|
}
|
||||||
|
|
||||||
|
const payload = `${JSON.stringify(record)}\n`;
|
||||||
|
|
||||||
|
this.pending = this.pending.then(async () => {
|
||||||
|
fs.writeFileSync(this.filePath, payload, {
|
||||||
|
encoding: this.encoding,
|
||||||
|
mode: "a",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return this.pending;
|
||||||
|
}
|
||||||
|
|
||||||
|
async close(): Promise<void> {
|
||||||
|
this.closed = true;
|
||||||
|
await this.pending;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
import { resolveSourceConfig } from "@/utils";
|
||||||
|
import {
|
||||||
|
closePersistors,
|
||||||
|
CrawlingOptions,
|
||||||
|
createPersistors,
|
||||||
|
resolveCrawlerConfig,
|
||||||
|
} from "@/process/crawler";
|
||||||
|
import logger from "@basango/logger";
|
||||||
|
import { WordPressCrawler } from "@/process/parsers/wordpress";
|
||||||
|
import { HtmlCrawler } from "@/process/parsers/html";
|
||||||
|
|
||||||
|
export const runSyncCrawl = async (options: CrawlingOptions): Promise<void> => {
|
||||||
|
const source = resolveSourceConfig(options.sourceId);
|
||||||
|
const settings = resolveCrawlerConfig(source, options);
|
||||||
|
const persistors = createPersistors(source);
|
||||||
|
|
||||||
|
const crawler =
|
||||||
|
source.sourceKind === "wordpress"
|
||||||
|
? new WordPressCrawler(settings, { persistors })
|
||||||
|
: new HtmlCrawler(settings, { persistors });
|
||||||
|
|
||||||
|
try {
|
||||||
|
await crawler.fetch();
|
||||||
|
} finally {
|
||||||
|
await closePersistors(persistors);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info({ ...options }, "Synchronous crawl completed");
|
||||||
|
};
|
||||||
@@ -1,51 +1,8 @@
|
|||||||
import {z} from "zod";
|
import { z } from "zod";
|
||||||
import {createSourcesConfig, resolveProjectPaths} from "@/utils";
|
|
||||||
|
|
||||||
export const UpdateDirectionSchema = z.enum(["forward", "backward"]);
|
export const UpdateDirectionSchema = z.enum(["forward", "backward"]);
|
||||||
export const SourceKindSchema = z.enum(["wordpress", "html"]);
|
export const SourceKindSchema = z.enum(["wordpress", "html"]);
|
||||||
|
|
||||||
export const SourceDateSchema = z.object({
|
|
||||||
format: z.string().default("yyyy-LL-dd HH:mm"),
|
|
||||||
pattern: z.string().nullable().optional(),
|
|
||||||
replacement: z.string().nullable().optional(),
|
|
||||||
});
|
|
||||||
|
|
||||||
export const SourceSelectorsSchema = z.object({
|
|
||||||
articles: z.string().optional().nullable(),
|
|
||||||
article_title: z.string().optional().nullable(),
|
|
||||||
article_link: z.string().optional().nullable(),
|
|
||||||
article_body: z.string().optional().nullable(),
|
|
||||||
article_date: z.string().optional().nullable(),
|
|
||||||
article_categories: z.string().optional().nullable(),
|
|
||||||
pagination: z.string().default("ul.pagination > li a"),
|
|
||||||
});
|
|
||||||
|
|
||||||
const BaseSourceSchema = z.object({
|
|
||||||
source_id: z.string(),
|
|
||||||
source_url: z.url(),
|
|
||||||
source_date: SourceDateSchema.default(SourceDateSchema.parse({})),
|
|
||||||
source_kind: SourceKindSchema,
|
|
||||||
categories: z.array(z.string()).default([]),
|
|
||||||
supports_categories: z.boolean().default(false),
|
|
||||||
requires_details: z.boolean().default(false),
|
|
||||||
requires_rate_limit: z.boolean().default(false),
|
|
||||||
});
|
|
||||||
|
|
||||||
export const HtmlSourceConfigSchema = BaseSourceSchema.extend({
|
|
||||||
source_kind: z.literal("html"),
|
|
||||||
source_selectors: SourceSelectorsSchema.default(
|
|
||||||
SourceSelectorsSchema.parse({}),
|
|
||||||
),
|
|
||||||
pagination_template: z.string(),
|
|
||||||
});
|
|
||||||
|
|
||||||
export const WordPressSourceConfigSchema = BaseSourceSchema.extend({
|
|
||||||
source_kind: z.literal("wordpress"),
|
|
||||||
source_date: SourceDateSchema.default(
|
|
||||||
SourceDateSchema.parse({format: "yyyy-LL-dd'T'HH:mm:ss"}),
|
|
||||||
),
|
|
||||||
});
|
|
||||||
|
|
||||||
export const DateRangeSchema = z
|
export const DateRangeSchema = z
|
||||||
.object({
|
.object({
|
||||||
start: z.number().int(),
|
start: z.number().int(),
|
||||||
@@ -96,102 +53,79 @@ export const DateRangeSpecSchema = z
|
|||||||
.regex(/.+:.+/, "Expected start:end format")
|
.regex(/.+:.+/, "Expected start:end format")
|
||||||
.transform((spec) => {
|
.transform((spec) => {
|
||||||
const [startRaw, endRaw] = spec.split(":");
|
const [startRaw, endRaw] = spec.split(":");
|
||||||
return {startRaw: String(startRaw), endRaw: String(endRaw)};
|
return { startRaw: String(startRaw), endRaw: String(endRaw) };
|
||||||
});
|
});
|
||||||
|
|
||||||
export const ProjectPathsSchema = z.object({
|
export const SourceDateSchema = z.object({
|
||||||
root: z.string(),
|
format: z.string().default("yyyy-LL-dd HH:mm"),
|
||||||
data: z.string(),
|
pattern: z.string().nullable().optional(),
|
||||||
logs: z.string(),
|
replacement: z.string().nullable().optional(),
|
||||||
configs: z.string(),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
export const LoggingConfigSchema = z.object({
|
const BaseSourceSchema = z.object({
|
||||||
level: z.string().default("INFO"),
|
sourceId: z.string(),
|
||||||
format: z
|
sourceUrl: z.url(),
|
||||||
.string()
|
sourceDate: SourceDateSchema,
|
||||||
.default("%(asctime)s - %(name)s - %(levelname)s - %(message)s"),
|
sourceKind: SourceKindSchema,
|
||||||
console_logging: z.boolean().default(true),
|
categories: z.array(z.string()).default([]),
|
||||||
file_logging: z.boolean().default(false),
|
supportsCategories: z.boolean().default(false),
|
||||||
log_file: z.string().default("crawler.log"),
|
requiresDetails: z.boolean().default(false),
|
||||||
max_log_size: z
|
requiresRateLimit: z.boolean().default(false),
|
||||||
.number()
|
|
||||||
.int()
|
|
||||||
.positive()
|
|
||||||
.default(10 * 1024 * 1024),
|
|
||||||
backup_count: z.number().int().nonnegative().default(5),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
export const ClientConfigSchema = z.object({
|
export const HtmlSourceConfigSchema = BaseSourceSchema.extend({
|
||||||
timeout: z.number().positive().default(20),
|
sourceKind: z.literal("html"),
|
||||||
user_agent: z
|
sourceSelectors: z.object({
|
||||||
.string()
|
articles: z.string(),
|
||||||
.default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
|
articleTitle: z.string(),
|
||||||
follow_redirects: z.boolean().default(true),
|
articleLink: z.string(),
|
||||||
verify_ssl: z.boolean().default(true),
|
articleBody: z.string(),
|
||||||
rotate: z.boolean().default(true),
|
articleDate: z.string(),
|
||||||
max_retries: z.number().int().nonnegative().default(3),
|
articleCategories: z.string().optional(),
|
||||||
backoff_initial: z.number().nonnegative().default(1),
|
pagination: z.string().default("ul.pagination > li a"),
|
||||||
backoff_multiplier: z.number().positive().default(2),
|
}),
|
||||||
backoff_max: z.number().nonnegative().default(30),
|
paginationTemplate: z.string(),
|
||||||
respect_retry_after: z.boolean().default(true),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
export const CrawlerConfigSchema = z.object({
|
export const WordPressSourceConfigSchema = BaseSourceSchema.extend({
|
||||||
source: z
|
sourceKind: z.literal("wordpress"),
|
||||||
.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema])
|
sourceDate: SourceDateSchema.default(SourceDateSchema.parse({ format: "yyyy-LL-dd'T'HH:mm:ss" })),
|
||||||
.optional(),
|
|
||||||
page_range: PageRangeSchema.optional(),
|
|
||||||
date_range: DateRangeSchema.optional(),
|
|
||||||
category: z.string().optional(),
|
|
||||||
notify: z.boolean().default(false),
|
|
||||||
is_update: z.boolean().default(false),
|
|
||||||
use_multi_threading: z.boolean().default(false),
|
|
||||||
max_workers: z.number().int().positive().default(5),
|
|
||||||
direction: UpdateDirectionSchema.default("forward"),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
export const FetchConfigSchema = z.object({
|
export const ArticleMetadataSchema = z.object({
|
||||||
client: ClientConfigSchema.default(ClientConfigSchema.parse({})),
|
title: z.string().optional(),
|
||||||
crawler: CrawlerConfigSchema.default(CrawlerConfigSchema.parse({})),
|
description: z.string().optional(),
|
||||||
|
image: z.string().optional(),
|
||||||
|
url: z.url().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export const SourcesConfigSchema = z.object({
|
export const ArticleTokenStatisticsSchema = z.object({
|
||||||
html: z.array(HtmlSourceConfigSchema).default([]),
|
title: z.number().int().nonnegative().default(0),
|
||||||
wordpress: z.array(WordPressSourceConfigSchema).default([]),
|
body: z.number().int().nonnegative().default(0),
|
||||||
|
excerpt: z.number().int().nonnegative().default(0),
|
||||||
|
categories: z.number().int().nonnegative().default(0),
|
||||||
});
|
});
|
||||||
|
|
||||||
export const PipelineConfigSchema = z.object({
|
export const ArticleSchema = z.object({
|
||||||
paths: ProjectPathsSchema.default(resolveProjectPaths(process.cwd())),
|
title: z.string(),
|
||||||
logging: LoggingConfigSchema.default(LoggingConfigSchema.parse({})),
|
link: z.url(),
|
||||||
fetch: FetchConfigSchema.default(FetchConfigSchema.parse({})),
|
body: z.string(),
|
||||||
sources: z
|
categories: z.array(z.string()).default([]),
|
||||||
.union([SourcesConfigSchema, z.undefined()])
|
source: z.string(),
|
||||||
.transform((value) => createSourcesConfig(value ?? {})),
|
timestamp: z.number().int(),
|
||||||
|
metadata: ArticleMetadataSchema.optional(),
|
||||||
|
tokenStatistics: ArticleTokenStatisticsSchema.optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export type UpdateDirection = z.infer<typeof UpdateDirectionSchema>;
|
export type ArticleMetadata = z.infer<typeof ArticleMetadataSchema>;
|
||||||
export type SourceKind = z.infer<typeof SourceKindSchema>;
|
export type Article = z.infer<typeof ArticleSchema>;
|
||||||
export type SourceDate = z.infer<typeof SourceDateSchema>;
|
export type DateRange = z.infer<typeof DateRangeSchema>;
|
||||||
export type SourceSelectors = z.infer<typeof SourceSelectorsSchema>;
|
export type PageRange = z.infer<typeof PageRangeSchema>;
|
||||||
export type HtmlSourceConfig = z.infer<typeof HtmlSourceConfigSchema>;
|
export type HtmlSourceConfig = z.infer<typeof HtmlSourceConfigSchema>;
|
||||||
export type WordPressSourceConfig = z.infer<typeof WordPressSourceConfigSchema>;
|
export type WordPressSourceConfig = z.infer<typeof WordPressSourceConfigSchema>;
|
||||||
export type AnySourceConfig = HtmlSourceConfig | WordPressSourceConfig;
|
export type AnySourceConfig = HtmlSourceConfig | WordPressSourceConfig;
|
||||||
export type DateRange = z.infer<typeof DateRangeSchema>;
|
|
||||||
export type PageRange = z.infer<typeof PageRangeSchema>;
|
|
||||||
|
|
||||||
export interface CreateDateRangeOptions {
|
export interface CreateDateRangeOptions {
|
||||||
format?: string;
|
format?: string;
|
||||||
separator?: string;
|
separator?: string;
|
||||||
}
|
}
|
||||||
export type SourcesConfig = z.infer<typeof SourcesConfigSchema> & {
|
|
||||||
find: (sourceId: string) => AnySourceConfig | undefined;
|
|
||||||
};
|
|
||||||
export type ProjectPaths = z.infer<typeof ProjectPathsSchema>;
|
|
||||||
export type LoggingConfig = z.infer<typeof LoggingConfigSchema>;
|
|
||||||
export type ClientConfig = z.infer<typeof ClientConfigSchema>;
|
|
||||||
export type CrawlerConfig = z.infer<typeof CrawlerConfigSchema> & {
|
|
||||||
source?: AnySourceConfig;
|
|
||||||
};
|
|
||||||
export type FetchConfig = z.infer<typeof FetchConfigSchema>;
|
|
||||||
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
|
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
import { logger } from "@basango/logger";
|
||||||
|
import { runSyncCrawl } from "@/process/sync/tasks";
|
||||||
|
import { parseCrawlingCliArgs, CRAWLING_USAGE } from "@/scripts/utils";
|
||||||
|
|
||||||
|
const main = async (): Promise<void> => {
|
||||||
|
const options = parseCrawlingCliArgs();
|
||||||
|
|
||||||
|
if (options.sourceId === undefined) {
|
||||||
|
console.log(CRAWLING_USAGE);
|
||||||
|
process.exitCode = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await runSyncCrawl({ ...options });
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error }, "Synchronous crawl failed");
|
||||||
|
process.exitCode = 1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void main();
|
||||||
@@ -1,78 +1,20 @@
|
|||||||
import { parseArgs } from "node:util";
|
|
||||||
|
|
||||||
import { logger } from "@basango/logger";
|
import { logger } from "@basango/logger";
|
||||||
import { PipelineConfigManager } from "@/config";
|
|
||||||
import { createQueueSettings } from "@/process/async/queue";
|
|
||||||
import { scheduleAsyncCrawl } from "@/process/async/tasks";
|
import { scheduleAsyncCrawl } from "@/process/async/tasks";
|
||||||
|
import { parseCrawlingCliArgs, CRAWLING_USAGE } from "@/scripts/utils";
|
||||||
interface QueueCliOptions {
|
|
||||||
source?: string;
|
|
||||||
env: string;
|
|
||||||
page?: string;
|
|
||||||
date?: string;
|
|
||||||
category?: string;
|
|
||||||
"redis-url"?: string;
|
|
||||||
help?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
const usage = `
|
|
||||||
Usage: bun run src/scripts/queue -- --source <id> [options]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--page <range> Optional page range filter (e.g. 1:5)
|
|
||||||
--date <range> Optional date range filter (e.g. 2024-01-01:2024-01-31)
|
|
||||||
--category <slug> Optional category to crawl
|
|
||||||
--redis-url <url> Override Redis connection URL
|
|
||||||
--env <env> Environment to load (default: development)
|
|
||||||
-h, --help Show this message
|
|
||||||
`;
|
|
||||||
|
|
||||||
const parseCliArgs = (): QueueCliOptions => {
|
|
||||||
const { values } = parseArgs({
|
|
||||||
options: {
|
|
||||||
source: { type: "string" },
|
|
||||||
page: { type: "string" },
|
|
||||||
date: { type: "string" },
|
|
||||||
category: { type: "string" },
|
|
||||||
"redis-url": { type: "string" },
|
|
||||||
env: { type: "string", default: "development" },
|
|
||||||
help: { type: "boolean", short: "h" },
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
return values as QueueCliOptions;
|
|
||||||
};
|
|
||||||
|
|
||||||
const main = async (): Promise<void> => {
|
const main = async (): Promise<void> => {
|
||||||
const options = parseCliArgs();
|
const options = parseCrawlingCliArgs();
|
||||||
|
|
||||||
if (options.help || !options.source) {
|
if (options.sourceId === undefined) {
|
||||||
console.log(usage);
|
console.log(CRAWLING_USAGE);
|
||||||
if (!options.source) {
|
process.exitCode = 1;
|
||||||
process.exitCode = 1;
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const env = options.env ?? "development";
|
|
||||||
const manager = new PipelineConfigManager({ env });
|
|
||||||
manager.setupLogging(manager.get(env));
|
|
||||||
|
|
||||||
const settings = options["redis-url"]
|
|
||||||
? createQueueSettings({ redis_url: options["redis-url"] })
|
|
||||||
: undefined;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const id = await scheduleAsyncCrawl({
|
const id = await scheduleAsyncCrawl({ ...options });
|
||||||
sourceId: options.source,
|
|
||||||
env,
|
|
||||||
pageRange: options.page ?? null,
|
|
||||||
dateRange: options.date ?? null,
|
|
||||||
category: options.category ?? null,
|
|
||||||
settings,
|
|
||||||
});
|
|
||||||
|
|
||||||
logger.info({ id, ...options }, "Scheduled asynchronous crawl job");
|
logger.info({ id, options }, "Scheduled asynchronous crawl job");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error({ error }, "Failed to schedule crawl job");
|
logger.error({ error }, "Failed to schedule crawl job");
|
||||||
process.exitCode = 1;
|
process.exitCode = 1;
|
||||||
|
|||||||
@@ -0,0 +1,39 @@
|
|||||||
|
import { parseArgs } from "node:util";
|
||||||
|
import { CrawlingOptions } from "@/process/crawler";
|
||||||
|
|
||||||
|
interface WorkerCliOptions {
|
||||||
|
queue?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export const CRAWLING_USAGE = `
|
||||||
|
Usage: bun run crawl:[async|sync] -- --sourceId <id> [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--page <range> Optional page range filter (e.g. 1:5)
|
||||||
|
--date <range> Optional date range filter (e.g. 2024-01-01:2024-01-31)
|
||||||
|
--category <slug> Optional category to crawl
|
||||||
|
-h, --help Show this message
|
||||||
|
`;
|
||||||
|
|
||||||
|
export const parseWorkerCliArgs = (): WorkerCliOptions => {
|
||||||
|
const { values } = parseArgs({
|
||||||
|
options: {
|
||||||
|
queue: { type: "string", multiple: true, short: "q" },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return values as WorkerCliOptions;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const parseCrawlingCliArgs = (): CrawlingOptions => {
|
||||||
|
const { values } = parseArgs({
|
||||||
|
options: {
|
||||||
|
sourceId: { type: "string" },
|
||||||
|
page: { type: "string" },
|
||||||
|
date: { type: "string" },
|
||||||
|
category: { type: "string" },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return values as CrawlingOptions;
|
||||||
|
};
|
||||||
@@ -1,93 +1,20 @@
|
|||||||
import { parseArgs } from "node:util";
|
|
||||||
|
|
||||||
import { logger } from "@basango/logger";
|
import { logger } from "@basango/logger";
|
||||||
|
|
||||||
import { PipelineConfigManager } from "@/config";
|
import { createQueueManager } from "@/process/async/queue";
|
||||||
import { createQueueManager, createQueueSettings } from "@/process/async/queue";
|
|
||||||
import { startWorker } from "@/process/async/worker";
|
import { startWorker } from "@/process/async/worker";
|
||||||
|
import { parseWorkerCliArgs } from "@/scripts/utils";
|
||||||
interface WorkerCliOptions {
|
|
||||||
env: string;
|
|
||||||
queue?: string[];
|
|
||||||
concurrency?: string;
|
|
||||||
"redis-url"?: string;
|
|
||||||
help?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
const usage = `
|
|
||||||
Usage: bun run src/scripts/worker [options]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--env <env> Environment to load (default: development)
|
|
||||||
-q, --queue <name> Queue name to listen on (repeatable)
|
|
||||||
--concurrency <number> Number of concurrent jobs per worker
|
|
||||||
--redis-url <url> Override Redis connection URL
|
|
||||||
-h, --help Show this message
|
|
||||||
`;
|
|
||||||
|
|
||||||
const parseCliArgs = (): WorkerCliOptions => {
|
|
||||||
const { values } = parseArgs({
|
|
||||||
options: {
|
|
||||||
env: { type: "string", default: "development" },
|
|
||||||
queue: { type: "string", multiple: true, short: "q" },
|
|
||||||
concurrency: { type: "string" },
|
|
||||||
"redis-url": { type: "string" },
|
|
||||||
help: { type: "boolean", short: "h" },
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
return values as WorkerCliOptions;
|
|
||||||
};
|
|
||||||
|
|
||||||
const parseConcurrency = (value?: string): number | undefined => {
|
|
||||||
if (!value) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
const parsed = Number.parseInt(value, 10);
|
|
||||||
if (Number.isNaN(parsed) || parsed <= 0) {
|
|
||||||
throw new Error(`Invalid concurrency value: ${value}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
return parsed;
|
|
||||||
};
|
|
||||||
|
|
||||||
const main = async (): Promise<void> => {
|
const main = async (): Promise<void> => {
|
||||||
const options = parseCliArgs();
|
const options = parseWorkerCliArgs();
|
||||||
|
|
||||||
if (options.help) {
|
const manager = createQueueManager();
|
||||||
console.log(usage);
|
const queues = options.queue?.length
|
||||||
return;
|
? options.queue.map((name) => manager.queueName(name))
|
||||||
}
|
|
||||||
|
|
||||||
const env = options.env ?? "development";
|
|
||||||
const manager = new PipelineConfigManager({ env });
|
|
||||||
manager.setupLogging(manager.get(env));
|
|
||||||
|
|
||||||
let concurrency: number | undefined;
|
|
||||||
try {
|
|
||||||
concurrency = parseConcurrency(options.concurrency);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(
|
|
||||||
error instanceof Error ? error : { error },
|
|
||||||
"Invalid concurrency value provided",
|
|
||||||
);
|
|
||||||
process.exitCode = 1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const settings = options["redis-url"]
|
|
||||||
? createQueueSettings({ redis_url: options["redis-url"] })
|
|
||||||
: undefined;
|
|
||||||
const queueManager = createQueueManager({ settings });
|
|
||||||
|
|
||||||
const queueNames = options.queue?.length
|
|
||||||
? options.queue.map((name) => queueManager.queueName(name))
|
|
||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
const handle = startWorker({
|
const handle = startWorker({
|
||||||
queueManager,
|
queueManager: manager,
|
||||||
queueNames,
|
queueNames: queues,
|
||||||
concurrency,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const shutdown = async (signal: NodeJS.Signals) => {
|
const shutdown = async (signal: NodeJS.Signals) => {
|
||||||
@@ -95,26 +22,14 @@ const main = async (): Promise<void> => {
|
|||||||
try {
|
try {
|
||||||
await handle.close();
|
await handle.close();
|
||||||
} finally {
|
} finally {
|
||||||
await queueManager.close();
|
await manager.close();
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
process.once("SIGINT", (signal) => {
|
process.once("SIGINT", (signal) => void shutdown(signal));
|
||||||
void shutdown(signal);
|
process.once("SIGTERM", (signal) => void shutdown(signal));
|
||||||
});
|
logger.info({ queueNames: queues }, "Crawler workers started");
|
||||||
process.once("SIGTERM", (signal) => {
|
|
||||||
void shutdown(signal);
|
|
||||||
});
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
{
|
|
||||||
env,
|
|
||||||
queueNames: queueNames ?? queueManager.iterQueueNames(),
|
|
||||||
concurrency: concurrency ?? "default",
|
|
||||||
},
|
|
||||||
"Crawler workers started",
|
|
||||||
);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void main();
|
void main();
|
||||||
|
|||||||
@@ -1,32 +1,40 @@
|
|||||||
import fs from "node:fs";
|
|
||||||
import path from "node:path";
|
|
||||||
|
|
||||||
import type { RedisOptions } from "ioredis";
|
import type { RedisOptions } from "ioredis";
|
||||||
import { get_encoding, TiktokenEncoding } from "tiktoken";
|
import { get_encoding, TiktokenEncoding } from "tiktoken";
|
||||||
import { format, getUnixTime, isMatch, parse } from "date-fns";
|
import { format, getUnixTime, isMatch, parse } from "date-fns";
|
||||||
import { z } from "zod";
|
|
||||||
|
|
||||||
import {
|
import {
|
||||||
|
AnySourceConfig,
|
||||||
CreateDateRangeOptions,
|
CreateDateRangeOptions,
|
||||||
DateRange,
|
DateRange,
|
||||||
DateRangeSchema,
|
DateRangeSchema,
|
||||||
DateRangeSpecSchema,
|
DateRangeSpecSchema,
|
||||||
PipelineConfig,
|
PageRange,
|
||||||
ProjectPaths,
|
PageRangeSchema,
|
||||||
ProjectPathsSchema,
|
PageRangeSpecSchema,
|
||||||
SourcesConfig,
|
|
||||||
SourcesConfigSchema,
|
|
||||||
} from "@/schema";
|
} from "@/schema";
|
||||||
import { DEFAULT_DATE_FORMAT } from "@/constants";
|
import { DEFAULT_DATE_FORMAT } from "@/constants";
|
||||||
|
import { config } from "@/config";
|
||||||
|
|
||||||
export const ensureDirectories = (paths: ProjectPaths): void => {
|
/**
|
||||||
for (const dir of [paths.data, paths.logs, paths.configs]) {
|
* Resolve a source configuration by its ID.
|
||||||
if (!fs.existsSync(dir)) {
|
* @param id - The source ID
|
||||||
fs.mkdirSync(dir, { recursive: true });
|
*/
|
||||||
}
|
export const resolveSourceConfig = (id: string): AnySourceConfig => {
|
||||||
|
const source =
|
||||||
|
config.sources.html.find((s) => s.sourceId === id) ||
|
||||||
|
config.sources.wordpress.find((s) => s.sourceId === id);
|
||||||
|
|
||||||
|
if (source === undefined) {
|
||||||
|
throw new Error(`Source '${id}' not found in configuration`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return source;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a Redis URL into RedisOptions.
|
||||||
|
* @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0")
|
||||||
|
*/
|
||||||
export const parseRedisUrl = (url: string): RedisOptions => {
|
export const parseRedisUrl = (url: string): RedisOptions => {
|
||||||
if (!url.startsWith("redis://")) {
|
if (!url.startsWith("redis://")) {
|
||||||
return {};
|
return {};
|
||||||
@@ -40,20 +48,11 @@ export const parseRedisUrl = (url: string): RedisOptions => {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
export const countTokens = (
|
/**
|
||||||
text: string,
|
* Parse a date string using the specified format.
|
||||||
encoding: TiktokenEncoding = "cl100k_base",
|
* @param value - The date string to parse
|
||||||
): number => {
|
* @param format - The date format
|
||||||
try {
|
*/
|
||||||
const encoder = get_encoding(encoding);
|
|
||||||
const tokens = encoder.encode(text);
|
|
||||||
encoder.free();
|
|
||||||
return tokens.length;
|
|
||||||
} catch {
|
|
||||||
return text.length;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const parseDate = (value: string, format: string): Date => {
|
const parseDate = (value: string, format: string): Date => {
|
||||||
if (!isMatch(value, format)) {
|
if (!isMatch(value, format)) {
|
||||||
throw new Error(`Invalid date '${value}' for format '${format}'`);
|
throw new Error(`Invalid date '${value}' for format '${format}'`);
|
||||||
@@ -65,10 +64,42 @@ const parseDate = (value: string, format: string): Date => {
|
|||||||
return parsed;
|
return parsed;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Count the number of tokens in the given text using the specified encoding.
|
||||||
|
* @param text - The input text
|
||||||
|
* @param encoding - The token encoding (default: "cl100k_base")
|
||||||
|
*/
|
||||||
|
export const countTokens = (text: string, encoding: TiktokenEncoding = "cl100k_base"): number => {
|
||||||
|
try {
|
||||||
|
const encoder = get_encoding(encoding);
|
||||||
|
const tokens = encoder.encode(text);
|
||||||
|
encoder.free();
|
||||||
|
return tokens.length;
|
||||||
|
} catch {
|
||||||
|
return text.length;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a page range from a string specification.
|
||||||
|
* @param spec - The page range specification (e.g., "1:10")
|
||||||
|
*/
|
||||||
|
export const createPageRange = (spec: string | undefined): PageRange | undefined => {
|
||||||
|
if (!spec) return undefined;
|
||||||
|
const parsed = PageRangeSpecSchema.parse(spec);
|
||||||
|
return PageRangeSchema.parse(parsed);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a date range from a string specification.
|
||||||
|
* @param spec - The date range specification (e.g., "2023-01-01:2023-12-31")
|
||||||
|
* @param options - Options for date range creation
|
||||||
|
*/
|
||||||
export const createDateRange = (
|
export const createDateRange = (
|
||||||
spec: string,
|
spec: string | undefined,
|
||||||
options: CreateDateRangeOptions = {},
|
options: CreateDateRangeOptions = {},
|
||||||
): DateRange => {
|
): DateRange | undefined => {
|
||||||
|
if (!spec) return undefined;
|
||||||
const { format = DEFAULT_DATE_FORMAT, separator = ":" } = options;
|
const { format = DEFAULT_DATE_FORMAT, separator = ":" } = options;
|
||||||
if (!separator) {
|
if (!separator) {
|
||||||
throw new Error("Separator cannot be empty");
|
throw new Error("Separator cannot be empty");
|
||||||
@@ -88,95 +119,44 @@ export const createDateRange = (
|
|||||||
return DateRangeSchema.parse(range);
|
return DateRangeSchema.parse(range);
|
||||||
};
|
};
|
||||||
|
|
||||||
export const formatDateRange = (
|
/**
|
||||||
range: DateRange,
|
* Format a date range into a string representation.
|
||||||
fmt = DEFAULT_DATE_FORMAT,
|
* @param range - The date range
|
||||||
): string => {
|
* @param fmt - The date format (default: DEFAULT_DATE_FORMAT)
|
||||||
|
*/
|
||||||
|
export const formatDateRange = (range: DateRange, fmt = DEFAULT_DATE_FORMAT): string => {
|
||||||
const start = format(new Date(range.start * 1000), fmt);
|
const start = format(new Date(range.start * 1000), fmt);
|
||||||
const end = format(new Date(range.end * 1000), fmt);
|
const end = format(new Date(range.end * 1000), fmt);
|
||||||
return `${start}:${end}`;
|
return `${start}:${end}`;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const isTimestampInRange = (
|
/**
|
||||||
range: DateRange,
|
* Format a page range into a string representation.
|
||||||
timestamp: number,
|
* @param range - The page range
|
||||||
): boolean => {
|
*/
|
||||||
|
export const formatPageRange = (range: PageRange): string => {
|
||||||
|
return `${range.start}:${range.end}`;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a timestamp is within a given date range.
|
||||||
|
* @param range - The date range
|
||||||
|
* @param timestamp - The timestamp to check
|
||||||
|
*/
|
||||||
|
export const isTimestampInRange = (range: DateRange, timestamp: number): boolean => {
|
||||||
return range.start <= timestamp && timestamp <= range.end;
|
return range.start <= timestamp && timestamp <= range.end;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const resolveProjectPaths = (rootDir: string): ProjectPaths => {
|
/**
|
||||||
return ProjectPathsSchema.parse({
|
* Convert a relative URL to an absolute URL based on the base URL.
|
||||||
root: rootDir,
|
* @param base - The base URL
|
||||||
data: path.join(rootDir, "data", "dataset"),
|
* @param href - The relative or absolute URL
|
||||||
logs: path.join(rootDir, "data", "logs"),
|
*/
|
||||||
configs: path.join(rootDir, "config"),
|
export const createAbsoluteUrl = (base: string, href: string): string => {
|
||||||
});
|
try {
|
||||||
};
|
// new URL handles relative paths with base
|
||||||
|
return new URL(href, base.endsWith("/") ? base : `${base}/`).toString();
|
||||||
export const createSourcesConfig = (input: unknown): SourcesConfig => {
|
} catch {
|
||||||
const parsed = SourcesConfigSchema.parse(input);
|
return href;
|
||||||
const resolver = (sourceId: string) =>
|
|
||||||
[...parsed.html, ...parsed.wordpress].find(
|
|
||||||
(source) => source.source_id === sourceId,
|
|
||||||
);
|
|
||||||
return Object.assign({ find: resolver }, parsed);
|
|
||||||
};
|
|
||||||
|
|
||||||
export const mergePipelineConfig = (
|
|
||||||
base: PipelineConfig,
|
|
||||||
overrides: Partial<PipelineConfig>,
|
|
||||||
): PipelineConfig => {
|
|
||||||
const paths = overrides.paths ?? base.paths;
|
|
||||||
const logging = { ...base.logging, ...(overrides.logging ?? {}) };
|
|
||||||
const fetch = {
|
|
||||||
client: { ...base.fetch.client, ...(overrides.fetch?.client ?? {}) },
|
|
||||||
crawler: { ...base.fetch.crawler, ...(overrides.fetch?.crawler ?? {}) },
|
|
||||||
};
|
|
||||||
|
|
||||||
const sources = createSourcesConfig({
|
|
||||||
html: overrides.sources?.html ?? base.sources.html,
|
|
||||||
wordpress: overrides.sources?.wordpress ?? base.sources.wordpress,
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
paths,
|
|
||||||
logging,
|
|
||||||
fetch,
|
|
||||||
sources,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
export const resolveConfigPath = (basePath: string, env?: string): string => {
|
|
||||||
if (!env || env === "development") {
|
|
||||||
return basePath;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const ext = path.extname(basePath);
|
|
||||||
const withoutExt = basePath.slice(0, basePath.length - ext.length);
|
|
||||||
return `${withoutExt}.${env}${ext}`;
|
|
||||||
};
|
|
||||||
export const schemaToJSON = <T extends z.ZodTypeAny>(schema: T): unknown => {
|
|
||||||
const toJSONSchema = (z as any).toJSONSchema as
|
|
||||||
| ((s: z.ZodTypeAny, opts?: Record<string, unknown>) => unknown)
|
|
||||||
| undefined;
|
|
||||||
|
|
||||||
if (typeof toJSONSchema === "function") {
|
|
||||||
try {
|
|
||||||
// target can be "draft-2020-12" | "draft-7" | "draft-4" | "openapi-3.0"
|
|
||||||
return toJSONSchema(schema, {
|
|
||||||
target: "draft-2020-12",
|
|
||||||
unrepresentable: "any",
|
|
||||||
});
|
|
||||||
} catch {
|
|
||||||
// fall through to minimal mapping
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (schema instanceof z.ZodObject) return { type: "object" };
|
|
||||||
if (schema instanceof z.ZodArray) return { type: "array" };
|
|
||||||
if (schema instanceof z.ZodString) return { type: "string" };
|
|
||||||
if (schema instanceof z.ZodNumber) return { type: "number" };
|
|
||||||
if (schema instanceof z.ZodBoolean) return { type: "boolean" };
|
|
||||||
|
|
||||||
return { type: "unknown" };
|
|
||||||
};
|
};
|
||||||
|
|||||||
+16
-3
@@ -10,12 +10,22 @@
|
|||||||
},
|
},
|
||||||
"formatter": {
|
"formatter": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"indentStyle": "space"
|
"indentStyle": "space",
|
||||||
|
"indentWidth": 2,
|
||||||
|
"lineEnding": "lf",
|
||||||
|
"lineWidth": 100
|
||||||
},
|
},
|
||||||
"linter": {
|
"linter": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"rules": {
|
"rules": {
|
||||||
"recommended": true
|
"recommended": true,
|
||||||
|
"style": {
|
||||||
|
"useImportType": "off"
|
||||||
|
},
|
||||||
|
"correctness": {
|
||||||
|
"noUnusedImports": "on",
|
||||||
|
"useImportExtensions": "off"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"javascript": {
|
"javascript": {
|
||||||
@@ -27,7 +37,10 @@
|
|||||||
"enabled": true,
|
"enabled": true,
|
||||||
"actions": {
|
"actions": {
|
||||||
"source": {
|
"source": {
|
||||||
"organizeImports": "on"
|
"organizeImports": "on",
|
||||||
|
"useSortedKeys": "on",
|
||||||
|
"useSortedAttributes": "on",
|
||||||
|
"useSortedProperties": "on"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+34
-3
@@ -13,13 +13,15 @@
|
|||||||
},
|
},
|
||||||
"apps/crawler": {
|
"apps/crawler": {
|
||||||
"name": "@basango/crawler",
|
"name": "@basango/crawler",
|
||||||
"version": "0.1.0",
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@basango/logger": "workspace:*",
|
"@basango/logger": "workspace:*",
|
||||||
|
"@devscast/config": "^1.0.2",
|
||||||
"bullmq": "^4.17.0",
|
"bullmq": "^4.17.0",
|
||||||
"date-fns": "^3.6.0",
|
"date-fns": "catalog:",
|
||||||
"ioredis": "^5.3.2",
|
"ioredis": "^5.3.2",
|
||||||
|
"node-html-parser": "^7.0.1",
|
||||||
"tiktoken": "^1.0.14",
|
"tiktoken": "^1.0.14",
|
||||||
|
"turndown": "^7.2.2",
|
||||||
"zod": "catalog:",
|
"zod": "catalog:",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@@ -33,7 +35,7 @@
|
|||||||
"snakecase-keys": "^9.0.2",
|
"snakecase-keys": "^9.0.2",
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/bun": "^1.3.1",
|
"@types/bun": "catalog:",
|
||||||
"@types/pg": "^8.15.6",
|
"@types/pg": "^8.15.6",
|
||||||
"drizzle-kit": "^0.31.6",
|
"drizzle-kit": "^0.31.6",
|
||||||
"typescript": "catalog:",
|
"typescript": "catalog:",
|
||||||
@@ -56,6 +58,7 @@
|
|||||||
},
|
},
|
||||||
"catalog": {
|
"catalog": {
|
||||||
"@types/bun": "^1.3.1",
|
"@types/bun": "^1.3.1",
|
||||||
|
"date-fns": "^3.6.0",
|
||||||
"typescript": "^5.9.3",
|
"typescript": "^5.9.3",
|
||||||
"zod": "^4.0.0",
|
"zod": "^4.0.0",
|
||||||
},
|
},
|
||||||
@@ -88,6 +91,8 @@
|
|||||||
|
|
||||||
"@date-fns/utc": ["@date-fns/utc@2.1.1", "", {}, "sha512-SlJDfG6RPeEX8wEVv6ZB3kak4MmbtyiI2qX/5zuKdordbrhB/iaJ58GVMZgJ6P1sJaM1gMgENFYYeg1JWrCFrA=="],
|
"@date-fns/utc": ["@date-fns/utc@2.1.1", "", {}, "sha512-SlJDfG6RPeEX8wEVv6ZB3kak4MmbtyiI2qX/5zuKdordbrhB/iaJ58GVMZgJ6P1sJaM1gMgENFYYeg1JWrCFrA=="],
|
||||||
|
|
||||||
|
"@devscast/config": ["@devscast/config@1.0.2", "", { "peerDependencies": { "ini": "^6.0.0", "yaml": "^2.8.1", "zod": "^4.1.12" }, "optionalPeers": ["ini", "yaml"] }, "sha512-1DR8GQogAOrR4B9mtZ24YIKlEZNvKOFeovw+XepfkXVx0MB1f1fAHtPAAXppV7RPMLSyQEMFJzve17x2HbohEw=="],
|
||||||
|
|
||||||
"@drizzle-team/brocli": ["@drizzle-team/brocli@0.10.2", "", {}, "sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w=="],
|
"@drizzle-team/brocli": ["@drizzle-team/brocli@0.10.2", "", {}, "sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w=="],
|
||||||
|
|
||||||
"@esbuild-kit/core-utils": ["@esbuild-kit/core-utils@3.3.2", "", { "dependencies": { "esbuild": "~0.18.20", "source-map-support": "^0.5.21" } }, "sha512-sPRAnw9CdSsRmEtnsl2WXWdyquogVpB3yZ3dgwJfe8zrOzTsV7cJvmwrKVa+0ma5BoiGJ+BoqkMvawbayKUsqQ=="],
|
"@esbuild-kit/core-utils": ["@esbuild-kit/core-utils@3.3.2", "", { "dependencies": { "esbuild": "~0.18.20", "source-map-support": "^0.5.21" } }, "sha512-sPRAnw9CdSsRmEtnsl2WXWdyquogVpB3yZ3dgwJfe8zrOzTsV7cJvmwrKVa+0ma5BoiGJ+BoqkMvawbayKUsqQ=="],
|
||||||
@@ -156,6 +161,8 @@
|
|||||||
|
|
||||||
"@manypkg/tools": ["@manypkg/tools@2.1.0", "", { "dependencies": { "jju": "^1.4.0", "js-yaml": "^4.1.0", "tinyglobby": "^0.2.13" } }, "sha512-0FOIepYR4ugPYaHwK7hDeHDkfPOBVvayt9QpvRbi2LT/h2b0GaE/gM9Gag7fsnyYyNaTZ2IGyOuVg07IYepvYQ=="],
|
"@manypkg/tools": ["@manypkg/tools@2.1.0", "", { "dependencies": { "jju": "^1.4.0", "js-yaml": "^4.1.0", "tinyglobby": "^0.2.13" } }, "sha512-0FOIepYR4ugPYaHwK7hDeHDkfPOBVvayt9QpvRbi2LT/h2b0GaE/gM9Gag7fsnyYyNaTZ2IGyOuVg07IYepvYQ=="],
|
||||||
|
|
||||||
|
"@mixmark-io/domino": ["@mixmark-io/domino@2.2.0", "", {}, "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="],
|
||||||
|
|
||||||
"@msgpackr-extract/msgpackr-extract-darwin-arm64": ["@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-QZHtlVgbAdy2zAqNA9Gu1UpIuI8Xvsd1v8ic6B2pZmeFnFcMWiPLfWXh7TVw4eGEZ/C9TH281KwhVoeQUKbyjw=="],
|
"@msgpackr-extract/msgpackr-extract-darwin-arm64": ["@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-QZHtlVgbAdy2zAqNA9Gu1UpIuI8Xvsd1v8ic6B2pZmeFnFcMWiPLfWXh7TVw4eGEZ/C9TH281KwhVoeQUKbyjw=="],
|
||||||
|
|
||||||
"@msgpackr-extract/msgpackr-extract-darwin-x64": ["@msgpackr-extract/msgpackr-extract-darwin-x64@3.0.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-mdzd3AVzYKuUmiWOQ8GNhl64/IoFGol569zNRdkLReh6LRLHOXxU4U8eq0JwaD8iFHdVGqSy4IjFL4reoWCDFw=="],
|
"@msgpackr-extract/msgpackr-extract-darwin-x64": ["@msgpackr-extract/msgpackr-extract-darwin-x64@3.0.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-mdzd3AVzYKuUmiWOQ8GNhl64/IoFGol569zNRdkLReh6LRLHOXxU4U8eq0JwaD8iFHdVGqSy4IjFL4reoWCDFw=="],
|
||||||
@@ -190,6 +197,8 @@
|
|||||||
|
|
||||||
"balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="],
|
"balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="],
|
||||||
|
|
||||||
|
"boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="],
|
||||||
|
|
||||||
"brace-expansion": ["brace-expansion@2.0.2", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ=="],
|
"brace-expansion": ["brace-expansion@2.0.2", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ=="],
|
||||||
|
|
||||||
"buffer-from": ["buffer-from@1.1.2", "", {}, "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ=="],
|
"buffer-from": ["buffer-from@1.1.2", "", {}, "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ=="],
|
||||||
@@ -208,6 +217,10 @@
|
|||||||
|
|
||||||
"cron-parser": ["cron-parser@4.9.0", "", { "dependencies": { "luxon": "^3.2.1" } }, "sha512-p0SaNjrHOnQeR8/VnfGbmg9te2kfyYSQ7Sc/j/6DtPL3JQvKxmjO9TSjNFpujqV3vEYYBvNNvXSxzyksBWAx1Q=="],
|
"cron-parser": ["cron-parser@4.9.0", "", { "dependencies": { "luxon": "^3.2.1" } }, "sha512-p0SaNjrHOnQeR8/VnfGbmg9te2kfyYSQ7Sc/j/6DtPL3JQvKxmjO9TSjNFpujqV3vEYYBvNNvXSxzyksBWAx1Q=="],
|
||||||
|
|
||||||
|
"css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="],
|
||||||
|
|
||||||
|
"css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="],
|
||||||
|
|
||||||
"csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
|
"csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
|
||||||
|
|
||||||
"date-fns": ["date-fns@3.6.0", "", {}, "sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww=="],
|
"date-fns": ["date-fns@3.6.0", "", {}, "sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww=="],
|
||||||
@@ -224,12 +237,22 @@
|
|||||||
|
|
||||||
"detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="],
|
"detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="],
|
||||||
|
|
||||||
|
"dom-serializer": ["dom-serializer@2.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", "entities": "^4.2.0" } }, "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg=="],
|
||||||
|
|
||||||
|
"domelementtype": ["domelementtype@2.3.0", "", {}, "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="],
|
||||||
|
|
||||||
|
"domhandler": ["domhandler@5.0.3", "", { "dependencies": { "domelementtype": "^2.3.0" } }, "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w=="],
|
||||||
|
|
||||||
|
"domutils": ["domutils@3.2.2", "", { "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3" } }, "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw=="],
|
||||||
|
|
||||||
"drizzle-kit": ["drizzle-kit@0.31.6", "", { "dependencies": { "@drizzle-team/brocli": "^0.10.2", "@esbuild-kit/esm-loader": "^2.5.5", "esbuild": "^0.25.4", "esbuild-register": "^3.5.0" }, "bin": { "drizzle-kit": "bin.cjs" } }, "sha512-/B4e/4pwnx25QwD5xXgdpo1S+077a2VZdosXbItE/oNmUgQwZydGDz9qJYmnQl/b+5IX0rLfwRhrPnroGtrg8Q=="],
|
"drizzle-kit": ["drizzle-kit@0.31.6", "", { "dependencies": { "@drizzle-team/brocli": "^0.10.2", "@esbuild-kit/esm-loader": "^2.5.5", "esbuild": "^0.25.4", "esbuild-register": "^3.5.0" }, "bin": { "drizzle-kit": "bin.cjs" } }, "sha512-/B4e/4pwnx25QwD5xXgdpo1S+077a2VZdosXbItE/oNmUgQwZydGDz9qJYmnQl/b+5IX0rLfwRhrPnroGtrg8Q=="],
|
||||||
|
|
||||||
"drizzle-orm": ["drizzle-orm@0.44.7", "", { "peerDependencies": { "@aws-sdk/client-rds-data": ">=3", "@cloudflare/workers-types": ">=4", "@electric-sql/pglite": ">=0.2.0", "@libsql/client": ">=0.10.0", "@libsql/client-wasm": ">=0.10.0", "@neondatabase/serverless": ">=0.10.0", "@op-engineering/op-sqlite": ">=2", "@opentelemetry/api": "^1.4.1", "@planetscale/database": ">=1.13", "@prisma/client": "*", "@tidbcloud/serverless": "*", "@types/better-sqlite3": "*", "@types/pg": "*", "@types/sql.js": "*", "@upstash/redis": ">=1.34.7", "@vercel/postgres": ">=0.8.0", "@xata.io/client": "*", "better-sqlite3": ">=7", "bun-types": "*", "expo-sqlite": ">=14.0.0", "gel": ">=2", "knex": "*", "kysely": "*", "mysql2": ">=2", "pg": ">=8", "postgres": ">=3", "sql.js": ">=1", "sqlite3": ">=5" }, "optionalPeers": ["@aws-sdk/client-rds-data", "@cloudflare/workers-types", "@electric-sql/pglite", "@libsql/client", "@libsql/client-wasm", "@neondatabase/serverless", "@op-engineering/op-sqlite", "@opentelemetry/api", "@planetscale/database", "@prisma/client", "@tidbcloud/serverless", "@types/better-sqlite3", "@types/pg", "@types/sql.js", "@upstash/redis", "@vercel/postgres", "@xata.io/client", "better-sqlite3", "bun-types", "expo-sqlite", "gel", "knex", "kysely", "mysql2", "pg", "postgres", "sql.js", "sqlite3"] }, "sha512-quIpnYznjU9lHshEOAYLoZ9s3jweleHlZIAWR/jX9gAWNg/JhQ1wj0KGRf7/Zm+obRrYd9GjPVJg790QY9N5AQ=="],
|
"drizzle-orm": ["drizzle-orm@0.44.7", "", { "peerDependencies": { "@aws-sdk/client-rds-data": ">=3", "@cloudflare/workers-types": ">=4", "@electric-sql/pglite": ">=0.2.0", "@libsql/client": ">=0.10.0", "@libsql/client-wasm": ">=0.10.0", "@neondatabase/serverless": ">=0.10.0", "@op-engineering/op-sqlite": ">=2", "@opentelemetry/api": "^1.4.1", "@planetscale/database": ">=1.13", "@prisma/client": "*", "@tidbcloud/serverless": "*", "@types/better-sqlite3": "*", "@types/pg": "*", "@types/sql.js": "*", "@upstash/redis": ">=1.34.7", "@vercel/postgres": ">=0.8.0", "@xata.io/client": "*", "better-sqlite3": ">=7", "bun-types": "*", "expo-sqlite": ">=14.0.0", "gel": ">=2", "knex": "*", "kysely": "*", "mysql2": ">=2", "pg": ">=8", "postgres": ">=3", "sql.js": ">=1", "sqlite3": ">=5" }, "optionalPeers": ["@aws-sdk/client-rds-data", "@cloudflare/workers-types", "@electric-sql/pglite", "@libsql/client", "@libsql/client-wasm", "@neondatabase/serverless", "@op-engineering/op-sqlite", "@opentelemetry/api", "@planetscale/database", "@prisma/client", "@tidbcloud/serverless", "@types/better-sqlite3", "@types/pg", "@types/sql.js", "@upstash/redis", "@vercel/postgres", "@xata.io/client", "better-sqlite3", "bun-types", "expo-sqlite", "gel", "knex", "kysely", "mysql2", "pg", "postgres", "sql.js", "sqlite3"] }, "sha512-quIpnYznjU9lHshEOAYLoZ9s3jweleHlZIAWR/jX9gAWNg/JhQ1wj0KGRf7/Zm+obRrYd9GjPVJg790QY9N5AQ=="],
|
||||||
|
|
||||||
"end-of-stream": ["end-of-stream@1.4.5", "", { "dependencies": { "once": "^1.4.0" } }, "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg=="],
|
"end-of-stream": ["end-of-stream@1.4.5", "", { "dependencies": { "once": "^1.4.0" } }, "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg=="],
|
||||||
|
|
||||||
|
"entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="],
|
||||||
|
|
||||||
"esbuild": ["esbuild@0.25.11", "", { "optionalDependencies": { "@esbuild/aix-ppc64": "0.25.11", "@esbuild/android-arm": "0.25.11", "@esbuild/android-arm64": "0.25.11", "@esbuild/android-x64": "0.25.11", "@esbuild/darwin-arm64": "0.25.11", "@esbuild/darwin-x64": "0.25.11", "@esbuild/freebsd-arm64": "0.25.11", "@esbuild/freebsd-x64": "0.25.11", "@esbuild/linux-arm": "0.25.11", "@esbuild/linux-arm64": "0.25.11", "@esbuild/linux-ia32": "0.25.11", "@esbuild/linux-loong64": "0.25.11", "@esbuild/linux-mips64el": "0.25.11", "@esbuild/linux-ppc64": "0.25.11", "@esbuild/linux-riscv64": "0.25.11", "@esbuild/linux-s390x": "0.25.11", "@esbuild/linux-x64": "0.25.11", "@esbuild/netbsd-arm64": "0.25.11", "@esbuild/netbsd-x64": "0.25.11", "@esbuild/openbsd-arm64": "0.25.11", "@esbuild/openbsd-x64": "0.25.11", "@esbuild/openharmony-arm64": "0.25.11", "@esbuild/sunos-x64": "0.25.11", "@esbuild/win32-arm64": "0.25.11", "@esbuild/win32-ia32": "0.25.11", "@esbuild/win32-x64": "0.25.11" }, "bin": { "esbuild": "bin/esbuild" } }, "sha512-KohQwyzrKTQmhXDW1PjCv3Tyspn9n5GcY2RTDqeORIdIJY8yKIF7sTSopFmn/wpMPW4rdPXI0UE5LJLuq3bx0Q=="],
|
"esbuild": ["esbuild@0.25.11", "", { "optionalDependencies": { "@esbuild/aix-ppc64": "0.25.11", "@esbuild/android-arm": "0.25.11", "@esbuild/android-arm64": "0.25.11", "@esbuild/android-x64": "0.25.11", "@esbuild/darwin-arm64": "0.25.11", "@esbuild/darwin-x64": "0.25.11", "@esbuild/freebsd-arm64": "0.25.11", "@esbuild/freebsd-x64": "0.25.11", "@esbuild/linux-arm": "0.25.11", "@esbuild/linux-arm64": "0.25.11", "@esbuild/linux-ia32": "0.25.11", "@esbuild/linux-loong64": "0.25.11", "@esbuild/linux-mips64el": "0.25.11", "@esbuild/linux-ppc64": "0.25.11", "@esbuild/linux-riscv64": "0.25.11", "@esbuild/linux-s390x": "0.25.11", "@esbuild/linux-x64": "0.25.11", "@esbuild/netbsd-arm64": "0.25.11", "@esbuild/netbsd-x64": "0.25.11", "@esbuild/openbsd-arm64": "0.25.11", "@esbuild/openbsd-x64": "0.25.11", "@esbuild/openharmony-arm64": "0.25.11", "@esbuild/sunos-x64": "0.25.11", "@esbuild/win32-arm64": "0.25.11", "@esbuild/win32-ia32": "0.25.11", "@esbuild/win32-x64": "0.25.11" }, "bin": { "esbuild": "bin/esbuild" } }, "sha512-KohQwyzrKTQmhXDW1PjCv3Tyspn9n5GcY2RTDqeORIdIJY8yKIF7sTSopFmn/wpMPW4rdPXI0UE5LJLuq3bx0Q=="],
|
||||||
|
|
||||||
"esbuild-register": ["esbuild-register@3.6.0", "", { "dependencies": { "debug": "^4.3.4" }, "peerDependencies": { "esbuild": ">=0.12 <1" } }, "sha512-H2/S7Pm8a9CL1uhp9OvjwrBh5Pvx0H8qVOxNu8Wed9Y7qv56MPtq+GGM8RJpq6glYJn9Wspr8uw7l55uyinNeg=="],
|
"esbuild-register": ["esbuild-register@3.6.0", "", { "dependencies": { "debug": "^4.3.4" }, "peerDependencies": { "esbuild": ">=0.12 <1" } }, "sha512-H2/S7Pm8a9CL1uhp9OvjwrBh5Pvx0H8qVOxNu8Wed9Y7qv56MPtq+GGM8RJpq6glYJn9Wspr8uw7l55uyinNeg=="],
|
||||||
@@ -248,6 +271,8 @@
|
|||||||
|
|
||||||
"graceful-fs": ["graceful-fs@4.2.10", "", {}, "sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA=="],
|
"graceful-fs": ["graceful-fs@4.2.10", "", {}, "sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA=="],
|
||||||
|
|
||||||
|
"he": ["he@1.2.0", "", { "bin": { "he": "bin/he" } }, "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw=="],
|
||||||
|
|
||||||
"help-me": ["help-me@5.0.0", "", {}, "sha512-7xgomUX6ADmcYzFik0HzAxh/73YlKR9bmFzf51CZwR+b6YtzU2m0u49hQCqV6SvlqIqsaxovfwdvbnsw3b/zpg=="],
|
"help-me": ["help-me@5.0.0", "", {}, "sha512-7xgomUX6ADmcYzFik0HzAxh/73YlKR9bmFzf51CZwR+b6YtzU2m0u49hQCqV6SvlqIqsaxovfwdvbnsw3b/zpg=="],
|
||||||
|
|
||||||
"inflight": ["inflight@1.0.6", "", { "dependencies": { "once": "^1.3.0", "wrappy": "1" } }, "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA=="],
|
"inflight": ["inflight@1.0.6", "", { "dependencies": { "once": "^1.3.0", "wrappy": "1" } }, "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA=="],
|
||||||
@@ -290,8 +315,12 @@
|
|||||||
|
|
||||||
"node-gyp-build-optional-packages": ["node-gyp-build-optional-packages@5.2.2", "", { "dependencies": { "detect-libc": "^2.0.1" }, "bin": { "node-gyp-build-optional-packages": "bin.js", "node-gyp-build-optional-packages-optional": "optional.js", "node-gyp-build-optional-packages-test": "build-test.js" } }, "sha512-s+w+rBWnpTMwSFbaE0UXsRlg7hU4FjekKU4eyAih5T8nJuNZT1nNsskXpxmeqSK9UzkBl6UgRlnKc8hz8IEqOw=="],
|
"node-gyp-build-optional-packages": ["node-gyp-build-optional-packages@5.2.2", "", { "dependencies": { "detect-libc": "^2.0.1" }, "bin": { "node-gyp-build-optional-packages": "bin.js", "node-gyp-build-optional-packages-optional": "optional.js", "node-gyp-build-optional-packages-test": "build-test.js" } }, "sha512-s+w+rBWnpTMwSFbaE0UXsRlg7hU4FjekKU4eyAih5T8nJuNZT1nNsskXpxmeqSK9UzkBl6UgRlnKc8hz8IEqOw=="],
|
||||||
|
|
||||||
|
"node-html-parser": ["node-html-parser@7.0.1", "", { "dependencies": { "css-select": "^5.1.0", "he": "1.2.0" } }, "sha512-KGtmPY2kS0thCWGK0VuPyOS+pBKhhe8gXztzA2ilAOhbUbxa9homF1bOyKvhGzMLXUoRds9IOmr/v5lr/lqNmA=="],
|
||||||
|
|
||||||
"normalize-path": ["normalize-path@3.0.0", "", {}, "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA=="],
|
"normalize-path": ["normalize-path@3.0.0", "", {}, "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA=="],
|
||||||
|
|
||||||
|
"nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="],
|
||||||
|
|
||||||
"on-exit-leak-free": ["on-exit-leak-free@2.1.2", "", {}, "sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA=="],
|
"on-exit-leak-free": ["on-exit-leak-free@2.1.2", "", {}, "sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA=="],
|
||||||
|
|
||||||
"once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],
|
"once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],
|
||||||
@@ -406,6 +435,8 @@
|
|||||||
|
|
||||||
"turbo-windows-arm64": ["turbo-windows-arm64@2.5.8", "", { "os": "win32", "cpu": "arm64" }, "sha512-eFC5XzLmgXJfnAK3UMTmVECCwuBcORrWdewoiXBnUm934DY6QN8YowC/srhNnROMpaKaqNeRpoB5FxCww3eteQ=="],
|
"turbo-windows-arm64": ["turbo-windows-arm64@2.5.8", "", { "os": "win32", "cpu": "arm64" }, "sha512-eFC5XzLmgXJfnAK3UMTmVECCwuBcORrWdewoiXBnUm934DY6QN8YowC/srhNnROMpaKaqNeRpoB5FxCww3eteQ=="],
|
||||||
|
|
||||||
|
"turndown": ["turndown@7.2.2", "", { "dependencies": { "@mixmark-io/domino": "^2.2.0" } }, "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ=="],
|
||||||
|
|
||||||
"type-fest": ["type-fest@4.41.0", "", {}, "sha512-TeTSQ6H5YHvpqVwBRcnLDCBnDOHWYu7IvGbHT6N8AOymcr9PJGjc1GTtiWZTYg0NCgYwvnYWEkVChQAr9bjfwA=="],
|
"type-fest": ["type-fest@4.41.0", "", {}, "sha512-TeTSQ6H5YHvpqVwBRcnLDCBnDOHWYu7IvGbHT6N8AOymcr9PJGjc1GTtiWZTYg0NCgYwvnYWEkVChQAr9bjfwA=="],
|
||||||
|
|
||||||
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
||||||
|
|||||||
@@ -29,17 +29,17 @@ export interface ArticleFilters {
|
|||||||
|
|
||||||
export interface ArticleOverviewRow {
|
export interface ArticleOverviewRow {
|
||||||
article_id: string;
|
article_id: string;
|
||||||
article_title: string;
|
articleTitle: string;
|
||||||
article_link: string;
|
articleLink: string;
|
||||||
article_categories: string | null;
|
articleCategories: string | null;
|
||||||
article_excerpt: string | null;
|
article_excerpt: string | null;
|
||||||
article_published_at: string;
|
article_published_at: string;
|
||||||
article_image: string | null;
|
article_image: string | null;
|
||||||
article_reading_time: number | null;
|
article_reading_time: number | null;
|
||||||
source_id: string;
|
sourceId: string;
|
||||||
source_display_name: string | null;
|
source_display_name: string | null;
|
||||||
source_image: string;
|
source_image: string;
|
||||||
source_url: string;
|
sourceUrl: string;
|
||||||
source_name: string;
|
source_name: string;
|
||||||
source_created_at: string;
|
source_created_at: string;
|
||||||
article_is_bookmarked: boolean;
|
article_is_bookmarked: boolean;
|
||||||
@@ -52,10 +52,10 @@ export interface ArticleOverviewResult {
|
|||||||
|
|
||||||
export interface ArticleDetailsRow {
|
export interface ArticleDetailsRow {
|
||||||
article_id: string;
|
article_id: string;
|
||||||
article_title: string;
|
articleTitle: string;
|
||||||
article_link: string;
|
articleLink: string;
|
||||||
article_categories: string | null;
|
articleCategories: string | null;
|
||||||
article_body: string;
|
articleBody: string;
|
||||||
article_hash: string;
|
article_hash: string;
|
||||||
article_published_at: string;
|
article_published_at: string;
|
||||||
article_crawled_at: string;
|
article_crawled_at: string;
|
||||||
@@ -66,10 +66,10 @@ export interface ArticleDetailsRow {
|
|||||||
article_sentiment: string;
|
article_sentiment: string;
|
||||||
article_metadata: unknown;
|
article_metadata: unknown;
|
||||||
article_reading_time: number | null;
|
article_reading_time: number | null;
|
||||||
source_id: string;
|
sourceId: string;
|
||||||
source_name: string;
|
source_name: string;
|
||||||
source_description: string | null;
|
source_description: string | null;
|
||||||
source_url: string;
|
sourceUrl: string;
|
||||||
source_updated_at: string | null;
|
source_updated_at: string | null;
|
||||||
source_display_name: string | null;
|
source_display_name: string | null;
|
||||||
source_bias: string;
|
source_bias: string;
|
||||||
@@ -269,18 +269,18 @@ async function fetchArticleOverview(
|
|||||||
|
|
||||||
const selectFields = {
|
const selectFields = {
|
||||||
article_id: articles.id,
|
article_id: articles.id,
|
||||||
article_title: articles.title,
|
articleTitle: articles.title,
|
||||||
article_link: articles.link,
|
articleLink: articles.link,
|
||||||
article_categories: sql<string | null>`array_to_string
|
articleCategories: sql<string | null>`array_to_string
|
||||||
(${articles.categories}, ',')`,
|
(${articles.categories}, ',')`,
|
||||||
article_excerpt: articles.excerpt,
|
article_excerpt: articles.excerpt,
|
||||||
article_published_at: articles.publishedAt,
|
article_published_at: articles.publishedAt,
|
||||||
article_image: articles.image,
|
article_image: articles.image,
|
||||||
article_reading_time: articles.readingTime,
|
article_reading_time: articles.readingTime,
|
||||||
source_id: sources.id,
|
sourceId: sources.id,
|
||||||
source_display_name: sources.displayName,
|
source_display_name: sources.displayName,
|
||||||
source_image: sql<string>`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`,
|
source_image: sql<string>`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`,
|
||||||
source_url: sources.url,
|
sourceUrl: sources.url,
|
||||||
source_name: sources.name,
|
source_name: sources.name,
|
||||||
source_created_at: sources.createdAt,
|
source_created_at: sources.createdAt,
|
||||||
article_is_bookmarked: bookmarkExpression,
|
article_is_bookmarked: bookmarkExpression,
|
||||||
@@ -405,18 +405,18 @@ export async function getBookmarkedArticleList(
|
|||||||
|
|
||||||
const selectFields = {
|
const selectFields = {
|
||||||
article_id: articles.id,
|
article_id: articles.id,
|
||||||
article_title: articles.title,
|
articleTitle: articles.title,
|
||||||
article_link: articles.link,
|
articleLink: articles.link,
|
||||||
article_categories: sql<string | null>`array_to_string
|
articleCategories: sql<string | null>`array_to_string
|
||||||
(${articles.categories}, ',')`,
|
(${articles.categories}, ',')`,
|
||||||
article_excerpt: articles.excerpt,
|
article_excerpt: articles.excerpt,
|
||||||
article_published_at: articles.publishedAt,
|
article_published_at: articles.publishedAt,
|
||||||
article_image: articles.image,
|
article_image: articles.image,
|
||||||
article_reading_time: articles.readingTime,
|
article_reading_time: articles.readingTime,
|
||||||
source_id: sources.id,
|
sourceId: sources.id,
|
||||||
source_display_name: sources.displayName,
|
source_display_name: sources.displayName,
|
||||||
source_image: sql<string>`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`,
|
source_image: sql<string>`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`,
|
||||||
source_url: sources.url,
|
sourceUrl: sources.url,
|
||||||
source_name: sources.name,
|
source_name: sources.name,
|
||||||
source_created_at: sources.createdAt,
|
source_created_at: sources.createdAt,
|
||||||
article_is_bookmarked: sql<boolean>`true`,
|
article_is_bookmarked: sql<boolean>`true`,
|
||||||
@@ -492,11 +492,11 @@ export async function getArticleDetails(
|
|||||||
const [row] = await db
|
const [row] = await db
|
||||||
.select({
|
.select({
|
||||||
article_id: articles.id,
|
article_id: articles.id,
|
||||||
article_title: articles.title,
|
articleTitle: articles.title,
|
||||||
article_link: articles.link,
|
articleLink: articles.link,
|
||||||
article_categories: sql<string | null>`array_to_string
|
articleCategories: sql<string | null>`array_to_string
|
||||||
(${articles.categories}, ',')`,
|
(${articles.categories}, ',')`,
|
||||||
article_body: articles.body,
|
articleBody: articles.body,
|
||||||
article_hash: articles.hash,
|
article_hash: articles.hash,
|
||||||
article_published_at: articles.publishedAt,
|
article_published_at: articles.publishedAt,
|
||||||
article_crawled_at: articles.crawledAt,
|
article_crawled_at: articles.crawledAt,
|
||||||
@@ -507,10 +507,10 @@ export async function getArticleDetails(
|
|||||||
article_sentiment: articles.sentiment,
|
article_sentiment: articles.sentiment,
|
||||||
article_metadata: articles.metadata,
|
article_metadata: articles.metadata,
|
||||||
article_reading_time: articles.readingTime,
|
article_reading_time: articles.readingTime,
|
||||||
source_id: sources.id,
|
sourceId: sources.id,
|
||||||
source_name: sources.name,
|
source_name: sources.name,
|
||||||
source_description: sources.description,
|
source_description: sources.description,
|
||||||
source_url: sources.url,
|
sourceUrl: sources.url,
|
||||||
source_updated_at: sources.updatedAt,
|
source_updated_at: sources.updatedAt,
|
||||||
source_display_name: sources.displayName,
|
source_display_name: sources.displayName,
|
||||||
source_bias: sources.bias,
|
source_bias: sources.bias,
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ import {
|
|||||||
import { PUBLICATION_GRAPH_DAYS, SOURCE_IMAGE_BASE } from "@/constant";
|
import { PUBLICATION_GRAPH_DAYS, SOURCE_IMAGE_BASE } from "@/constant";
|
||||||
|
|
||||||
export interface SourceOverviewRow {
|
export interface SourceOverviewRow {
|
||||||
source_id: string;
|
sourceId: string;
|
||||||
source_display_name: string | null;
|
source_display_name: string | null;
|
||||||
source_image: string;
|
source_image: string;
|
||||||
source_url: string;
|
sourceUrl: string;
|
||||||
source_name: string;
|
source_name: string;
|
||||||
source_created_at: string;
|
source_created_at: string;
|
||||||
source_is_followed: boolean;
|
source_is_followed: boolean;
|
||||||
@@ -40,10 +40,10 @@ export interface CategoryShare {
|
|||||||
|
|
||||||
export interface SourceDetailsResult {
|
export interface SourceDetailsResult {
|
||||||
source: {
|
source: {
|
||||||
source_id: string;
|
sourceId: string;
|
||||||
source_name: string;
|
source_name: string;
|
||||||
source_description: string | null;
|
source_description: string | null;
|
||||||
source_url: string;
|
sourceUrl: string;
|
||||||
source_updated_at: string | null;
|
source_updated_at: string | null;
|
||||||
source_display_name: string | null;
|
source_display_name: string | null;
|
||||||
source_bias: string;
|
source_bias: string;
|
||||||
@@ -148,7 +148,7 @@ function buildFollowExistsExpression(userId: string): SQL<boolean> {
|
|||||||
return sql`EXISTS
|
return sql`EXISTS
|
||||||
(SELECT 1
|
(SELECT 1
|
||||||
FROM ${followedSources} f
|
FROM ${followedSources} f
|
||||||
WHERE f.source_id = ${sources.id}
|
WHERE f.sourceId = ${sources.id}
|
||||||
AND f.follower_id = ${userId})`;
|
AND f.follower_id = ${userId})`;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -161,10 +161,10 @@ export async function getSourceOverviewList(
|
|||||||
|
|
||||||
let query = db
|
let query = db
|
||||||
.select({
|
.select({
|
||||||
source_id: sources.id,
|
sourceId: sources.id,
|
||||||
source_display_name: sources.displayName,
|
source_display_name: sources.displayName,
|
||||||
source_image: sql<string>`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`,
|
source_image: sql<string>`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`,
|
||||||
source_url: sources.url,
|
sourceUrl: sources.url,
|
||||||
source_name: sources.name,
|
source_name: sources.name,
|
||||||
source_created_at: sources.createdAt,
|
source_created_at: sources.createdAt,
|
||||||
source_is_followed: followExpression,
|
source_is_followed: followExpression,
|
||||||
@@ -186,7 +186,7 @@ export async function getSourceOverviewList(
|
|||||||
.limit(page.limit + 1);
|
.limit(page.limit + 1);
|
||||||
|
|
||||||
return buildPaginationResult(rows, page, {
|
return buildPaginationResult(rows, page, {
|
||||||
id: "source_id",
|
id: "sourceId",
|
||||||
date: "source_created_at",
|
date: "source_created_at",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -298,10 +298,10 @@ export async function getSourceDetails(
|
|||||||
|
|
||||||
const [row] = await db
|
const [row] = await db
|
||||||
.select({
|
.select({
|
||||||
source_id: sources.id,
|
sourceId: sources.id,
|
||||||
source_name: sources.name,
|
source_name: sources.name,
|
||||||
source_description: sources.description,
|
source_description: sources.description,
|
||||||
source_url: sources.url,
|
sourceUrl: sources.url,
|
||||||
source_updated_at: sources.updatedAt,
|
source_updated_at: sources.updatedAt,
|
||||||
source_display_name: sources.displayName,
|
source_display_name: sources.displayName,
|
||||||
source_bias: sources.bias,
|
source_bias: sources.bias,
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ export const sources = pgTable(
|
|||||||
sql`lower
|
sql`lower
|
||||||
(${table.name})`,
|
(${table.name})`,
|
||||||
),
|
),
|
||||||
uniqueIndex("unq_source_url").using(
|
uniqueIndex("unq_sourceUrl").using(
|
||||||
"btree",
|
"btree",
|
||||||
sql`lower
|
sql`lower
|
||||||
(${table.url})`,
|
(${table.url})`,
|
||||||
@@ -113,7 +113,7 @@ export const articles = pgTable(
|
|||||||
"article",
|
"article",
|
||||||
{
|
{
|
||||||
id: uuid("id").notNull().defaultRandom().primaryKey(),
|
id: uuid("id").notNull().defaultRandom().primaryKey(),
|
||||||
sourceId: uuid("source_id").notNull(),
|
sourceId: uuid("sourceId").notNull(),
|
||||||
title: varchar("title", { length: 1024 }).notNull(),
|
title: varchar("title", { length: 1024 }).notNull(),
|
||||||
body: text("body").notNull(),
|
body: text("body").notNull(),
|
||||||
hash: varchar("hash", { length: 32 }).notNull(),
|
hash: varchar("hash", { length: 32 }).notNull(),
|
||||||
@@ -143,7 +143,7 @@ export const articles = pgTable(
|
|||||||
),
|
),
|
||||||
},
|
},
|
||||||
(table) => [
|
(table) => [
|
||||||
index("article_source_id_idx").on(table.sourceId),
|
index("article_sourceId_idx").on(table.sourceId),
|
||||||
index("idx_article_published_at").using("btree", table.publishedAt.desc()),
|
index("idx_article_published_at").using("btree", table.publishedAt.desc()),
|
||||||
index("idx_article_published_id").using(
|
index("idx_article_published_id").using(
|
||||||
"btree",
|
"btree",
|
||||||
@@ -152,16 +152,16 @@ export const articles = pgTable(
|
|||||||
),
|
),
|
||||||
unique("unq_article_hash").on(table.hash),
|
unique("unq_article_hash").on(table.hash),
|
||||||
index("gin_article_tsv").using("gin", table.tsv),
|
index("gin_article_tsv").using("gin", table.tsv),
|
||||||
index("gin_article_link_trgm").using("gin", table.link.op("gin_trgm_ops")),
|
index("gin_articleLink_trgm").using("gin", table.link.op("gin_trgm_ops")),
|
||||||
index("gin_article_title_trgm").using(
|
index("gin_articleTitle_trgm").using(
|
||||||
"gin",
|
"gin",
|
||||||
table.title.op("gin_trgm_ops"),
|
table.title.op("gin_trgm_ops"),
|
||||||
),
|
),
|
||||||
index("gin_article_categories").using("gin", table.categories),
|
index("gin_articleCategories").using("gin", table.categories),
|
||||||
foreignKey({
|
foreignKey({
|
||||||
columns: [table.sourceId],
|
columns: [table.sourceId],
|
||||||
foreignColumns: [sources.id],
|
foreignColumns: [sources.id],
|
||||||
name: "article_source_id_fkey",
|
name: "article_sourceId_fkey",
|
||||||
}).onDelete("cascade"),
|
}).onDelete("cascade"),
|
||||||
{
|
{
|
||||||
kind: "check",
|
kind: "check",
|
||||||
@@ -288,12 +288,12 @@ export const followedSources = pgTable(
|
|||||||
{
|
{
|
||||||
id: uuid("id").notNull().defaultRandom().primaryKey(),
|
id: uuid("id").notNull().defaultRandom().primaryKey(),
|
||||||
followerId: uuid("follower_id").notNull(),
|
followerId: uuid("follower_id").notNull(),
|
||||||
sourceId: uuid("source_id").notNull(),
|
sourceId: uuid("sourceId").notNull(),
|
||||||
createdAt: timestamp("created_at", { mode: "string" }).notNull(),
|
createdAt: timestamp("created_at", { mode: "string" }).notNull(),
|
||||||
},
|
},
|
||||||
(table) => [
|
(table) => [
|
||||||
index("followed_source_follower_idx").on(table.followerId),
|
index("followed_source_follower_idx").on(table.followerId),
|
||||||
index("followed_source_source_idx").on(table.sourceId),
|
index("followed_source_sourceIdx").on(table.sourceId),
|
||||||
index("idx_followed_source_follower_created").using(
|
index("idx_followed_source_follower_created").using(
|
||||||
"btree",
|
"btree",
|
||||||
table.followerId,
|
table.followerId,
|
||||||
@@ -307,7 +307,7 @@ export const followedSources = pgTable(
|
|||||||
foreignKey({
|
foreignKey({
|
||||||
columns: [table.sourceId],
|
columns: [table.sourceId],
|
||||||
foreignColumns: [sources.id],
|
foreignColumns: [sources.id],
|
||||||
name: "followed_source_source_id_fkey",
|
name: "followed_source_sourceId_fkey",
|
||||||
}).onDelete("cascade"),
|
}).onDelete("cascade"),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -20,7 +20,7 @@
|
|||||||
"erasableSyntaxOnly": true,
|
"erasableSyntaxOnly": true,
|
||||||
"noFallthroughCasesInSwitch": true,
|
"noFallthroughCasesInSwitch": true,
|
||||||
"noUncheckedSideEffectImports": true,
|
"noUncheckedSideEffectImports": true,
|
||||||
"allowImportingTsExtensions": true,
|
"allowImportingTsExtensions": false,
|
||||||
"strict": true,
|
"strict": true,
|
||||||
"target": "ES2022",
|
"target": "ES2022",
|
||||||
"baseUrl": "."
|
"baseUrl": "."
|
||||||
|
|||||||
Generated
+6
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"name": "basango",
|
||||||
|
"lockfileVersion": 3,
|
||||||
|
"requires": true,
|
||||||
|
"packages": {}
|
||||||
|
}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
Reference in New Issue
Block a user