[crawler]: fix configuration format
This commit is contained in:
@@ -20,9 +20,9 @@
|
|||||||
},
|
},
|
||||||
"crawler": {
|
"crawler": {
|
||||||
"notify": false,
|
"notify": false,
|
||||||
"use_multi_threading": false,
|
"useMultiThreading": false,
|
||||||
"maxWorkers": 5,
|
"maxWorkers": 5,
|
||||||
"direction": "%env(BASANGO_CRAWLER_DEFAULT_DIRECTION)%"
|
"direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%"
|
||||||
},
|
},
|
||||||
"async": {
|
"async": {
|
||||||
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
|
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
|
||||||
|
|||||||
@@ -2,189 +2,189 @@
|
|||||||
"sources": {
|
"sources": {
|
||||||
"html": [
|
"html": [
|
||||||
{
|
{
|
||||||
"source_id": "radiookapi.net",
|
"sourceId": "radiookapi.net",
|
||||||
"source_url": "https://www.radiookapi.net",
|
"sourceUrl": "https://www.radiookapi.net",
|
||||||
"source_date": {
|
"sourceDate": {
|
||||||
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
||||||
"replacement": "$3-$2-$1 $4"
|
"replacement": "$3-$2-$1 $4"
|
||||||
},
|
},
|
||||||
"source_selectors": {
|
"sourceSelectors": {
|
||||||
"articles": ".view-content > .views-row.content-row",
|
"articles": ".view-content > .views-row.content-row",
|
||||||
"article_title": "h1.page-header",
|
"articleTitle": "h1.page-header",
|
||||||
"article_link": ".views-field-title a",
|
"articleLink": ".views-field-title a",
|
||||||
"article_body": ".field-name-body",
|
"articleBody": ".field-name-body",
|
||||||
"article_date": ".views-field-created",
|
"articleDate": ".views-field-created",
|
||||||
"article_categories": ".views-field-field-cat-gorie a",
|
"articleCategories": ".views-field-field-cat-gorie a",
|
||||||
"pagination": "ul.pagination > li.pager-last > a"
|
"pagination": "ul.pagination > li.pager-last > a"
|
||||||
},
|
},
|
||||||
"pagination_template": "actualite",
|
"paginationTemplate": "actualite",
|
||||||
"supports_categories": false,
|
"supportsCategories": false,
|
||||||
"requires_details": true,
|
"requiresDetails": true,
|
||||||
"requires_rate_limit": false
|
"requiresRateLimit": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "7sur7.cd",
|
"sourceId": "7sur7.cd",
|
||||||
"source_url": "https://7sur7.cd",
|
"sourceUrl": "https://7sur7.cd",
|
||||||
"source_date": {
|
"sourceDate": {
|
||||||
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
||||||
"replacement": "$3-$2-$1 $4"
|
"replacement": "$3-$2-$1 $4"
|
||||||
},
|
},
|
||||||
"categories": ["politique", "economie", "culture", "sport", "societe"],
|
"categories": ["politique", "economie", "culture", "sport", "societe"],
|
||||||
"source_selectors": {
|
"sourceSelectors": {
|
||||||
"articles": ".view-content > .row.views-row",
|
"articles": ".view-content > .row.views-row",
|
||||||
"article_title": ".views-field-title a",
|
"articleTitle": ".views-field-title a",
|
||||||
"article_link": ".views-field-title a",
|
"articleLink": ".views-field-title a",
|
||||||
"article_body": ".field.field--name-body",
|
"articleBody": ".field.field--name-body",
|
||||||
"article_date": ".views-field-created",
|
"articleDate": ".views-field-created",
|
||||||
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
|
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
|
||||||
},
|
},
|
||||||
"pagination_template": "index.php/category/{category}",
|
"paginationTemplate": "index.php/category/{category}",
|
||||||
"supports_categories": true,
|
"supportsCategories": true,
|
||||||
"requires_details": false,
|
"requiresDetails": false,
|
||||||
"requires_rate_limit": false
|
"requiresRateLimit": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "mediacongo.net",
|
"sourceId": "mediacongo.net",
|
||||||
"source_url": "https://www.mediacongo.net",
|
"sourceUrl": "https://www.mediacongo.net",
|
||||||
"source_date": {
|
"sourceDate": {
|
||||||
"format": "%d.%m.%Y %H:%M"
|
"format": "%d.%m.%Y %H:%M"
|
||||||
},
|
},
|
||||||
"source_selectors": {
|
"sourceSelectors": {
|
||||||
"articles": ".for_aitems > .article_other_item",
|
"articles": ".for_aitems > .article_other_item",
|
||||||
"article_title": "img",
|
"articleTitle": "img",
|
||||||
"article_link": "a:first-child",
|
"articleLink": "a:first-child",
|
||||||
"article_categories": "a.color_link",
|
"articleCategories": "a.color_link",
|
||||||
"article_body": ".article_ttext",
|
"articleBody": ".article_ttext",
|
||||||
"article_date": ".article_other_about",
|
"articleDate": ".article_other_about",
|
||||||
"pagination": "div.pagination > div > a:last-child"
|
"pagination": "div.pagination > div > a:last-child"
|
||||||
},
|
},
|
||||||
"pagination_template": "articles.html",
|
"paginationTemplate": "articles.html",
|
||||||
"supports_categories": false,
|
"supportsCategories": false,
|
||||||
"requires_details": true,
|
"requiresDetails": true,
|
||||||
"requires_rate_limit": false
|
"requiresRateLimit": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "actualite.cd",
|
"sourceId": "actualite.cd",
|
||||||
"source_url": "https://actualite.cd",
|
"sourceUrl": "https://actualite.cd",
|
||||||
"source_date": {
|
"sourceDate": {
|
||||||
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
|
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
|
||||||
"replacement": "$4-$3-$2 $5"
|
"replacement": "$4-$3-$2 $5"
|
||||||
},
|
},
|
||||||
"source_selectors": {
|
"sourceSelectors": {
|
||||||
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
|
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
|
||||||
"article_title": "#actu-titre a",
|
"articleTitle": "#actu-titre a",
|
||||||
"article_link": "#actu-titre a",
|
"articleLink": "#actu-titre a",
|
||||||
"article_categories": "#actu-cat a",
|
"articleCategories": "#actu-cat a",
|
||||||
"article_body": ".views-field.views-field-body",
|
"articleBody": ".views-field.views-field-body",
|
||||||
"article_date": "#p-date"
|
"articleDate": "#p-date"
|
||||||
},
|
},
|
||||||
"pagination_template": "actualite",
|
"paginationTemplate": "actualite",
|
||||||
"supports_categories": false,
|
"supportsCategories": false,
|
||||||
"requires_details": true,
|
"requiresDetails": true,
|
||||||
"requires_rate_limit": false
|
"requiresRateLimit": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"wordpress": [
|
"wordpress": [
|
||||||
{
|
{
|
||||||
"source_id": "beto.cd",
|
"sourceId": "beto.cd",
|
||||||
"source_url": "https://beto.cd",
|
"sourceUrl": "https://beto.cd",
|
||||||
"requires_rate_limit": true
|
"requiresRateLimit": true
|
||||||
},
|
},
|
||||||
{ "source_id": "newscd.net", "source_url": "https://newscd.net" },
|
{ "sourceId": "newscd.net", "sourceUrl": "https://newscd.net" },
|
||||||
{
|
{
|
||||||
"source_id": "africanewsrdc.net",
|
"sourceId": "africanewsrdc.net",
|
||||||
"source_url": "https://www.africanewsrdc.net"
|
"sourceUrl": "https://www.africanewsrdc.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "angazainstitute.ac.cd",
|
"sourceId": "angazainstitute.ac.cd",
|
||||||
"source_url": "https://angazainstitute.ac.cd"
|
"sourceUrl": "https://angazainstitute.ac.cd"
|
||||||
},
|
},
|
||||||
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" },
|
{ "sourceId": "b-onetv.cd", "sourceUrl": "https://b-onetv.cd" },
|
||||||
{
|
{
|
||||||
"source_id": "bukavufm.com",
|
"sourceId": "bukavufm.com",
|
||||||
"source_url": "https://bukavufm.com"
|
"sourceUrl": "https://bukavufm.com"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "changement7.net",
|
"sourceId": "changement7.net",
|
||||||
"source_url": "https://changement7.net"
|
"sourceUrl": "https://changement7.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "congoactu.net",
|
"sourceId": "congoactu.net",
|
||||||
"source_url": "https://congoactu.net"
|
"sourceUrl": "https://congoactu.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "congoindependant.com",
|
"sourceId": "congoindependant.com",
|
||||||
"source_url": "https://www.congoindependant.com"
|
"sourceUrl": "https://www.congoindependant.com"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "congoquotidien.com",
|
"sourceId": "congoquotidien.com",
|
||||||
"source_url": "https://www.congoquotidien.com"
|
"sourceUrl": "https://www.congoquotidien.com"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "cumulard.cd",
|
"sourceId": "cumulard.cd",
|
||||||
"source_url": "https://www.cumulard.cd"
|
"sourceUrl": "https://www.cumulard.cd"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "environews-rdc.net",
|
"sourceId": "environews-rdc.net",
|
||||||
"source_url": "https://environews-rdc.net"
|
"sourceUrl": "https://environews-rdc.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "freemediardc.info",
|
"sourceId": "freemediardc.info",
|
||||||
"source_url": "https://www.freemediardc.info"
|
"sourceUrl": "https://www.freemediardc.info"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "geopolismagazine.org",
|
"sourceId": "geopolismagazine.org",
|
||||||
"source_url": "https://geopolismagazine.org"
|
"sourceUrl": "https://geopolismagazine.org"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "habarirdc.net",
|
"sourceId": "habarirdc.net",
|
||||||
"source_url": "https://habarirdc.net"
|
"sourceUrl": "https://habarirdc.net"
|
||||||
},
|
},
|
||||||
{ "source_id": "infordc.com", "source_url": "https://infordc.com" },
|
{ "sourceId": "infordc.com", "sourceUrl": "https://infordc.com" },
|
||||||
{
|
{
|
||||||
"source_id": "kilalopress.net",
|
"sourceId": "kilalopress.net",
|
||||||
"source_url": "https://kilalopress.net"
|
"sourceUrl": "https://kilalopress.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "laprosperiteonline.net",
|
"sourceId": "laprosperiteonline.net",
|
||||||
"source_url": "https://laprosperiteonline.net"
|
"sourceUrl": "https://laprosperiteonline.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "laprunellerdc.cd",
|
"sourceId": "laprunellerdc.cd",
|
||||||
"source_url": "https://laprunellerdc.cd"
|
"sourceUrl": "https://laprunellerdc.cd"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "lesmedias.net",
|
"sourceId": "lesmedias.net",
|
||||||
"source_url": "https://lesmedias.net"
|
"sourceUrl": "https://lesmedias.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "lesvolcansnews.net",
|
"sourceId": "lesvolcansnews.net",
|
||||||
"source_url": "https://lesvolcansnews.net"
|
"sourceUrl": "https://lesvolcansnews.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "netic-news.net",
|
"sourceId": "netic-news.net",
|
||||||
"source_url": "https://www.netic-news.net"
|
"sourceUrl": "https://www.netic-news.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "objectif-infos.cd",
|
"sourceId": "objectif-infos.cd",
|
||||||
"source_url": "https://objectif-infos.cd"
|
"sourceUrl": "https://objectif-infos.cd"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "scooprdc.net",
|
"sourceId": "scooprdc.net",
|
||||||
"source_url": "https://scooprdc.net"
|
"sourceUrl": "https://scooprdc.net"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "journaldekinshasa.com",
|
"sourceId": "journaldekinshasa.com",
|
||||||
"source_url": "https://www.journaldekinshasa.com"
|
"sourceUrl": "https://www.journaldekinshasa.com"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source_id": "lepotentiel.cd",
|
"sourceId": "lepotentiel.cd",
|
||||||
"source_url": "https://lepotentiel.cd"
|
"sourceUrl": "https://lepotentiel.cd"
|
||||||
},
|
},
|
||||||
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" },
|
{ "sourceId": "acturdc.com", "sourceUrl": "https://acturdc.com" },
|
||||||
{
|
{
|
||||||
"source_id": "matininfos.net",
|
"sourceId": "matininfos.net",
|
||||||
"source_url": "https://matininfos.net"
|
"sourceUrl": "https://matininfos.net"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,6 +22,7 @@
|
|||||||
"node-html-parser": "^7.0.1",
|
"node-html-parser": "^7.0.1",
|
||||||
"tiktoken": "^1.0.14",
|
"tiktoken": "^1.0.14",
|
||||||
"turndown": "^7.2.2",
|
"turndown": "^7.2.2",
|
||||||
|
"yaml": "^2.8.1",
|
||||||
"zod": "catalog:"
|
"zod": "catalog:"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import {
|
|||||||
WordPressSourceConfigSchema,
|
WordPressSourceConfigSchema,
|
||||||
} from "@/schema";
|
} from "@/schema";
|
||||||
|
|
||||||
export const PROJECT_DIR = path.resolve(process.cwd(), "basango", "apps", "crawler");
|
export const PROJECT_DIR = path.resolve(__dirname, "../");
|
||||||
|
|
||||||
export const PipelineConfigSchema = z.object({
|
export const PipelineConfigSchema = z.object({
|
||||||
paths: z.object({
|
paths: z.object({
|
||||||
|
|||||||
@@ -22,6 +22,7 @@
|
|||||||
"node-html-parser": "^7.0.1",
|
"node-html-parser": "^7.0.1",
|
||||||
"tiktoken": "^1.0.14",
|
"tiktoken": "^1.0.14",
|
||||||
"turndown": "^7.2.2",
|
"turndown": "^7.2.2",
|
||||||
|
"yaml": "^2.8.1",
|
||||||
"zod": "catalog:",
|
"zod": "catalog:",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@@ -451,6 +452,8 @@
|
|||||||
|
|
||||||
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
|
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
|
||||||
|
|
||||||
|
"yaml": ["yaml@2.8.1", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-lcYcMxX2PO9XMGvAJkJ3OsNMw+/7FKes7/hgerGUYWIoWu5j/+YQqcZr5JnPZWzOsEBgMbSbiSTn/dv/69Mkpw=="],
|
||||||
|
|
||||||
"yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="],
|
"yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="],
|
||||||
|
|
||||||
"zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="],
|
"zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="],
|
||||||
|
|||||||
Reference in New Issue
Block a user