[crawler]: fix configuration format

This commit is contained in:
2025-11-04 16:38:30 +02:00
parent dad5e343d1
commit 13dd1b09ee
5 changed files with 111 additions and 107 deletions
+2 -2
View File
@@ -20,9 +20,9 @@
},
"crawler": {
"notify": false,
"use_multi_threading": false,
"useMultiThreading": false,
"maxWorkers": 5,
"direction": "%env(BASANGO_CRAWLER_DEFAULT_DIRECTION)%"
"direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%"
},
"async": {
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
+104 -104
View File
@@ -2,189 +2,189 @@
"sources": {
"html": [
{
"source_id": "radiookapi.net",
"source_url": "https://www.radiookapi.net",
"source_date": {
"sourceId": "radiookapi.net",
"sourceUrl": "https://www.radiookapi.net",
"sourceDate": {
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"source_selectors": {
"sourceSelectors": {
"articles": ".view-content > .views-row.content-row",
"article_title": "h1.page-header",
"article_link": ".views-field-title a",
"article_body": ".field-name-body",
"article_date": ".views-field-created",
"article_categories": ".views-field-field-cat-gorie a",
"articleTitle": "h1.page-header",
"articleLink": ".views-field-title a",
"articleBody": ".field-name-body",
"articleDate": ".views-field-created",
"articleCategories": ".views-field-field-cat-gorie a",
"pagination": "ul.pagination > li.pager-last > a"
},
"pagination_template": "actualite",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
"paginationTemplate": "actualite",
"supportsCategories": false,
"requiresDetails": true,
"requiresRateLimit": false
},
{
"source_id": "7sur7.cd",
"source_url": "https://7sur7.cd",
"source_date": {
"sourceId": "7sur7.cd",
"sourceUrl": "https://7sur7.cd",
"sourceDate": {
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"categories": ["politique", "economie", "culture", "sport", "societe"],
"source_selectors": {
"sourceSelectors": {
"articles": ".view-content > .row.views-row",
"article_title": ".views-field-title a",
"article_link": ".views-field-title a",
"article_body": ".field.field--name-body",
"article_date": ".views-field-created",
"articleTitle": ".views-field-title a",
"articleLink": ".views-field-title a",
"articleBody": ".field.field--name-body",
"articleDate": ".views-field-created",
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
},
"pagination_template": "index.php/category/{category}",
"supports_categories": true,
"requires_details": false,
"requires_rate_limit": false
"paginationTemplate": "index.php/category/{category}",
"supportsCategories": true,
"requiresDetails": false,
"requiresRateLimit": false
},
{
"source_id": "mediacongo.net",
"source_url": "https://www.mediacongo.net",
"source_date": {
"sourceId": "mediacongo.net",
"sourceUrl": "https://www.mediacongo.net",
"sourceDate": {
"format": "%d.%m.%Y %H:%M"
},
"source_selectors": {
"sourceSelectors": {
"articles": ".for_aitems > .article_other_item",
"article_title": "img",
"article_link": "a:first-child",
"article_categories": "a.color_link",
"article_body": ".article_ttext",
"article_date": ".article_other_about",
"articleTitle": "img",
"articleLink": "a:first-child",
"articleCategories": "a.color_link",
"articleBody": ".article_ttext",
"articleDate": ".article_other_about",
"pagination": "div.pagination > div > a:last-child"
},
"pagination_template": "articles.html",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
"paginationTemplate": "articles.html",
"supportsCategories": false,
"requiresDetails": true,
"requiresRateLimit": false
},
{
"source_id": "actualite.cd",
"source_url": "https://actualite.cd",
"source_date": {
"sourceId": "actualite.cd",
"sourceUrl": "https://actualite.cd",
"sourceDate": {
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$4-$3-$2 $5"
},
"source_selectors": {
"sourceSelectors": {
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
"article_title": "#actu-titre a",
"article_link": "#actu-titre a",
"article_categories": "#actu-cat a",
"article_body": ".views-field.views-field-body",
"article_date": "#p-date"
"articleTitle": "#actu-titre a",
"articleLink": "#actu-titre a",
"articleCategories": "#actu-cat a",
"articleBody": ".views-field.views-field-body",
"articleDate": "#p-date"
},
"pagination_template": "actualite",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
"paginationTemplate": "actualite",
"supportsCategories": false,
"requiresDetails": true,
"requiresRateLimit": false
}
],
"wordpress": [
{
"source_id": "beto.cd",
"source_url": "https://beto.cd",
"requires_rate_limit": true
"sourceId": "beto.cd",
"sourceUrl": "https://beto.cd",
"requiresRateLimit": true
},
{ "source_id": "newscd.net", "source_url": "https://newscd.net" },
{ "sourceId": "newscd.net", "sourceUrl": "https://newscd.net" },
{
"source_id": "africanewsrdc.net",
"source_url": "https://www.africanewsrdc.net"
"sourceId": "africanewsrdc.net",
"sourceUrl": "https://www.africanewsrdc.net"
},
{
"source_id": "angazainstitute.ac.cd",
"source_url": "https://angazainstitute.ac.cd"
"sourceId": "angazainstitute.ac.cd",
"sourceUrl": "https://angazainstitute.ac.cd"
},
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" },
{ "sourceId": "b-onetv.cd", "sourceUrl": "https://b-onetv.cd" },
{
"source_id": "bukavufm.com",
"source_url": "https://bukavufm.com"
"sourceId": "bukavufm.com",
"sourceUrl": "https://bukavufm.com"
},
{
"source_id": "changement7.net",
"source_url": "https://changement7.net"
"sourceId": "changement7.net",
"sourceUrl": "https://changement7.net"
},
{
"source_id": "congoactu.net",
"source_url": "https://congoactu.net"
"sourceId": "congoactu.net",
"sourceUrl": "https://congoactu.net"
},
{
"source_id": "congoindependant.com",
"source_url": "https://www.congoindependant.com"
"sourceId": "congoindependant.com",
"sourceUrl": "https://www.congoindependant.com"
},
{
"source_id": "congoquotidien.com",
"source_url": "https://www.congoquotidien.com"
"sourceId": "congoquotidien.com",
"sourceUrl": "https://www.congoquotidien.com"
},
{
"source_id": "cumulard.cd",
"source_url": "https://www.cumulard.cd"
"sourceId": "cumulard.cd",
"sourceUrl": "https://www.cumulard.cd"
},
{
"source_id": "environews-rdc.net",
"source_url": "https://environews-rdc.net"
"sourceId": "environews-rdc.net",
"sourceUrl": "https://environews-rdc.net"
},
{
"source_id": "freemediardc.info",
"source_url": "https://www.freemediardc.info"
"sourceId": "freemediardc.info",
"sourceUrl": "https://www.freemediardc.info"
},
{
"source_id": "geopolismagazine.org",
"source_url": "https://geopolismagazine.org"
"sourceId": "geopolismagazine.org",
"sourceUrl": "https://geopolismagazine.org"
},
{
"source_id": "habarirdc.net",
"source_url": "https://habarirdc.net"
"sourceId": "habarirdc.net",
"sourceUrl": "https://habarirdc.net"
},
{ "source_id": "infordc.com", "source_url": "https://infordc.com" },
{ "sourceId": "infordc.com", "sourceUrl": "https://infordc.com" },
{
"source_id": "kilalopress.net",
"source_url": "https://kilalopress.net"
"sourceId": "kilalopress.net",
"sourceUrl": "https://kilalopress.net"
},
{
"source_id": "laprosperiteonline.net",
"source_url": "https://laprosperiteonline.net"
"sourceId": "laprosperiteonline.net",
"sourceUrl": "https://laprosperiteonline.net"
},
{
"source_id": "laprunellerdc.cd",
"source_url": "https://laprunellerdc.cd"
"sourceId": "laprunellerdc.cd",
"sourceUrl": "https://laprunellerdc.cd"
},
{
"source_id": "lesmedias.net",
"source_url": "https://lesmedias.net"
"sourceId": "lesmedias.net",
"sourceUrl": "https://lesmedias.net"
},
{
"source_id": "lesvolcansnews.net",
"source_url": "https://lesvolcansnews.net"
"sourceId": "lesvolcansnews.net",
"sourceUrl": "https://lesvolcansnews.net"
},
{
"source_id": "netic-news.net",
"source_url": "https://www.netic-news.net"
"sourceId": "netic-news.net",
"sourceUrl": "https://www.netic-news.net"
},
{
"source_id": "objectif-infos.cd",
"source_url": "https://objectif-infos.cd"
"sourceId": "objectif-infos.cd",
"sourceUrl": "https://objectif-infos.cd"
},
{
"source_id": "scooprdc.net",
"source_url": "https://scooprdc.net"
"sourceId": "scooprdc.net",
"sourceUrl": "https://scooprdc.net"
},
{
"source_id": "journaldekinshasa.com",
"source_url": "https://www.journaldekinshasa.com"
"sourceId": "journaldekinshasa.com",
"sourceUrl": "https://www.journaldekinshasa.com"
},
{
"source_id": "lepotentiel.cd",
"source_url": "https://lepotentiel.cd"
"sourceId": "lepotentiel.cd",
"sourceUrl": "https://lepotentiel.cd"
},
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" },
{ "sourceId": "acturdc.com", "sourceUrl": "https://acturdc.com" },
{
"source_id": "matininfos.net",
"source_url": "https://matininfos.net"
"sourceId": "matininfos.net",
"sourceUrl": "https://matininfos.net"
}
]
}
+1
View File
@@ -22,6 +22,7 @@
"node-html-parser": "^7.0.1",
"tiktoken": "^1.0.14",
"turndown": "^7.2.2",
"yaml": "^2.8.1",
"zod": "catalog:"
}
}
+1 -1
View File
@@ -10,7 +10,7 @@ import {
WordPressSourceConfigSchema,
} from "@/schema";
export const PROJECT_DIR = path.resolve(process.cwd(), "basango", "apps", "crawler");
export const PROJECT_DIR = path.resolve(__dirname, "../");
export const PipelineConfigSchema = z.object({
paths: z.object({
+3
View File
@@ -22,6 +22,7 @@
"node-html-parser": "^7.0.1",
"tiktoken": "^1.0.14",
"turndown": "^7.2.2",
"yaml": "^2.8.1",
"zod": "catalog:",
},
},
@@ -451,6 +452,8 @@
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
"yaml": ["yaml@2.8.1", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-lcYcMxX2PO9XMGvAJkJ3OsNMw+/7FKes7/hgerGUYWIoWu5j/+YQqcZr5JnPZWzOsEBgMbSbiSTn/dv/69Mkpw=="],
"yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="],
"zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="],