[crawler]: fix configuration format

This commit is contained in:
2025-11-04 16:38:30 +02:00
parent dad5e343d1
commit 13dd1b09ee
5 changed files with 111 additions and 107 deletions
+2 -2
View File
@@ -20,9 +20,9 @@
}, },
"crawler": { "crawler": {
"notify": false, "notify": false,
"use_multi_threading": false, "useMultiThreading": false,
"maxWorkers": 5, "maxWorkers": 5,
"direction": "%env(BASANGO_CRAWLER_DEFAULT_DIRECTION)%" "direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%"
}, },
"async": { "async": {
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%", "redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
+104 -104
View File
@@ -2,189 +2,189 @@
"sources": { "sources": {
"html": [ "html": [
{ {
"source_id": "radiookapi.net", "sourceId": "radiookapi.net",
"source_url": "https://www.radiookapi.net", "sourceUrl": "https://www.radiookapi.net",
"source_date": { "sourceDate": {
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", "pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4" "replacement": "$3-$2-$1 $4"
}, },
"source_selectors": { "sourceSelectors": {
"articles": ".view-content > .views-row.content-row", "articles": ".view-content > .views-row.content-row",
"article_title": "h1.page-header", "articleTitle": "h1.page-header",
"article_link": ".views-field-title a", "articleLink": ".views-field-title a",
"article_body": ".field-name-body", "articleBody": ".field-name-body",
"article_date": ".views-field-created", "articleDate": ".views-field-created",
"article_categories": ".views-field-field-cat-gorie a", "articleCategories": ".views-field-field-cat-gorie a",
"pagination": "ul.pagination > li.pager-last > a" "pagination": "ul.pagination > li.pager-last > a"
}, },
"pagination_template": "actualite", "paginationTemplate": "actualite",
"supports_categories": false, "supportsCategories": false,
"requires_details": true, "requiresDetails": true,
"requires_rate_limit": false "requiresRateLimit": false
}, },
{ {
"source_id": "7sur7.cd", "sourceId": "7sur7.cd",
"source_url": "https://7sur7.cd", "sourceUrl": "https://7sur7.cd",
"source_date": { "sourceDate": {
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", "pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4" "replacement": "$3-$2-$1 $4"
}, },
"categories": ["politique", "economie", "culture", "sport", "societe"], "categories": ["politique", "economie", "culture", "sport", "societe"],
"source_selectors": { "sourceSelectors": {
"articles": ".view-content > .row.views-row", "articles": ".view-content > .row.views-row",
"article_title": ".views-field-title a", "articleTitle": ".views-field-title a",
"article_link": ".views-field-title a", "articleLink": ".views-field-title a",
"article_body": ".field.field--name-body", "articleBody": ".field.field--name-body",
"article_date": ".views-field-created", "articleDate": ".views-field-created",
"pagination": "ul.pagination > li.pager__item.pager__item--last > a" "pagination": "ul.pagination > li.pager__item.pager__item--last > a"
}, },
"pagination_template": "index.php/category/{category}", "paginationTemplate": "index.php/category/{category}",
"supports_categories": true, "supportsCategories": true,
"requires_details": false, "requiresDetails": false,
"requires_rate_limit": false "requiresRateLimit": false
}, },
{ {
"source_id": "mediacongo.net", "sourceId": "mediacongo.net",
"source_url": "https://www.mediacongo.net", "sourceUrl": "https://www.mediacongo.net",
"source_date": { "sourceDate": {
"format": "%d.%m.%Y %H:%M" "format": "%d.%m.%Y %H:%M"
}, },
"source_selectors": { "sourceSelectors": {
"articles": ".for_aitems > .article_other_item", "articles": ".for_aitems > .article_other_item",
"article_title": "img", "articleTitle": "img",
"article_link": "a:first-child", "articleLink": "a:first-child",
"article_categories": "a.color_link", "articleCategories": "a.color_link",
"article_body": ".article_ttext", "articleBody": ".article_ttext",
"article_date": ".article_other_about", "articleDate": ".article_other_about",
"pagination": "div.pagination > div > a:last-child" "pagination": "div.pagination > div > a:last-child"
}, },
"pagination_template": "articles.html", "paginationTemplate": "articles.html",
"supports_categories": false, "supportsCategories": false,
"requires_details": true, "requiresDetails": true,
"requires_rate_limit": false "requiresRateLimit": false
}, },
{ {
"source_id": "actualite.cd", "sourceId": "actualite.cd",
"source_url": "https://actualite.cd", "sourceUrl": "https://actualite.cd",
"source_date": { "sourceDate": {
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/", "pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$4-$3-$2 $5" "replacement": "$4-$3-$2 $5"
}, },
"source_selectors": { "sourceSelectors": {
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div", "articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
"article_title": "#actu-titre a", "articleTitle": "#actu-titre a",
"article_link": "#actu-titre a", "articleLink": "#actu-titre a",
"article_categories": "#actu-cat a", "articleCategories": "#actu-cat a",
"article_body": ".views-field.views-field-body", "articleBody": ".views-field.views-field-body",
"article_date": "#p-date" "articleDate": "#p-date"
}, },
"pagination_template": "actualite", "paginationTemplate": "actualite",
"supports_categories": false, "supportsCategories": false,
"requires_details": true, "requiresDetails": true,
"requires_rate_limit": false "requiresRateLimit": false
} }
], ],
"wordpress": [ "wordpress": [
{ {
"source_id": "beto.cd", "sourceId": "beto.cd",
"source_url": "https://beto.cd", "sourceUrl": "https://beto.cd",
"requires_rate_limit": true "requiresRateLimit": true
}, },
{ "source_id": "newscd.net", "source_url": "https://newscd.net" }, { "sourceId": "newscd.net", "sourceUrl": "https://newscd.net" },
{ {
"source_id": "africanewsrdc.net", "sourceId": "africanewsrdc.net",
"source_url": "https://www.africanewsrdc.net" "sourceUrl": "https://www.africanewsrdc.net"
}, },
{ {
"source_id": "angazainstitute.ac.cd", "sourceId": "angazainstitute.ac.cd",
"source_url": "https://angazainstitute.ac.cd" "sourceUrl": "https://angazainstitute.ac.cd"
}, },
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" }, { "sourceId": "b-onetv.cd", "sourceUrl": "https://b-onetv.cd" },
{ {
"source_id": "bukavufm.com", "sourceId": "bukavufm.com",
"source_url": "https://bukavufm.com" "sourceUrl": "https://bukavufm.com"
}, },
{ {
"source_id": "changement7.net", "sourceId": "changement7.net",
"source_url": "https://changement7.net" "sourceUrl": "https://changement7.net"
}, },
{ {
"source_id": "congoactu.net", "sourceId": "congoactu.net",
"source_url": "https://congoactu.net" "sourceUrl": "https://congoactu.net"
}, },
{ {
"source_id": "congoindependant.com", "sourceId": "congoindependant.com",
"source_url": "https://www.congoindependant.com" "sourceUrl": "https://www.congoindependant.com"
}, },
{ {
"source_id": "congoquotidien.com", "sourceId": "congoquotidien.com",
"source_url": "https://www.congoquotidien.com" "sourceUrl": "https://www.congoquotidien.com"
}, },
{ {
"source_id": "cumulard.cd", "sourceId": "cumulard.cd",
"source_url": "https://www.cumulard.cd" "sourceUrl": "https://www.cumulard.cd"
}, },
{ {
"source_id": "environews-rdc.net", "sourceId": "environews-rdc.net",
"source_url": "https://environews-rdc.net" "sourceUrl": "https://environews-rdc.net"
}, },
{ {
"source_id": "freemediardc.info", "sourceId": "freemediardc.info",
"source_url": "https://www.freemediardc.info" "sourceUrl": "https://www.freemediardc.info"
}, },
{ {
"source_id": "geopolismagazine.org", "sourceId": "geopolismagazine.org",
"source_url": "https://geopolismagazine.org" "sourceUrl": "https://geopolismagazine.org"
}, },
{ {
"source_id": "habarirdc.net", "sourceId": "habarirdc.net",
"source_url": "https://habarirdc.net" "sourceUrl": "https://habarirdc.net"
}, },
{ "source_id": "infordc.com", "source_url": "https://infordc.com" }, { "sourceId": "infordc.com", "sourceUrl": "https://infordc.com" },
{ {
"source_id": "kilalopress.net", "sourceId": "kilalopress.net",
"source_url": "https://kilalopress.net" "sourceUrl": "https://kilalopress.net"
}, },
{ {
"source_id": "laprosperiteonline.net", "sourceId": "laprosperiteonline.net",
"source_url": "https://laprosperiteonline.net" "sourceUrl": "https://laprosperiteonline.net"
}, },
{ {
"source_id": "laprunellerdc.cd", "sourceId": "laprunellerdc.cd",
"source_url": "https://laprunellerdc.cd" "sourceUrl": "https://laprunellerdc.cd"
}, },
{ {
"source_id": "lesmedias.net", "sourceId": "lesmedias.net",
"source_url": "https://lesmedias.net" "sourceUrl": "https://lesmedias.net"
}, },
{ {
"source_id": "lesvolcansnews.net", "sourceId": "lesvolcansnews.net",
"source_url": "https://lesvolcansnews.net" "sourceUrl": "https://lesvolcansnews.net"
}, },
{ {
"source_id": "netic-news.net", "sourceId": "netic-news.net",
"source_url": "https://www.netic-news.net" "sourceUrl": "https://www.netic-news.net"
}, },
{ {
"source_id": "objectif-infos.cd", "sourceId": "objectif-infos.cd",
"source_url": "https://objectif-infos.cd" "sourceUrl": "https://objectif-infos.cd"
}, },
{ {
"source_id": "scooprdc.net", "sourceId": "scooprdc.net",
"source_url": "https://scooprdc.net" "sourceUrl": "https://scooprdc.net"
}, },
{ {
"source_id": "journaldekinshasa.com", "sourceId": "journaldekinshasa.com",
"source_url": "https://www.journaldekinshasa.com" "sourceUrl": "https://www.journaldekinshasa.com"
}, },
{ {
"source_id": "lepotentiel.cd", "sourceId": "lepotentiel.cd",
"source_url": "https://lepotentiel.cd" "sourceUrl": "https://lepotentiel.cd"
}, },
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" }, { "sourceId": "acturdc.com", "sourceUrl": "https://acturdc.com" },
{ {
"source_id": "matininfos.net", "sourceId": "matininfos.net",
"source_url": "https://matininfos.net" "sourceUrl": "https://matininfos.net"
} }
] ]
} }
+1
View File
@@ -22,6 +22,7 @@
"node-html-parser": "^7.0.1", "node-html-parser": "^7.0.1",
"tiktoken": "^1.0.14", "tiktoken": "^1.0.14",
"turndown": "^7.2.2", "turndown": "^7.2.2",
"yaml": "^2.8.1",
"zod": "catalog:" "zod": "catalog:"
} }
} }
+1 -1
View File
@@ -10,7 +10,7 @@ import {
WordPressSourceConfigSchema, WordPressSourceConfigSchema,
} from "@/schema"; } from "@/schema";
export const PROJECT_DIR = path.resolve(process.cwd(), "basango", "apps", "crawler"); export const PROJECT_DIR = path.resolve(__dirname, "../");
export const PipelineConfigSchema = z.object({ export const PipelineConfigSchema = z.object({
paths: z.object({ paths: z.object({
+3
View File
@@ -22,6 +22,7 @@
"node-html-parser": "^7.0.1", "node-html-parser": "^7.0.1",
"tiktoken": "^1.0.14", "tiktoken": "^1.0.14",
"turndown": "^7.2.2", "turndown": "^7.2.2",
"yaml": "^2.8.1",
"zod": "catalog:", "zod": "catalog:",
}, },
}, },
@@ -451,6 +452,8 @@
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="], "xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
"yaml": ["yaml@2.8.1", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-lcYcMxX2PO9XMGvAJkJ3OsNMw+/7FKes7/hgerGUYWIoWu5j/+YQqcZr5JnPZWzOsEBgMbSbiSTn/dv/69Mkpw=="],
"yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="], "yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="],
"zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="], "zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="],