diff --git a/basango/apps/crawler/config/pipeline.json b/basango/apps/crawler/config/pipeline.json index c173617..ab271eb 100644 --- a/basango/apps/crawler/config/pipeline.json +++ b/basango/apps/crawler/config/pipeline.json @@ -20,9 +20,9 @@ }, "crawler": { "notify": false, - "use_multi_threading": false, + "useMultiThreading": false, "maxWorkers": 5, - "direction": "%env(BASANGO_CRAWLER_DEFAULT_DIRECTION)%" + "direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%" }, "async": { "redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%", diff --git a/basango/apps/crawler/config/sources.json b/basango/apps/crawler/config/sources.json index 806322c..a09b7da 100644 --- a/basango/apps/crawler/config/sources.json +++ b/basango/apps/crawler/config/sources.json @@ -2,189 +2,189 @@ "sources": { "html": [ { - "source_id": "radiookapi.net", - "source_url": "https://www.radiookapi.net", - "source_date": { + "sourceId": "radiookapi.net", + "sourceUrl": "https://www.radiookapi.net", + "sourceDate": { "pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", "replacement": "$3-$2-$1 $4" }, - "source_selectors": { + "sourceSelectors": { "articles": ".view-content > .views-row.content-row", - "article_title": "h1.page-header", - "article_link": ".views-field-title a", - "article_body": ".field-name-body", - "article_date": ".views-field-created", - "article_categories": ".views-field-field-cat-gorie a", + "articleTitle": "h1.page-header", + "articleLink": ".views-field-title a", + "articleBody": ".field-name-body", + "articleDate": ".views-field-created", + "articleCategories": ".views-field-field-cat-gorie a", "pagination": "ul.pagination > li.pager-last > a" }, - "pagination_template": "actualite", - "supports_categories": false, - "requires_details": true, - "requires_rate_limit": false + "paginationTemplate": "actualite", + "supportsCategories": false, + "requiresDetails": true, + "requiresRateLimit": false }, { - "source_id": "7sur7.cd", - "source_url": "https://7sur7.cd", - "source_date": { + "sourceId": "7sur7.cd", + "sourceUrl": "https://7sur7.cd", + "sourceDate": { "pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", "replacement": "$3-$2-$1 $4" }, "categories": ["politique", "economie", "culture", "sport", "societe"], - "source_selectors": { + "sourceSelectors": { "articles": ".view-content > .row.views-row", - "article_title": ".views-field-title a", - "article_link": ".views-field-title a", - "article_body": ".field.field--name-body", - "article_date": ".views-field-created", + "articleTitle": ".views-field-title a", + "articleLink": ".views-field-title a", + "articleBody": ".field.field--name-body", + "articleDate": ".views-field-created", "pagination": "ul.pagination > li.pager__item.pager__item--last > a" }, - "pagination_template": "index.php/category/{category}", - "supports_categories": true, - "requires_details": false, - "requires_rate_limit": false + "paginationTemplate": "index.php/category/{category}", + "supportsCategories": true, + "requiresDetails": false, + "requiresRateLimit": false }, { - "source_id": "mediacongo.net", - "source_url": "https://www.mediacongo.net", - "source_date": { + "sourceId": "mediacongo.net", + "sourceUrl": "https://www.mediacongo.net", + "sourceDate": { "format": "%d.%m.%Y %H:%M" }, - "source_selectors": { + "sourceSelectors": { "articles": ".for_aitems > .article_other_item", - "article_title": "img", - "article_link": "a:first-child", - "article_categories": "a.color_link", - "article_body": ".article_ttext", - "article_date": ".article_other_about", + "articleTitle": "img", + "articleLink": "a:first-child", + "articleCategories": "a.color_link", + "articleBody": ".article_ttext", + "articleDate": ".article_other_about", "pagination": "div.pagination > div > a:last-child" }, - "pagination_template": "articles.html", - "supports_categories": false, - "requires_details": true, - "requires_rate_limit": false + "paginationTemplate": "articles.html", + "supportsCategories": false, + "requiresDetails": true, + "requiresRateLimit": false }, { - "source_id": "actualite.cd", - "source_url": "https://actualite.cd", - "source_date": { + "sourceId": "actualite.cd", + "sourceUrl": "https://actualite.cd", + "sourceDate": { "pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/", "replacement": "$4-$3-$2 $5" }, - "source_selectors": { + "sourceSelectors": { "articles": "#views-bootstrap-taxonomy-term-page-2 > div > div", - "article_title": "#actu-titre a", - "article_link": "#actu-titre a", - "article_categories": "#actu-cat a", - "article_body": ".views-field.views-field-body", - "article_date": "#p-date" + "articleTitle": "#actu-titre a", + "articleLink": "#actu-titre a", + "articleCategories": "#actu-cat a", + "articleBody": ".views-field.views-field-body", + "articleDate": "#p-date" }, - "pagination_template": "actualite", - "supports_categories": false, - "requires_details": true, - "requires_rate_limit": false + "paginationTemplate": "actualite", + "supportsCategories": false, + "requiresDetails": true, + "requiresRateLimit": false } ], "wordpress": [ { - "source_id": "beto.cd", - "source_url": "https://beto.cd", - "requires_rate_limit": true + "sourceId": "beto.cd", + "sourceUrl": "https://beto.cd", + "requiresRateLimit": true }, - { "source_id": "newscd.net", "source_url": "https://newscd.net" }, + { "sourceId": "newscd.net", "sourceUrl": "https://newscd.net" }, { - "source_id": "africanewsrdc.net", - "source_url": "https://www.africanewsrdc.net" + "sourceId": "africanewsrdc.net", + "sourceUrl": "https://www.africanewsrdc.net" }, { - "source_id": "angazainstitute.ac.cd", - "source_url": "https://angazainstitute.ac.cd" + "sourceId": "angazainstitute.ac.cd", + "sourceUrl": "https://angazainstitute.ac.cd" }, - { "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" }, + { "sourceId": "b-onetv.cd", "sourceUrl": "https://b-onetv.cd" }, { - "source_id": "bukavufm.com", - "source_url": "https://bukavufm.com" + "sourceId": "bukavufm.com", + "sourceUrl": "https://bukavufm.com" }, { - "source_id": "changement7.net", - "source_url": "https://changement7.net" + "sourceId": "changement7.net", + "sourceUrl": "https://changement7.net" }, { - "source_id": "congoactu.net", - "source_url": "https://congoactu.net" + "sourceId": "congoactu.net", + "sourceUrl": "https://congoactu.net" }, { - "source_id": "congoindependant.com", - "source_url": "https://www.congoindependant.com" + "sourceId": "congoindependant.com", + "sourceUrl": "https://www.congoindependant.com" }, { - "source_id": "congoquotidien.com", - "source_url": "https://www.congoquotidien.com" + "sourceId": "congoquotidien.com", + "sourceUrl": "https://www.congoquotidien.com" }, { - "source_id": "cumulard.cd", - "source_url": "https://www.cumulard.cd" + "sourceId": "cumulard.cd", + "sourceUrl": "https://www.cumulard.cd" }, { - "source_id": "environews-rdc.net", - "source_url": "https://environews-rdc.net" + "sourceId": "environews-rdc.net", + "sourceUrl": "https://environews-rdc.net" }, { - "source_id": "freemediardc.info", - "source_url": "https://www.freemediardc.info" + "sourceId": "freemediardc.info", + "sourceUrl": "https://www.freemediardc.info" }, { - "source_id": "geopolismagazine.org", - "source_url": "https://geopolismagazine.org" + "sourceId": "geopolismagazine.org", + "sourceUrl": "https://geopolismagazine.org" }, { - "source_id": "habarirdc.net", - "source_url": "https://habarirdc.net" + "sourceId": "habarirdc.net", + "sourceUrl": "https://habarirdc.net" }, - { "source_id": "infordc.com", "source_url": "https://infordc.com" }, + { "sourceId": "infordc.com", "sourceUrl": "https://infordc.com" }, { - "source_id": "kilalopress.net", - "source_url": "https://kilalopress.net" + "sourceId": "kilalopress.net", + "sourceUrl": "https://kilalopress.net" }, { - "source_id": "laprosperiteonline.net", - "source_url": "https://laprosperiteonline.net" + "sourceId": "laprosperiteonline.net", + "sourceUrl": "https://laprosperiteonline.net" }, { - "source_id": "laprunellerdc.cd", - "source_url": "https://laprunellerdc.cd" + "sourceId": "laprunellerdc.cd", + "sourceUrl": "https://laprunellerdc.cd" }, { - "source_id": "lesmedias.net", - "source_url": "https://lesmedias.net" + "sourceId": "lesmedias.net", + "sourceUrl": "https://lesmedias.net" }, { - "source_id": "lesvolcansnews.net", - "source_url": "https://lesvolcansnews.net" + "sourceId": "lesvolcansnews.net", + "sourceUrl": "https://lesvolcansnews.net" }, { - "source_id": "netic-news.net", - "source_url": "https://www.netic-news.net" + "sourceId": "netic-news.net", + "sourceUrl": "https://www.netic-news.net" }, { - "source_id": "objectif-infos.cd", - "source_url": "https://objectif-infos.cd" + "sourceId": "objectif-infos.cd", + "sourceUrl": "https://objectif-infos.cd" }, { - "source_id": "scooprdc.net", - "source_url": "https://scooprdc.net" + "sourceId": "scooprdc.net", + "sourceUrl": "https://scooprdc.net" }, { - "source_id": "journaldekinshasa.com", - "source_url": "https://www.journaldekinshasa.com" + "sourceId": "journaldekinshasa.com", + "sourceUrl": "https://www.journaldekinshasa.com" }, { - "source_id": "lepotentiel.cd", - "source_url": "https://lepotentiel.cd" + "sourceId": "lepotentiel.cd", + "sourceUrl": "https://lepotentiel.cd" }, - { "source_id": "acturdc.com", "source_url": "https://acturdc.com" }, + { "sourceId": "acturdc.com", "sourceUrl": "https://acturdc.com" }, { - "source_id": "matininfos.net", - "source_url": "https://matininfos.net" + "sourceId": "matininfos.net", + "sourceUrl": "https://matininfos.net" } ] } diff --git a/basango/apps/crawler/package.json b/basango/apps/crawler/package.json index d4f5fc3..2f2d77f 100644 --- a/basango/apps/crawler/package.json +++ b/basango/apps/crawler/package.json @@ -22,6 +22,7 @@ "node-html-parser": "^7.0.1", "tiktoken": "^1.0.14", "turndown": "^7.2.2", + "yaml": "^2.8.1", "zod": "catalog:" } } diff --git a/basango/apps/crawler/src/config.ts b/basango/apps/crawler/src/config.ts index c7501df..e02e582 100644 --- a/basango/apps/crawler/src/config.ts +++ b/basango/apps/crawler/src/config.ts @@ -10,7 +10,7 @@ import { WordPressSourceConfigSchema, } from "@/schema"; -export const PROJECT_DIR = path.resolve(process.cwd(), "basango", "apps", "crawler"); +export const PROJECT_DIR = path.resolve(__dirname, "../"); export const PipelineConfigSchema = z.object({ paths: z.object({ diff --git a/basango/bun.lock b/basango/bun.lock index bbf026d..005b238 100644 --- a/basango/bun.lock +++ b/basango/bun.lock @@ -22,6 +22,7 @@ "node-html-parser": "^7.0.1", "tiktoken": "^1.0.14", "turndown": "^7.2.2", + "yaml": "^2.8.1", "zod": "catalog:", }, }, @@ -451,6 +452,8 @@ "xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="], + "yaml": ["yaml@2.8.1", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-lcYcMxX2PO9XMGvAJkJ3OsNMw+/7FKes7/hgerGUYWIoWu5j/+YQqcZr5JnPZWzOsEBgMbSbiSTn/dv/69Mkpw=="], + "yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="], "zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="],