feat(crawler): add configuration manager and cli scripts

This commit is contained in:
Bernard Ngandu
2025-11-02 12:13:41 +02:00
parent eafb8a2de9
commit c55be3989c
8 changed files with 617 additions and 16 deletions
+152
View File
@@ -0,0 +1,152 @@
{
"fetch": {
"client": {
"timeout": 20,
"user_agent": "Basango/0.1 (+https://github.com/bernard-ng/basango)",
"follow_redirects": true,
"verify_ssl": true,
"rotate": true,
"max_retries": 3,
"backoff_initial": 1,
"backoff_multiplier": 2,
"backoff_max": 30,
"respect_retry_after": true
},
"crawler": {
"notify": false,
"use_multi_threading": false,
"max_workers": 5
}
},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
"file_logging": true,
"console_logging": true,
"log_file": "pipeline.log",
"max_log_size": 10485760,
"backup_count": 5
},
"sources": {
"html": [
{
"source_id": "radiookapi.net",
"source_url": "https://www.radiookapi.net",
"source_date": {
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"source_selectors": {
"articles": ".view-content > .views-row.content-row",
"article_title": "h1.page-header",
"article_link": ".views-field-title a",
"article_body": ".field-name-body",
"article_date": ".views-field-created",
"article_categories": ".views-field-field-cat-gorie a",
"pagination": "ul.pagination > li.pager-last > a"
},
"pagination_template": "actualite",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
},
{
"source_id": "7sur7.cd",
"source_url": "https://7sur7.cd",
"source_date": {
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"categories": [
"politique",
"economie",
"culture",
"sport",
"societe"
],
"source_selectors": {
"articles": ".view-content > .row.views-row",
"article_title": ".views-field-title a",
"article_link": ".views-field-title a",
"article_body": ".field.field--name-body",
"article_date": ".views-field-created",
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
},
"pagination_template": "index.php/category/{category}",
"supports_categories": true,
"requires_details": false,
"requires_rate_limit": false
},
{
"source_id": "mediacongo.net",
"source_url": "https://www.mediacongo.net",
"source_date": {
"format": "%d.%m.%Y %H:%M"
},
"source_selectors": {
"articles": ".for_aitems > .article_other_item",
"article_title": "img",
"article_link": "a:first-child",
"article_categories": "a.color_link",
"article_body": ".article_ttext",
"article_date": ".article_other_about",
"pagination": "div.pagination > div > a:last-child"
},
"pagination_template": "articles.html",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
},
{
"source_id": "actualite.cd",
"source_url": "https://actualite.cd",
"source_date": {
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$4-$3-$2 $5"
},
"source_selectors": {
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
"article_title": "#actu-titre a",
"article_link": "#actu-titre a",
"article_categories": "#actu-cat a",
"article_body": ".views-field.views-field-body",
"article_date": "#p-date"
},
"pagination_template": "actualite",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
}
],
"wordpress": [
{ "source_id": "beto.cd", "source_url": "https://beto.cd", "requires_rate_limit": true },
{ "source_id": "newscd.net", "source_url": "https://newscd.net" },
{ "source_id": "africanewsrdc.net", "source_url": "https://www.africanewsrdc.net" },
{ "source_id": "angazainstitute.ac.cd", "source_url": "https://angazainstitute.ac.cd" },
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" },
{ "source_id": "bukavufm.com", "source_url": "https://bukavufm.com" },
{ "source_id": "changement7.net", "source_url": "https://changement7.net" },
{ "source_id": "congoactu.net", "source_url": "https://congoactu.net" },
{ "source_id": "congoindependant.com", "source_url": "https://www.congoindependant.com" },
{ "source_id": "congoquotidien.com", "source_url": "https://www.congoquotidien.com" },
{ "source_id": "cumulard.cd", "source_url": "https://www.cumulard.cd" },
{ "source_id": "environews-rdc.net", "source_url": "https://environews-rdc.net" },
{ "source_id": "freemediardc.info", "source_url": "https://www.freemediardc.info" },
{ "source_id": "geopolismagazine.org", "source_url": "https://geopolismagazine.org" },
{ "source_id": "habarirdc.net", "source_url": "https://habarirdc.net" },
{ "source_id": "infordc.com", "source_url": "https://infordc.com" },
{ "source_id": "kilalopress.net", "source_url": "https://kilalopress.net" },
{ "source_id": "laprosperiteonline.net", "source_url": "https://laprosperiteonline.net" },
{ "source_id": "laprunellerdc.cd", "source_url": "https://laprunellerdc.cd" },
{ "source_id": "lesmedias.net", "source_url": "https://lesmedias.net" },
{ "source_id": "lesvolcansnews.net", "source_url": "https://lesvolcansnews.net" },
{ "source_id": "netic-news.net", "source_url": "https://www.netic-news.net" },
{ "source_id": "objectif-infos.cd", "source_url": "https://objectif-infos.cd" },
{ "source_id": "scooprdc.net", "source_url": "https://scooprdc.net" },
{ "source_id": "journaldekinshasa.com", "source_url": "https://www.journaldekinshasa.com" },
{ "source_id": "lepotentiel.cd", "source_url": "https://lepotentiel.cd" },
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" },
{ "source_id": "matininfos.net", "source_url": "https://matininfos.net" }
]
}
}
@@ -0,0 +1,152 @@
{
"fetch": {
"client": {
"timeout": 20,
"user_agent": "Basango/0.1 (+https://github.com/bernard-ng/basango)",
"follow_redirects": true,
"verify_ssl": true,
"rotate": true,
"max_retries": 3,
"backoff_initial": 1,
"backoff_multiplier": 2,
"backoff_max": 30,
"respect_retry_after": true
},
"crawler": {
"notify": false,
"use_multi_threading": false,
"max_workers": 5
}
},
"logging": {
"level": "ERROR",
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
"file_logging": true,
"console_logging": true,
"log_file": "pipeline.log",
"max_log_size": 10485760,
"backup_count": 5
},
"sources": {
"html": [
{
"source_id": "radiookapi.net",
"source_url": "https://www.radiookapi.net",
"source_date": {
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"source_selectors": {
"articles": ".view-content > .views-row.content-row",
"article_title": ".views-field-title a",
"article_link": ".views-field-title a",
"article_body": ".field-name-body",
"article_date": ".views-field-created",
"article_categories": ".views-field-field-cat-gorie a",
"pagination": "ul.pagination > li a(:last-child)"
},
"pagination_template": "/actualite?page={page}",
"supports_categories": false,
"requires_details": false,
"requires_rate_limit": false
},
{
"source_id": "7sur7.cd",
"source_url": "https://7sur7.cd",
"source_date": {
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$3-$2-$1 $4"
},
"categories": [
"politique",
"economie",
"culture",
"sport",
"societe"
],
"source_selectors": {
"articles": ".view-content > .row.views-row",
"article_title": ".views-field-title a",
"article_link": ".views-field-title a",
"article_body": ".field.field--name-body",
"article_date": ".views-field-created",
"pagination": "ul.pagination > li a(:last-child)"
},
"pagination_template": "/index.php/category/{category}?page={page}",
"supports_categories": true,
"requires_details": false,
"requires_rate_limit": false
},
{
"source_id": "mediacongo.net",
"source_url": "https://mediacongo.net",
"source_date": {
"format": "%d.%m.%Y %H:%M"
},
"source_selectors": {
"articles": ".for_aitems > .article_other_item",
"article_title": "img",
"article_link": "a(:first-child)",
"article_categories": "a.color_link",
"article_body": ".article_ttext",
"article_date": ".article_other_about",
"pagination": ".nav > a(:last-child)"
},
"pagination_template": "/articles.html?page={page}",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
},
{
"source_id": "actualite.cd",
"source_url": "https://actualite.cd",
"source_date": {
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
"replacement": "$4-$3-$2 $5"
},
"source_selectors": {
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
"article_title": "#actu-titre a",
"article_link": "#actu-titre a",
"article_categories": "#actu-cat a",
"article_body": ".views-field.views-field-body",
"article_date": "#p-date"
},
"pagination_template": "/actualite?page={page}",
"supports_categories": false,
"requires_details": true,
"requires_rate_limit": false
}
],
"wordpress": [
{ "source_id": "beto.cd", "source_url": "https://beto.cd", "requires_rate_limit": true },
{ "source_id": "newscd.net", "source_url": "https://newscd.net" },
{ "source_id": "africanewsrdc.net", "source_url": "https://www.africanewsrdc.net" },
{ "source_id": "angazainstitute.ac.cd", "source_url": "https://angazainstitute.ac.cd" },
{ "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" },
{ "source_id": "bukavufm.com", "source_url": "https://bukavufm.com" },
{ "source_id": "changement7.net", "source_url": "https://changement7.net" },
{ "source_id": "congoactu.net", "source_url": "https://congoactu.net" },
{ "source_id": "congoindependant.com", "source_url": "https://www.congoindependant.com" },
{ "source_id": "congoquotidien.com", "source_url": "https://www.congoquotidien.com" },
{ "source_id": "cumulard.cd", "source_url": "https://www.cumulard.cd" },
{ "source_id": "environews-rdc.net", "source_url": "https://environews-rdc.net" },
{ "source_id": "freemediardc.info", "source_url": "https://www.freemediardc.info" },
{ "source_id": "geopolismagazine.org", "source_url": "https://geopolismagazine.org" },
{ "source_id": "habarirdc.net", "source_url": "https://habarirdc.net" },
{ "source_id": "infordc.com", "source_url": "https://infordc.com" },
{ "source_id": "kilalopress.net", "source_url": "https://kilalopress.net" },
{ "source_id": "laprosperiteonline.net", "source_url": "https://laprosperiteonline.net" },
{ "source_id": "laprunellerdc.cd", "source_url": "https://laprunellerdc.cd" },
{ "source_id": "lesmedias.net", "source_url": "https://lesmedias.net" },
{ "source_id": "lesvolcansnews.net", "source_url": "https://lesvolcansnews.net" },
{ "source_id": "netic-news.net", "source_url": "https://www.netic-news.net" },
{ "source_id": "objectif-infos.cd", "source_url": "https://objectif-infos.cd" },
{ "source_id": "scooprdc.net", "source_url": "https://scooprdc.net" },
{ "source_id": "journaldekinshasa.com", "source_url": "https://www.journaldekinshasa.com" },
{ "source_id": "lepotentiel.cd", "source_url": "https://lepotentiel.cd" },
{ "source_id": "acturdc.com", "source_url": "https://acturdc.com" },
{ "source_id": "matininfos.net", "source_url": "https://matininfos.net" }
]
}
}