Initial commit
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
# Fetching and crawling configuration
|
||||
fetch:
|
||||
client:
|
||||
timeout: 20
|
||||
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
|
||||
follow_redirects: true
|
||||
verify_ssl: true
|
||||
rotate_user_agent: true
|
||||
max_retries: 3
|
||||
backoff_initial: 1.0
|
||||
backoff_multiplier: 2.0
|
||||
backoff_max: 30.0
|
||||
respect_retry_after: true
|
||||
crawler:
|
||||
notify: false
|
||||
use_multi_threading: false
|
||||
max_workers: 5
|
||||
|
||||
# Source configurations
|
||||
sources:
|
||||
html:
|
||||
- source_id: radiookapi.net
|
||||
source_url: https://www.radiookapi.net
|
||||
source_date:
|
||||
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
source_selectors:
|
||||
articles: ".view-content > .views-row.content-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field-name-body"
|
||||
article_date: ".views-field-created"
|
||||
article_categories: ".views-field-field-cat-gorie a"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: 7sur7.cd
|
||||
source_url: https://7sur7.cd
|
||||
source_date:
|
||||
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
categories: [ "politique", "economie", "culture", "sport", "societe" ]
|
||||
source_selectors:
|
||||
articles: ".view-content > .row.views-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field.field--name-body"
|
||||
article_date: ".views-field-created"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/index.php/category/{category}?page={page}"
|
||||
supports_categories: true
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: mediacongo.net
|
||||
source_url: https://mediacongo.net
|
||||
source_date:
|
||||
format: "%d.%m.%Y %H:%M"
|
||||
source_selectors:
|
||||
articles: ".for_aitems > .article_other_item"
|
||||
article_title: "img"
|
||||
article_link: "a(:first-child)"
|
||||
article_categories: "a.color_link"
|
||||
article_body: ".article_ttext"
|
||||
article_date: ".article_other_about"
|
||||
pagination: ".nav > a(:last-child)"
|
||||
pagination_template: "/articles.html?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: actualite.cd
|
||||
source_url: https://actualite.cd
|
||||
source_date:
|
||||
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$4-$3-$2 $5"
|
||||
source_selectors:
|
||||
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
|
||||
article_title: "#actu-titre a"
|
||||
article_link: "#actu-titre a"
|
||||
article_categories: "#actu-cat a"
|
||||
article_body: ".views-field.views-field-body"
|
||||
article_date: "#p-date"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
wordpress:
|
||||
- source_id: beto.cd
|
||||
source_url: https://beto.cd
|
||||
requires_rate_limit: true
|
||||
- source_id: newscd.net
|
||||
source_url: https://newscd.net
|
||||
@@ -0,0 +1,160 @@
|
||||
# Fetching and crawling configuration
|
||||
fetch:
|
||||
client:
|
||||
timeout: 20
|
||||
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
|
||||
follow_redirects: true
|
||||
verify_ssl: true
|
||||
rotate_user_agent: true
|
||||
max_retries: 3
|
||||
backoff_initial: 1.0
|
||||
backoff_multiplier: 2.0
|
||||
backoff_max: 30.0
|
||||
respect_retry_after: true
|
||||
crawler:
|
||||
notify: false
|
||||
use_multi_threading: false
|
||||
max_workers: 5
|
||||
|
||||
# Logging configuration
|
||||
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
logging:
|
||||
level: "ERROR"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: true # Enable logging to file
|
||||
console_logging: true # Enable logging to console
|
||||
log_file: "pipeline.log" # Log file name
|
||||
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
||||
backup_count: 5 # Number of backup log files to keep
|
||||
|
||||
# Source configurations
|
||||
sources:
|
||||
html:
|
||||
- source_id: radiookapi.net
|
||||
source_url: https://www.radiookapi.net
|
||||
source_date:
|
||||
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
source_selectors:
|
||||
articles: ".view-content > .views-row.content-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field-name-body"
|
||||
article_date: ".views-field-created"
|
||||
article_categories: ".views-field-field-cat-gorie a"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: 7sur7.cd
|
||||
source_url: https://7sur7.cd
|
||||
source_date:
|
||||
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
categories: [ "politique", "economie", "culture", "sport", "societe" ]
|
||||
source_selectors:
|
||||
articles: ".view-content > .row.views-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field.field--name-body"
|
||||
article_date: ".views-field-created"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/index.php/category/{category}?page={page}"
|
||||
supports_categories: true
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: mediacongo.net
|
||||
source_url: https://mediacongo.net
|
||||
source_date:
|
||||
format: "%d.%m.%Y %H:%M"
|
||||
source_selectors:
|
||||
articles: ".for_aitems > .article_other_item"
|
||||
article_title: "img"
|
||||
article_link: "a(:first-child)"
|
||||
article_categories: "a.color_link"
|
||||
article_body: ".article_ttext"
|
||||
article_date: ".article_other_about"
|
||||
pagination: ".nav > a(:last-child)"
|
||||
pagination_template: "/articles.html?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: actualite.cd
|
||||
source_url: https://actualite.cd
|
||||
source_date:
|
||||
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$4-$3-$2 $5"
|
||||
source_selectors:
|
||||
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
|
||||
article_title: "#actu-titre a"
|
||||
article_link: "#actu-titre a"
|
||||
article_categories: "#actu-cat a"
|
||||
article_body: ".views-field.views-field-body"
|
||||
article_date: "#p-date"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
wordpress:
|
||||
- source_id: beto.cd
|
||||
source_url: https://beto.cd
|
||||
requires_rate_limit: true
|
||||
- source_id: newscd.net
|
||||
source_url: https://newscd.net
|
||||
- source_id: africanewsrdc.net
|
||||
source_url: https://www.africanewsrdc.net
|
||||
- source_id: angazainstitute.ac.cd
|
||||
source_url: https://angazainstitute.ac.cd
|
||||
- source_id: b-onetv.cd
|
||||
source_url: https://b-onetv.cd
|
||||
- source_id: bukavufm.com
|
||||
source_url: https://bukavufm.com
|
||||
- source_id: changement7.net
|
||||
source_url: https://changement7.net
|
||||
- source_id: congoactu.net
|
||||
source_url: https://congoactu.net
|
||||
- source_id: congoindependant.com
|
||||
source_url: https://www.congoindependant.com
|
||||
- source_id: congoquotidien.com
|
||||
source_url: https://www.congoquotidien.com
|
||||
- source_id: cumulard.cd
|
||||
source_url: https://www.cumulard.cd
|
||||
- source_id: environews-rdc.net
|
||||
source_url: https://environews-rdc.net
|
||||
- source_id: freemediardc.info
|
||||
source_url: https://www.freemediardc.info
|
||||
- source_id: geopolismagazine.org
|
||||
source_url: https://geopolismagazine.org
|
||||
- source_id: habarirdc.net
|
||||
source_url: https://habarirdc.net
|
||||
- source_id: infordc.com
|
||||
source_url: https://infordc.com
|
||||
- source_id: kilalopress.net
|
||||
source_url: https://kilalopress.net
|
||||
- source_id: laprosperiteonline.net
|
||||
source_url: https://laprosperiteonline.net
|
||||
- source_id: laprunellerdc.cd
|
||||
source_url: https://laprunellerdc.cd
|
||||
- source_id: lesmedias.net
|
||||
source_url: https://lesmedias.net
|
||||
- source_id: lesvolcansnews.net
|
||||
source_url: https://lesvolcansnews.net
|
||||
- source_id: netic-news.net
|
||||
source_url: https://www.netic-news.net
|
||||
- source_id: objectif-infos.cd
|
||||
source_url: https://objectif-infos.cd
|
||||
- source_id: scooprdc.net
|
||||
source_url: https://scooprdc.net
|
||||
- source_id: journaldekinshasa.com
|
||||
source_url: https://www.journaldekinshasa.com
|
||||
- source_id: lepotentiel.cd
|
||||
source_url: https://lepotentiel.cd
|
||||
- source_id: acturdc.com
|
||||
source_url: https://acturdc.com
|
||||
- source_id: matininfos.net
|
||||
source_url: https://matininfos.net
|
||||
@@ -0,0 +1,160 @@
|
||||
# Fetching and crawling configuration
|
||||
fetch:
|
||||
client:
|
||||
timeout: 20
|
||||
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
|
||||
follow_redirects: true
|
||||
verify_ssl: true
|
||||
rotate: true
|
||||
max_retries: 3
|
||||
backoff_initial: 1.0
|
||||
backoff_multiplier: 2.0
|
||||
backoff_max: 30.0
|
||||
respect_retry_after: true
|
||||
crawler:
|
||||
notify: false
|
||||
use_multi_threading: false
|
||||
max_workers: 5
|
||||
|
||||
# Logging configuration
|
||||
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
logging:
|
||||
level: "INFO"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: true # Enable logging to file
|
||||
console_logging: true # Enable logging to console
|
||||
log_file: "pipeline.log" # Log file name
|
||||
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
||||
backup_count: 5 # Number of backup log files to keep
|
||||
|
||||
# Source configurations
|
||||
sources:
|
||||
html:
|
||||
- source_id: radiookapi.net
|
||||
source_url: https://www.radiookapi.net
|
||||
source_date:
|
||||
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
source_selectors:
|
||||
articles: ".view-content > .views-row.content-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field-name-body"
|
||||
article_date: ".views-field-created"
|
||||
article_categories: ".views-field-field-cat-gorie a"
|
||||
pagination: "ul.pagination > li.pager-last > a"
|
||||
pagination_template: "actualite"
|
||||
supports_categories: false
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: 7sur7.cd
|
||||
source_url: https://7sur7.cd
|
||||
source_date:
|
||||
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
categories: [ "politique", "economie", "culture", "sport", "societe" ]
|
||||
source_selectors:
|
||||
articles: ".view-content > .row.views-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field.field--name-body"
|
||||
article_date: ".views-field-created"
|
||||
pagination: "ul.pagination > li.pager__item.pager__item--last > a"
|
||||
pagination_template: "index.php/category/{category}"
|
||||
supports_categories: true
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: mediacongo.net
|
||||
source_url: https://www.mediacongo.net
|
||||
source_date:
|
||||
format: "%d.%m.%Y %H:%M"
|
||||
source_selectors:
|
||||
articles: ".for_aitems > .article_other_item"
|
||||
article_title: "img"
|
||||
article_link: "a:first-child"
|
||||
article_categories: "a.color_link"
|
||||
article_body: ".article_ttext"
|
||||
article_date: ".article_other_about"
|
||||
pagination: "div.pagination > div > a:last-child"
|
||||
pagination_template: "articles.html"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: actualite.cd
|
||||
source_url: https://actualite.cd
|
||||
source_date:
|
||||
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$4-$3-$2 $5"
|
||||
source_selectors:
|
||||
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
|
||||
article_title: "#actu-titre a"
|
||||
article_link: "#actu-titre a"
|
||||
article_categories: "#actu-cat a"
|
||||
article_body: ".views-field.views-field-body"
|
||||
article_date: "#p-date"
|
||||
pagination_template: "actualite"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
wordpress:
|
||||
- source_id: beto.cd
|
||||
source_url: https://beto.cd
|
||||
requires_rate_limit: true
|
||||
- source_id: newscd.net
|
||||
source_url: https://newscd.net
|
||||
- source_id: africanewsrdc.net
|
||||
source_url: https://www.africanewsrdc.net
|
||||
- source_id: angazainstitute.ac.cd
|
||||
source_url: https://angazainstitute.ac.cd
|
||||
- source_id: b-onetv.cd
|
||||
source_url: https://b-onetv.cd
|
||||
- source_id: bukavufm.com
|
||||
source_url: https://bukavufm.com
|
||||
- source_id: changement7.net
|
||||
source_url: https://changement7.net
|
||||
- source_id: congoactu.net
|
||||
source_url: https://congoactu.net
|
||||
- source_id: congoindependant.com
|
||||
source_url: https://www.congoindependant.com
|
||||
- source_id: congoquotidien.com
|
||||
source_url: https://www.congoquotidien.com
|
||||
- source_id: cumulard.cd
|
||||
source_url: https://www.cumulard.cd
|
||||
- source_id: environews-rdc.net
|
||||
source_url: https://environews-rdc.net
|
||||
- source_id: freemediardc.info
|
||||
source_url: https://www.freemediardc.info
|
||||
- source_id: geopolismagazine.org
|
||||
source_url: https://geopolismagazine.org
|
||||
- source_id: habarirdc.net
|
||||
source_url: https://habarirdc.net
|
||||
- source_id: infordc.com
|
||||
source_url: https://infordc.com
|
||||
- source_id: kilalopress.net
|
||||
source_url: https://kilalopress.net
|
||||
- source_id: laprosperiteonline.net
|
||||
source_url: https://laprosperiteonline.net
|
||||
- source_id: laprunellerdc.cd
|
||||
source_url: https://laprunellerdc.cd
|
||||
- source_id: lesmedias.net
|
||||
source_url: https://lesmedias.net
|
||||
- source_id: lesvolcansnews.net
|
||||
source_url: https://lesvolcansnews.net
|
||||
- source_id: netic-news.net
|
||||
source_url: https://www.netic-news.net
|
||||
- source_id: objectif-infos.cd
|
||||
source_url: https://objectif-infos.cd
|
||||
- source_id: scooprdc.net
|
||||
source_url: https://scooprdc.net
|
||||
- source_id: journaldekinshasa.com
|
||||
source_url: https://www.journaldekinshasa.com
|
||||
- source_id: lepotentiel.cd
|
||||
source_url: https://lepotentiel.cd
|
||||
- source_id: acturdc.com
|
||||
source_url: https://acturdc.com
|
||||
- source_id: matininfos.net
|
||||
source_url: https://matininfos.net
|
||||
Reference in New Issue
Block a user