161 lines
5.7 KiB
YAML
161 lines
5.7 KiB
YAML
# Fetching and crawling configuration
|
|
fetch:
|
|
client:
|
|
timeout: 20
|
|
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
|
|
follow_redirects: true
|
|
verify_ssl: true
|
|
rotate: true
|
|
max_retries: 3
|
|
backoff_initial: 1.0
|
|
backoff_multiplier: 2.0
|
|
backoff_max: 30.0
|
|
respect_retry_after: true
|
|
crawler:
|
|
notify: false
|
|
use_multi_threading: false
|
|
max_workers: 5
|
|
|
|
# Logging configuration
|
|
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
logging:
|
|
level: "INFO"
|
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
file_logging: true # Enable logging to file
|
|
console_logging: true # Enable logging to console
|
|
log_file: "pipeline.log" # Log file name
|
|
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
|
backup_count: 5 # Number of backup log files to keep
|
|
|
|
# Source configurations
|
|
sources:
|
|
html:
|
|
- source_id: radiookapi.net
|
|
source_url: https://www.radiookapi.net
|
|
source_date:
|
|
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
|
replacement: "$3-$2-$1 $4"
|
|
source_selectors:
|
|
articles: ".view-content > .views-row.content-row"
|
|
article_title: "h1.page-header"
|
|
article_link: ".views-field-title a"
|
|
article_body: ".field-name-body"
|
|
article_date: ".views-field-created"
|
|
article_categories: ".views-field-field-cat-gorie a"
|
|
pagination: "ul.pagination > li.pager-last > a"
|
|
pagination_template: "actualite"
|
|
supports_categories: false
|
|
requires_details: true
|
|
requires_rate_limit: false
|
|
|
|
- source_id: 7sur7.cd
|
|
source_url: https://7sur7.cd
|
|
source_date:
|
|
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
|
replacement: "$3-$2-$1 $4"
|
|
categories: [ "politique", "economie", "culture", "sport", "societe" ]
|
|
source_selectors:
|
|
articles: ".view-content > .row.views-row"
|
|
article_title: ".views-field-title a"
|
|
article_link: ".views-field-title a"
|
|
article_body: ".field.field--name-body"
|
|
article_date: ".views-field-created"
|
|
pagination: "ul.pagination > li.pager__item.pager__item--last > a"
|
|
pagination_template: "index.php/category/{category}"
|
|
supports_categories: true
|
|
requires_details: false
|
|
requires_rate_limit: false
|
|
|
|
- source_id: mediacongo.net
|
|
source_url: https://www.mediacongo.net
|
|
source_date:
|
|
format: "%d.%m.%Y %H:%M"
|
|
source_selectors:
|
|
articles: ".for_aitems > .article_other_item"
|
|
article_title: "img"
|
|
article_link: "a:first-child"
|
|
article_categories: "a.color_link"
|
|
article_body: ".article_ttext"
|
|
article_date: ".article_other_about"
|
|
pagination: "div.pagination > div > a:last-child"
|
|
pagination_template: "articles.html"
|
|
supports_categories: false
|
|
requires_details: true
|
|
requires_rate_limit: false
|
|
|
|
- source_id: actualite.cd
|
|
source_url: https://actualite.cd
|
|
source_date:
|
|
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
|
|
replacement: "$4-$3-$2 $5"
|
|
source_selectors:
|
|
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
|
|
article_title: "#actu-titre a"
|
|
article_link: "#actu-titre a"
|
|
article_categories: "#actu-cat a"
|
|
article_body: ".views-field.views-field-body"
|
|
article_date: "#p-date"
|
|
pagination_template: "actualite"
|
|
supports_categories: false
|
|
requires_details: true
|
|
requires_rate_limit: false
|
|
|
|
wordpress:
|
|
- source_id: beto.cd
|
|
source_url: https://beto.cd
|
|
requires_rate_limit: true
|
|
- source_id: newscd.net
|
|
source_url: https://newscd.net
|
|
- source_id: africanewsrdc.net
|
|
source_url: https://www.africanewsrdc.net
|
|
- source_id: angazainstitute.ac.cd
|
|
source_url: https://angazainstitute.ac.cd
|
|
- source_id: b-onetv.cd
|
|
source_url: https://b-onetv.cd
|
|
- source_id: bukavufm.com
|
|
source_url: https://bukavufm.com
|
|
- source_id: changement7.net
|
|
source_url: https://changement7.net
|
|
- source_id: congoactu.net
|
|
source_url: https://congoactu.net
|
|
- source_id: congoindependant.com
|
|
source_url: https://www.congoindependant.com
|
|
- source_id: congoquotidien.com
|
|
source_url: https://www.congoquotidien.com
|
|
- source_id: cumulard.cd
|
|
source_url: https://www.cumulard.cd
|
|
- source_id: environews-rdc.net
|
|
source_url: https://environews-rdc.net
|
|
- source_id: freemediardc.info
|
|
source_url: https://www.freemediardc.info
|
|
- source_id: geopolismagazine.org
|
|
source_url: https://geopolismagazine.org
|
|
- source_id: habarirdc.net
|
|
source_url: https://habarirdc.net
|
|
- source_id: infordc.com
|
|
source_url: https://infordc.com
|
|
- source_id: kilalopress.net
|
|
source_url: https://kilalopress.net
|
|
- source_id: laprosperiteonline.net
|
|
source_url: https://laprosperiteonline.net
|
|
- source_id: laprunellerdc.cd
|
|
source_url: https://laprunellerdc.cd
|
|
- source_id: lesmedias.net
|
|
source_url: https://lesmedias.net
|
|
- source_id: lesvolcansnews.net
|
|
source_url: https://lesvolcansnews.net
|
|
- source_id: netic-news.net
|
|
source_url: https://www.netic-news.net
|
|
- source_id: objectif-infos.cd
|
|
source_url: https://objectif-infos.cd
|
|
- source_id: scooprdc.net
|
|
source_url: https://scooprdc.net
|
|
- source_id: journaldekinshasa.com
|
|
source_url: https://www.journaldekinshasa.com
|
|
- source_id: lepotentiel.cd
|
|
source_url: https://lepotentiel.cd
|
|
- source_id: acturdc.com
|
|
source_url: https://acturdc.com
|
|
- source_id: matininfos.net
|
|
source_url: https://matininfos.net
|