Files
basango/projects/crawler/config/pipeline.prod.yaml
T
2025-10-05 14:42:25 +02:00

161 lines
5.8 KiB
YAML

# Fetching and crawling configuration
fetch:
client:
timeout: 20
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
follow_redirects: true
verify_ssl: true
rotate_user_agent: true
max_retries: 3
backoff_initial: 1.0
backoff_multiplier: 2.0
backoff_max: 30.0
respect_retry_after: true
crawler:
notify: false
use_multi_threading: false
max_workers: 5
# Logging configuration
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
logging:
level: "ERROR"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_logging: true # Enable logging to file
console_logging: true # Enable logging to console
log_file: "pipeline.log" # Log file name
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
backup_count: 5 # Number of backup log files to keep
# Source configurations
sources:
html:
- source_id: radiookapi.net
source_url: https://www.radiookapi.net
source_date:
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
replacement: "$3-$2-$1 $4"
source_selectors:
articles: ".view-content > .views-row.content-row"
article_title: ".views-field-title a"
article_link: ".views-field-title a"
article_body: ".field-name-body"
article_date: ".views-field-created"
article_categories: ".views-field-field-cat-gorie a"
pagination: "ul.pagination > li a(:last-child)"
pagination_template: "/actualite?page={page}"
supports_categories: false
requires_details: false
requires_rate_limit: false
- source_id: 7sur7.cd
source_url: https://7sur7.cd
source_date:
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
replacement: "$3-$2-$1 $4"
categories: [ "politique", "economie", "culture", "sport", "societe" ]
source_selectors:
articles: ".view-content > .row.views-row"
article_title: ".views-field-title a"
article_link: ".views-field-title a"
article_body: ".field.field--name-body"
article_date: ".views-field-created"
pagination: "ul.pagination > li a(:last-child)"
pagination_template: "/index.php/category/{category}?page={page}"
supports_categories: true
requires_details: false
requires_rate_limit: false
- source_id: mediacongo.net
source_url: https://mediacongo.net
source_date:
format: "%d.%m.%Y %H:%M"
source_selectors:
articles: ".for_aitems > .article_other_item"
article_title: "img"
article_link: "a(:first-child)"
article_categories: "a.color_link"
article_body: ".article_ttext"
article_date: ".article_other_about"
pagination: ".nav > a(:last-child)"
pagination_template: "/articles.html?page={page}"
supports_categories: false
requires_details: true
requires_rate_limit: false
- source_id: actualite.cd
source_url: https://actualite.cd
source_date:
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
replacement: "$4-$3-$2 $5"
source_selectors:
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
article_title: "#actu-titre a"
article_link: "#actu-titre a"
article_categories: "#actu-cat a"
article_body: ".views-field.views-field-body"
article_date: "#p-date"
pagination_template: "/actualite?page={page}"
supports_categories: false
requires_details: true
requires_rate_limit: false
wordpress:
- source_id: beto.cd
source_url: https://beto.cd
requires_rate_limit: true
- source_id: newscd.net
source_url: https://newscd.net
- source_id: africanewsrdc.net
source_url: https://www.africanewsrdc.net
- source_id: angazainstitute.ac.cd
source_url: https://angazainstitute.ac.cd
- source_id: b-onetv.cd
source_url: https://b-onetv.cd
- source_id: bukavufm.com
source_url: https://bukavufm.com
- source_id: changement7.net
source_url: https://changement7.net
- source_id: congoactu.net
source_url: https://congoactu.net
- source_id: congoindependant.com
source_url: https://www.congoindependant.com
- source_id: congoquotidien.com
source_url: https://www.congoquotidien.com
- source_id: cumulard.cd
source_url: https://www.cumulard.cd
- source_id: environews-rdc.net
source_url: https://environews-rdc.net
- source_id: freemediardc.info
source_url: https://www.freemediardc.info
- source_id: geopolismagazine.org
source_url: https://geopolismagazine.org
- source_id: habarirdc.net
source_url: https://habarirdc.net
- source_id: infordc.com
source_url: https://infordc.com
- source_id: kilalopress.net
source_url: https://kilalopress.net
- source_id: laprosperiteonline.net
source_url: https://laprosperiteonline.net
- source_id: laprunellerdc.cd
source_url: https://laprunellerdc.cd
- source_id: lesmedias.net
source_url: https://lesmedias.net
- source_id: lesvolcansnews.net
source_url: https://lesvolcansnews.net
- source_id: netic-news.net
source_url: https://www.netic-news.net
- source_id: objectif-infos.cd
source_url: https://objectif-infos.cd
- source_id: scooprdc.net
source_url: https://scooprdc.net
- source_id: journaldekinshasa.com
source_url: https://www.journaldekinshasa.com
- source_id: lepotentiel.cd
source_url: https://lepotentiel.cd
- source_id: acturdc.com
source_url: https://acturdc.com
- source_id: matininfos.net
source_url: https://matininfos.net