[crawler] remove python implementation
This commit is contained in:
@@ -1,23 +0,0 @@
|
||||
# Ignore Python cache files
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Ignore virtual environments
|
||||
.venv/
|
||||
|
||||
# Ignore local environment files
|
||||
.env.local
|
||||
.env.*.local
|
||||
|
||||
# Ignore logs
|
||||
*.log
|
||||
|
||||
# Ignore Docker-related files
|
||||
Dockerfile
|
||||
docker-compose.yml
|
||||
|
||||
# Ignore other unnecessary files
|
||||
*.swp
|
||||
.idea/
|
||||
.vscode/
|
||||
.DS_Store
|
||||
@@ -1,7 +0,0 @@
|
||||
BASANGO_CRAWLER_TOKEN=some-token
|
||||
BASANGO_API_ENDPOINT=http://localhost:8000/api/aggregator/articles?token=dev
|
||||
BASANGO_REDIS_URL=redis://localhost:6379/0
|
||||
BASANGO_QUEUE_PREFIX=basango
|
||||
BASANGO_QUEUE_TIMEOUT=30
|
||||
BASANGO_QUEUE_RESULT_TTL=3600
|
||||
BASANGO_QUEUE_FAILURE_TTL=86400
|
||||
@@ -1,22 +0,0 @@
|
||||
.idea/
|
||||
.vscode/
|
||||
.ipynb_checkpoints/
|
||||
*.pyc
|
||||
.env.local
|
||||
.env.*.local
|
||||
var/
|
||||
.DS_Store
|
||||
|
||||
# Python-generated files
|
||||
__pycache__/
|
||||
.pytest_cache/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
|
||||
data/
|
||||
@@ -1,6 +0,0 @@
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.12.12
|
||||
hooks:
|
||||
- id: ruff-check
|
||||
- id: ruff-format
|
||||
@@ -1 +0,0 @@
|
||||
3.13
|
||||
@@ -1,34 +0,0 @@
|
||||
# Use the official Python image as a base
|
||||
FROM python:3.13-slim
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||
|
||||
# Install the project into `/app`
|
||||
WORKDIR /app
|
||||
|
||||
# Enable bytecode compilation
|
||||
ENV UV_COMPILE_BYTECODE=1
|
||||
|
||||
# Copy from the cache instead of linking since it's a mounted volume
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Ensure installed tools can be executed out of the box
|
||||
ENV UV_TOOL_BIN_DIR=/usr/local/bin
|
||||
|
||||
# Install the project's dependencies using the lockfile and settings
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=uv.lock,target=uv.lock \
|
||||
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
||||
uv sync --locked --no-install-project --no-dev
|
||||
|
||||
# Then, add the rest of the project source code and install it
|
||||
# Installing separately from its dependencies allows optimal layer caching
|
||||
COPY . /app
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --locked --no-dev
|
||||
|
||||
# Place executables in the environment at the front of the path
|
||||
ENV PATH="/app/.venv/bin:$PATH"
|
||||
|
||||
# Reset the entrypoint, don't invoke `uv`
|
||||
ENTRYPOINT []
|
||||
@@ -1,46 +0,0 @@
|
||||
# Crawler
|
||||
|
||||
[](https://github.com/bernard-ng/basango/actions/workflows/crawler_audit.yml)
|
||||
[](https://github.com/bernard-ng/basango/actions/workflows/crawler_quality.yml)
|
||||
[](https://github.com/bernard-ng/basango/actions/workflows/crawler_tests.yml)
|
||||
[](https://github.com/astral-sh/ruff)
|
||||
[](https://github.com/PyCQA/bandit)
|
||||
|
||||
---
|
||||
|
||||
### Usage
|
||||
|
||||
- Install the project in your virtualenv so the `basango` CLI is available:
|
||||
- With uv: `uv run --with . basango --help`
|
||||
- Or install locally: `uv sync` then `basango --help`
|
||||
|
||||
#### Sync crawl (in-process)
|
||||
|
||||
- Crawl a configured source by id and write to CSV/JSON:
|
||||
- `basango crawl --source-id my-source`
|
||||
- Limit by page range: `basango crawl --source-id my-source -p 1:3`
|
||||
- Limit by date range: `basango crawl --source-id my-source -d 2024-10-01:2024-10-31`
|
||||
- Category, when supported: `basango crawl --source-id my-source -g tech`
|
||||
|
||||
#### Async crawl (Redis + RQ)
|
||||
|
||||
- Enqueue a crawl job and return immediately:
|
||||
- `basango crawl --source-id my-source --async`
|
||||
- Start one or more workers to process queues:
|
||||
- Article-only (default): `basango worker`
|
||||
- Multiple queues: `basango worker -q listing -q articles -q processed`
|
||||
- macOS friendly (no forking): `basango worker --simple`
|
||||
- One-shot draining for CI: `basango worker --burst`
|
||||
|
||||
#### Environment
|
||||
|
||||
- `BASANGO_REDIS_URL` (default `redis://localhost:6379/0`)
|
||||
- `BASANGO_QUEUE_PREFIX` (default `crawler`)
|
||||
- `BASANGO_QUEUE_TIMEOUT` (default `600` seconds)
|
||||
- `BASANGO_QUEUE_RESULT_TTL` (default `3600` seconds)
|
||||
- `BASANGO_QUEUE_FAILURE_TTL` (default `3600` seconds)
|
||||
|
||||
#### Configuration
|
||||
|
||||
- See `config/pipeline.*.yaml` for source definitions and HTTP client settings.
|
||||
- Use `-c/--env` to select which pipeline to load (default `development`).
|
||||
@@ -1,38 +0,0 @@
|
||||
services:
|
||||
basango:
|
||||
build: .
|
||||
container_name: basango-app
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- basango-network
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: basango-redis
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- redis_data:/var/redis
|
||||
command: redis-server --appendonly yes
|
||||
networks:
|
||||
- basango-network
|
||||
|
||||
redis-commander:
|
||||
image: rediscommander/redis-commander:latest
|
||||
container_name: basango-redis-commander
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8081:8081"
|
||||
environment:
|
||||
- REDIS_HOSTS=local:redis:6379
|
||||
depends_on:
|
||||
- redis
|
||||
networks:
|
||||
- basango-network
|
||||
|
||||
networks:
|
||||
basango-network:
|
||||
|
||||
volumes:
|
||||
redis_data:
|
||||
@@ -1,97 +0,0 @@
|
||||
# Fetching and crawling configuration
|
||||
fetch:
|
||||
client:
|
||||
timeout: 20
|
||||
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
|
||||
follow_redirects: true
|
||||
verify_ssl: true
|
||||
rotate_user_agent: true
|
||||
max_retries: 3
|
||||
backoff_initial: 1.0
|
||||
backoff_multiplier: 2.0
|
||||
backoff_max: 30.0
|
||||
respect_retry_after: true
|
||||
crawler:
|
||||
notify: false
|
||||
use_multi_threading: false
|
||||
max_workers: 5
|
||||
|
||||
# Source configurations
|
||||
sources:
|
||||
html:
|
||||
- source_id: radiookapi.net
|
||||
source_url: https://www.radiookapi.net
|
||||
source_date:
|
||||
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
source_selectors:
|
||||
articles: ".view-content > .views-row.content-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field-name-body"
|
||||
article_date: ".views-field-created"
|
||||
article_categories: ".views-field-field-cat-gorie a"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: 7sur7.cd
|
||||
source_url: https://7sur7.cd
|
||||
source_date:
|
||||
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
categories: [ "politique", "economie", "culture", "sport", "societe" ]
|
||||
source_selectors:
|
||||
articles: ".view-content > .row.views-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field.field--name-body"
|
||||
article_date: ".views-field-created"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/index.php/category/{category}?page={page}"
|
||||
supports_categories: true
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: mediacongo.net
|
||||
source_url: https://mediacongo.net
|
||||
source_date:
|
||||
format: "%d.%m.%Y %H:%M"
|
||||
source_selectors:
|
||||
articles: ".for_aitems > .article_other_item"
|
||||
article_title: "img"
|
||||
article_link: "a(:first-child)"
|
||||
article_categories: "a.color_link"
|
||||
article_body: ".article_ttext"
|
||||
article_date: ".article_other_about"
|
||||
pagination: ".nav > a(:last-child)"
|
||||
pagination_template: "/articles.html?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: actualite.cd
|
||||
source_url: https://actualite.cd
|
||||
source_date:
|
||||
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$4-$3-$2 $5"
|
||||
source_selectors:
|
||||
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
|
||||
article_title: "#actu-titre a"
|
||||
article_link: "#actu-titre a"
|
||||
article_categories: "#actu-cat a"
|
||||
article_body: ".views-field.views-field-body"
|
||||
article_date: "#p-date"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
wordpress:
|
||||
- source_id: beto.cd
|
||||
source_url: https://beto.cd
|
||||
requires_rate_limit: true
|
||||
- source_id: newscd.net
|
||||
source_url: https://newscd.net
|
||||
@@ -1,160 +0,0 @@
|
||||
# Fetching and crawling configuration
|
||||
fetch:
|
||||
client:
|
||||
timeout: 20
|
||||
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
|
||||
follow_redirects: true
|
||||
verify_ssl: true
|
||||
rotate_user_agent: true
|
||||
max_retries: 3
|
||||
backoff_initial: 1.0
|
||||
backoff_multiplier: 2.0
|
||||
backoff_max: 30.0
|
||||
respect_retry_after: true
|
||||
crawler:
|
||||
notify: false
|
||||
use_multi_threading: false
|
||||
max_workers: 5
|
||||
|
||||
# Logging configuration
|
||||
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
logging:
|
||||
level: "ERROR"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: true # Enable logging to file
|
||||
console_logging: true # Enable logging to console
|
||||
log_file: "pipeline.log" # Log file name
|
||||
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
||||
backup_count: 5 # Number of backup log files to keep
|
||||
|
||||
# Source configurations
|
||||
sources:
|
||||
html:
|
||||
- source_id: radiookapi.net
|
||||
source_url: https://www.radiookapi.net
|
||||
source_date:
|
||||
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
source_selectors:
|
||||
articles: ".view-content > .views-row.content-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field-name-body"
|
||||
article_date: ".views-field-created"
|
||||
article_categories: ".views-field-field-cat-gorie a"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: 7sur7.cd
|
||||
source_url: https://7sur7.cd
|
||||
source_date:
|
||||
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
categories: [ "politique", "economie", "culture", "sport", "societe" ]
|
||||
source_selectors:
|
||||
articles: ".view-content > .row.views-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field.field--name-body"
|
||||
article_date: ".views-field-created"
|
||||
pagination: "ul.pagination > li a(:last-child)"
|
||||
pagination_template: "/index.php/category/{category}?page={page}"
|
||||
supports_categories: true
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: mediacongo.net
|
||||
source_url: https://mediacongo.net
|
||||
source_date:
|
||||
format: "%d.%m.%Y %H:%M"
|
||||
source_selectors:
|
||||
articles: ".for_aitems > .article_other_item"
|
||||
article_title: "img"
|
||||
article_link: "a(:first-child)"
|
||||
article_categories: "a.color_link"
|
||||
article_body: ".article_ttext"
|
||||
article_date: ".article_other_about"
|
||||
pagination: ".nav > a(:last-child)"
|
||||
pagination_template: "/articles.html?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: actualite.cd
|
||||
source_url: https://actualite.cd
|
||||
source_date:
|
||||
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$4-$3-$2 $5"
|
||||
source_selectors:
|
||||
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
|
||||
article_title: "#actu-titre a"
|
||||
article_link: "#actu-titre a"
|
||||
article_categories: "#actu-cat a"
|
||||
article_body: ".views-field.views-field-body"
|
||||
article_date: "#p-date"
|
||||
pagination_template: "/actualite?page={page}"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
wordpress:
|
||||
- source_id: beto.cd
|
||||
source_url: https://beto.cd
|
||||
requires_rate_limit: true
|
||||
- source_id: newscd.net
|
||||
source_url: https://newscd.net
|
||||
- source_id: africanewsrdc.net
|
||||
source_url: https://www.africanewsrdc.net
|
||||
- source_id: angazainstitute.ac.cd
|
||||
source_url: https://angazainstitute.ac.cd
|
||||
- source_id: b-onetv.cd
|
||||
source_url: https://b-onetv.cd
|
||||
- source_id: bukavufm.com
|
||||
source_url: https://bukavufm.com
|
||||
- source_id: changement7.net
|
||||
source_url: https://changement7.net
|
||||
- source_id: congoactu.net
|
||||
source_url: https://congoactu.net
|
||||
- source_id: congoindependant.com
|
||||
source_url: https://www.congoindependant.com
|
||||
- source_id: congoquotidien.com
|
||||
source_url: https://www.congoquotidien.com
|
||||
- source_id: cumulard.cd
|
||||
source_url: https://www.cumulard.cd
|
||||
- source_id: environews-rdc.net
|
||||
source_url: https://environews-rdc.net
|
||||
- source_id: freemediardc.info
|
||||
source_url: https://www.freemediardc.info
|
||||
- source_id: geopolismagazine.org
|
||||
source_url: https://geopolismagazine.org
|
||||
- source_id: habarirdc.net
|
||||
source_url: https://habarirdc.net
|
||||
- source_id: infordc.com
|
||||
source_url: https://infordc.com
|
||||
- source_id: kilalopress.net
|
||||
source_url: https://kilalopress.net
|
||||
- source_id: laprosperiteonline.net
|
||||
source_url: https://laprosperiteonline.net
|
||||
- source_id: laprunellerdc.cd
|
||||
source_url: https://laprunellerdc.cd
|
||||
- source_id: lesmedias.net
|
||||
source_url: https://lesmedias.net
|
||||
- source_id: lesvolcansnews.net
|
||||
source_url: https://lesvolcansnews.net
|
||||
- source_id: netic-news.net
|
||||
source_url: https://www.netic-news.net
|
||||
- source_id: objectif-infos.cd
|
||||
source_url: https://objectif-infos.cd
|
||||
- source_id: scooprdc.net
|
||||
source_url: https://scooprdc.net
|
||||
- source_id: journaldekinshasa.com
|
||||
source_url: https://www.journaldekinshasa.com
|
||||
- source_id: lepotentiel.cd
|
||||
source_url: https://lepotentiel.cd
|
||||
- source_id: acturdc.com
|
||||
source_url: https://acturdc.com
|
||||
- source_id: matininfos.net
|
||||
source_url: https://matininfos.net
|
||||
@@ -1,160 +0,0 @@
|
||||
# Fetching and crawling configuration
|
||||
fetch:
|
||||
client:
|
||||
timeout: 20
|
||||
user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango)
|
||||
follow_redirects: true
|
||||
verify_ssl: true
|
||||
rotate: true
|
||||
max_retries: 3
|
||||
backoff_initial: 1.0
|
||||
backoff_multiplier: 2.0
|
||||
backoff_max: 30.0
|
||||
respect_retry_after: true
|
||||
crawler:
|
||||
notify: false
|
||||
use_multi_threading: false
|
||||
max_workers: 5
|
||||
|
||||
# Logging configuration
|
||||
# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
logging:
|
||||
level: "INFO"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: true # Enable logging to file
|
||||
console_logging: true # Enable logging to console
|
||||
log_file: "pipeline.log" # Log file name
|
||||
max_log_size: 10485760 # Maximum size of log file before rotation (10MB)
|
||||
backup_count: 5 # Number of backup log files to keep
|
||||
|
||||
# Source configurations
|
||||
sources:
|
||||
html:
|
||||
- source_id: radiookapi.net
|
||||
source_url: https://www.radiookapi.net
|
||||
source_date:
|
||||
pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
source_selectors:
|
||||
articles: ".view-content > .views-row.content-row"
|
||||
article_title: "h1.page-header"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field-name-body"
|
||||
article_date: ".views-field-created"
|
||||
article_categories: ".views-field-field-cat-gorie a"
|
||||
pagination: "ul.pagination > li.pager-last > a"
|
||||
pagination_template: "actualite"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: 7sur7.cd
|
||||
source_url: https://7sur7.cd
|
||||
source_date:
|
||||
pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$3-$2-$1 $4"
|
||||
categories: [ "politique", "economie", "culture", "sport", "societe" ]
|
||||
source_selectors:
|
||||
articles: ".view-content > .row.views-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field.field--name-body"
|
||||
article_date: ".views-field-created"
|
||||
pagination: "ul.pagination > li.pager__item.pager__item--last > a"
|
||||
pagination_template: "index.php/category/{category}"
|
||||
supports_categories: true
|
||||
requires_details: false
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: mediacongo.net
|
||||
source_url: https://www.mediacongo.net
|
||||
source_date:
|
||||
format: "%d.%m.%Y %H:%M"
|
||||
source_selectors:
|
||||
articles: ".for_aitems > .article_other_item"
|
||||
article_title: "img"
|
||||
article_link: "a:first-child"
|
||||
article_categories: "a.color_link"
|
||||
article_body: ".article_ttext"
|
||||
article_date: ".article_other_about"
|
||||
pagination: "div.pagination > div > a:last-child"
|
||||
pagination_template: "articles.html"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: actualite.cd
|
||||
source_url: https://actualite.cd
|
||||
source_date:
|
||||
pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/"
|
||||
replacement: "$4-$3-$2 $5"
|
||||
source_selectors:
|
||||
articles: "#views-bootstrap-taxonomy-term-page-2 > div > div"
|
||||
article_title: "#actu-titre a"
|
||||
article_link: "#actu-titre a"
|
||||
article_categories: "#actu-cat a"
|
||||
article_body: ".views-field.views-field-body"
|
||||
article_date: "#p-date"
|
||||
pagination_template: "actualite"
|
||||
supports_categories: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
wordpress:
|
||||
- source_id: beto.cd
|
||||
source_url: https://beto.cd
|
||||
requires_rate_limit: true
|
||||
- source_id: newscd.net
|
||||
source_url: https://newscd.net
|
||||
- source_id: africanewsrdc.net
|
||||
source_url: https://www.africanewsrdc.net
|
||||
- source_id: angazainstitute.ac.cd
|
||||
source_url: https://angazainstitute.ac.cd
|
||||
- source_id: b-onetv.cd
|
||||
source_url: https://b-onetv.cd
|
||||
- source_id: bukavufm.com
|
||||
source_url: https://bukavufm.com
|
||||
- source_id: changement7.net
|
||||
source_url: https://changement7.net
|
||||
- source_id: congoactu.net
|
||||
source_url: https://congoactu.net
|
||||
- source_id: congoindependant.com
|
||||
source_url: https://www.congoindependant.com
|
||||
- source_id: congoquotidien.com
|
||||
source_url: https://www.congoquotidien.com
|
||||
- source_id: cumulard.cd
|
||||
source_url: https://www.cumulard.cd
|
||||
- source_id: environews-rdc.net
|
||||
source_url: https://environews-rdc.net
|
||||
- source_id: freemediardc.info
|
||||
source_url: https://www.freemediardc.info
|
||||
- source_id: geopolismagazine.org
|
||||
source_url: https://geopolismagazine.org
|
||||
- source_id: habarirdc.net
|
||||
source_url: https://habarirdc.net
|
||||
- source_id: infordc.com
|
||||
source_url: https://infordc.com
|
||||
- source_id: kilalopress.net
|
||||
source_url: https://kilalopress.net
|
||||
- source_id: laprosperiteonline.net
|
||||
source_url: https://laprosperiteonline.net
|
||||
- source_id: laprunellerdc.cd
|
||||
source_url: https://laprunellerdc.cd
|
||||
- source_id: lesmedias.net
|
||||
source_url: https://lesmedias.net
|
||||
- source_id: lesvolcansnews.net
|
||||
source_url: https://lesvolcansnews.net
|
||||
- source_id: netic-news.net
|
||||
source_url: https://www.netic-news.net
|
||||
- source_id: objectif-infos.cd
|
||||
source_url: https://objectif-infos.cd
|
||||
- source_id: scooprdc.net
|
||||
source_url: https://scooprdc.net
|
||||
- source_id: journaldekinshasa.com
|
||||
source_url: https://www.journaldekinshasa.com
|
||||
- source_id: lepotentiel.cd
|
||||
source_url: https://lepotentiel.cd
|
||||
- source_id: acturdc.com
|
||||
source_url: https://acturdc.com
|
||||
- source_id: matininfos.net
|
||||
source_url: https://matininfos.net
|
||||
@@ -1,39 +0,0 @@
|
||||
[project]
|
||||
name = "basango"
|
||||
version = "0.1.0"
|
||||
description = "Basango : Web Scrapper for DRC News"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"pydantic>=2.11.7",
|
||||
"pydantic-settings>=2.10.1",
|
||||
"rq>=2.5.0",
|
||||
"typer>=0.16.1",
|
||||
"uv-build>=0.8.12,<0.9.0",
|
||||
"pyyaml>=6.0.2",
|
||||
"httpx>=0.27.2",
|
||||
"trafilatura>=1.7.0",
|
||||
"selectolax>=0.3.20",
|
||||
"markdownify>=0.13.1",
|
||||
"readability-lxml>=0.8.1",
|
||||
"beautifulsoup4>=4.13.5",
|
||||
"tiktoken>=0.12.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"bandit>=1.8.6",
|
||||
"pyright>=1.1.404",
|
||||
"pytest>=8.4.1",
|
||||
"ruff>=0.12.9",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
basango = "basango:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["uv_build>=0.8.12,<0.9.0"]
|
||||
build-backend = "uv_build"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
@@ -1,9 +0,0 @@
|
||||
def main() -> None:
|
||||
# Lazy import to avoid importing CLI deps during package import
|
||||
from basango.cli import app
|
||||
|
||||
app()
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
main()
|
||||
@@ -1,186 +0,0 @@
|
||||
"""
|
||||
CLI entry points for crawling and worker management.
|
||||
|
||||
Sync vs async usage
|
||||
- Synchronous crawl: runs the selected crawler in-process and writes results
|
||||
via configured persistors (CSV/JSON). Suitable for local development or
|
||||
small runs.
|
||||
- Asynchronous crawl: enqueues a listing job in Redis (RQ) and returns
|
||||
immediately. One or more RQ workers must be running to process jobs.
|
||||
|
||||
Examples
|
||||
- Sync: `basango crawl --source-id my-source --page 1:3`
|
||||
- Async: `basango crawl --source-id my-source --async`
|
||||
- Worker (macOS friendly): `basango worker --simple -q articles`
|
||||
|
||||
Environment
|
||||
- `BASANGO_REDIS_URL` points the worker/queues to Redis.
|
||||
- `BASANGO_QUEUE_PREFIX` namespaces queues (default: `crawler`).
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
from enum import Enum
|
||||
|
||||
import typer
|
||||
|
||||
from basango.core.config import CrawlerConfig
|
||||
from basango.core.config_manager import ConfigManager
|
||||
from basango.domain import DateRange, PageRange, UpdateDirection
|
||||
from basango.services import JsonPersistor
|
||||
from basango.services.crawler.async_api import (
|
||||
QueueSettings,
|
||||
schedule_async_crawl,
|
||||
start_worker,
|
||||
)
|
||||
from basango.services.crawler.html_crawler import HtmlCrawler
|
||||
from basango.services.crawler.wordpress_crawler import WordpressCrawler
|
||||
|
||||
app = typer.Typer(no_args_is_help=True, add_completion=False)
|
||||
|
||||
|
||||
class QueueName(str, Enum):
|
||||
listing = "listing"
|
||||
articles = "articles"
|
||||
processed = "processed"
|
||||
|
||||
|
||||
@app.command("crawl")
|
||||
def crawl_cmd(
|
||||
source_id: str = typer.Option(
|
||||
..., help="Source id to crawl (as defined in config)"
|
||||
),
|
||||
page: str = typer.Option(None, "--page", "-p", help="Page range e.g. '1:10'"),
|
||||
date: str = typer.Option(
|
||||
None, "--date", "-d", help="Date range e.g. '2024-10-01:2024-10-31'"
|
||||
),
|
||||
category: str = typer.Option(None, "--category", "-g", help="Optional category"),
|
||||
notify: bool = typer.Option(False, "--notify", "-n", help="Enable notifications"),
|
||||
env: str = typer.Option("development", "--env", "-c", help="Environment"),
|
||||
async_mode: bool = typer.Option(
|
||||
False,
|
||||
"--async/--no-async",
|
||||
help="Schedule crawl through Redis queues instead of running synchronously.",
|
||||
),
|
||||
) -> None:
|
||||
"""Crawl a single source, either synchronously or via the async queue.
|
||||
|
||||
Technical notes
|
||||
- When `--async` is set, we only enqueue a job (no crawling happens here).
|
||||
This keeps the CLI responsive and leaves fault-tolerance to RQ workers.
|
||||
- Persistors (CSV/JSON) are instantiated only for the sync path; the async
|
||||
path assigns them inside worker tasks to avoid importing heavy deps in the
|
||||
CLI process and to better isolate failures.
|
||||
"""
|
||||
manager = ConfigManager()
|
||||
pipeline = manager.get(env)
|
||||
manager.ensure_directories(pipeline)
|
||||
manager.setup_logging(pipeline)
|
||||
|
||||
source = pipeline.sources.find(source_id)
|
||||
if source is None:
|
||||
raise typer.BadParameter(f"Source '{source_id}' not found in config")
|
||||
|
||||
if async_mode:
|
||||
job_id = schedule_async_crawl(
|
||||
source_id=source_id,
|
||||
env=env,
|
||||
page_range=page,
|
||||
date_range=date,
|
||||
category=category,
|
||||
)
|
||||
typer.echo(
|
||||
f"Scheduled async crawl job {job_id} for source '{source_id}' on queue"
|
||||
)
|
||||
return
|
||||
|
||||
crawler_config = CrawlerConfig(
|
||||
source=source,
|
||||
page_range=PageRange.create(page) if page else None,
|
||||
date_range=DateRange.create(date) if date else None,
|
||||
category=category,
|
||||
notify=notify,
|
||||
direction=UpdateDirection.FORWARD,
|
||||
)
|
||||
|
||||
crawlers = [
|
||||
HtmlCrawler,
|
||||
WordpressCrawler,
|
||||
]
|
||||
|
||||
source_identifier = getattr(source, "source_id", source_id) or source_id
|
||||
persistors = [
|
||||
JsonPersistor(
|
||||
data_dir=pipeline.paths.data,
|
||||
source_id=str(source_identifier),
|
||||
),
|
||||
]
|
||||
|
||||
for crawler in crawlers:
|
||||
if crawler.supports() == source.source_kind:
|
||||
crawler = crawler(
|
||||
crawler_config,
|
||||
pipeline.fetch.client,
|
||||
persistors=persistors,
|
||||
)
|
||||
crawler.fetch()
|
||||
break
|
||||
|
||||
|
||||
@app.command("worker")
|
||||
def worker_cmd(
|
||||
queue: Optional[List[QueueName]] = typer.Option(
|
||||
None,
|
||||
"--queue",
|
||||
"-q",
|
||||
help=(
|
||||
"Queue name(s) (without prefix). Choices: listing, articles, processed. "
|
||||
"Provide multiple times to listen to more than one queue."
|
||||
),
|
||||
),
|
||||
simple: bool = typer.Option(
|
||||
False,
|
||||
"--simple/--no-simple",
|
||||
help=(
|
||||
"Run jobs in-process using RQ SimpleWorker (no forking). "
|
||||
"Recommended on macOS to avoid fork-related crashes."
|
||||
),
|
||||
),
|
||||
burst: bool = typer.Option(
|
||||
False,
|
||||
"--burst",
|
||||
help="Process available jobs and exit instead of running continuously.",
|
||||
),
|
||||
redis_url: str = typer.Option(
|
||||
None,
|
||||
"--redis-url",
|
||||
help="Redis connection URL. Defaults to BASANGO_REDIS_URL.",
|
||||
),
|
||||
env: str = typer.Option(
|
||||
"development",
|
||||
"--env",
|
||||
"-c",
|
||||
help="Environment used to configure logging before starting the worker.",
|
||||
),
|
||||
) -> None:
|
||||
"""Run an RQ worker that consumes crawler queues.
|
||||
|
||||
Notes
|
||||
- By default the worker listens to the `articles` queue (detail jobs). Use
|
||||
`-q listing -q articles -q processed` to listen to multiple.
|
||||
- `--simple` uses RQ's SimpleWorker (no forking). On macOS this avoids
|
||||
fork-related crashes when libraries aren't fork-safe.
|
||||
- Use `--burst` to drain the queue and exit, useful for CI or one-off runs.
|
||||
"""
|
||||
manager = ConfigManager()
|
||||
pipeline = manager.get(env)
|
||||
manager.ensure_directories(pipeline)
|
||||
manager.setup_logging(pipeline)
|
||||
|
||||
settings = QueueSettings(redis_url=redis_url) if redis_url else QueueSettings()
|
||||
queue_names = [q.value for q in queue] if queue else None
|
||||
start_worker(
|
||||
queue_names=queue_names,
|
||||
settings=settings,
|
||||
burst=burst,
|
||||
simple=simple,
|
||||
)
|
||||
@@ -1,19 +0,0 @@
|
||||
from .fetch_config import ClientConfig, FetchConfig, CrawlerConfig
|
||||
from .logging_config import LoggingConfig
|
||||
from .pipeline_config import PipelineConfig
|
||||
from .source_config import (
|
||||
WordPressSourceConfig,
|
||||
HtmlSourceConfig,
|
||||
SourcesConfig,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ClientConfig",
|
||||
"FetchConfig",
|
||||
"CrawlerConfig",
|
||||
"LoggingConfig",
|
||||
"PipelineConfig",
|
||||
"WordPressSourceConfig",
|
||||
"HtmlSourceConfig",
|
||||
"SourcesConfig",
|
||||
]
|
||||
@@ -1,71 +0,0 @@
|
||||
from typing import Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from basango.domain import PageRange, DateRange, UpdateDirection
|
||||
from basango.core.config.source_config import HtmlSourceConfig, WordPressSourceConfig
|
||||
|
||||
|
||||
class ClientConfig(BaseModel):
|
||||
timeout: float = Field(default=20.0, description="Request timeout in seconds")
|
||||
user_agent: str = Field(
|
||||
default="Basango/0.1 (+https://github.com/bernard-ng/basango)"
|
||||
)
|
||||
follow_redirects: bool = Field(default=True, description="Follow HTTP redirects")
|
||||
verify_ssl: bool = Field(default=True, description="Verify SSL certificates")
|
||||
rotate: bool = Field(default=True, description="Rotate User-Agent header")
|
||||
max_retries: int = Field(
|
||||
default=3, description="Maximum number of retries on failure"
|
||||
)
|
||||
backoff_initial: float = Field(
|
||||
default=1.0, description="Initial backoff delay in seconds"
|
||||
)
|
||||
backoff_multiplier: float = Field(default=2.0, description="Backoff multiplier")
|
||||
backoff_max: float = Field(
|
||||
default=30.0, description="Maximum backoff delay in seconds"
|
||||
)
|
||||
respect_retry_after: bool = Field(
|
||||
default=True, description="Respect Retry-After header if present"
|
||||
)
|
||||
|
||||
|
||||
class CrawlerConfig(BaseModel):
|
||||
source: Optional[Union[HtmlSourceConfig, WordPressSourceConfig]] = Field(
|
||||
default=None, description="Source configuration to crawl"
|
||||
)
|
||||
page_range: Optional[PageRange] = Field(
|
||||
default=None, description="Page range to crawl, e.g: 1:10"
|
||||
)
|
||||
date_range: Optional[DateRange] = Field(
|
||||
default=None,
|
||||
description="Date range to filter articles, e.g: 2024-10-01:2024-10-31",
|
||||
)
|
||||
category: Optional[str] = Field(
|
||||
default=None, description="Optional category to filter articles"
|
||||
)
|
||||
notify: bool = Field(
|
||||
default=False, description="Enable notifications after crawling"
|
||||
)
|
||||
|
||||
is_update: bool = Field(
|
||||
default=False,
|
||||
description="Whether this crawl is an update (True) or a full crawl (False)",
|
||||
)
|
||||
use_multi_threading: bool = Field(
|
||||
default=False, description="Enable multiprocessing for concurrent crawling"
|
||||
)
|
||||
max_workers: int = Field(
|
||||
default=5, description="Maximum number of concurrent crawling workers"
|
||||
)
|
||||
direction: UpdateDirection = Field(
|
||||
default=UpdateDirection.FORWARD, description="Crawling direction"
|
||||
)
|
||||
|
||||
|
||||
class FetchConfig(BaseModel):
|
||||
client: ClientConfig = Field(
|
||||
default_factory=ClientConfig, description="Http client configuration"
|
||||
)
|
||||
crawler: CrawlerConfig = Field(
|
||||
default_factory=CrawlerConfig, description="Crawler configuration"
|
||||
)
|
||||
@@ -1,11 +0,0 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class LoggingConfig(BaseModel):
|
||||
level: str = "INFO"
|
||||
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_logging: bool = True
|
||||
console_logging: bool = True
|
||||
log_file: str = "pipeline.log"
|
||||
max_log_size: int = 10 * 1024 * 1024 # 10MB
|
||||
backup_count: int = 5
|
||||
@@ -1,25 +0,0 @@
|
||||
from pathlib import Path
|
||||
from pydantic import Field, BaseModel
|
||||
|
||||
from basango.core.config.fetch_config import FetchConfig
|
||||
from basango.core.config.logging_config import LoggingConfig
|
||||
from basango.core.config.source_config import SourcesConfig
|
||||
from basango.core.project_paths import ProjectPaths
|
||||
|
||||
|
||||
def _default_project_paths() -> ProjectPaths:
|
||||
"""Create default project paths relative to the project root."""
|
||||
root = Path.cwd()
|
||||
return ProjectPaths(
|
||||
root=root,
|
||||
configs=root / "config",
|
||||
data=root / "data" / "dataset",
|
||||
logs=root / "data" / "logs",
|
||||
)
|
||||
|
||||
|
||||
class PipelineConfig(BaseModel):
|
||||
paths: ProjectPaths = Field(default_factory=_default_project_paths, alias="paths")
|
||||
logging: LoggingConfig = Field(default_factory=LoggingConfig)
|
||||
fetch: FetchConfig = Field(default_factory=FetchConfig)
|
||||
sources: SourcesConfig = Field(default_factory=SourcesConfig)
|
||||
@@ -1,66 +0,0 @@
|
||||
from typing import Union
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
|
||||
from basango.domain import SourceDate, SourceKind, SourceSelectors
|
||||
|
||||
|
||||
class SourceConfigBase(BaseModel):
|
||||
source_id: str = Field(..., description="Unique identifier for the source")
|
||||
source_url: HttpUrl = Field(..., description="URL of the source")
|
||||
source_date: SourceDate = Field(
|
||||
default_factory=SourceDate, description="Date extraction schema"
|
||||
)
|
||||
source_kind: SourceKind = Field(
|
||||
..., description="Type of the source, e.g., 'wordpress' or 'html'"
|
||||
)
|
||||
categories: list[str] = Field(
|
||||
default_factory=list, description="List of categories to filter articles"
|
||||
)
|
||||
|
||||
supports_categories: bool = Field(
|
||||
default=False, description="the source supports categories"
|
||||
)
|
||||
requires_details: bool = Field(
|
||||
default=False, description="detailed article is required to compute date range"
|
||||
)
|
||||
requires_rate_limit: bool = Field(
|
||||
default=False, description="requires rate limit to avoid being blocked"
|
||||
)
|
||||
|
||||
|
||||
class WordPressSourceConfig(SourceConfigBase):
|
||||
source_kind: SourceKind = Field(
|
||||
default=SourceKind.WORDPRESS, description="Type of the source"
|
||||
)
|
||||
source_date: SourceDate = SourceDate(
|
||||
format="%Y-%m-%dT%H:%M:%S", pattern=None, replacement=None
|
||||
)
|
||||
|
||||
|
||||
class HtmlSourceConfig(SourceConfigBase):
|
||||
source_kind: SourceKind = Field(
|
||||
default=SourceKind.HTML, description="Type of the source"
|
||||
)
|
||||
source_selectors: SourceSelectors = Field(
|
||||
default_factory=lambda: SourceSelectors(),
|
||||
description="CSS selectors for extracting articles",
|
||||
)
|
||||
pagination_template: str = Field(
|
||||
..., description="Template URL for pagination, e.g., '/actualite?page={page}'"
|
||||
)
|
||||
|
||||
|
||||
class SourcesConfig(BaseModel):
|
||||
html: list[HtmlSourceConfig] = Field(
|
||||
default_factory=list, description="List of source configurations"
|
||||
)
|
||||
wordpress: list[WordPressSourceConfig] = Field(
|
||||
default_factory=list, description="List of source configurations"
|
||||
)
|
||||
|
||||
def find(self, source_id: str) -> Union[HtmlSourceConfig, WordPressSourceConfig]:
|
||||
for source in self.html + self.wordpress:
|
||||
if source.source_id == source_id:
|
||||
return source
|
||||
raise ValueError(f"Source with id '{source_id}' not found")
|
||||
@@ -1,149 +0,0 @@
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Dict
|
||||
|
||||
import yaml
|
||||
|
||||
from basango.core.config import PipelineConfig
|
||||
from basango.core.project_paths import ProjectPaths
|
||||
|
||||
|
||||
def _ensure_utf8_stream(stream):
|
||||
try:
|
||||
if hasattr(stream, "reconfigure"):
|
||||
stream.reconfigure(encoding="utf-8", errors="replace")
|
||||
except (AttributeError, ValueError):
|
||||
return stream
|
||||
return stream
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
def __init__(self, config_path: Optional[Union[str, Path]] = None):
|
||||
self.config_path = Path(config_path) if config_path else self._find_config()
|
||||
self._config: Optional[PipelineConfig] = None
|
||||
self._setup_paths()
|
||||
|
||||
def get(self, env: Optional[str] = None) -> PipelineConfig:
|
||||
if env:
|
||||
path = self.config_path.parent / f"pipeline.{env}.yaml"
|
||||
|
||||
if path.exists():
|
||||
base = self.load().model_dump()
|
||||
self._override(base, self.load(path).model_dump())
|
||||
return PipelineConfig(**base)
|
||||
|
||||
if self._config is None:
|
||||
self._config = self.load()
|
||||
return self._config
|
||||
|
||||
def load(self, config_path: Optional[Path] = None) -> PipelineConfig:
|
||||
"""Load configuration from file"""
|
||||
self.config_path = config_path if config_path else self._find_config()
|
||||
|
||||
if not self.config_path.exists():
|
||||
logging.warning(
|
||||
f"Config file not found: {self.config_path}. Using defaults."
|
||||
)
|
||||
return self._create_default()
|
||||
|
||||
try:
|
||||
with open(self.config_path, "r") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
if "paths" not in config_data:
|
||||
config_data["paths"] = self.default_paths.model_dump()
|
||||
|
||||
self._config = PipelineConfig(**config_data)
|
||||
return self._config
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load config from {self.config_path}: {e}")
|
||||
return self._create_default()
|
||||
|
||||
@classmethod
|
||||
def ensure_directories(cls, cfg: PipelineConfig) -> None:
|
||||
directories = [cfg.paths.data, cfg.paths.logs, cfg.paths.configs]
|
||||
|
||||
for directory in directories:
|
||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info("Ensured all required directories exist")
|
||||
|
||||
@classmethod
|
||||
def setup_logging(cls, cfg: PipelineConfig):
|
||||
logs_path = cfg.paths.logs
|
||||
logs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup logging configuration
|
||||
log_level = getattr(logging, cfg.logging.level.upper(), logging.INFO)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(cfg.logging.format)
|
||||
|
||||
# Setup root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(log_level)
|
||||
|
||||
# Clear existing handlers
|
||||
root_logger.handlers.clear()
|
||||
|
||||
_ensure_utf8_stream(sys.stdout)
|
||||
_ensure_utf8_stream(sys.stderr)
|
||||
# Console handler
|
||||
if cfg.logging.console_logging:
|
||||
console_handler = logging.StreamHandler(
|
||||
stream=_ensure_utf8_stream(sys.stderr)
|
||||
)
|
||||
console_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# File handler
|
||||
if cfg.logging.file_logging:
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
log_file_path = logs_path / cfg.logging.log_file
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file_path,
|
||||
maxBytes=cfg.logging.max_log_size,
|
||||
backupCount=cfg.logging.backup_count,
|
||||
encoding="utf-8",
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(file_handler)
|
||||
|
||||
@classmethod
|
||||
def _find_config(cls) -> Path:
|
||||
possible_paths = [
|
||||
Path.cwd() / "config" / "pipeline.yaml",
|
||||
Path.cwd() / "config" / "pipeline.yml",
|
||||
Path.cwd() / "pipeline.yaml",
|
||||
Path(__file__).parent.parent.parent.parent / "config" / "pipeline.yaml",
|
||||
]
|
||||
|
||||
for path in possible_paths:
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
raise FileNotFoundError(
|
||||
"No configuration file found in the expected locations."
|
||||
)
|
||||
|
||||
def _setup_paths(self) -> None:
|
||||
root = Path(__file__).parent.parent.parent.parent
|
||||
self.default_paths = ProjectPaths(
|
||||
root=root,
|
||||
configs=root / "config",
|
||||
data=root / "data" / "dataset",
|
||||
logs=root / "data" / "logs",
|
||||
)
|
||||
|
||||
def _create_default(self) -> PipelineConfig:
|
||||
return PipelineConfig(paths=self.default_paths)
|
||||
|
||||
def _override(self, base: Dict, update: Dict):
|
||||
for key, value in update.items():
|
||||
if key in base and isinstance(base[key], dict) and isinstance(value, dict):
|
||||
self._override(base[key], value)
|
||||
else:
|
||||
base[key] = value
|
||||
@@ -1,26 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseModel, field_validator, ConfigDict
|
||||
|
||||
|
||||
class ProjectPaths(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
root: Path
|
||||
data: Path
|
||||
logs: Path
|
||||
configs: Path
|
||||
|
||||
@classmethod
|
||||
@field_validator("*", mode="before")
|
||||
def convert_to_path(cls, v):
|
||||
return Path(v) if not isinstance(v, Path) else v
|
||||
|
||||
def get_data_path(self, filename: str) -> Path:
|
||||
return self.data / filename
|
||||
|
||||
def get_logs_path(self, filename: str) -> Path:
|
||||
return self.logs / filename
|
||||
|
||||
def get_config_path(self, filename: str) -> Path:
|
||||
return self.configs / filename
|
||||
@@ -1,15 +0,0 @@
|
||||
from .article import Article
|
||||
from .date_range import DateRange
|
||||
from .page_range import PageRange
|
||||
from .source import SourceKind, SourceDate, SourceSelectors
|
||||
from .update_direction import UpdateDirection
|
||||
|
||||
__all__ = [
|
||||
"Article",
|
||||
"DateRange",
|
||||
"PageRange",
|
||||
"SourceKind",
|
||||
"SourceDate",
|
||||
"SourceSelectors",
|
||||
"UpdateDirection",
|
||||
]
|
||||
@@ -1,30 +0,0 @@
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from .token_statistics import TokenStatistics
|
||||
|
||||
|
||||
class Article(BaseModel):
|
||||
title: str
|
||||
link: HttpUrl
|
||||
body: str
|
||||
categories: list[str]
|
||||
source: str
|
||||
timestamp: datetime
|
||||
metadata: Optional[dict[str, Any]] = None
|
||||
token_statistics: Optional["TokenStatistics"] = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"title": self.title,
|
||||
"link": str(self.link),
|
||||
"body": self.body,
|
||||
"categories": self.categories,
|
||||
"source": self.source,
|
||||
"timestamp": int(self.timestamp.timestamp()),
|
||||
"metadata": self.metadata,
|
||||
"tokenStatistics": self.token_statistics.to_dict()
|
||||
if self.token_statistics
|
||||
else "",
|
||||
}
|
||||
@@ -1,64 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def _ensure_utc(dt: datetime) -> datetime:
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=timezone.utc)
|
||||
return dt
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DateRange:
|
||||
start: int # Unix timestamp
|
||||
end: int # Unix timestamp
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
assert self.start != 0, "[DateRange] Start timestamp cannot be 0"
|
||||
assert self.end != 0, "[DateRange] End timestamp cannot be 0"
|
||||
assert self.end >= self.start, (
|
||||
"[DateRange] End must be greater than or equal to start"
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.start}:{self.end}"
|
||||
|
||||
def in_range(self, ts: int) -> bool:
|
||||
return self.start <= ts <= self.end
|
||||
|
||||
def out_range(self, ts: int) -> bool:
|
||||
return ts < self.start or ts > self.end
|
||||
|
||||
def format(self, fmt: str = "%Y-%m-%d") -> str:
|
||||
start = datetime.fromtimestamp(self.start, tz=timezone.utc).strftime(fmt)
|
||||
end = datetime.fromtimestamp(self.end, tz=timezone.utc).strftime(fmt)
|
||||
return f"{start}:{end}"
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
cls, spec: str, fmt: str = "%Y-%m-%d", separator: str = ":"
|
||||
) -> "DateRange":
|
||||
assert separator != "", "[DateRange] Separator cannot be empty"
|
||||
assert separator in spec, f"[DateRange] {separator} must be in {spec}"
|
||||
|
||||
parts = spec.split(separator)
|
||||
assert len(parts) == 2, f"[DateRange] Invalid date interval: {spec}"
|
||||
|
||||
start = _ensure_utc(datetime.strptime(parts[0], fmt))
|
||||
end = _ensure_utc(datetime.strptime(parts[1], fmt))
|
||||
return cls(int(start.timestamp()), int(end.timestamp()))
|
||||
|
||||
@classmethod
|
||||
def backward(cls, date: Optional[datetime] = None, days: int = 30) -> "DateRange":
|
||||
base = _ensure_utc(date or datetime.now(timezone.utc))
|
||||
|
||||
start = base - timedelta(days=days)
|
||||
end = base + timedelta(days=1) # in future to avoid timezone issues
|
||||
return cls(int(start.timestamp()), int(end.timestamp()))
|
||||
|
||||
@classmethod
|
||||
def forward(cls, date: datetime) -> "DateRange":
|
||||
start = _ensure_utc(date)
|
||||
end = datetime.now(timezone.utc) + timedelta(days=1)
|
||||
return cls(int(start.timestamp()), int(end.timestamp()))
|
||||
@@ -1,18 +0,0 @@
|
||||
from basango.domain import DateRange
|
||||
|
||||
|
||||
class ArticleNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ArticleOutOfRange(Exception):
|
||||
def __init__(self, timestamp: str, date_range: DateRange):
|
||||
self.timestamp = timestamp
|
||||
self.date_range = date_range
|
||||
super().__init__(
|
||||
f"Article with timestamp {timestamp} is out of range {date_range}"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def create(cls, timestamp: str, date_range: DateRange) -> "ArticleOutOfRange":
|
||||
return cls(timestamp, date_range)
|
||||
@@ -1,20 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PageRange:
|
||||
start: int
|
||||
end: int
|
||||
|
||||
@staticmethod
|
||||
def create(spec: str) -> "PageRange":
|
||||
parts = spec.split(":")
|
||||
assert len(parts) == 2, f"[PageRange] Invalid page range: {spec}"
|
||||
|
||||
start, end = int(parts[0]), int(parts[1])
|
||||
assert start >= 0, f"[PageRange] Invalid page range: {spec}"
|
||||
assert end >= start, f"[PageRange] Invalid page range: {spec}"
|
||||
return PageRange(start=start, end=end)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.start}:{self.end}"
|
||||
@@ -1,41 +0,0 @@
|
||||
from enum import StrEnum
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class SourceKind(StrEnum):
|
||||
WORDPRESS = "wordpress"
|
||||
HTML = "html"
|
||||
|
||||
|
||||
class SourceDate(BaseModel):
|
||||
format: str = "%Y-%m-%d %H:%M"
|
||||
pattern: Optional[str] = None
|
||||
replacement: Optional[str] = None
|
||||
|
||||
|
||||
class SourceSelectors(BaseModel):
|
||||
articles: Optional[str] = Field(
|
||||
default=None, description="CSS selector for the list of articles within a page"
|
||||
)
|
||||
article_title: Optional[str] = Field(
|
||||
default=None, description="CSS selector for the article title"
|
||||
)
|
||||
article_link: Optional[str] = Field(
|
||||
default=None, description="CSS selector for the article link"
|
||||
)
|
||||
article_body: Optional[str] = Field(
|
||||
default=None, description="CSS selector for the article body/content"
|
||||
)
|
||||
article_date: Optional[str] = Field(
|
||||
default=None, description="CSS selector for the article date"
|
||||
)
|
||||
article_categories: Optional[str] = Field(
|
||||
default=None, description="CSS selector for the article categories"
|
||||
)
|
||||
|
||||
pagination: str = Field(
|
||||
default="ul.pagination > li a",
|
||||
description="CSS selector for the pagination links",
|
||||
)
|
||||
@@ -1,19 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenStatistics:
|
||||
"""Counts of tokens for different article sections."""
|
||||
|
||||
title: int
|
||||
body: int
|
||||
excerpt: int
|
||||
categories: int
|
||||
|
||||
def to_dict(self) -> dict[str, int]:
|
||||
return {
|
||||
"title": self.title,
|
||||
"body": self.body,
|
||||
"excerpt": self.excerpt,
|
||||
"categories": self.categories,
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
from enum import StrEnum
|
||||
|
||||
|
||||
class UpdateDirection(StrEnum):
|
||||
FORWARD = "forward"
|
||||
BACKWARD = "backward"
|
||||
@@ -1,22 +0,0 @@
|
||||
from .date_parser import DateParser
|
||||
from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient
|
||||
from .open_graph import OpenGraphProvider
|
||||
from .persistence import BasePersistor, CsvPersistor, JsonPersistor
|
||||
from .user_agents import UserAgents
|
||||
from .tokenizer import Tokenizer
|
||||
|
||||
HttpClient = SyncHttpClient
|
||||
|
||||
__all__ = [
|
||||
"DateParser",
|
||||
"BaseHttpClient",
|
||||
"SyncHttpClient",
|
||||
"AsyncHttpClient",
|
||||
"HttpClient",
|
||||
"OpenGraphProvider",
|
||||
"UserAgents",
|
||||
"BasePersistor",
|
||||
"CsvPersistor",
|
||||
"JsonPersistor",
|
||||
"Tokenizer",
|
||||
]
|
||||
@@ -1,22 +0,0 @@
|
||||
from .queue import QueueManager, QueueSettings
|
||||
from .schemas import ListingTaskPayload, ArticleTaskPayload, ProcessedTaskPayload
|
||||
from .tasks import (
|
||||
schedule_async_crawl,
|
||||
collect_listing,
|
||||
collect_article,
|
||||
forward_for_processing,
|
||||
)
|
||||
from .worker import start_worker
|
||||
|
||||
__all__ = [
|
||||
"QueueManager",
|
||||
"QueueSettings",
|
||||
"ListingTaskPayload",
|
||||
"ArticleTaskPayload",
|
||||
"ProcessedTaskPayload",
|
||||
"schedule_async_crawl",
|
||||
"collect_listing",
|
||||
"collect_article",
|
||||
"forward_for_processing",
|
||||
"start_worker",
|
||||
]
|
||||
@@ -1,92 +0,0 @@
|
||||
"""
|
||||
RQ queue configuration and helpers.
|
||||
|
||||
Design choices
|
||||
- Queue names are prefixed (e.g. `crawler:articles`) so multiple environments
|
||||
can share the same Redis. Configure via `BASANGO_QUEUE_PREFIX`.
|
||||
- Job default timeouts and TTLs are centrally configured to avoid per-enqueue
|
||||
tuning. Environment variables allow ops to adjust at runtime.
|
||||
- Task callables are referenced by dotted string path when enqueuing to ensure
|
||||
RQ workers can import them without importing this module and creating cycles.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Iterable
|
||||
|
||||
from redis import Redis
|
||||
from rq import Queue
|
||||
|
||||
from .schemas import (
|
||||
ArticleTaskPayload,
|
||||
ListingTaskPayload,
|
||||
ProcessedTaskPayload,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class QueueSettings:
|
||||
redis_url: str = field(
|
||||
default_factory=lambda: os.getenv( # type: ignore[arg-type]
|
||||
"BASANGO_REDIS_URL", "redis://localhost:6379/0"
|
||||
)
|
||||
)
|
||||
prefix: str = field(
|
||||
default_factory=lambda: os.getenv("BASANGO_QUEUE_PREFIX", "crawler")
|
||||
)
|
||||
default_timeout: int = field(
|
||||
default_factory=lambda: int(os.getenv("BASANGO_QUEUE_TIMEOUT", "600"))
|
||||
)
|
||||
result_ttl: int = field(
|
||||
default_factory=lambda: int(os.getenv("BASANGO_QUEUE_RESULT_TTL", "3600"))
|
||||
)
|
||||
failure_ttl: int = field(
|
||||
default_factory=lambda: int(os.getenv("BASANGO_QUEUE_FAILURE_TTL", "3600"))
|
||||
)
|
||||
listing_queue: str = "listing"
|
||||
article_queue: str = "articles"
|
||||
processed_queue: str = "processed"
|
||||
|
||||
|
||||
class QueueManager:
|
||||
def __init__(self, settings: QueueSettings | None = None) -> None:
|
||||
self.settings = settings or QueueSettings()
|
||||
self.connection = Redis.from_url(self.settings.redis_url)
|
||||
self.listing_queue = self._build_queue(self.settings.listing_queue)
|
||||
self.article_queue = self._build_queue(self.settings.article_queue)
|
||||
self.processed_queue = self._build_queue(self.settings.processed_queue)
|
||||
|
||||
def _build_queue(self, suffix: str) -> Queue:
|
||||
return Queue(
|
||||
self.queue_name(suffix),
|
||||
connection=self.connection,
|
||||
default_timeout=self.settings.default_timeout,
|
||||
result_ttl=self.settings.result_ttl,
|
||||
failure_ttl=self.settings.failure_ttl,
|
||||
)
|
||||
|
||||
def queue_name(self, suffix: str) -> str:
|
||||
return f"{self.settings.prefix}:{suffix}"
|
||||
|
||||
def enqueue_listing(self, payload: ListingTaskPayload):
|
||||
return self.listing_queue.enqueue(
|
||||
"basango.services.crawler.async.tasks.collect_listing",
|
||||
payload.to_dict(),
|
||||
)
|
||||
|
||||
def enqueue_article(self, payload: ArticleTaskPayload):
|
||||
return self.article_queue.enqueue(
|
||||
"basango.services.crawler.async.tasks.collect_article",
|
||||
payload.to_dict(),
|
||||
)
|
||||
|
||||
def enqueue_processed(self, payload: ProcessedTaskPayload):
|
||||
return self.processed_queue.enqueue(
|
||||
"basango.services.crawler.async.tasks.forward_for_processing",
|
||||
payload.to_dict(),
|
||||
)
|
||||
|
||||
def iter_queue_names(self) -> Iterable[str]:
|
||||
yield self.queue_name(self.settings.listing_queue)
|
||||
yield self.queue_name(self.settings.article_queue)
|
||||
yield self.queue_name(self.settings.processed_queue)
|
||||
@@ -1,64 +0,0 @@
|
||||
"""
|
||||
Lightweight task payload schemas.
|
||||
|
||||
Notes
|
||||
- Use dataclasses with `slots=True` for low overhead and predictable fields.
|
||||
- `_coerce_kwargs` filters unknown keys so payloads are resilient to schema
|
||||
changes when workers and producers are not updated in lockstep.
|
||||
"""
|
||||
|
||||
from dataclasses import asdict, dataclass, fields
|
||||
from typing import Any, Mapping
|
||||
|
||||
from basango.domain.article import Article
|
||||
|
||||
|
||||
def _coerce_kwargs(cls, data: Mapping[str, Any]) -> dict[str, Any]:
|
||||
return {field.name: data.get(field.name) for field in fields(cls)}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ListingTaskPayload:
|
||||
source_id: str
|
||||
env: str = "development"
|
||||
page_range: str | None = None
|
||||
date_range: str | None = None
|
||||
category: str | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Mapping[str, Any]) -> "ListingTaskPayload":
|
||||
return cls(**_coerce_kwargs(cls, data))
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ArticleTaskPayload:
|
||||
source_id: str
|
||||
env: str = "development"
|
||||
url: str | None = None
|
||||
data: Any | None = None
|
||||
date_range: str | None = None
|
||||
category: str | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Mapping[str, Any]) -> "ArticleTaskPayload":
|
||||
return cls(**_coerce_kwargs(cls, data))
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ProcessedTaskPayload:
|
||||
source_id: str
|
||||
article: Article
|
||||
env: str = "development"
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Mapping[str, Any]) -> "ProcessedTaskPayload":
|
||||
return cls(**_coerce_kwargs(cls, data))
|
||||
@@ -1,305 +0,0 @@
|
||||
"""
|
||||
RQ task functions for the asynchronous crawl pipeline.
|
||||
|
||||
Pipeline
|
||||
- schedule_async_crawl: seeds a listing job for a source
|
||||
- collect_listing: enumerates listing pages and enqueues detail jobs
|
||||
- collect_article: extracts and persists article data, then forwards
|
||||
- forward_for_processing: hands the record to downstream system (HTTP API)
|
||||
|
||||
Rationale
|
||||
- Split listing vs article work to keep jobs small and retryable.
|
||||
- Use ConfigManager to reconstruct the same pipeline/env in workers.
|
||||
- Persist locally (CSV/JSON) before forwarding to decouple pipelines.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from basango.domain.article import Article
|
||||
from basango.services import SyncHttpClient
|
||||
from basango.core.config import CrawlerConfig
|
||||
from basango.core.config_manager import ConfigManager
|
||||
from basango.domain import DateRange, PageRange, SourceKind, UpdateDirection
|
||||
from basango.services import JsonPersistor
|
||||
from basango.services.crawler.html_crawler import HtmlCrawler
|
||||
from basango.services.crawler.wordpress_crawler import WordpressCrawler
|
||||
|
||||
from .queue import QueueManager, QueueSettings
|
||||
from .schemas import (
|
||||
ArticleTaskPayload,
|
||||
ListingTaskPayload,
|
||||
ProcessedTaskPayload,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def schedule_async_crawl(
|
||||
*,
|
||||
source_id: str,
|
||||
env: str = "development",
|
||||
page_range: str | None = None,
|
||||
date_range: str | None = None,
|
||||
category: str | None = None,
|
||||
settings: QueueSettings | None = None,
|
||||
):
|
||||
# Keep payload serialisable and minimal; workers reconstruct config objects.
|
||||
payload = ListingTaskPayload(
|
||||
source_id=source_id,
|
||||
env=env,
|
||||
page_range=page_range,
|
||||
date_range=date_range,
|
||||
category=category,
|
||||
)
|
||||
manager = QueueManager(settings=settings)
|
||||
job = manager.enqueue_listing(payload)
|
||||
logger.info("Scheduled listing collection job %s for source %s", job.id, source_id)
|
||||
return job.id
|
||||
|
||||
|
||||
def collect_listing(payload: dict[str, Any]) -> int:
|
||||
data = ListingTaskPayload.from_dict(payload)
|
||||
manager = ConfigManager()
|
||||
pipeline = manager.get(data.env)
|
||||
source = pipeline.sources.find(data.source_id)
|
||||
if source is None:
|
||||
logger.error("Unknown source id %s", data.source_id)
|
||||
return 0
|
||||
|
||||
crawler_config = CrawlerConfig(
|
||||
source=source,
|
||||
page_range=PageRange.create(data.page_range) if data.page_range else None,
|
||||
date_range=DateRange.create(data.date_range) if data.date_range else None,
|
||||
category=data.category,
|
||||
notify=False,
|
||||
direction=UpdateDirection.FORWARD,
|
||||
)
|
||||
client_config = pipeline.fetch.client
|
||||
queue_manager = QueueManager()
|
||||
|
||||
# Branch by source kind to reuse the same high-level flow with different
|
||||
# extraction strategies.
|
||||
if source.source_kind == SourceKind.HTML:
|
||||
crawler = HtmlCrawler(crawler_config, client_config)
|
||||
queued = _collect_html_listing(crawler, data, queue_manager)
|
||||
elif source.source_kind == SourceKind.WORDPRESS:
|
||||
crawler = WordpressCrawler(crawler_config, client_config)
|
||||
queued = _collect_wordpress_listing(crawler, data, queue_manager)
|
||||
else:
|
||||
logger.warning(
|
||||
"Async crawling not supported for source kind %s", source.source_kind
|
||||
)
|
||||
queued = 0
|
||||
|
||||
logger.info("Queued %s article detail jobs for source %s", queued, data.source_id)
|
||||
return queued
|
||||
|
||||
|
||||
def collect_article(payload: dict[str, Any]) -> Article | None:
|
||||
data = ArticleTaskPayload.from_dict(payload)
|
||||
manager = ConfigManager()
|
||||
pipeline = manager.get(data.env)
|
||||
source = pipeline.sources.find(data.source_id)
|
||||
if source is None:
|
||||
logger.error("Unknown source id %s", data.source_id)
|
||||
return None
|
||||
|
||||
crawler_config = CrawlerConfig(
|
||||
source=source,
|
||||
date_range=DateRange.create(data.date_range) if data.date_range else None,
|
||||
category=data.category,
|
||||
notify=False,
|
||||
direction=UpdateDirection.FORWARD,
|
||||
)
|
||||
|
||||
# Persist locally first to keep an auditable trail and enable
|
||||
# replay/recovery independent of downstream availability.
|
||||
persistors = [
|
||||
JsonPersistor(
|
||||
data_dir=pipeline.paths.data,
|
||||
source_id=str(source.source_id),
|
||||
),
|
||||
]
|
||||
|
||||
try:
|
||||
if source.source_kind == SourceKind.HTML:
|
||||
article = _collect_html_article(
|
||||
HtmlCrawler(
|
||||
crawler_config, pipeline.fetch.client, persistors=persistors
|
||||
),
|
||||
data,
|
||||
)
|
||||
else:
|
||||
article = _collect_wordpress_article(
|
||||
WordpressCrawler(
|
||||
crawler_config, pipeline.fetch.client, persistors=persistors
|
||||
),
|
||||
data,
|
||||
)
|
||||
|
||||
queue_manager = QueueManager()
|
||||
queue_manager.enqueue_processed(
|
||||
ProcessedTaskPayload(
|
||||
source_id=data.source_id,
|
||||
env=data.env,
|
||||
article=article,
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Persisted article %s and forwarded to processed queue", article.link
|
||||
)
|
||||
return article
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error(
|
||||
"Failed to collect article for source %s url %s: %s",
|
||||
data.source_id,
|
||||
data.url,
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def forward_for_processing(payload: dict[str, Any]) -> Article | None:
|
||||
data = ProcessedTaskPayload.from_dict(payload)
|
||||
manager = ConfigManager()
|
||||
pipeline = manager.get(data.env)
|
||||
|
||||
article = data.article
|
||||
logger.info(
|
||||
"Ready for downstream processing: source=%s link=%s",
|
||||
data.source_id,
|
||||
article.link,
|
||||
)
|
||||
|
||||
try:
|
||||
client = SyncHttpClient(client_config=pipeline.fetch.client)
|
||||
client.post(
|
||||
os.getenv(
|
||||
"BASANGO_API_ENDPOINT",
|
||||
"http://localhost:8000/api/aggregator/articles?token=dev",
|
||||
),
|
||||
json=article.to_dict(),
|
||||
)
|
||||
|
||||
logger.info("Forwarded article %s to API", article.link)
|
||||
return article
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error(
|
||||
"Failed to forward article %s to API: %s",
|
||||
article.link,
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _collect_html_listing(
|
||||
crawler: HtmlCrawler,
|
||||
payload: ListingTaskPayload,
|
||||
queue_manager: QueueManager,
|
||||
) -> int:
|
||||
source = crawler.source
|
||||
selector = source.source_selectors.articles
|
||||
if not selector:
|
||||
logger.warning(
|
||||
"No article selector configured for HTML source %s",
|
||||
source.source_id,
|
||||
)
|
||||
return 0
|
||||
|
||||
page_range = crawler.config.page_range or crawler.get_pagination()
|
||||
queued = 0
|
||||
|
||||
for page in range(page_range.start, page_range.end + 1):
|
||||
page_url = crawler._build_page_url(page)
|
||||
try:
|
||||
soup = crawler.crawl(page_url, page)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("Failed to crawl page %s: %s", page_url, exc)
|
||||
continue
|
||||
|
||||
for node in soup.select(selector):
|
||||
link = crawler._extract_link(node)
|
||||
if not link:
|
||||
continue
|
||||
queue_manager.enqueue_article(
|
||||
ArticleTaskPayload(
|
||||
source_id=payload.source_id,
|
||||
env=payload.env,
|
||||
url=link,
|
||||
date_range=payload.date_range,
|
||||
category=payload.category,
|
||||
)
|
||||
)
|
||||
queued += 1
|
||||
|
||||
return queued
|
||||
|
||||
|
||||
def _collect_wordpress_listing(
|
||||
crawler: WordpressCrawler,
|
||||
payload: ListingTaskPayload,
|
||||
queue_manager: QueueManager,
|
||||
) -> int:
|
||||
page_range = crawler.config.page_range or crawler.get_pagination()
|
||||
queued = 0
|
||||
|
||||
for page in range(page_range.start, page_range.end + 1):
|
||||
endpoint = crawler._posts_endpoint(page)
|
||||
try:
|
||||
response = crawler.client.get(endpoint)
|
||||
articles = response.json()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("Failed to fetch WordPress page %s: %s", endpoint, exc)
|
||||
continue
|
||||
|
||||
if not isinstance(articles, list):
|
||||
logger.warning("Unexpected WordPress payload type: %s", type(articles))
|
||||
continue
|
||||
|
||||
for entry in articles:
|
||||
queue_manager.enqueue_article(
|
||||
ArticleTaskPayload(
|
||||
source_id=payload.source_id,
|
||||
env=payload.env,
|
||||
url=entry.get("link"),
|
||||
data=entry,
|
||||
date_range=payload.date_range,
|
||||
category=payload.category,
|
||||
)
|
||||
)
|
||||
queued += 1
|
||||
|
||||
return queued
|
||||
|
||||
|
||||
def _collect_html_article(
|
||||
crawler: HtmlCrawler,
|
||||
payload: ArticleTaskPayload,
|
||||
) -> Article:
|
||||
if not payload.url:
|
||||
logger.warning("Missing article url for HTML source %s", payload.source_id)
|
||||
raise ValueError("Missing article url")
|
||||
|
||||
crawler._current_article_url = payload.url # type: ignore[attr-defined]
|
||||
try:
|
||||
soup = crawler.crawl(payload.url)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.exception("Failed to crawl article %s: %s", payload.url, exc)
|
||||
raise exc
|
||||
|
||||
return crawler.fetch_one(str(soup), crawler.config.date_range)
|
||||
|
||||
|
||||
def _collect_wordpress_article(
|
||||
crawler: WordpressCrawler,
|
||||
payload: ArticleTaskPayload,
|
||||
) -> Article:
|
||||
if payload.data is None:
|
||||
logger.warning("Missing WordPress payload for source %s", payload.source_id)
|
||||
raise ValueError("Missing WordPress payload")
|
||||
|
||||
return crawler.fetch_one(payload.data, crawler.config.date_range)
|
||||
@@ -1,41 +0,0 @@
|
||||
"""
|
||||
Worker bootstrap for RQ queues.
|
||||
|
||||
Defaults to the `articles` queue to prioritise article detail processing.
|
||||
`SimpleWorker` is exposed for environments where forking is unstable (e.g.,
|
||||
some macOS setups). Use `burst=True` for CI or one-shot consumption.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Sequence
|
||||
|
||||
from rq import Queue, Worker, SimpleWorker
|
||||
|
||||
from .queue import QueueManager, QueueSettings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def start_worker(
|
||||
queue_names: Sequence[str] | None = None,
|
||||
*,
|
||||
settings: QueueSettings | None = None,
|
||||
burst: bool = False,
|
||||
simple: bool = False,
|
||||
) -> None:
|
||||
manager = QueueManager(settings=settings)
|
||||
if queue_names is None or not list(queue_names):
|
||||
queue_names = [manager.settings.article_queue]
|
||||
|
||||
resolved = [manager.queue_name(name) for name in queue_names]
|
||||
queues = [Queue(name, connection=manager.connection) for name in resolved]
|
||||
|
||||
worker_cls = SimpleWorker if simple else Worker
|
||||
logger.info(
|
||||
"Starting RQ %s for queues %s",
|
||||
worker_cls.__name__,
|
||||
", ".join(resolved),
|
||||
)
|
||||
worker = worker_cls(queues, connection=manager.connection)
|
||||
worker.work(burst=burst)
|
||||
@@ -1,39 +0,0 @@
|
||||
"""
|
||||
Thin indirection layer around async components (queues, tasks, worker).
|
||||
|
||||
We import symbols dynamically to avoid importing optional runtime dependencies
|
||||
like RQ and Redis at module import time. This keeps regular (sync) crawling
|
||||
usable even if async deps aren't installed, and avoids circular imports when
|
||||
RQ workers import task callables by string path.
|
||||
"""
|
||||
|
||||
from importlib import import_module
|
||||
|
||||
_async_queue = import_module("basango.services.crawler.async.queue")
|
||||
_async_tasks = import_module("basango.services.crawler.async.tasks")
|
||||
_async_worker = import_module("basango.services.crawler.async.worker")
|
||||
_async_schemas = import_module("basango.services.crawler.async.schemas")
|
||||
|
||||
QueueManager = getattr(_async_queue, "QueueManager")
|
||||
QueueSettings = getattr(_async_queue, "QueueSettings")
|
||||
ListingTaskPayload = getattr(_async_schemas, "ListingTaskPayload")
|
||||
ArticleTaskPayload = getattr(_async_schemas, "ArticleTaskPayload")
|
||||
ProcessedTaskPayload = getattr(_async_schemas, "ProcessedTaskPayload")
|
||||
schedule_async_crawl = getattr(_async_tasks, "schedule_async_crawl")
|
||||
collect_listing = getattr(_async_tasks, "collect_listing")
|
||||
collect_article = getattr(_async_tasks, "collect_article")
|
||||
forward_for_processing = getattr(_async_tasks, "forward_for_processing")
|
||||
start_worker = getattr(_async_worker, "start_worker")
|
||||
|
||||
__all__ = [
|
||||
"QueueManager",
|
||||
"QueueSettings",
|
||||
"ListingTaskPayload",
|
||||
"ArticleTaskPayload",
|
||||
"ProcessedTaskPayload",
|
||||
"schedule_async_crawl",
|
||||
"collect_listing",
|
||||
"collect_article",
|
||||
"forward_for_processing",
|
||||
"start_worker",
|
||||
]
|
||||
@@ -1,161 +0,0 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import asdict, is_dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional, Any, Dict, List, Sequence
|
||||
|
||||
from basango.domain.article import Article
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import HttpUrl
|
||||
|
||||
from basango.core.config import CrawlerConfig, ClientConfig
|
||||
from basango.domain import DateRange, SourceKind, PageRange
|
||||
from basango.domain.exception import ArticleOutOfRange
|
||||
from basango.services import (
|
||||
HttpClient,
|
||||
DateParser,
|
||||
OpenGraphProvider,
|
||||
BasePersistor,
|
||||
Tokenizer,
|
||||
)
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
"""
|
||||
Base building blocks shared by concrete crawlers.
|
||||
|
||||
Notable conventions
|
||||
- `skip`: raises `ArticleOutOfRange` when an item falls outside the desired
|
||||
date range. Callers catch it to stop pagination early.
|
||||
- `record_article`: normalises metadata (including dataclasses) before
|
||||
handing off to persistors.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler_config: CrawlerConfig,
|
||||
client_config: ClientConfig,
|
||||
persistors: Sequence[BasePersistor] | None = None,
|
||||
) -> None:
|
||||
self.config = crawler_config
|
||||
self.source = crawler_config.source
|
||||
self.client = HttpClient(client_config=client_config)
|
||||
self.persistors: list[BasePersistor] = list(persistors) if persistors else []
|
||||
self.date_parser = DateParser()
|
||||
self.open_graph = OpenGraphProvider()
|
||||
self.tokenizer = Tokenizer()
|
||||
|
||||
@abstractmethod
|
||||
def fetch(self) -> None:
|
||||
pass
|
||||
|
||||
def crawl(self, url: str, page: Optional[int] = None) -> BeautifulSoup:
|
||||
if page is not None:
|
||||
logging.info(f"> Page {page}")
|
||||
|
||||
response = self.client.get(url).text
|
||||
return BeautifulSoup(response, "html.parser")
|
||||
|
||||
def save_article(
|
||||
self,
|
||||
*,
|
||||
title: str,
|
||||
link: str,
|
||||
body: str,
|
||||
categories: List[str],
|
||||
timestamp: int,
|
||||
metadata: Any = None,
|
||||
) -> Article:
|
||||
if metadata is None:
|
||||
metadata_value = None
|
||||
elif is_dataclass(metadata) and not isinstance(metadata, type):
|
||||
metadata_value = asdict(metadata)
|
||||
elif isinstance(metadata, dict):
|
||||
metadata_value = metadata
|
||||
else:
|
||||
metadata_value = None
|
||||
|
||||
# Get source_id and ensure it's a string
|
||||
source_id = getattr(self.source, "source_id", None)
|
||||
if source_id is None:
|
||||
source_id = "unknown"
|
||||
|
||||
article = Article(
|
||||
title=title,
|
||||
link=HttpUrl(link), # Convert str to HttpUrl
|
||||
body=body,
|
||||
categories=categories,
|
||||
source=source_id, # Ensure it's a string, not None
|
||||
timestamp=datetime.fromtimestamp(
|
||||
timestamp
|
||||
), # Convert int timestamp to datetime
|
||||
metadata=metadata_value,
|
||||
)
|
||||
article.token_statistics = self.tokenizer.count_tokens(
|
||||
article.title, article.body, article.categories
|
||||
)
|
||||
|
||||
self._persist(article.to_dict())
|
||||
logging.info("> %s [saved]", article.title)
|
||||
|
||||
return article
|
||||
|
||||
@abstractmethod
|
||||
def fetch_one(
|
||||
self, html: str, date_range: Optional[DateRange] = None
|
||||
) -> Article | None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_pagination(self) -> PageRange:
|
||||
pass
|
||||
|
||||
def get_last_page(self) -> int:
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def supports() -> SourceKind:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def initialize(cls) -> None:
|
||||
logging.info("Initializing Crawler")
|
||||
|
||||
def completed(self, notify: bool = False) -> None:
|
||||
logging.info("Crawling completed")
|
||||
if notify:
|
||||
logging.info("Sending notification about completion")
|
||||
# TODO: Implement notification logic here
|
||||
self._shutdown_persistors()
|
||||
|
||||
@classmethod
|
||||
def skip(cls, date_range: DateRange, timestamp: str, title: str, date: str) -> None:
|
||||
if date_range.out_range(int(timestamp)):
|
||||
# Use an exception to unwind to the crawl loop and stop as soon as
|
||||
# we detect items beyond the configured range.
|
||||
raise ArticleOutOfRange.create(timestamp, date_range)
|
||||
|
||||
logging.warning(f"> {title} [Skipped {date}]")
|
||||
|
||||
def _persist(self, article: Dict[str, Any]) -> None:
|
||||
for persistor in self.persistors:
|
||||
try:
|
||||
persistor.persist(article)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logging.exception(
|
||||
"Failed to persist article via %s: %s",
|
||||
persistor.__class__.__name__,
|
||||
exc,
|
||||
)
|
||||
|
||||
def _shutdown_persistors(self) -> None:
|
||||
for persistor in self.persistors:
|
||||
try:
|
||||
persistor.close()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logging.exception(
|
||||
"Failed to close persistor %s: %s",
|
||||
persistor.__class__.__name__,
|
||||
exc,
|
||||
)
|
||||
@@ -1,322 +0,0 @@
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, cast, override, Sequence
|
||||
from urllib.parse import parse_qs, urljoin, urlparse
|
||||
|
||||
from basango.domain.article import Article
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from markdownify import markdownify
|
||||
|
||||
from basango.core.config import CrawlerConfig, ClientConfig
|
||||
from basango.core.config.source_config import HtmlSourceConfig
|
||||
from basango.domain import DateRange, PageRange, SourceKind
|
||||
from basango.domain.exception import ArticleOutOfRange
|
||||
from basango.services.crawler.base_crawler import BaseCrawler
|
||||
from basango.services import BasePersistor
|
||||
|
||||
|
||||
class HtmlCrawler(BaseCrawler):
|
||||
"""
|
||||
Generic HTML crawler driven by CSS selectors.
|
||||
|
||||
Strategy
|
||||
- Listing pages are iterated to extract per-article links or blocks.
|
||||
- When `requires_details` is set, a second request fetches the article page
|
||||
to extract full content; otherwise the article block is parsed inline.
|
||||
- Pagination is inferred from a template and last-page discovery heuristics
|
||||
(regex or query string `page` fallback).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler_config: CrawlerConfig,
|
||||
client_config: ClientConfig,
|
||||
persistors: Sequence[BasePersistor] | None = None,
|
||||
) -> None:
|
||||
super().__init__(crawler_config, client_config, persistors=persistors)
|
||||
if not self.source or self.source.source_kind != SourceKind.HTML:
|
||||
raise ValueError("HtmlCrawler requires a source of kind HTML")
|
||||
|
||||
self.source = cast(HtmlSourceConfig, self.source)
|
||||
self._current_article_url: Optional[str] = None
|
||||
|
||||
@override
|
||||
def fetch(self) -> None:
|
||||
self.initialize()
|
||||
page_range = self.config.page_range or self.get_pagination()
|
||||
date_range = self.config.date_range
|
||||
|
||||
article_selector = self.source.source_selectors.articles
|
||||
if not article_selector:
|
||||
logging.error(
|
||||
"No article selector configured for HTML source %s",
|
||||
self.source.source_id,
|
||||
)
|
||||
return
|
||||
|
||||
stop = False
|
||||
for page_number in range(page_range.start, page_range.end + 1):
|
||||
page_url = self._build_page_url(page_number)
|
||||
try:
|
||||
soup = self.crawl(page_url, page_number)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logging.error(
|
||||
"> page %s => %s [failed]",
|
||||
page_number,
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
|
||||
articles = soup.select(article_selector)
|
||||
if not articles:
|
||||
logging.info("No articles found on page %s", page_number)
|
||||
continue
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
self._current_article_url = self._extract_link(article)
|
||||
target_html = str(article)
|
||||
|
||||
if self.source.requires_details:
|
||||
if not self._current_article_url:
|
||||
logging.debug(
|
||||
"Skipping article without link for details on page %s",
|
||||
page_number,
|
||||
)
|
||||
continue
|
||||
try:
|
||||
detail_soup = self.crawl(self._current_article_url)
|
||||
target_html = str(detail_soup)
|
||||
except Exception as detail_exc: # noqa: BLE001
|
||||
logging.error(
|
||||
"Failed to fetch detail page %s: %s",
|
||||
self._current_article_url,
|
||||
detail_exc,
|
||||
)
|
||||
continue
|
||||
|
||||
self.fetch_one(target_html, date_range)
|
||||
except ArticleOutOfRange:
|
||||
# Using an exception to short-circuit nested loops keeps the
|
||||
# happy path tidy (no extra flags at each extraction site).
|
||||
logging.info("No more articles to fetch in this range.")
|
||||
stop = True
|
||||
break
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logging.error(
|
||||
"Failed to process article on %s: %s",
|
||||
page_url,
|
||||
exc,
|
||||
)
|
||||
finally:
|
||||
self._current_article_url = None
|
||||
|
||||
if stop:
|
||||
break
|
||||
|
||||
self.completed(self.config.notify)
|
||||
|
||||
@override
|
||||
def fetch_one(self, html: str, date_range: Optional[DateRange] = None) -> Article:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
selectors = self.source.source_selectors
|
||||
|
||||
title = self._extract_text(soup, selectors.article_title) or "Untitled"
|
||||
link = self._current_article_url or self._extract_link(soup)
|
||||
if not link:
|
||||
logging.warning("Skipping article '%s' without link", title)
|
||||
raise ValueError("Missing article link")
|
||||
|
||||
body = self._extract_body(soup, selectors.article_body)
|
||||
categories = self._extract_categories(soup, selectors.article_categories)
|
||||
if not categories and self.config.category:
|
||||
categories = [self.config.category]
|
||||
|
||||
raw_date = self._extract_text(soup, selectors.article_date)
|
||||
timestamp = self._compute_timestamp(raw_date)
|
||||
|
||||
if date_range and not date_range.in_range(timestamp):
|
||||
self.skip(date_range, str(timestamp), title, raw_date or "")
|
||||
|
||||
metadata = self.open_graph.consume_html(html)
|
||||
|
||||
return self.save_article(
|
||||
title=title,
|
||||
link=link,
|
||||
body=body,
|
||||
categories=categories,
|
||||
timestamp=timestamp,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
@override
|
||||
def get_pagination(self) -> PageRange:
|
||||
return PageRange.create(f"0:{self.get_last_page()}")
|
||||
|
||||
@override
|
||||
def get_last_page(self) -> int:
|
||||
if not self.source:
|
||||
return 1
|
||||
|
||||
if self.source.supports_categories and self.config.category:
|
||||
path = self.source.pagination_template.replace(
|
||||
"{category}", self.config.category
|
||||
)
|
||||
else:
|
||||
path = self.source.pagination_template
|
||||
|
||||
links = self.crawl(f"{self.source.source_url}{path}").select(
|
||||
self.source.source_selectors.pagination
|
||||
)
|
||||
if not links:
|
||||
return 1
|
||||
|
||||
href = links[-1].get("href")
|
||||
if not href or not isinstance(href, str):
|
||||
return 1
|
||||
|
||||
# Heuristic: last pagination link either contains the page number
|
||||
# directly or as a `page` query param. Prefer regex first to support
|
||||
# path-style pagination (e.g., /page/4/).
|
||||
match = re.search(r"(\d+)", href)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
queries = parse_qs(urlparse(href).query)
|
||||
if "page" in queries and queries["page"]:
|
||||
try:
|
||||
return int(queries["page"][0])
|
||||
except ValueError:
|
||||
return 1
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
@override
|
||||
def supports() -> SourceKind:
|
||||
return SourceKind.HTML
|
||||
|
||||
def _build_page_url(self, page: int) -> str:
|
||||
template = self._apply_category(self.source.pagination_template)
|
||||
if "{page}" in template:
|
||||
template = template.format(page=page)
|
||||
elif page > 0:
|
||||
separator = "&" if "?" in template else "?"
|
||||
template = f"{template}{separator}page={page}"
|
||||
|
||||
base = str(self.source.source_url)
|
||||
if not base.endswith("/"):
|
||||
base = f"{base}/"
|
||||
return urljoin(base, template.lstrip("/"))
|
||||
|
||||
def _apply_category(self, template: str) -> str:
|
||||
if "{category}" in template:
|
||||
replacement = self.config.category or ""
|
||||
return template.replace("{category}", replacement)
|
||||
return template
|
||||
|
||||
def _extract_link(self, node: BeautifulSoup | Tag) -> Optional[str]:
|
||||
selector = self.source.source_selectors.article_link
|
||||
if not selector:
|
||||
return None
|
||||
|
||||
target = node.select_one(selector)
|
||||
if not target:
|
||||
return None
|
||||
|
||||
# Support a few common attributes for link-like elements (href,
|
||||
# data-href, src) to tolerate variations in markup without custom code.
|
||||
raw_href = target.get("href") or target.get("data-href") or target.get("src")
|
||||
href: Optional[str]
|
||||
if isinstance(raw_href, str):
|
||||
href = raw_href.strip() or None
|
||||
elif isinstance(raw_href, list):
|
||||
href = next(
|
||||
(
|
||||
item.strip()
|
||||
for item in raw_href
|
||||
if isinstance(item, str) and item.strip()
|
||||
),
|
||||
None,
|
||||
)
|
||||
else:
|
||||
href = None
|
||||
if not href:
|
||||
return None
|
||||
return self._to_absolute_url(href)
|
||||
|
||||
def _to_absolute_url(self, href: str) -> str:
|
||||
base = str(self.source.source_url)
|
||||
if not base.endswith("/"):
|
||||
base = f"{base}/"
|
||||
return urljoin(base, href)
|
||||
|
||||
@staticmethod
|
||||
def _extract_text(
|
||||
node: BeautifulSoup | Tag, selector: Optional[str]
|
||||
) -> Optional[str]:
|
||||
if not selector:
|
||||
return None
|
||||
target = node.select_one(selector)
|
||||
if not target:
|
||||
return None
|
||||
|
||||
if target.name == "img":
|
||||
for attr in ("alt", "title"):
|
||||
value = target.get(attr)
|
||||
if isinstance(value, str):
|
||||
stripped = value.strip()
|
||||
if stripped:
|
||||
return stripped
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
if isinstance(item, str):
|
||||
stripped = item.strip()
|
||||
if stripped:
|
||||
return stripped
|
||||
|
||||
text = target.get_text(" ", strip=True)
|
||||
return text or None
|
||||
|
||||
@staticmethod
|
||||
def _extract_body(node: BeautifulSoup | Tag, selector: Optional[str]) -> str:
|
||||
if selector:
|
||||
matches = node.select(selector)
|
||||
if matches:
|
||||
parts = [
|
||||
markdownify(item.get_text(" ", strip=False), heading_style="ATX")
|
||||
for item in matches
|
||||
if item.get_text(strip=True)
|
||||
]
|
||||
if parts:
|
||||
# Join without separators: callers can post-process if
|
||||
# needed, but this preserves maximum fidelity.
|
||||
return "\n".join(parts)
|
||||
return markdownify(node.get_text(" ", strip=False), heading_style="ATX")
|
||||
|
||||
@staticmethod
|
||||
def _extract_categories(
|
||||
node: BeautifulSoup | Tag, selector: Optional[str]
|
||||
) -> list[str]:
|
||||
if not selector:
|
||||
return []
|
||||
|
||||
values: list[str] = []
|
||||
for item in node.select(selector):
|
||||
text = item.get_text(" ", strip=True)
|
||||
if text:
|
||||
lower = text.lower()
|
||||
if lower not in values:
|
||||
values.append(lower)
|
||||
return values
|
||||
|
||||
def _compute_timestamp(self, raw_date: Optional[str]) -> int:
|
||||
if not raw_date:
|
||||
return int(datetime.now(timezone.utc).timestamp())
|
||||
|
||||
return self.date_parser.create_timestamp(
|
||||
raw_date.strip(),
|
||||
fmt=self.source.source_date.format,
|
||||
pattern=self.source.source_date.pattern,
|
||||
replacement=self.source.source_date.replacement,
|
||||
)
|
||||
@@ -1,187 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, override, cast, Final, Any, Sequence
|
||||
|
||||
from markdownify import markdownify
|
||||
|
||||
from basango.domain.article import Article
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from basango.core.config import WordPressSourceConfig, CrawlerConfig, ClientConfig
|
||||
from basango.domain import SourceKind, PageRange, DateRange
|
||||
from basango.domain.exception import ArticleOutOfRange
|
||||
from basango.services.crawler.base_crawler import BaseCrawler
|
||||
from basango.services import BasePersistor
|
||||
|
||||
|
||||
class WordpressCrawler(BaseCrawler):
|
||||
"""
|
||||
WordPress REST API crawler.
|
||||
|
||||
It uses the `/wp-json/wp/v2/posts` endpoints and limits fields to reduce
|
||||
payload size. Pagination is driven by WordPress headers `x-wp-totalpages`
|
||||
and `x-wp-total`. Category IDs are mapped to slugs via a secondary endpoint
|
||||
and cached per run.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler_config: CrawlerConfig,
|
||||
client_config: ClientConfig,
|
||||
persistors: Sequence[BasePersistor] | None = None,
|
||||
) -> None:
|
||||
super().__init__(crawler_config, client_config, persistors=persistors)
|
||||
if not self.source or self.source.source_kind != SourceKind.WORDPRESS:
|
||||
raise ValueError("WordpressCrawler requires a source of kind WORDPRESS")
|
||||
|
||||
self.source = cast(WordPressSourceConfig, self.source)
|
||||
self.category_map: dict[int, str] = {}
|
||||
|
||||
POST_QUERY: Final = "_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc"
|
||||
CATEGORY_QUERY: Final = (
|
||||
"_fields=id,slug,count&orderby=count&order=desc&per_page=100"
|
||||
)
|
||||
TOTAL_PAGES_HEADER: Final = "x-wp-totalpages"
|
||||
TOTAL_POSTS_HEADER: Final = "x-wp-total"
|
||||
|
||||
@override
|
||||
def fetch(self) -> None:
|
||||
self.initialize()
|
||||
page_range = self.config.page_range or self.get_pagination()
|
||||
date_range = self.config.date_range
|
||||
|
||||
stop = False
|
||||
for page_number in range(page_range.start, page_range.end + 1):
|
||||
endpoint = self._posts_endpoint(page_number)
|
||||
try:
|
||||
response = self.client.get(endpoint)
|
||||
payload = response.text
|
||||
articles = json.loads(payload)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logging.error(
|
||||
"> page %s => %s [failed]",
|
||||
page_number,
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
self.fetch_one(article, date_range)
|
||||
except ArticleOutOfRange:
|
||||
# Same early-exit semantic as HtmlCrawler
|
||||
logging.info("No more articles to fetch in this range.")
|
||||
stop = True
|
||||
break
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logging.error(
|
||||
"Failed to process WordPress article on page %s: %s",
|
||||
page_number,
|
||||
exc,
|
||||
)
|
||||
if stop:
|
||||
break
|
||||
|
||||
self.completed(self.config.notify)
|
||||
|
||||
@override
|
||||
def fetch_one(self, html: Any, date_range: Optional[DateRange] = None) -> Article:
|
||||
try:
|
||||
data = json.loads(html) if isinstance(html, str) else html
|
||||
except json.JSONDecodeError as exc:
|
||||
logging.error("Failed to decode WordPress payload: %s", exc)
|
||||
raise exc
|
||||
|
||||
if not isinstance(data, dict):
|
||||
logging.error("Skipping unexpected WordPress payload: %s", type(data))
|
||||
raise ValueError("Unexpected WordPress payload type")
|
||||
|
||||
link = data.get("link")
|
||||
if not link:
|
||||
logging.error("Skipping WordPress article without link")
|
||||
raise ValueError("WordPress article without link")
|
||||
|
||||
title_html = data.get("title", {}).get("rendered", "")
|
||||
body_html = data.get("content", {}).get("rendered", "")
|
||||
|
||||
title = BeautifulSoup(title_html, "html.parser").get_text(" ", strip=True)
|
||||
body = markdownify(
|
||||
BeautifulSoup(body_html, "html.parser").get_text(" ", strip=False),
|
||||
heading_style="ATX",
|
||||
)
|
||||
timestamp = self._compute_timestamp(data.get("date"))
|
||||
|
||||
categories_value = self._map_categories(data.get("categories", []))
|
||||
categories = [item for item in categories_value.split(",") if item]
|
||||
|
||||
if date_range and not date_range.in_range(timestamp):
|
||||
self.skip(date_range, str(timestamp), title, data.get("date", ""))
|
||||
|
||||
metadata = self.open_graph.consume_url(link)
|
||||
|
||||
return self.save_article(
|
||||
title=title or data.get("slug", "Untitled"),
|
||||
link=link,
|
||||
body=body,
|
||||
categories=categories,
|
||||
timestamp=timestamp,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
@override
|
||||
def get_pagination(self) -> PageRange:
|
||||
response = self.client.get(
|
||||
f"{self.source.source_url}wp-json/wp/v2/posts?_fields=id&per_page=100"
|
||||
)
|
||||
pages = int(response.headers.get(self.TOTAL_PAGES_HEADER, "1"))
|
||||
posts = int(response.headers.get(self.TOTAL_POSTS_HEADER, "0"))
|
||||
|
||||
logging.info("WordPress Pagination %s posts in %s pages", posts, pages)
|
||||
return PageRange.create(f"1:{pages}")
|
||||
|
||||
def _fetch_categories(self) -> None:
|
||||
response = self.client.get(
|
||||
f"{self.source.source_url}wp-json/wp/v2/categories?{self.CATEGORY_QUERY}"
|
||||
)
|
||||
for category in response.json():
|
||||
self.category_map[int(category["id"])] = category["slug"]
|
||||
|
||||
def _map_categories(self, categories: list[int]) -> str:
|
||||
if not self.category_map:
|
||||
self._fetch_categories()
|
||||
return ",".join(
|
||||
self.category_map[category]
|
||||
for category in sorted(categories)
|
||||
if category in self.category_map
|
||||
)
|
||||
|
||||
def _posts_endpoint(self, page: int) -> str:
|
||||
base = str(self.source.source_url)
|
||||
if not base.endswith("/"):
|
||||
base = f"{base}/"
|
||||
return f"{base}wp-json/wp/v2/posts?{self.POST_QUERY}&page={page}&per_page=100"
|
||||
|
||||
@staticmethod
|
||||
def _compute_timestamp(raw: Optional[str]) -> int:
|
||||
if not raw:
|
||||
return int(datetime.now(timezone.utc).timestamp())
|
||||
|
||||
cleaned = raw.replace("Z", "+00:00")
|
||||
try:
|
||||
dt = datetime.fromisoformat(cleaned)
|
||||
except ValueError:
|
||||
return int(datetime.now(timezone.utc).timestamp())
|
||||
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return int(dt.timestamp())
|
||||
|
||||
@override
|
||||
def get_last_page(self) -> int:
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
@override
|
||||
def supports() -> SourceKind:
|
||||
return SourceKind.WORDPRESS
|
||||
@@ -1,82 +0,0 @@
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class DateParser:
|
||||
MONTHS = {
|
||||
"janvier": "01",
|
||||
"février": "02",
|
||||
"mars": "03",
|
||||
"avril": "04",
|
||||
"mai": "05",
|
||||
"juin": "06",
|
||||
"juillet": "07",
|
||||
"août": "08",
|
||||
"septembre": "09",
|
||||
"octobre": "10",
|
||||
"novembre": "11",
|
||||
"décembre": "12",
|
||||
}
|
||||
|
||||
DAYS = {
|
||||
"dimanche": "0",
|
||||
"lundi": "1",
|
||||
"mardi": "2",
|
||||
"mercredi": "3",
|
||||
"jeudi": "4",
|
||||
"vendredi": "5",
|
||||
"samedi": "6",
|
||||
}
|
||||
|
||||
DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M"
|
||||
|
||||
@classmethod
|
||||
def _apply_substitution(
|
||||
cls, date: str, pattern: Optional[str], replacement: Optional[str]
|
||||
) -> str:
|
||||
if not pattern or replacement is None:
|
||||
return date
|
||||
|
||||
# Accept PHP-like patterns with leading/trailing slashes
|
||||
if len(pattern) >= 2 and pattern[0] == "/" and pattern.rfind("/") > 0:
|
||||
pattern = pattern[1 : pattern.rfind("/")]
|
||||
|
||||
# Convert $1 to \1 for Python
|
||||
replacement = re.sub(r"\$(\d+)", r"\\\1", replacement)
|
||||
try:
|
||||
return re.sub(pattern, replacement, date)
|
||||
except re.error:
|
||||
logging.error(f"[DateParser] Could not convert {pattern} to {replacement}")
|
||||
return date
|
||||
|
||||
def create_timestamp(
|
||||
self,
|
||||
date: str,
|
||||
fmt: Optional[str] = None,
|
||||
pattern: Optional[str] = None,
|
||||
replacement: Optional[str] = None,
|
||||
) -> int:
|
||||
# Normalize and translate French day/month words
|
||||
date = date.lower()
|
||||
for k, v in self.DAYS.items():
|
||||
date = date.replace(k, v)
|
||||
for k, v in self.MONTHS.items():
|
||||
date = date.replace(k, v)
|
||||
|
||||
# Optional regex transform
|
||||
date = self._apply_substitution(date, pattern, replacement)
|
||||
fmt = fmt or self.DEFAULT_DATE_FORMAT
|
||||
|
||||
try:
|
||||
dt = datetime.strptime(date, fmt).replace(tzinfo=timezone.utc)
|
||||
return int(dt.timestamp())
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"[DateParser] Could not parse date '{date}' with format '{fmt}': {e}"
|
||||
)
|
||||
dt = datetime.now(timezone.utc).replace(
|
||||
hour=0, minute=0, second=0, microsecond=0
|
||||
)
|
||||
return int(dt.timestamp())
|
||||
@@ -1,9 +0,0 @@
|
||||
from .base_http_client import BaseHttpClient
|
||||
from .sync_http_client import SyncHttpClient
|
||||
from .async_http_client import AsyncHttpClient
|
||||
|
||||
__all__ = [
|
||||
"BaseHttpClient",
|
||||
"SyncHttpClient",
|
||||
"AsyncHttpClient",
|
||||
]
|
||||
@@ -1,121 +0,0 @@
|
||||
import asyncio
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
|
||||
from .base_http_client import (
|
||||
BaseHttpClient,
|
||||
HttpData,
|
||||
HttpHeaders,
|
||||
HttpParams,
|
||||
TRANSIENT_STATUSES,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AsyncHttpClient(BaseHttpClient):
|
||||
_client: httpx.AsyncClient = field(init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
super().__post_init__()
|
||||
self._client = httpx.AsyncClient(
|
||||
follow_redirects=self.client_config.follow_redirects,
|
||||
max_redirects=5,
|
||||
verify=self.client_config.verify_ssl,
|
||||
timeout=self.client_config.timeout,
|
||||
headers=dict(self._headers),
|
||||
)
|
||||
|
||||
async def __aenter__(self) -> "AsyncHttpClient":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb) -> None:
|
||||
await self.aclose()
|
||||
|
||||
def close(self) -> None:
|
||||
if self._client.is_closed:
|
||||
return
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
except RuntimeError: # no running loop
|
||||
asyncio.run(self.aclose())
|
||||
else:
|
||||
loop.create_task(self.aclose())
|
||||
|
||||
async def aclose(self) -> None:
|
||||
try:
|
||||
await self._client.aclose()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
async def _request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
headers: HttpHeaders = None,
|
||||
params: HttpParams = None,
|
||||
data: HttpData = None,
|
||||
json: HttpData = None,
|
||||
) -> httpx.Response:
|
||||
attempt = 0
|
||||
while True:
|
||||
try:
|
||||
response = await self._client.request(
|
||||
method,
|
||||
url,
|
||||
headers=self._build_headers(headers),
|
||||
params=params,
|
||||
data=data,
|
||||
json=json,
|
||||
)
|
||||
if (
|
||||
response.status_code in TRANSIENT_STATUSES
|
||||
) and attempt < self.client_config.max_retries:
|
||||
await asyncio.sleep(self._retry_delay(attempt, response))
|
||||
attempt += 1
|
||||
continue
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except httpx.HTTPStatusError as exc:
|
||||
status = exc.response.status_code if exc.response else 0
|
||||
if (
|
||||
status in TRANSIENT_STATUSES
|
||||
) and attempt < self.client_config.max_retries:
|
||||
await asyncio.sleep(self._retry_delay(attempt, exc.response))
|
||||
attempt += 1
|
||||
continue
|
||||
raise
|
||||
except httpx.RequestError:
|
||||
if attempt < self.client_config.max_retries:
|
||||
await asyncio.sleep(self._compute_backoff(attempt))
|
||||
attempt += 1
|
||||
continue
|
||||
raise
|
||||
|
||||
async def get(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
headers: HttpHeaders = None,
|
||||
params: HttpParams = None,
|
||||
) -> httpx.Response:
|
||||
return await self._request("GET", url, headers=headers, params=params)
|
||||
|
||||
async def post(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
headers: HttpHeaders = None,
|
||||
params: HttpParams = None,
|
||||
data: HttpData = None,
|
||||
json: HttpData = None,
|
||||
) -> httpx.Response:
|
||||
return await self._request(
|
||||
"POST",
|
||||
url,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
json=json,
|
||||
)
|
||||
@@ -1,87 +0,0 @@
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import Any, Optional, TypeAlias
|
||||
|
||||
import httpx
|
||||
|
||||
from basango.core.config import ClientConfig
|
||||
from basango.services.user_agents import UserAgents
|
||||
|
||||
HttpHeaders: TypeAlias = dict[str, str] | None
|
||||
HttpParams: TypeAlias = dict[str, Any] | None
|
||||
HttpData: TypeAlias = Any | None
|
||||
|
||||
TRANSIENT_STATUSES = (429, 500, 502, 503, 504)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseHttpClient(ABC):
|
||||
client_config: ClientConfig
|
||||
user_agent_provider: UserAgents | None = None
|
||||
default_headers: HttpHeaders = None
|
||||
_user_agent: str = field(init=False, repr=False)
|
||||
_headers: dict[str, str] = field(init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
provider = self.user_agent_provider or UserAgents(
|
||||
rotate=self.client_config.rotate,
|
||||
fallback=self.client_config.user_agent,
|
||||
)
|
||||
user_agent = provider.get()
|
||||
self._user_agent = user_agent if user_agent else self.client_config.user_agent
|
||||
|
||||
headers = {"User-Agent": self._user_agent}
|
||||
if self.default_headers:
|
||||
headers.update(self.default_headers)
|
||||
self._headers = headers
|
||||
|
||||
def _compute_backoff(self, attempt: int) -> float:
|
||||
base = min(
|
||||
self.client_config.backoff_initial
|
||||
* (self.client_config.backoff_multiplier**attempt),
|
||||
self.client_config.backoff_max,
|
||||
)
|
||||
jitter = random.uniform(0, base * 0.25)
|
||||
return base + jitter
|
||||
|
||||
def _retry_delay(
|
||||
self, attempt: int, response: Optional[httpx.Response] = None
|
||||
) -> float:
|
||||
delay = 0.0
|
||||
if response is not None and self.client_config.respect_retry_after:
|
||||
retry_after = (
|
||||
response.headers.get("Retry-After") if response.headers else None
|
||||
)
|
||||
if retry_after:
|
||||
delay = self._parse_retry_after(retry_after)
|
||||
|
||||
if delay == 0.0:
|
||||
delay = self._compute_backoff(attempt)
|
||||
return delay
|
||||
|
||||
@staticmethod
|
||||
def _parse_retry_after(header_value: str) -> float:
|
||||
try:
|
||||
return max(0.0, float(int(header_value)))
|
||||
except (TypeError, ValueError):
|
||||
try:
|
||||
dt = parsedate_to_datetime(header_value)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
now = datetime.now(timezone.utc)
|
||||
return max(0.0, (dt - now).total_seconds())
|
||||
except Exception: # noqa: BLE001
|
||||
return 0.0
|
||||
|
||||
def _build_headers(self, headers: HttpHeaders = None) -> dict[str, str]:
|
||||
merged = dict(self._headers)
|
||||
if headers:
|
||||
merged.update(headers)
|
||||
return merged
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None: # pragma: no cover - enforced by subclasses
|
||||
"""Close the underlying HTTPX client."""
|
||||
@@ -1,107 +0,0 @@
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
|
||||
from .base_http_client import (
|
||||
BaseHttpClient,
|
||||
HttpData,
|
||||
HttpHeaders,
|
||||
HttpParams,
|
||||
TRANSIENT_STATUSES,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SyncHttpClient(BaseHttpClient):
|
||||
_client: httpx.Client = field(init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
super().__post_init__()
|
||||
self._client = httpx.Client(
|
||||
follow_redirects=self.client_config.follow_redirects,
|
||||
max_redirects=5,
|
||||
verify=self.client_config.verify_ssl,
|
||||
timeout=self.client_config.timeout,
|
||||
headers=dict(self._headers),
|
||||
)
|
||||
|
||||
def __enter__(self) -> "SyncHttpClient":
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb) -> None:
|
||||
self.close()
|
||||
|
||||
def close(self) -> None:
|
||||
try:
|
||||
self._client.close()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
def _request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
headers: HttpHeaders = None,
|
||||
params: HttpParams = None,
|
||||
data: HttpData = None,
|
||||
json: HttpData = None,
|
||||
) -> httpx.Response:
|
||||
attempt = 0
|
||||
while True:
|
||||
try:
|
||||
response = self._client.request(
|
||||
method,
|
||||
url,
|
||||
headers=self._build_headers(headers),
|
||||
params=params,
|
||||
data=data,
|
||||
json=json,
|
||||
)
|
||||
if (
|
||||
response.status_code in TRANSIENT_STATUSES
|
||||
) and attempt < self.client_config.max_retries:
|
||||
time.sleep(self._retry_delay(attempt, response))
|
||||
attempt += 1
|
||||
continue
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except httpx.HTTPStatusError as exc:
|
||||
status = exc.response.status_code if exc.response else 0
|
||||
if (
|
||||
status in TRANSIENT_STATUSES
|
||||
) and attempt < self.client_config.max_retries:
|
||||
time.sleep(self._retry_delay(attempt, exc.response))
|
||||
attempt += 1
|
||||
continue
|
||||
raise
|
||||
except httpx.RequestError:
|
||||
if attempt < self.client_config.max_retries:
|
||||
time.sleep(self._compute_backoff(attempt))
|
||||
attempt += 1
|
||||
continue
|
||||
raise
|
||||
|
||||
def get(
|
||||
self, url: str, *, headers: HttpHeaders = None, params: HttpParams = None
|
||||
) -> httpx.Response:
|
||||
return self._request("GET", url, headers=headers, params=params)
|
||||
|
||||
def post(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
headers: HttpHeaders = None,
|
||||
params: HttpParams = None,
|
||||
data: HttpData = None,
|
||||
json: HttpData = None,
|
||||
) -> httpx.Response:
|
||||
return self._request(
|
||||
"POST",
|
||||
url,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
json=json,
|
||||
)
|
||||
@@ -1,55 +0,0 @@
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import trafilatura
|
||||
|
||||
from basango.core.config import ClientConfig
|
||||
from basango.services.http_client import SyncHttpClient
|
||||
from basango.services.user_agents import UserAgents
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenGraphObject:
|
||||
title: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
image: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
|
||||
|
||||
class OpenGraphProvider:
|
||||
def __init__(
|
||||
self, user_agent_provider: UserAgents = UserAgents(rotate=False)
|
||||
) -> None:
|
||||
self._user_agent = user_agent_provider.og()
|
||||
self._http_client = SyncHttpClient(
|
||||
client_config=ClientConfig(),
|
||||
default_headers={"User-Agent": self._user_agent},
|
||||
)
|
||||
|
||||
def consume_url(self, url: str) -> OpenGraphObject | None:
|
||||
try:
|
||||
logging.info(f"[OpenGraphProvider] Consuming url: {url}")
|
||||
html = self._http_client.get(url).text
|
||||
return self.consume_html(html, url)
|
||||
except Exception as e:
|
||||
logging.exception(f"[OpenGraphProvider] Failed to consume url: {e}")
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def consume_html(
|
||||
cls, html: str, url: Optional[str] = None
|
||||
) -> OpenGraphObject | None:
|
||||
try:
|
||||
meta = trafilatura.extract_metadata(html, default_url=url)
|
||||
if not meta:
|
||||
return None
|
||||
return OpenGraphObject(
|
||||
title=meta.title or None,
|
||||
description=meta.description or None,
|
||||
image=meta.image or None,
|
||||
url=url,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(f"[OpenGraphProvider] Failed to extract metadata: {e}")
|
||||
return None
|
||||
@@ -1,9 +0,0 @@
|
||||
from .base_persistor import BasePersistor
|
||||
from .csv_persistor import CsvPersistor
|
||||
from .json_persistor import JsonPersistor
|
||||
|
||||
__all__ = [
|
||||
"BasePersistor",
|
||||
"CsvPersistor",
|
||||
"JsonPersistor",
|
||||
]
|
||||
@@ -1,14 +0,0 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Mapping, Any
|
||||
|
||||
|
||||
class BasePersistor(ABC):
|
||||
"""Abstract interface for article persistence backends."""
|
||||
|
||||
@abstractmethod
|
||||
def persist(self, article: Mapping[str, Any]) -> None:
|
||||
"""Persist a single article payload."""
|
||||
|
||||
def close(self) -> None: # pragma: no cover - optional override
|
||||
"""Hook for subclasses that need explicit shutdown."""
|
||||
return None
|
||||
@@ -1,79 +0,0 @@
|
||||
import csv
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from threading import Lock
|
||||
from typing import Any, Mapping, Sequence
|
||||
|
||||
from .base_persistor import BasePersistor
|
||||
|
||||
|
||||
DEFAULT_FIELDS = (
|
||||
"title",
|
||||
"link",
|
||||
"body",
|
||||
"categories",
|
||||
"source",
|
||||
"timestamp",
|
||||
"metadata",
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CsvPersistor(BasePersistor):
|
||||
data_dir: Path
|
||||
source_id: str
|
||||
fieldnames: Sequence[str] = DEFAULT_FIELDS
|
||||
encoding: str = "utf-8"
|
||||
_file_path: Path = field(init=False, repr=False)
|
||||
_lock: Lock = field(default_factory=Lock, init=False, repr=False)
|
||||
_header_written: bool = field(default=False, init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Pre-create output directory and detect existing header to avoid
|
||||
# re-writing it across process restarts.
|
||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._file_path = self.data_dir / f"{self.source_id}.csv"
|
||||
if self._file_path.exists() and self._file_path.stat().st_size > 0:
|
||||
self._header_written = True
|
||||
|
||||
def persist(self, article: Mapping[str, Any]) -> None:
|
||||
record = self._serialise(article)
|
||||
# File writes are guarded by a process-local lock to tolerate threads
|
||||
# sharing the same persistor instance.
|
||||
with self._lock:
|
||||
needs_header = not self._header_written or not self._file_path.exists()
|
||||
with self._file_path.open(
|
||||
"a", newline="", encoding=self.encoding
|
||||
) as handle:
|
||||
writer = csv.DictWriter(
|
||||
handle,
|
||||
fieldnames=self.fieldnames,
|
||||
quoting=csv.QUOTE_ALL,
|
||||
lineterminator="\n",
|
||||
)
|
||||
if needs_header:
|
||||
writer.writeheader()
|
||||
self._header_written = True
|
||||
writer.writerow(record)
|
||||
|
||||
def _serialise(self, article: Mapping[str, Any]) -> dict[str, Any]:
|
||||
categories = article.get("categories")
|
||||
if isinstance(categories, (list, tuple)):
|
||||
serialised_categories = ";".join(str(item) for item in categories)
|
||||
else:
|
||||
serialised_categories = categories
|
||||
|
||||
metadata = article.get("metadata")
|
||||
if metadata is None or isinstance(metadata, str):
|
||||
serialised_metadata = metadata
|
||||
else:
|
||||
# JSON-encode metadata to a compact, CSV-safe string; csv will quote it.
|
||||
serialised_metadata = json.dumps(
|
||||
metadata, ensure_ascii=True, separators=(",", ":"), sort_keys=True
|
||||
)
|
||||
|
||||
record = {field: article.get(field) for field in self.fieldnames}
|
||||
record["categories"] = serialised_categories
|
||||
record["metadata"] = serialised_metadata
|
||||
return record
|
||||
@@ -1,28 +0,0 @@
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from threading import Lock
|
||||
from typing import Any, Mapping
|
||||
|
||||
from .base_persistor import BasePersistor
|
||||
|
||||
|
||||
@dataclass
|
||||
class JsonPersistor(BasePersistor):
|
||||
data_dir: Path
|
||||
source_id: str
|
||||
suffix: str = ".jsonl"
|
||||
encoding: str = "utf-8"
|
||||
_file_path: Path = field(init=False, repr=False)
|
||||
_lock: Lock = field(default_factory=Lock, init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._file_path = self.data_dir / f"{self.source_id}{self.suffix}"
|
||||
|
||||
def persist(self, article: Mapping[str, Any]) -> None:
|
||||
payload = json.dumps(article, ensure_ascii=False)
|
||||
with self._lock:
|
||||
with self._file_path.open("a", encoding=self.encoding) as handle:
|
||||
handle.write(payload)
|
||||
handle.write("\n")
|
||||
@@ -1,56 +0,0 @@
|
||||
"""
|
||||
Tokenizer utilities for counting and encoding article text.
|
||||
|
||||
This module wraps the `tiktoken` encoder to provide simple helpers for:
|
||||
- encoding/decoding text to token ids
|
||||
- counting tokens for different parts of an Article
|
||||
|
||||
The `Tokenizer` can be constructed with either a specific `model` (preferred)
|
||||
or an `encoding` name fallback.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import tiktoken
|
||||
from typing import Optional
|
||||
|
||||
from basango.domain.token_statistics import TokenStatistics
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""Thin wrapper around tiktoken encoder for token operations."""
|
||||
|
||||
def __init__(
|
||||
self, encoding: str = "cl100k_base", model: Optional[str] = None
|
||||
) -> None:
|
||||
self.encoding = encoding
|
||||
# Prefer model-based encoding lookup if a model is provided.
|
||||
self.tokenizer = (
|
||||
tiktoken.encoding_for_model(model)
|
||||
if model
|
||||
else tiktoken.get_encoding(encoding)
|
||||
)
|
||||
|
||||
def encode(self, text: str) -> list[int]:
|
||||
"""Encode text into a list of token ids."""
|
||||
return self.tokenizer.encode(text)
|
||||
|
||||
def decode(self, tokens: list[int]) -> str:
|
||||
"""Decode a list of token ids back into a string."""
|
||||
return self.tokenizer.decode(tokens)
|
||||
|
||||
def count_tokens(
|
||||
self, title: str, body: str, categories: list[str]
|
||||
) -> TokenStatistics:
|
||||
"""Return token counts for the provided Article.
|
||||
|
||||
The excerpt count is computed on the first 200 characters of the body
|
||||
to give a quick estimate of a short preview's token length.
|
||||
"""
|
||||
logging.info(f"[Tokenizer] tokenizing {title}...")
|
||||
return TokenStatistics(
|
||||
title=len(self.encode(title)),
|
||||
body=len(self.encode(body)),
|
||||
excerpt=len(self.encode(body[:200])),
|
||||
categories=len(self.encode(", ".join(categories))),
|
||||
)
|
||||
@@ -1,28 +0,0 @@
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserAgents:
|
||||
USER_AGENTS = [
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
|
||||
"Mozilla/5.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7",
|
||||
"Mozilla/5.0 (Linux; U; Android 5.0; SM-P815 Build/LRX22G) AppleWebKit/600.4 (KHTML, like Gecko) Chrome/48.0.1562.260 Mobile Safari/600.0",
|
||||
"Mozilla/5.0 (Windows; U; Windows NT 6.3;) AppleWebKit/533.34 (KHTML, like Gecko) Chrome/51.0.1883.215 Safari/533",
|
||||
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.3; x64; en-US Trident/4.0)",
|
||||
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_10_3) Gecko/20100101 Firefox/63.4",
|
||||
"Mozilla/5.0 (Linux; Linux x86_64; en-US) AppleWebKit/603.50 (KHTML, like Gecko) Chrome/55.0.2226.116 Safari/601",
|
||||
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 7_8_3; en-US) Gecko/20100101 Firefox/68.9",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 8_9_8; like Mac OS X) AppleWebKit/603.34 (KHTML, like Gecko) Chrome/47.0.1126.107 Mobile Safari/602.7",
|
||||
"Mozilla/5.0 (iPod; CPU iPod OS 8_2_0; like Mac OS X) AppleWebKit/601.40 (KHTML, like Gecko) Chrome/47.0.1590.178 Mobile Safari/535.2",
|
||||
]
|
||||
|
||||
rotate: bool = True
|
||||
fallback: str = "Basango/0.1 (+https://github.com/bernard-ng/basango)"
|
||||
|
||||
def get(self) -> str:
|
||||
return random.choice(self.USER_AGENTS) if self.rotate else self.fallback
|
||||
|
||||
@classmethod
|
||||
def og(cls) -> str:
|
||||
return "facebookexternalhit/1.1"
|
||||
@@ -1,57 +0,0 @@
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from basango.domain import DateRange
|
||||
|
||||
|
||||
def ts(y: int, m: int, d: int, hh: int = 0, mm: int = 0, ss: int = 0) -> int:
|
||||
return int(datetime(y, m, d, hh, mm, ss, tzinfo=timezone.utc).timestamp())
|
||||
|
||||
|
||||
def test_from_parses_two_dates_with_default_format() -> None:
|
||||
dr = DateRange.create("2024-10-01:2024-10-08")
|
||||
assert dr.start == ts(2024, 10, 1)
|
||||
assert dr.end == ts(2024, 10, 8)
|
||||
|
||||
|
||||
def test_str_and_format_roundtrip() -> None:
|
||||
dr = DateRange.create("2024-10-01:2024-10-02")
|
||||
assert str(dr) == f"{ts(2024, 10, 1)}:{ts(2024, 10, 2)}"
|
||||
assert dr.format("%Y-%m-%d") == "2024-10-01:2024-10-02"
|
||||
|
||||
|
||||
def test_in_range_out_range_inclusive_boundaries() -> None:
|
||||
dr = DateRange.create("2024-10-01:2024-10-02")
|
||||
start = ts(2024, 10, 1)
|
||||
end = ts(2024, 10, 2)
|
||||
before = start - 1
|
||||
after = end + 1
|
||||
midday_end = ts(2024, 10, 2, 12, 0, 0)
|
||||
|
||||
assert dr.in_range(start) is True
|
||||
assert dr.in_range(end) is True
|
||||
assert dr.out_range(before) is True
|
||||
# End is at 00:00 of end day; times later that day are outside
|
||||
assert dr.out_range(midday_end) is True
|
||||
assert dr.out_range(after) is True
|
||||
|
||||
|
||||
def test_backward_uses_days_and_next_day_end() -> None:
|
||||
base = datetime(2024, 10, 31, tzinfo=timezone.utc)
|
||||
dr = DateRange.backward(date=base, days=10)
|
||||
assert dr.start == ts(2024, 10, 21)
|
||||
assert dr.end == ts(2024, 11, 1)
|
||||
|
||||
|
||||
def test_from_raises_on_invalid_separator_or_spec() -> None:
|
||||
with pytest.raises(AssertionError):
|
||||
DateRange.create("2024-10-01:2024-10-08", separator="")
|
||||
with pytest.raises(AssertionError):
|
||||
DateRange.create("2024-10-01", separator=":")
|
||||
|
||||
|
||||
def test_from_accepts_python_format_string() -> None:
|
||||
dr = DateRange.create("2024/10/01|2024/10/02", fmt="%Y/%m/%d", separator="|")
|
||||
assert dr.start == ts(2024, 10, 1)
|
||||
assert dr.end == ts(2024, 10, 2)
|
||||
@@ -1,19 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from basango.domain import PageRange
|
||||
|
||||
|
||||
def test_it_should_create_page_range():
|
||||
pr = PageRange.create("1:10")
|
||||
assert pr.start == 1
|
||||
assert pr.end == 10
|
||||
|
||||
|
||||
def test_end_page_should_be_greater_than_start_page():
|
||||
with pytest.raises(AssertionError):
|
||||
PageRange.create("10:1")
|
||||
|
||||
|
||||
def test_non_negative_pages():
|
||||
with pytest.raises(AssertionError):
|
||||
PageRange.create("-1:-10")
|
||||
@@ -1,291 +0,0 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import HttpUrl
|
||||
|
||||
from basango.core.config import WordPressSourceConfig
|
||||
from basango.core.config.fetch_config import CrawlerConfig, ClientConfig
|
||||
from basango.core.config.source_config import HtmlSourceConfig, SourceSelectors
|
||||
from basango.domain import SourceKind, PageRange
|
||||
from basango.services.crawler.html_crawler import HtmlCrawler
|
||||
|
||||
|
||||
class TestHtmlCrawler:
|
||||
"""Test suite for HtmlCrawler."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client_config(self):
|
||||
return ClientConfig()
|
||||
|
||||
@pytest.fixture
|
||||
def mock_html_source_config(self):
|
||||
return HtmlSourceConfig(
|
||||
source_id="test_source",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
pagination_template="news",
|
||||
source_selectors=SourceSelectors(pagination="ul.pagination > li a"),
|
||||
supports_categories=True,
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_crawler_config(self, mock_html_source_config):
|
||||
return CrawlerConfig(source=mock_html_source_config, category="tech")
|
||||
|
||||
@pytest.fixture
|
||||
def html_crawler(self, mock_crawler_config, mock_client_config):
|
||||
return HtmlCrawler(mock_crawler_config, mock_client_config)
|
||||
|
||||
def test_with_valid_html_source(self, html_crawler):
|
||||
"""Test __init__ with valid HTML source config."""
|
||||
assert html_crawler.source.source_kind == SourceKind.HTML
|
||||
assert isinstance(html_crawler.source, HtmlSourceConfig)
|
||||
|
||||
def test_with_invalid_source_kind_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when source kind is not HTML."""
|
||||
wordpress_source = WordPressSourceConfig(
|
||||
source_id="test_wordpress",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
)
|
||||
config = CrawlerConfig(source=wordpress_source)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="HtmlCrawler requires a source of kind HTML"
|
||||
):
|
||||
HtmlCrawler(config, mock_client_config)
|
||||
|
||||
def test_with_no_source_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when no source is provided."""
|
||||
config = CrawlerConfig(source=None)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="HtmlCrawler requires a source of kind HTML"
|
||||
):
|
||||
HtmlCrawler(config, mock_client_config)
|
||||
|
||||
def test_get_pagination_returns_valid_page_range(self, html_crawler):
|
||||
"""Test that get_pagination returns a valid PageRange."""
|
||||
with patch.object(html_crawler, "get_last_page", return_value=5):
|
||||
result = html_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 0
|
||||
assert result.end == 5
|
||||
assert str(result) == "0:5"
|
||||
|
||||
def test_get_last_page_with_valid_pagination_links(self, html_crawler):
|
||||
"""Test get_last_page extracts page number from pagination links."""
|
||||
# Mock HTML with pagination links
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=1">1</a></li>
|
||||
<li><a href="/news?page=2">2</a></li>
|
||||
<li><a href="/news?page=3">3</a></li>
|
||||
<li><a href="/news?page=10">10</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 10
|
||||
|
||||
def test_get_last_page_with_no_pagination_links(self, html_crawler):
|
||||
"""Test get_last_page returns 1 when no pagination links found."""
|
||||
mock_html = "<div>No pagination here</div>"
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
|
||||
def test_get_last_page_with_empty_href(self, html_crawler):
|
||||
"""Test get_last_page returns 1 when href is empty or None."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a>No href</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
|
||||
def test_get_last_page_with_regex_extraction(self, html_crawler):
|
||||
"""Test get_last_page extracts page number using regex."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/articles/page/25/">Page 25</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 25
|
||||
|
||||
def test_get_last_page_with_query_parameters(self, html_crawler):
|
||||
"""Test get_last_page extracts page number from query parameters."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?category=tech&page=15&sort=date">Last</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 15
|
||||
|
||||
def test_get_last_page_with_invalid_page_parameter(self, html_crawler):
|
||||
"""Test get_last_page returns 1 when page parameter is invalid."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=invalid">Last</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
|
||||
def test_get_last_page_with_category_support(self, html_crawler):
|
||||
"""Test get_last_page uses category in URL when supported."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?category=tech&page=8">8</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
html_crawler.get_last_page()
|
||||
|
||||
# The URL construction concatenates source_url with the path
|
||||
# Since the template doesn't contain {category}, it should remain unchanged
|
||||
expected_url = "https://example.com/news"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_with_category_template(self, mock_client_config):
|
||||
"""Test get_last_page uses category replacement when template contains {category}."""
|
||||
source_config = HtmlSourceConfig(
|
||||
source_id="test_source",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
pagination_template="news/{category}",
|
||||
source_selectors=SourceSelectors(pagination="ul.pagination > li a"),
|
||||
supports_categories=True,
|
||||
)
|
||||
crawler_config = CrawlerConfig(source=source_config, category="tech")
|
||||
crawler = HtmlCrawler(crawler_config, mock_client_config)
|
||||
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news/tech?page=5">5</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
crawler.get_last_page()
|
||||
|
||||
expected_url = "https://example.com/news/tech"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_without_category_support(self, html_crawler):
|
||||
"""Test get_last_page uses default template when categories not supported."""
|
||||
# Modify source to not support categories
|
||||
html_crawler.source.supports_categories = False
|
||||
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=5">5</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
html_crawler.get_last_page()
|
||||
|
||||
# Verify the URL was constructed without category replacement
|
||||
expected_url = "https://example.com/news"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_without_category_in_config(
|
||||
self, mock_client_config, mock_html_source_config
|
||||
):
|
||||
"""Test get_last_page uses default template when no category in config."""
|
||||
config = CrawlerConfig(source=mock_html_source_config, category=None)
|
||||
crawler = HtmlCrawler(config, mock_client_config)
|
||||
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=3">3</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
crawler.get_last_page()
|
||||
|
||||
# Verify the URL was constructed without category replacement
|
||||
expected_url = "https://example.com/news"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_with_multiple_numbers_in_href(self, html_crawler):
|
||||
"""Test get_last_page extracts first number when multiple numbers present."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news/2024/page/42/comments/100">Last</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
# Should extract the first number found (2024)
|
||||
assert result == 2024
|
||||
|
||||
def test_supports_html_source_kind(self):
|
||||
"""Test that supports method returns True for HTML source kind."""
|
||||
assert HtmlCrawler.supports() is SourceKind.HTML
|
||||
|
||||
def test_get_pagination_integration(self, html_crawler):
|
||||
"""Integration test for get_pagination calling get_last_page."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=7">7</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 0
|
||||
assert result.end == 7
|
||||
|
||||
def test_get_last_page_with_non_string_href(self, html_crawler):
|
||||
"""Test get_last_page handles non-string href attributes."""
|
||||
# Create a mock element with href as a list (AttributeValueList)
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=5">5</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
# Modify the href to simulate a non-string type by removing it
|
||||
pagination_link = mock_soup.select("ul.pagination > li a")[-1]
|
||||
# Instead of setting href to a list, let's test with missing href
|
||||
del pagination_link.attrs["href"]
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
@@ -1,239 +0,0 @@
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from pydantic import HttpUrl
|
||||
|
||||
from basango.core.config.fetch_config import CrawlerConfig, ClientConfig
|
||||
from basango.core.config.source_config import (
|
||||
WordPressSourceConfig,
|
||||
HtmlSourceConfig,
|
||||
SourceSelectors,
|
||||
)
|
||||
from basango.domain import SourceKind, PageRange
|
||||
from basango.services.crawler.wordpress_crawler import WordpressCrawler
|
||||
|
||||
|
||||
class TestWordPressCrawler:
|
||||
"""Test suite for WordPressCrawler."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client_config(self):
|
||||
return ClientConfig()
|
||||
|
||||
@pytest.fixture
|
||||
def mock_wordpress_source_config(self):
|
||||
return WordPressSourceConfig(
|
||||
source_id="test_wordpress_source",
|
||||
source_url=HttpUrl("https://example.com/"),
|
||||
supports_categories=True,
|
||||
categories=["tech", "news"],
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_crawler_config(self, mock_wordpress_source_config):
|
||||
return CrawlerConfig(source=mock_wordpress_source_config, category="tech")
|
||||
|
||||
@pytest.fixture
|
||||
def wordpress_crawler(self, mock_crawler_config, mock_client_config):
|
||||
return WordpressCrawler(mock_crawler_config, mock_client_config)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_response_with_headers(self):
|
||||
response = Mock()
|
||||
response.headers = {
|
||||
WordpressCrawler.TOTAL_PAGES_HEADER: "5",
|
||||
WordpressCrawler.TOTAL_POSTS_HEADER: "47",
|
||||
}
|
||||
return response
|
||||
|
||||
def test_with_valid_wordpress_source(self, wordpress_crawler):
|
||||
"""Test __init__ with valid WordPress source config."""
|
||||
assert wordpress_crawler.source.source_kind == SourceKind.WORDPRESS
|
||||
assert isinstance(wordpress_crawler.source, WordPressSourceConfig)
|
||||
|
||||
def test_with_invalid_source_kind_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when source kind is not WORDPRESS."""
|
||||
html_source = HtmlSourceConfig(
|
||||
source_id="test_html",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
pagination_template="news",
|
||||
source_selectors=SourceSelectors(),
|
||||
)
|
||||
config = CrawlerConfig(source=html_source)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="WordpressCrawler requires a source of kind WORDPRESS"
|
||||
):
|
||||
WordpressCrawler(config, mock_client_config)
|
||||
|
||||
def test_with_no_source_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when source is None."""
|
||||
config = CrawlerConfig(source=None)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="WordpressCrawler requires a source of kind WORDPRESS"
|
||||
):
|
||||
WordpressCrawler(config, mock_client_config)
|
||||
|
||||
def test_get_pagination_returns_valid_page_range(
|
||||
self, wordpress_crawler, mock_response_with_headers
|
||||
):
|
||||
"""Test get_pagination returns correct PageRange from WordPress API headers."""
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_response_with_headers
|
||||
):
|
||||
result = wordpress_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 1
|
||||
assert result.end == 5
|
||||
assert str(result) == "1:5"
|
||||
|
||||
def test_get_pagination_with_default_headers(self, wordpress_crawler):
|
||||
"""Test get_pagination with default headers when WordPress headers are missing."""
|
||||
mock_response = Mock()
|
||||
mock_response.headers = {} # No WordPress headers
|
||||
|
||||
with patch.object(wordpress_crawler.client, "get", return_value=mock_response):
|
||||
result = wordpress_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 1
|
||||
assert result.end == 1 # Default when no headers
|
||||
|
||||
def test_get_pagination_makes_correct_api_call(self, wordpress_crawler):
|
||||
"""Test get_pagination makes the correct WordPress API call."""
|
||||
mock_response = Mock()
|
||||
mock_response.headers = {
|
||||
WordpressCrawler.TOTAL_PAGES_HEADER: "3",
|
||||
WordpressCrawler.TOTAL_POSTS_HEADER: "25",
|
||||
}
|
||||
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_response
|
||||
) as mock_get:
|
||||
wordpress_crawler.get_pagination()
|
||||
|
||||
expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/posts?_fields=id&per_page=100"
|
||||
mock_get.assert_called_once_with(expected_url)
|
||||
|
||||
def test_fetch_categories_populates_category_map(self, wordpress_crawler):
|
||||
"""Test _fetch_categories populates the category_map correctly."""
|
||||
mock_categories_response = Mock()
|
||||
mock_categories_response.json.return_value = [
|
||||
{"id": 1, "slug": "technology", "count": 15},
|
||||
{"id": 2, "slug": "business", "count": 10},
|
||||
{"id": 3, "slug": "sports", "count": 8},
|
||||
]
|
||||
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_categories_response
|
||||
):
|
||||
wordpress_crawler._fetch_categories()
|
||||
|
||||
assert len(wordpress_crawler.category_map) == 3
|
||||
assert wordpress_crawler.category_map[1] == "technology"
|
||||
assert wordpress_crawler.category_map[2] == "business"
|
||||
assert wordpress_crawler.category_map[3] == "sports"
|
||||
|
||||
def test_fetch_categories_makes_correct_api_call(self, wordpress_crawler):
|
||||
"""Test _fetch_categories makes the correct WordPress API call."""
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = []
|
||||
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_response
|
||||
) as mock_get:
|
||||
wordpress_crawler._fetch_categories()
|
||||
|
||||
expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/categories?{WordpressCrawler.CATEGORY_QUERY}"
|
||||
mock_get.assert_called_once_with(expected_url)
|
||||
|
||||
def test_map_categories_with_populated_category_map(self, wordpress_crawler):
|
||||
"""Test _map_categories returns correct comma-separated string."""
|
||||
|
||||
# Pre-populate category map
|
||||
wordpress_crawler.category_map = {
|
||||
1: "technology",
|
||||
2: "business",
|
||||
3: "sports",
|
||||
4: "lifestyle",
|
||||
}
|
||||
|
||||
result = wordpress_crawler._map_categories([2, 1, 4])
|
||||
|
||||
# Should be sorted by category ID
|
||||
assert result == "technology,business,lifestyle"
|
||||
|
||||
def test_map_categories_with_empty_category_map_fetches_categories(
|
||||
self, wordpress_crawler
|
||||
):
|
||||
"""Test _map_categories fetches categories when category_map is empty."""
|
||||
mock_categories_response = Mock()
|
||||
mock_categories_response.json.return_value = [
|
||||
{"id": 1, "slug": "tech", "count": 15},
|
||||
{"id": 2, "slug": "news", "count": 10},
|
||||
]
|
||||
|
||||
wordpress_crawler.category_map = {}
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_categories_response
|
||||
):
|
||||
result = wordpress_crawler._map_categories([1, 2])
|
||||
|
||||
assert result == "tech,news"
|
||||
assert len(wordpress_crawler.category_map) == 2
|
||||
|
||||
def test_map_categories_filters_unknown_category_ids(self, wordpress_crawler):
|
||||
"""Test _map_categories filters out unknown category IDs."""
|
||||
wordpress_crawler.category_map = {1: "technology", 2: "business"}
|
||||
|
||||
result = wordpress_crawler._map_categories([1, 99, 2, 100])
|
||||
|
||||
# Should only include known categories
|
||||
assert result == "technology,business"
|
||||
|
||||
def test_map_categories_with_empty_category_list(self, wordpress_crawler):
|
||||
"""Test _map_categories returns empty string for empty category list."""
|
||||
wordpress_crawler.category_map = {1: "tech", 2: "news"}
|
||||
|
||||
result = wordpress_crawler._map_categories([])
|
||||
|
||||
assert result == ""
|
||||
|
||||
def test_map_categories_sorts_by_category_id(self, wordpress_crawler):
|
||||
"""Test _map_categories sorts categories by ID."""
|
||||
wordpress_crawler.category_map = {3: "charlie", 1: "alpha", 2: "beta"}
|
||||
|
||||
result = wordpress_crawler._map_categories([3, 1, 2])
|
||||
|
||||
# Should be sorted by ID: 1, 2, 3
|
||||
assert result == "alpha,beta,charlie"
|
||||
|
||||
def test_supports_wordpress_source_kind(self):
|
||||
"""Test supports method returns True for WordPress source kind."""
|
||||
assert WordpressCrawler.supports() is SourceKind.WORDPRESS
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pages,posts,expected_start,expected_end",
|
||||
[
|
||||
("1", "10", 1, 1),
|
||||
("5", "47", 1, 5),
|
||||
("10", "100", 1, 10),
|
||||
],
|
||||
)
|
||||
def test_get_pagination_with_various_header_values(
|
||||
self, wordpress_crawler, pages, posts, expected_start, expected_end
|
||||
):
|
||||
"""Test get_pagination with various header values."""
|
||||
mock_response = Mock()
|
||||
mock_response.headers = {
|
||||
WordpressCrawler.TOTAL_PAGES_HEADER: pages,
|
||||
WordpressCrawler.TOTAL_POSTS_HEADER: posts,
|
||||
}
|
||||
|
||||
with patch.object(wordpress_crawler.client, "get", return_value=mock_response):
|
||||
result = wordpress_crawler.get_pagination()
|
||||
|
||||
assert result.start == expected_start
|
||||
assert result.end == expected_end
|
||||
@@ -1,70 +0,0 @@
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from basango.services.date_parser import DateParser
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_str, fmt, pattern, replacement, expected",
|
||||
[
|
||||
(
|
||||
"2004-02-12T15:19:21",
|
||||
"%Y-%m-%dT%H:%M:%S",
|
||||
None,
|
||||
None,
|
||||
1076599161, # 2004-02-12 15:19:21 UTC
|
||||
),
|
||||
(
|
||||
"08/10/2024 - 00:00",
|
||||
"%Y-%m-%d %H:%M",
|
||||
r"/(\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/",
|
||||
r"$3-$2-$1 $4",
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
(
|
||||
"mar 08/10/2024 - 00:00",
|
||||
"%Y-%m-%d %H:%M",
|
||||
r"/\w{3} (\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/",
|
||||
r"$3-$2-$1 $4",
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
(
|
||||
"Mardi 8 octobre 2024 - 00:00",
|
||||
"%Y-%m-%d %H:%M",
|
||||
r"/(\d{1}) (\d{1,2}) (\d{2}) (\d{4}) - (\d{2}:\d{2})/",
|
||||
r"$4-$3-$2 $5",
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
(
|
||||
"8.10.2024 00:00",
|
||||
"%d.%m.%Y %H:%M",
|
||||
None,
|
||||
None,
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_create_timestamp_with_valid_dates(
|
||||
date_str: str,
|
||||
fmt: str | None,
|
||||
pattern: str | None,
|
||||
replacement: str | None,
|
||||
expected: int,
|
||||
) -> None:
|
||||
dr = DateParser()
|
||||
result = dr.create_timestamp(date_str, fmt, pattern, replacement)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_create_timestamp_with_invalid_date_falls_back_to_midnight_today() -> None:
|
||||
dr = DateParser()
|
||||
|
||||
# Compute expected midnight (UTC) before invoking the parser to avoid edge cases.
|
||||
now = datetime.now(timezone.utc)
|
||||
expected_midnight = int(
|
||||
now.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
|
||||
)
|
||||
|
||||
result = dr.create_timestamp("invalid date string", None, None, None)
|
||||
assert result == expected_midnight
|
||||
@@ -1,9 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
# Ensure 'src' is on sys.path so `import basango...` works in tests
|
||||
ROOT = os.path.dirname(os.path.dirname(__file__))
|
||||
SRC = os.path.join(ROOT, "src")
|
||||
if SRC not in sys.path:
|
||||
sys.path.insert(0, SRC)
|
||||
Generated
-948
@@ -1,948 +0,0 @@
|
||||
version = 1
|
||||
revision = 3
|
||||
requires-python = ">=3.13"
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
version = "0.7.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyio"
|
||||
version = "4.10.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "idna" },
|
||||
{ name = "sniffio" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f1/b4/636b3b65173d3ce9a38ef5f0522789614e590dab6a8d505340a4efe4c567/anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6", size = 213252, upload-time = "2025-08-04T08:54:26.451Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "babel"
|
||||
version = "2.17.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bandit"
|
||||
version = "1.8.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "rich" },
|
||||
{ name = "stevedore" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/fb/b5/7eb834e213d6f73aace21938e5e90425c92e5f42abafaf8a6d5d21beed51/bandit-1.8.6.tar.gz", hash = "sha256:dbfe9c25fc6961c2078593de55fd19f2559f9e45b99f1272341f5b95dea4e56b", size = 4240271, upload-time = "2025-07-06T03:10:50.9Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/48/ca/ba5f909b40ea12ec542d5d7bdd13ee31c4d65f3beed20211ef81c18fa1f3/bandit-1.8.6-py3-none-any.whl", hash = "sha256:3348e934d736fcdb68b6aa4030487097e23a501adf3e7827b63658df464dddd0", size = 133808, upload-time = "2025-07-06T03:10:49.134Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "basango"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "httpx" },
|
||||
{ name = "markdownify" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pydantic-settings" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "readability-lxml" },
|
||||
{ name = "rq" },
|
||||
{ name = "selectolax" },
|
||||
{ name = "tiktoken" },
|
||||
{ name = "trafilatura" },
|
||||
{ name = "typer" },
|
||||
{ name = "uv-build" },
|
||||
]
|
||||
|
||||
[package.dev-dependencies]
|
||||
dev = [
|
||||
{ name = "bandit" },
|
||||
{ name = "pyright" },
|
||||
{ name = "pytest" },
|
||||
{ name = "ruff" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "beautifulsoup4", specifier = ">=4.13.5" },
|
||||
{ name = "httpx", specifier = ">=0.27.2" },
|
||||
{ name = "markdownify", specifier = ">=0.13.1" },
|
||||
{ name = "pydantic", specifier = ">=2.11.7" },
|
||||
{ name = "pydantic-settings", specifier = ">=2.10.1" },
|
||||
{ name = "pyyaml", specifier = ">=6.0.2" },
|
||||
{ name = "readability-lxml", specifier = ">=0.8.1" },
|
||||
{ name = "rq", specifier = ">=2.5.0" },
|
||||
{ name = "selectolax", specifier = ">=0.3.20" },
|
||||
{ name = "tiktoken", specifier = ">=0.12.0" },
|
||||
{ name = "trafilatura", specifier = ">=1.7.0" },
|
||||
{ name = "typer", specifier = ">=0.16.1" },
|
||||
{ name = "uv-build", specifier = ">=0.8.12,<0.9.0" },
|
||||
]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
{ name = "bandit", specifier = ">=1.8.6" },
|
||||
{ name = "pyright", specifier = ">=1.1.404" },
|
||||
{ name = "pytest", specifier = ">=8.4.1" },
|
||||
{ name = "ruff", specifier = ">=0.12.9" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.13.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "soupsieve" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/85/2e/3e5079847e653b1f6dc647aa24549d68c6addb4c595cc0d902d1b19308ad/beautifulsoup4-4.13.5.tar.gz", hash = "sha256:5e70131382930e7c3de33450a2f54a63d5e4b19386eab43a5b34d594268f3695", size = 622954, upload-time = "2025-08-24T14:06:13.168Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/04/eb/f4151e0c7377a6e08a38108609ba5cede57986802757848688aeedd1b9e8/beautifulsoup4-4.13.5-py3-none-any.whl", hash = "sha256:642085eaa22233aceadff9c69651bc51e8bf3f874fb6d7104ece2beb24b47c4a", size = 105113, upload-time = "2025-08-24T14:06:14.884Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2025.8.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chardet"
|
||||
version = "5.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/83/2d/5fd176ceb9b2fc619e63405525573493ca23441330fcdaee6bef9460e924/charset_normalizer-3.4.3.tar.gz", hash = "sha256:6fce4b8500244f6fcb71465d4a4930d132ba9ab8e71a7859e6a5d59851068d14", size = 122371, upload-time = "2025-08-09T07:57:28.46Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/65/ca/2135ac97709b400c7654b4b764daf5c5567c2da45a30cdd20f9eefe2d658/charset_normalizer-3.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:14c2a87c65b351109f6abfc424cab3927b3bdece6f706e4d12faaf3d52ee5efe", size = 205326, upload-time = "2025-08-09T07:56:24.721Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/71/11/98a04c3c97dd34e49c7d247083af03645ca3730809a5509443f3c37f7c99/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41d1fc408ff5fdfb910200ec0e74abc40387bccb3252f3f27c0676731df2b2c8", size = 146008, upload-time = "2025-08-09T07:56:26.004Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/60/f5/4659a4cb3c4ec146bec80c32d8bb16033752574c20b1252ee842a95d1a1e/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1bb60174149316da1c35fa5233681f7c0f9f514509b8e399ab70fea5f17e45c9", size = 159196, upload-time = "2025-08-09T07:56:27.25Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/9e/f552f7a00611f168b9a5865a1414179b2c6de8235a4fa40189f6f79a1753/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30d006f98569de3459c2fc1f2acde170b7b2bd265dc1943e87e1a4efe1b67c31", size = 156819, upload-time = "2025-08-09T07:56:28.515Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:416175faf02e4b0810f1f38bcb54682878a4af94059a1cd63b8747244420801f", size = 151350, upload-time = "2025-08-09T07:56:29.716Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/a9/3865b02c56f300a6f94fc631ef54f0a8a29da74fb45a773dfd3dcd380af7/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6aab0f181c486f973bc7262a97f5aca3ee7e1437011ef0c2ec04b5a11d16c927", size = 148644, upload-time = "2025-08-09T07:56:30.984Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/77/d9/cbcf1a2a5c7d7856f11e7ac2d782aec12bdfea60d104e60e0aa1c97849dc/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabf8315679312cfa71302f9bd509ded4f2f263fb5b765cf1433b39106c3cc9", size = 160468, upload-time = "2025-08-09T07:56:32.252Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/42/6f45efee8697b89fda4d50580f292b8f7f9306cb2971d4b53f8914e4d890/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:bd28b817ea8c70215401f657edef3a8aa83c29d447fb0b622c35403780ba11d5", size = 158187, upload-time = "2025-08-09T07:56:33.481Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/70/99/f1c3bdcfaa9c45b3ce96f70b14f070411366fa19549c1d4832c935d8e2c3/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:18343b2d246dc6761a249ba1fb13f9ee9a2bcd95decc767319506056ea4ad4dc", size = 152699, upload-time = "2025-08-09T07:56:34.739Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/ad/b0081f2f99a4b194bcbb1934ef3b12aa4d9702ced80a37026b7607c72e58/charset_normalizer-3.4.3-cp313-cp313-win32.whl", hash = "sha256:6fb70de56f1859a3f71261cbe41005f56a7842cc348d3aeb26237560bfa5e0ce", size = 99580, upload-time = "2025-08-09T07:56:35.981Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9a/8f/ae790790c7b64f925e5c953b924aaa42a243fb778fed9e41f147b2a5715a/charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:cf1ebb7d78e1ad8ec2a8c4732c7be2e736f6e5123a4146c5b89c9d1f585f8cef", size = 107366, upload-time = "2025-08-09T07:56:37.339Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/91/b5a06ad970ddc7a0e513112d40113e834638f4ca1120eb727a249fb2715e/charset_normalizer-3.4.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3cd35b7e8aedeb9e34c41385fda4f73ba609e561faedfae0a9e75e44ac558a15", size = 204342, upload-time = "2025-08-09T07:56:38.687Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/ec/1edc30a377f0a02689342f214455c3f6c2fbedd896a1d2f856c002fc3062/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b89bc04de1d83006373429975f8ef9e7932534b8cc9ca582e4db7d20d91816db", size = 145995, upload-time = "2025-08-09T07:56:40.048Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/17/e5/5e67ab85e6d22b04641acb5399c8684f4d37caf7558a53859f0283a650e9/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2001a39612b241dae17b4687898843f254f8748b796a2e16f1051a17078d991d", size = 158640, upload-time = "2025-08-09T07:56:41.311Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/e5/38421987f6c697ee3722981289d554957c4be652f963d71c5e46a262e135/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8dcfc373f888e4fb39a7bc57e93e3b845e7f462dacc008d9749568b1c4ece096", size = 156636, upload-time = "2025-08-09T07:56:43.195Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/e4/5a075de8daa3ec0745a9a3b54467e0c2967daaaf2cec04c845f73493e9a1/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b97b8404387b96cdbd30ad660f6407799126d26a39ca65729162fd810a99aa", size = 150939, upload-time = "2025-08-09T07:56:44.819Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/02/f7/3611b32318b30974131db62b4043f335861d4d9b49adc6d57c1149cc49d4/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ccf600859c183d70eb47e05a44cd80a4ce77394d1ac0f79dbd2dd90a69a3a049", size = 148580, upload-time = "2025-08-09T07:56:46.684Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7e/61/19b36f4bd67f2793ab6a99b979b4e4f3d8fc754cbdffb805335df4337126/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:53cd68b185d98dde4ad8990e56a58dea83a4162161b1ea9272e5c9182ce415e0", size = 159870, upload-time = "2025-08-09T07:56:47.941Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/06/57/84722eefdd338c04cf3030ada66889298eaedf3e7a30a624201e0cbe424a/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:30a96e1e1f865f78b030d65241c1ee850cdf422d869e9028e2fc1d5e4db73b92", size = 157797, upload-time = "2025-08-09T07:56:49.756Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/2a/aff5dd112b2f14bcc3462c312dce5445806bfc8ab3a7328555da95330e4b/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d716a916938e03231e86e43782ca7878fb602a125a91e7acb8b5112e2e96ac16", size = 152224, upload-time = "2025-08-09T07:56:51.369Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b7/8c/9839225320046ed279c6e839d51f028342eb77c91c89b8ef2549f951f3ec/charset_normalizer-3.4.3-cp314-cp314-win32.whl", hash = "sha256:c6dbd0ccdda3a2ba7c2ecd9d77b37f3b5831687d8dc1b6ca5f56a4880cc7b7ce", size = 100086, upload-time = "2025-08-09T07:56:52.722Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/7a/36fbcf646e41f710ce0a563c1c9a343c6edf9be80786edeb15b6f62e17db/charset_normalizer-3.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:73dc19b562516fc9bcf6e5d6e596df0b4eb98d87e4f79f3ae71840e6ed21361c", size = 107400, upload-time = "2025-08-09T07:56:55.172Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "click"
|
||||
version = "8.2.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "courlan"
|
||||
version = "1.3.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "babel" },
|
||||
{ name = "tld" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6f/54/6d6ceeff4bed42e7a10d6064d35ee43a810e7b3e8beb4abeae8cff4713ae/courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190", size = 206382, upload-time = "2024-10-29T16:40:20.994Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/ca/6a667ccbe649856dcd3458bab80b016681b274399d6211187c6ab969fc50/courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be", size = 33848, upload-time = "2024-10-29T16:40:18.325Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "croniter"
|
||||
version = "6.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "python-dateutil" },
|
||||
{ name = "pytz" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ad/2f/44d1ae153a0e27be56be43465e5cb39b9650c781e001e7864389deb25090/croniter-6.0.0.tar.gz", hash = "sha256:37c504b313956114a983ece2c2b07790b1f1094fe9d81cc94739214748255577", size = 64481, upload-time = "2024-12-17T17:17:47.32Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/07/4b/290b4c3efd6417a8b0c284896de19b1d5855e6dbdb97d2a35e68fa42de85/croniter-6.0.0-py2.py3-none-any.whl", hash = "sha256:2f878c3856f17896979b2a4379ba1f09c83e374931ea15cc835c5dd2eee9b368", size = 25468, upload-time = "2024-12-17T17:17:45.359Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssselect"
|
||||
version = "1.3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870, upload-time = "2025-03-10T09:30:29.638Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786, upload-time = "2025-03-10T09:30:28.048Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dateparser"
|
||||
version = "1.2.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "python-dateutil" },
|
||||
{ name = "pytz" },
|
||||
{ name = "regex" },
|
||||
{ name = "tzlocal" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a9/30/064144f0df1749e7bb5faaa7f52b007d7c2d08ec08fed8411aba87207f68/dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7", size = 329840, upload-time = "2025-06-26T09:29:23.211Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/87/22/f020c047ae1346613db9322638186468238bcfa8849b4668a22b97faad65/dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482", size = 315453, upload-time = "2025-06-26T09:29:21.412Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h11"
|
||||
version = "0.16.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "htmldate"
|
||||
version = "1.9.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "dateparser" },
|
||||
{ name = "lxml" },
|
||||
{ name = "python-dateutil" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a5/26/aaae4cab984f0b7dd0f5f1b823fa2ed2fd4a2bb50acd5bd2f0d217562678/htmldate-1.9.3.tar.gz", hash = "sha256:ac0caf4628c3ded4042011e2d60dc68dfb314c77b106587dd307a80d77e708e9", size = 44913, upload-time = "2024-12-30T12:52:35.206Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/05/49/8872130016209c20436ce0c1067de1cf630755d0443d068a5bc17fa95015/htmldate-1.9.3-py3-none-any.whl", hash = "sha256:3fadc422cf3c10a5cdb5e1b914daf37ec7270400a80a1b37e2673ff84faaaff8", size = 31565, upload-time = "2024-12-30T12:52:32.145Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httpcore"
|
||||
version = "1.0.9"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "h11" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httpx"
|
||||
version = "0.28.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "anyio" },
|
||||
{ name = "certifi" },
|
||||
{ name = "httpcore" },
|
||||
{ name = "idna" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.10"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "justext"
|
||||
version = "3.0.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "lxml", extra = ["html-clean"] },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521, upload-time = "2025-02-25T20:21:49.934Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lxml"
|
||||
version = "5.4.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/76/3d/14e82fc7c8fb1b7761f7e748fd47e2ec8276d137b6acfe5a4bb73853e08f/lxml-5.4.0.tar.gz", hash = "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd", size = 3679479, upload-time = "2025-04-23T01:50:29.322Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/87/cb/2ba1e9dd953415f58548506fa5549a7f373ae55e80c61c9041b7fd09a38a/lxml-5.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:773e27b62920199c6197130632c18fb7ead3257fce1ffb7d286912e56ddb79e0", size = 8110086, upload-time = "2025-04-23T01:46:52.218Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/3e/6602a4dca3ae344e8609914d6ab22e52ce42e3e1638c10967568c5c1450d/lxml-5.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9c671845de9699904b1e9df95acfe8dfc183f2310f163cdaa91a3535af95de", size = 4404613, upload-time = "2025-04-23T01:46:55.281Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4c/72/bf00988477d3bb452bef9436e45aeea82bb40cdfb4684b83c967c53909c7/lxml-5.4.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9454b8d8200ec99a224df8854786262b1bd6461f4280064c807303c642c05e76", size = 5012008, upload-time = "2025-04-23T01:46:57.817Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/92/1f/93e42d93e9e7a44b2d3354c462cd784dbaaf350f7976b5d7c3f85d68d1b1/lxml-5.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cccd007d5c95279e529c146d095f1d39ac05139de26c098166c4beb9374b0f4d", size = 4760915, upload-time = "2025-04-23T01:47:00.745Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/45/0b/363009390d0b461cf9976a499e83b68f792e4c32ecef092f3f9ef9c4ba54/lxml-5.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0fce1294a0497edb034cb416ad3e77ecc89b313cff7adbee5334e4dc0d11f422", size = 5283890, upload-time = "2025-04-23T01:47:04.702Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/19/dc/6056c332f9378ab476c88e301e6549a0454dbee8f0ae16847414f0eccb74/lxml-5.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:24974f774f3a78ac12b95e3a20ef0931795ff04dbb16db81a90c37f589819551", size = 4812644, upload-time = "2025-04-23T01:47:07.833Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/8a/f8c66bbb23ecb9048a46a5ef9b495fd23f7543df642dabeebcb2eeb66592/lxml-5.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:497cab4d8254c2a90bf988f162ace2ddbfdd806fce3bda3f581b9d24c852e03c", size = 4921817, upload-time = "2025-04-23T01:47:10.317Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/57/2e537083c3f381f83d05d9b176f0d838a9e8961f7ed8ddce3f0217179ce3/lxml-5.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e794f698ae4c5084414efea0f5cc9f4ac562ec02d66e1484ff822ef97c2cadff", size = 4753916, upload-time = "2025-04-23T01:47:12.823Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d8/80/ea8c4072109a350848f1157ce83ccd9439601274035cd045ac31f47f3417/lxml-5.4.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:2c62891b1ea3094bb12097822b3d44b93fc6c325f2043c4d2736a8ff09e65f60", size = 5289274, upload-time = "2025-04-23T01:47:15.916Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/47/c4be287c48cdc304483457878a3f22999098b9a95f455e3c4bda7ec7fc72/lxml-5.4.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:142accb3e4d1edae4b392bd165a9abdee8a3c432a2cca193df995bc3886249c8", size = 4874757, upload-time = "2025-04-23T01:47:19.793Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/04/6ef935dc74e729932e39478e44d8cfe6a83550552eaa072b7c05f6f22488/lxml-5.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1a42b3a19346e5601d1b8296ff6ef3d76038058f311902edd574461e9c036982", size = 4947028, upload-time = "2025-04-23T01:47:22.401Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cb/f9/c33fc8daa373ef8a7daddb53175289024512b6619bc9de36d77dca3df44b/lxml-5.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4291d3c409a17febf817259cb37bc62cb7eb398bcc95c1356947e2871911ae61", size = 4834487, upload-time = "2025-04-23T01:47:25.513Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/30/fc92bb595bcb878311e01b418b57d13900f84c2b94f6eca9e5073ea756e6/lxml-5.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4f5322cf38fe0e21c2d73901abf68e6329dc02a4994e483adbcf92b568a09a54", size = 5381688, upload-time = "2025-04-23T01:47:28.454Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/43/d1/3ba7bd978ce28bba8e3da2c2e9d5ae3f8f521ad3f0ca6ea4788d086ba00d/lxml-5.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:0be91891bdb06ebe65122aa6bf3fc94489960cf7e03033c6f83a90863b23c58b", size = 5242043, upload-time = "2025-04-23T01:47:31.208Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/cd/95fa2201041a610c4d08ddaf31d43b98ecc4b1d74b1e7245b1abdab443cb/lxml-5.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:15a665ad90054a3d4f397bc40f73948d48e36e4c09f9bcffc7d90c87410e478a", size = 5021569, upload-time = "2025-04-23T01:47:33.805Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2d/a6/31da006fead660b9512d08d23d31e93ad3477dd47cc42e3285f143443176/lxml-5.4.0-cp313-cp313-win32.whl", hash = "sha256:d5663bc1b471c79f5c833cffbc9b87d7bf13f87e055a5c86c363ccd2348d7e82", size = 3485270, upload-time = "2025-04-23T01:47:36.133Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fc/14/c115516c62a7d2499781d2d3d7215218c0731b2c940753bf9f9b7b73924d/lxml-5.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:bcb7a1096b4b6b24ce1ac24d4942ad98f983cd3810f9711bcd0293f43a9d8b9f", size = 3814606, upload-time = "2025-04-23T01:47:39.028Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
html-clean = [
|
||||
{ name = "lxml-html-clean" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lxml-html-clean"
|
||||
version = "0.4.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "lxml" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/79/b6/466e71db127950fb8d172026a8f0a9f0dc6f64c8e78e2ca79f252e5790b8/lxml_html_clean-0.4.2.tar.gz", hash = "sha256:91291e7b5db95430abf461bc53440964d58e06cc468950f9e47db64976cebcb3", size = 21622, upload-time = "2025-04-09T11:33:59.432Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/0b/942cb7278d6caad79343ad2ddd636ed204a47909b969d19114a3097f5aa3/lxml_html_clean-0.4.2-py3-none-any.whl", hash = "sha256:74ccfba277adcfea87a1e9294f47dd86b05d65b4da7c5b07966e3d5f3be8a505", size = 14184, upload-time = "2025-04-09T11:33:57.988Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markdown-it-py"
|
||||
version = "4.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "mdurl" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markdownify"
|
||||
version = "1.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "six" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/83/1b/6f2697b51eaca81f08852fd2734745af15718fea10222a1d40f8a239c4ea/markdownify-1.2.0.tar.gz", hash = "sha256:f6c367c54eb24ee953921804dfe6d6575c5e5b42c643955e7242034435de634c", size = 18771, upload-time = "2025-08-09T17:44:15.302Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mdurl"
|
||||
version = "0.1.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nodeenv"
|
||||
version = "1.9.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "25.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pbr"
|
||||
version = "7.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "setuptools" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/80/88/baf6b45d064271f19fefac7def6a030a893f912f430de0024dd595ced61f/pbr-7.0.0.tar.gz", hash = "sha256:cf4127298723dafbce3afd13775ccf3885be5d3c8435751b867f9a6a10b71a39", size = 129146, upload-time = "2025-08-13T09:16:41.654Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/98/120c3e21bf3fc0ef397a3906465ee9f5c76996c52811e65455eadc12d68a/pbr-7.0.0-py2.py3-none-any.whl", hash = "sha256:b447e63a2bc04fd975fc0480b8d5ebf979179e2c0ae203bf1eff9ea20073bc38", size = 125109, upload-time = "2025-08-13T09:16:40.269Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.6.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic"
|
||||
version = "2.11.7"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "annotated-types" },
|
||||
{ name = "pydantic-core" },
|
||||
{ name = "typing-extensions" },
|
||||
{ name = "typing-inspection" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic-core"
|
||||
version = "2.33.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic-settings"
|
||||
version = "2.10.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pydantic" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "typing-inspection" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/68/85/1ea668bbab3c50071ca613c6ab30047fb36ab0da1b92fa8f17bbc38fd36c/pydantic_settings-2.10.1.tar.gz", hash = "sha256:06f0062169818d0f5524420a360d632d5857b83cffd4d42fe29597807a1614ee", size = 172583, upload-time = "2025-06-24T13:26:46.841Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/58/f0/427018098906416f580e3cf1366d3b1abfb408a0652e9f31600c24a1903c/pydantic_settings-2.10.1-py3-none-any.whl", hash = "sha256:a60952460b99cf661dc25c29c0ef171721f98bfcb52ef8d9ea4c943d7c8cc796", size = 45235, upload-time = "2025-06-24T13:26:45.485Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.19.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyright"
|
||||
version = "1.1.404"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "nodeenv" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e2/6e/026be64c43af681d5632722acd100b06d3d39f383ec382ff50a71a6d5bce/pyright-1.1.404.tar.gz", hash = "sha256:455e881a558ca6be9ecca0b30ce08aa78343ecc031d37a198ffa9a7a1abeb63e", size = 4065679, upload-time = "2025-08-20T18:46:14.029Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/84/30/89aa7f7d7a875bbb9a577d4b1dc5a3e404e3d2ae2657354808e905e358e0/pyright-1.1.404-py3-none-any.whl", hash = "sha256:c7b7ff1fdb7219c643079e4c3e7d4125f0dafcc19d253b47e898d130ea426419", size = 5902951, upload-time = "2025-08-20T18:46:12.096Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "8.4.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
{ name = "iniconfig" },
|
||||
{ name = "packaging" },
|
||||
{ name = "pluggy" },
|
||||
{ name = "pygments" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "six" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dotenv"
|
||||
version = "1.1.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f6/b0/4bc07ccd3572a2f9df7e6782f52b0c6c90dcbb803ac4a167702d7d0dfe1e/python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab", size = 41978, upload-time = "2025-06-24T04:21:07.341Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5f/ed/539768cf28c661b5b068d66d96a2f155c4971a5d55684a514c1a0e0dec2f/python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc", size = 20556, upload-time = "2025-06-24T04:21:06.073Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2025.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyyaml"
|
||||
version = "6.0.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361, upload-time = "2024-08-06T20:32:51.188Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523, upload-time = "2024-08-06T20:32:53.019Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660, upload-time = "2024-08-06T20:32:54.708Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload-time = "2024-08-06T20:32:56.985Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload-time = "2024-08-06T20:33:03.001Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "readability-lxml"
|
||||
version = "0.8.4.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "chardet" },
|
||||
{ name = "cssselect" },
|
||||
{ name = "lxml", extra = ["html-clean"] },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/55/3e/dc87d97532ddad58af786ec89c7036182e352574c1cba37bf2bf783d2b15/readability_lxml-0.8.4.1.tar.gz", hash = "sha256:9d2924f5942dd7f37fb4da353263b22a3e877ccf922d0e45e348e4177b035a53", size = 22874, upload-time = "2025-05-03T21:11:45.493Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c7/75/2cc58965097e351415af420be81c4665cf80da52a17ef43c01ffbe2caf91/readability_lxml-0.8.4.1-py3-none-any.whl", hash = "sha256:874c0cea22c3bf2b78c7f8df831bfaad3c0a89b7301d45a188db581652b4b465", size = 19912, upload-time = "2025-05-03T21:11:43.993Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redis"
|
||||
version = "6.4.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0d/d6/e8b92798a5bd67d659d51a18170e91c16ac3b59738d91894651ee255ed49/redis-6.4.0.tar.gz", hash = "sha256:b01bc7282b8444e28ec36b261df5375183bb47a07eb9c603f284e89cbc5ef010", size = 4647399, upload-time = "2025-08-07T08:10:11.441Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl", hash = "sha256:f0544fa9604264e9464cdf4814e7d4830f74b165d52f2a330a760a88dd248b7f", size = 279847, upload-time = "2025-08-07T08:10:09.84Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "2025.9.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b2/5a/4c63457fbcaf19d138d72b2e9b39405954f98c0349b31c601bfcb151582c/regex-2025.9.1.tar.gz", hash = "sha256:88ac07b38d20b54d79e704e38aa3bd2c0f8027432164226bdee201a1c0c9c9ff", size = 400852, upload-time = "2025-09-01T22:10:10.479Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/98/25/b2959ce90c6138c5142fe5264ee1f9b71a0c502ca4c7959302a749407c79/regex-2025.9.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bc6834727d1b98d710a63e6c823edf6ffbf5792eba35d3fa119531349d4142ef", size = 485932, upload-time = "2025-09-01T22:08:57.913Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/49/2e/6507a2a85f3f2be6643438b7bd976e67ad73223692d6988eb1ff444106d3/regex-2025.9.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c3dc05b6d579875719bccc5f3037b4dc80433d64e94681a0061845bd8863c025", size = 289568, upload-time = "2025-09-01T22:08:59.258Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c7/d8/de4a4b57215d99868f1640e062a7907e185ec7476b4b689e2345487c1ff4/regex-2025.9.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:22213527df4c985ec4a729b055a8306272d41d2f45908d7bacb79be0fa7a75ad", size = 286984, upload-time = "2025-09-01T22:09:00.835Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/03/15/e8cb403403a57ed316e80661db0e54d7aa2efcd85cb6156f33cc18746922/regex-2025.9.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8e3f6e3c5a5a1adc3f7ea1b5aec89abfc2f4fbfba55dafb4343cd1d084f715b2", size = 797514, upload-time = "2025-09-01T22:09:02.538Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e4/26/2446f2b9585fed61faaa7e2bbce3aca7dd8df6554c32addee4c4caecf24a/regex-2025.9.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bcb89c02a0d6c2bec9b0bb2d8c78782699afe8434493bfa6b4021cc51503f249", size = 862586, upload-time = "2025-09-01T22:09:04.322Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fd/b8/82ffbe9c0992c31bbe6ae1c4b4e21269a5df2559102b90543c9b56724c3c/regex-2025.9.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b0e2f95413eb0c651cd1516a670036315b91b71767af83bc8525350d4375ccba", size = 910815, upload-time = "2025-09-01T22:09:05.978Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/d8/7303ea38911759c1ee30cc5bc623ee85d3196b733c51fd6703c34290a8d9/regex-2025.9.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a41dc039e1c97d3c2ed3e26523f748e58c4de3ea7a31f95e1cf9ff973fff5a", size = 802042, upload-time = "2025-09-01T22:09:07.865Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fc/0e/6ad51a55ed4b5af512bb3299a05d33309bda1c1d1e1808fa869a0bed31bc/regex-2025.9.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f0b4258b161094f66857a26ee938d3fe7b8a5063861e44571215c44fbf0e5df", size = 786764, upload-time = "2025-09-01T22:09:09.362Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/d5/394e3ffae6baa5a9217bbd14d96e0e5da47bb069d0dbb8278e2681a2b938/regex-2025.9.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bf70e18ac390e6977ea7e56f921768002cb0fa359c4199606c7219854ae332e0", size = 856557, upload-time = "2025-09-01T22:09:11.129Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cd/80/b288d3910c41194ad081b9fb4b371b76b0bbfdce93e7709fc98df27b37dc/regex-2025.9.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b84036511e1d2bb0a4ff1aec26951caa2dea8772b223c9e8a19ed8885b32dbac", size = 849108, upload-time = "2025-09-01T22:09:12.877Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/cd/5ec76bf626d0d5abdc277b7a1734696f5f3d14fbb4a3e2540665bc305d85/regex-2025.9.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c2e05dcdfe224047f2a59e70408274c325d019aad96227ab959403ba7d58d2d7", size = 788201, upload-time = "2025-09-01T22:09:14.561Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/36/674672f3fdead107565a2499f3007788b878188acec6d42bc141c5366c2c/regex-2025.9.1-cp313-cp313-win32.whl", hash = "sha256:3b9a62107a7441b81ca98261808fed30ae36ba06c8b7ee435308806bd53c1ed8", size = 264508, upload-time = "2025-09-01T22:09:16.193Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/83/ad/931134539515eb64ce36c24457a98b83c1b2e2d45adf3254b94df3735a76/regex-2025.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:b38afecc10c177eb34cfae68d669d5161880849ba70c05cbfbe409f08cc939d7", size = 275469, upload-time = "2025-09-01T22:09:17.462Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/24/8c/96d34e61c0e4e9248836bf86d69cb224fd222f270fa9045b24e218b65604/regex-2025.9.1-cp313-cp313-win_arm64.whl", hash = "sha256:ec329890ad5e7ed9fc292858554d28d58d56bf62cf964faf0aa57964b21155a0", size = 268586, upload-time = "2025-09-01T22:09:18.948Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/21/b1/453cbea5323b049181ec6344a803777914074b9726c9c5dc76749966d12d/regex-2025.9.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:72fb7a016467d364546f22b5ae86c45680a4e0de6b2a6f67441d22172ff641f1", size = 486111, upload-time = "2025-09-01T22:09:20.734Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/0e/92577f197bd2f7652c5e2857f399936c1876978474ecc5b068c6d8a79c86/regex-2025.9.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c9527fa74eba53f98ad86be2ba003b3ebe97e94b6eb2b916b31b5f055622ef03", size = 289520, upload-time = "2025-09-01T22:09:22.249Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/c6/b472398116cca7ea5a6c4d5ccd0fc543f7fd2492cb0c48d2852a11972f73/regex-2025.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c905d925d194c83a63f92422af7544ec188301451b292c8b487f0543726107ca", size = 287215, upload-time = "2025-09-01T22:09:23.657Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/11/f12ecb0cf9ca792a32bb92f758589a84149017467a544f2f6bfb45c0356d/regex-2025.9.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74df7c74a63adcad314426b1f4ea6054a5ab25d05b0244f0c07ff9ce640fa597", size = 797855, upload-time = "2025-09-01T22:09:25.197Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/46/88/bbb848f719a540fb5997e71310f16f0b33a92c5d4b4d72d4311487fff2a3/regex-2025.9.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4f6e935e98ea48c7a2e8be44494de337b57a204470e7f9c9c42f912c414cd6f5", size = 863363, upload-time = "2025-09-01T22:09:26.705Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/54/a9/2321eb3e2838f575a78d48e03c1e83ea61bd08b74b7ebbdeca8abc50fc25/regex-2025.9.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4a62d033cd9ebefc7c5e466731a508dfabee827d80b13f455de68a50d3c2543d", size = 910202, upload-time = "2025-09-01T22:09:28.906Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/07/d1d70835d7d11b7e126181f316f7213c4572ecf5c5c97bdbb969fb1f38a2/regex-2025.9.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef971ebf2b93bdc88d8337238be4dfb851cc97ed6808eb04870ef67589415171", size = 801808, upload-time = "2025-09-01T22:09:30.733Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/13/d1/29e4d1bed514ef2bf3a4ead3cb8bb88ca8af94130239a4e68aa765c35b1c/regex-2025.9.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d936a1db208bdca0eca1f2bb2c1ba1d8370b226785c1e6db76e32a228ffd0ad5", size = 786824, upload-time = "2025-09-01T22:09:32.61Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/27/20d8ccb1bee460faaa851e6e7cc4cfe852a42b70caa1dca22721ba19f02f/regex-2025.9.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:7e786d9e4469698fc63815b8de08a89165a0aa851720eb99f5e0ea9d51dd2b6a", size = 857406, upload-time = "2025-09-01T22:09:34.117Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/74/fe/60c6132262dc36430d51e0c46c49927d113d3a38c1aba6a26c7744c84cf3/regex-2025.9.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:6b81d7dbc5466ad2c57ce3a0ddb717858fe1a29535c8866f8514d785fdb9fc5b", size = 848593, upload-time = "2025-09-01T22:09:35.598Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/ae/2d4ff915622fabbef1af28387bf71e7f2f4944a348b8460d061e85e29bf0/regex-2025.9.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cd4890e184a6feb0ef195338a6ce68906a8903a0f2eb7e0ab727dbc0a3156273", size = 787951, upload-time = "2025-09-01T22:09:37.139Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/85/37/dc127703a9e715a284cc2f7dbdd8a9776fd813c85c126eddbcbdd1ca5fec/regex-2025.9.1-cp314-cp314-win32.whl", hash = "sha256:34679a86230e46164c9e0396b56cab13c0505972343880b9e705083cc5b8ec86", size = 269833, upload-time = "2025-09-01T22:09:39.245Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/83/bf/4bed4d3d0570e16771defd5f8f15f7ea2311edcbe91077436d6908956c4a/regex-2025.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:a1196e530a6bfa5f4bde029ac5b0295a6ecfaaffbfffede4bbaf4061d9455b70", size = 278742, upload-time = "2025-09-01T22:09:40.651Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/3e/7d7ac6fd085023312421e0d69dfabdfb28e116e513fadbe9afe710c01893/regex-2025.9.1-cp314-cp314-win_arm64.whl", hash = "sha256:f46d525934871ea772930e997d577d48c6983e50f206ff7b66d4ac5f8941e993", size = 271860, upload-time = "2025-09-01T22:09:42.413Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.32.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "idna" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "14.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "markdown-it-py" },
|
||||
{ name = "pygments" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e3/30/3c4d035596d3cf444529e0b2953ad0466f6049528a879d27534700580395/rich-14.1.0-py3-none-any.whl", hash = "sha256:536f5f1785986d6dbdea3c75205c473f970777b4a0d6c6dd1b696aa05a3fa04f", size = 243368, upload-time = "2025-07-25T07:32:56.73Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rq"
|
||||
version = "2.5.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "croniter" },
|
||||
{ name = "redis" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/48/1c/1c390fd8594e7367c1ee672297f7a877c0982b9c26877242c5a509ad27c0/rq-2.5.0.tar.gz", hash = "sha256:b55d328fcaeaf25823b8b8450283225f8048bd1c52abaaca192c99201ab5c687", size = 666978, upload-time = "2025-08-15T10:41:34.84Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/14/36/8917bcfc9794cbc4dd984962feb401f2dfeee0d89e1e40e3367420996f42/rq-2.5.0-py3-none-any.whl", hash = "sha256:90c74eb5b5793ff08e6c3391fd6deb7151f308ac8f04b6831580b38e90688155", size = 108377, upload-time = "2025-08-15T10:41:21.792Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ruff"
|
||||
version = "0.12.9"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/4a/45/2e403fa7007816b5fbb324cb4f8ed3c7402a927a0a0cb2b6279879a8bfdc/ruff-0.12.9.tar.gz", hash = "sha256:fbd94b2e3c623f659962934e52c2bea6fc6da11f667a427a368adaf3af2c866a", size = 5254702, upload-time = "2025-08-14T16:08:55.2Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ad/20/53bf098537adb7b6a97d98fcdebf6e916fcd11b2e21d15f8c171507909cc/ruff-0.12.9-py3-none-linux_armv6l.whl", hash = "sha256:fcebc6c79fcae3f220d05585229463621f5dbf24d79fdc4936d9302e177cfa3e", size = 11759705, upload-time = "2025-08-14T16:08:12.968Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/4d/c764ee423002aac1ec66b9d541285dd29d2c0640a8086c87de59ebbe80d5/ruff-0.12.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:aed9d15f8c5755c0e74467731a007fcad41f19bcce41cd75f768bbd687f8535f", size = 12527042, upload-time = "2025-08-14T16:08:16.54Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/45/cfcdf6d3eb5fc78a5b419e7e616d6ccba0013dc5b180522920af2897e1be/ruff-0.12.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5b15ea354c6ff0d7423814ba6d44be2807644d0c05e9ed60caca87e963e93f70", size = 11724457, upload-time = "2025-08-14T16:08:18.686Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/e6/44615c754b55662200c48bebb02196dbb14111b6e266ab071b7e7297b4ec/ruff-0.12.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d596c2d0393c2502eaabfef723bd74ca35348a8dac4267d18a94910087807c53", size = 11949446, upload-time = "2025-08-14T16:08:21.059Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fd/d1/9b7d46625d617c7df520d40d5ac6cdcdf20cbccb88fad4b5ecd476a6bb8d/ruff-0.12.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1b15599931a1a7a03c388b9c5df1bfa62be7ede6eb7ef753b272381f39c3d0ff", size = 11566350, upload-time = "2025-08-14T16:08:23.433Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/20/b73132f66f2856bc29d2d263c6ca457f8476b0bbbe064dac3ac3337a270f/ruff-0.12.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3d02faa2977fb6f3f32ddb7828e212b7dd499c59eb896ae6c03ea5c303575756", size = 13270430, upload-time = "2025-08-14T16:08:25.837Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a2/21/eaf3806f0a3d4c6be0a69d435646fba775b65f3f2097d54898b0fd4bb12e/ruff-0.12.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:17d5b6b0b3a25259b69ebcba87908496e6830e03acfb929ef9fd4c58675fa2ea", size = 14264717, upload-time = "2025-08-14T16:08:27.907Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d2/82/1d0c53bd37dcb582b2c521d352fbf4876b1e28bc0d8894344198f6c9950d/ruff-0.12.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:72db7521860e246adbb43f6ef464dd2a532ef2ef1f5dd0d470455b8d9f1773e0", size = 13684331, upload-time = "2025-08-14T16:08:30.352Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3b/2f/1c5cf6d8f656306d42a686f1e207f71d7cebdcbe7b2aa18e4e8a0cb74da3/ruff-0.12.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a03242c1522b4e0885af63320ad754d53983c9599157ee33e77d748363c561ce", size = 12739151, upload-time = "2025-08-14T16:08:32.55Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/09/25033198bff89b24d734e6479e39b1968e4c992e82262d61cdccaf11afb9/ruff-0.12.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fc83e4e9751e6c13b5046d7162f205d0a7bac5840183c5beebf824b08a27340", size = 12954992, upload-time = "2025-08-14T16:08:34.816Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/52/8e/d0dbf2f9dca66c2d7131feefc386523404014968cd6d22f057763935ab32/ruff-0.12.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:881465ed56ba4dd26a691954650de6ad389a2d1fdb130fe51ff18a25639fe4bb", size = 12899569, upload-time = "2025-08-14T16:08:36.852Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/bd/b614d7c08515b1428ed4d3f1d4e3d687deffb2479703b90237682586fa66/ruff-0.12.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:43f07a3ccfc62cdb4d3a3348bf0588358a66da756aa113e071b8ca8c3b9826af", size = 11751983, upload-time = "2025-08-14T16:08:39.314Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/d6/383e9f818a2441b1a0ed898d7875f11273f10882f997388b2b51cb2ae8b5/ruff-0.12.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:07adb221c54b6bba24387911e5734357f042e5669fa5718920ee728aba3cbadc", size = 11538635, upload-time = "2025-08-14T16:08:41.297Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/9c/56f869d314edaa9fc1f491706d1d8a47747b9d714130368fbd69ce9024e9/ruff-0.12.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f5cd34fabfdea3933ab85d72359f118035882a01bff15bd1d2b15261d85d5f66", size = 12534346, upload-time = "2025-08-14T16:08:43.39Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bd/4b/d8b95c6795a6c93b439bc913ee7a94fda42bb30a79285d47b80074003ee7/ruff-0.12.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f6be1d2ca0686c54564da8e7ee9e25f93bdd6868263805f8c0b8fc6a449db6d7", size = 13017021, upload-time = "2025-08-14T16:08:45.889Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c7/c1/5f9a839a697ce1acd7af44836f7c2181cdae5accd17a5cb85fcbd694075e/ruff-0.12.9-py3-none-win32.whl", hash = "sha256:cc7a37bd2509974379d0115cc5608a1a4a6c4bff1b452ea69db83c8855d53f93", size = 11734785, upload-time = "2025-08-14T16:08:48.062Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fa/66/cdddc2d1d9a9f677520b7cfc490d234336f523d4b429c1298de359a3be08/ruff-0.12.9-py3-none-win_amd64.whl", hash = "sha256:6fb15b1977309741d7d098c8a3cb7a30bc112760a00fb6efb7abc85f00ba5908", size = 12840654, upload-time = "2025-08-14T16:08:50.158Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/fd/669816bc6b5b93b9586f3c1d87cd6bc05028470b3ecfebb5938252c47a35/ruff-0.12.9-py3-none-win_arm64.whl", hash = "sha256:63c8c819739d86b96d500cce885956a1a48ab056bbcbc61b747ad494b2485089", size = 11949623, upload-time = "2025-08-14T16:08:52.233Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "selectolax"
|
||||
version = "0.3.34"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/bf/8c/8bbe1b17098b4e2a63a251361870303c37ad4c3170536277096575c24ca4/selectolax-0.3.34.tar.gz", hash = "sha256:c2cdb30b60994f1e0b74574dd408f1336d2fadd68a3ebab8ea573740dcbf17e2", size = 4706599, upload-time = "2025-08-28T23:17:44.131Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/29/eeb77d1a77599023387d4d00655960dfa3d760557b42a65ef347e29b40b0/selectolax-0.3.34-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2bb74e079098d758bd3d5c77b1c66c90098de305e4084b60981e561acf52c12a", size = 2001199, upload-time = "2025-08-28T23:16:59.467Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/21/80/326b9dd2901b64c3c654db9e8841ddc412b9c2af0047b7d43290bbb276be/selectolax-0.3.34-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cc39822f714e6e434ceb893e1ccff873f3f88c8db8226ba2f8a5f4a7a0e2aa29", size = 1994171, upload-time = "2025-08-28T23:17:01.206Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/15/af/1265e4f9429b3c3cf098ba08cb3264d7e16990ed3029d89e9890012aae76/selectolax-0.3.34-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:181b67949ec23b4f11b6f2e426ba9904dd25c73d12c2cb22caf8fae21a363e99", size = 2196092, upload-time = "2025-08-28T23:17:02.574Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/41/e67100abd8b0b2a5e1d5d7fa864c31d31e9a2c0bbd08ce4e951235f13143/selectolax-0.3.34-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0b09f9d7b22bbb633966ac2019ec059caf735a5bdb4a5784bab0f4db2198fd6a", size = 2233674, upload-time = "2025-08-28T23:17:03.928Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/24/7ad043805c9292b4f535071c223d10aad7703b4460d68de1dce9dcf21d3f/selectolax-0.3.34-cp313-cp313-win32.whl", hash = "sha256:6e2ae8a984f82c9373e8a5ec0450f67603fde843fed73675f5187986e9e45b59", size = 1686489, upload-time = "2025-08-28T23:17:05.341Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6b/79/62666fbfcd847c0cfc2b75b496bfa8382d765e7a3d5a2c792004760a6e61/selectolax-0.3.34-cp313-cp313-win_amd64.whl", hash = "sha256:96acd5414aaf0bb8677258ff7b0f494953b2621f71be1e3d69e01743545509ec", size = 1789924, upload-time = "2025-08-28T23:17:06.708Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5d/b5/0bb579210a7de36d97c359016e77119513d3e810c61e99ade72089bc1b4d/selectolax-0.3.34-cp313-cp313-win_arm64.whl", hash = "sha256:1d309fd17ba72bb46a282154f75752ed7746de6f00e2c1eec4cd421dcdadf008", size = 1737480, upload-time = "2025-08-28T23:17:08.575Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b8/5c/ab87e8ecb3c6aa1053d1c6d1eba0e47e292cc72aff0f6fbb89d920d4d87c/selectolax-0.3.34-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:3e9c4197563c9b62b56dd7545bfd993ce071fd40b8779736e9bc59813f014c23", size = 2000587, upload-time = "2025-08-28T23:17:10.327Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/8e/5c08bd5628f73ab582696f8349138a569115a0fd6ab71842e4115ceec4ff/selectolax-0.3.34-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f96eaa0da764a4b9e08e792c0f17cce98749f1406ffad35e6d4835194570bdbf", size = 1994327, upload-time = "2025-08-28T23:17:11.709Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/29/02b22eff289b29ee3f869a85e4be4f7f3cf4b480d429bb18aab014848917/selectolax-0.3.34-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:412ce46d963444cd378e9f3197a2f30b05d858722677a361fc44ad244d2bb7db", size = 2201620, upload-time = "2025-08-28T23:17:13.538Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/d3/bdd3a94bb1276be4ef4371dbfd254137b22f5c54a94d051a8d72c3956dc6/selectolax-0.3.34-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:58dd7dc062b0424adb001817bf9b05476d165a4db1885a69cac66ca16b313035", size = 2233487, upload-time = "2025-08-28T23:17:14.921Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/6a/5d551c570f29bfca5815f45fa6e6a3310cc5bc6c9b1073a968d71f73612b/selectolax-0.3.34-cp314-cp314-win32.whl", hash = "sha256:4255558fa48e3685a13f3d9dfc84586146c7b0b86e44c899ac2ac263357c987f", size = 1779755, upload-time = "2025-08-28T23:17:16.322Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/dc/5def41b07cb3b917841022489e6bd6c3277363c23b44eca00a0ada93221c/selectolax-0.3.34-cp314-cp314-win_amd64.whl", hash = "sha256:6cbf2707d79afd7e15083f3f32c11c9b6e39a39026c8b362ce25959842a837b6", size = 1877332, upload-time = "2025-08-28T23:17:17.766Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/19/0f/63da99be8f78bbfca0cb3f9ad71b7475ab97383f830c86a9abd29c6d3f25/selectolax-0.3.34-cp314-cp314-win_arm64.whl", hash = "sha256:3aa83e4d1f5f5534c9d9e44fc53640c82edc7d0eef6fca0829830cccc8df9568", size = 1831124, upload-time = "2025-08-28T23:17:19.744Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/39/5c/07d8031c6c106de10ff42b4440ad7fa6a038650942bb2e194e4eb9ffec6d/selectolax-0.3.34-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:bb0b9002974ec7052f7eb1439b8e404e11a00a26affcbdd73fc53fc55beec809", size = 2023889, upload-time = "2025-08-28T23:17:21.222Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fd/80/fa8220c2eae44928b5ae73eccd44baedb328109f115c948d796c46d11048/selectolax-0.3.34-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:38e5fdffab6d08800a19671ac9641ff9ca6738fad42090f4dd0da76e4db29582", size = 2011882, upload-time = "2025-08-28T23:17:22.844Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/02/657089f68f59308bd90137102a7f6da0c3770128ae7245e1290e99f5a48d/selectolax-0.3.34-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:871d35e19dfde9ee83c1df139940c2e5cdf6a50ef3d147a0e9acf382b63b5b3e", size = 2221871, upload-time = "2025-08-28T23:17:24.259Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d2/56/1ad7877f9b2b12f616a8847eca0a3047c6b5ed14588f21fe1f6915357efb/selectolax-0.3.34-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f3f269bc53bc84ccc166704263712f4448130ec827a38a0df230cffe3dc46a9", size = 2241032, upload-time = "2025-08-28T23:17:25.76Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/60/c0/30ce665b7382f663fdbb282748ddee392a61c85f51862776b128d8644d45/selectolax-0.3.34-cp314-cp314t-win32.whl", hash = "sha256:b957d105c2f3d86de872f61be1c9a92e1d84580a5ec89a413282f60ffb3f7bc1", size = 1828494, upload-time = "2025-08-28T23:17:27.447Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a4/9e/11d023ad74d0d1a48cefdddbb2d00365c4d9a97735d7c24c0f206cd1babb/selectolax-0.3.34-cp314-cp314t-win_amd64.whl", hash = "sha256:9c609d639ce09154d688063bb830dc351fb944fa52629e25717dbab45ad04327", size = 1951608, upload-time = "2025-08-28T23:17:29.327Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/20/a5f93b84e3e6de9756dc82465c0dff57b1c8a25b1815bca0817e4342494c/selectolax-0.3.34-cp314-cp314t-win_arm64.whl", hash = "sha256:6359e94d66fb4fce9fb7c9d18252c3d8cba28b90f7412da8ce610bd77746f750", size = 1852855, upload-time = "2025-08-28T23:17:30.746Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "80.9.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/18/5d/3bf57dcd21979b887f014ea83c24ae194cfcd12b9e0fda66b957c69d1fca/setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c", size = 1319958, upload-time = "2025-05-27T00:56:51.443Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486, upload-time = "2025-05-27T00:56:49.664Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shellingham"
|
||||
version = "1.5.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
version = "1.17.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sniffio"
|
||||
version = "1.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "soupsieve"
|
||||
version = "2.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stevedore"
|
||||
version = "5.4.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pbr" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/28/3f/13cacea96900bbd31bb05c6b74135f85d15564fc583802be56976c940470/stevedore-5.4.1.tar.gz", hash = "sha256:3135b5ae50fe12816ef291baff420acb727fcd356106e3e9cbfa9e5985cd6f4b", size = 513858, upload-time = "2025-02-20T14:03:57.285Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f7/45/8c4ebc0c460e6ec38e62ab245ad3c7fc10b210116cea7c16d61602aa9558/stevedore-5.4.1-py3-none-any.whl", hash = "sha256:d10a31c7b86cba16c1f6e8d15416955fc797052351a56af15e608ad20811fcfe", size = 49533, upload-time = "2025-02-20T14:03:55.849Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiktoken"
|
||||
version = "0.12.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "regex" },
|
||||
{ name = "requests" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tld"
|
||||
version = "0.13.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/df/a1/5723b07a70c1841a80afc9ac572fdf53488306848d844cd70519391b0d26/tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350", size = 462000, upload-time = "2025-05-21T22:18:29.341Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/70/b2f38360c3fc4bc9b5e8ef429e1fde63749144ac583c2dbdf7e21e27a9ad/tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c", size = 274718, upload-time = "2025-05-21T22:18:25.811Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "trafilatura"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "courlan" },
|
||||
{ name = "htmldate" },
|
||||
{ name = "justext" },
|
||||
{ name = "lxml" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/06/25/e3ebeefdebfdfae8c4a4396f5a6ea51fc6fa0831d63ce338e5090a8003dc/trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247", size = 253404, upload-time = "2024-12-03T15:23:24.16Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/b6/097367f180b6383a3581ca1b86fcae284e52075fa941d1232df35293363c/trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d", size = 132557, upload-time = "2024-12-03T15:23:21.41Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typer"
|
||||
version = "0.16.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "rich" },
|
||||
{ name = "shellingham" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.14.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673, upload-time = "2025-07-04T13:28:34.16Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-inspection"
|
||||
version = "0.4.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2025.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tzlocal"
|
||||
version = "5.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "tzdata", marker = "sys_platform == 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.5.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uv-build"
|
||||
version = "0.8.12"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/58/1d/109827cffcdd2430783450591083a3cc9b80c8d34f962ff86e00a7d73eaf/uv_build-0.8.12.tar.gz", hash = "sha256:49666685059bf5c62e5634371b00b2012ebe3e4e4d0f479cff0400bf66ad1e3a", size = 322245, upload-time = "2025-08-18T23:59:48.408Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/6e/75995ef959314680fc127c3d947bc2dec1fed57a0fb400b81270dda01132/uv_build-0.8.12-py3-none-linux_armv6l.whl", hash = "sha256:03cd118ae8731aeca7994a48d6f23a5d4aacef5ee9c88bc60daf99ad698cefae", size = 1318465, upload-time = "2025-08-18T23:59:19.615Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fc/55/fa65b463af6b2c1738b81d6153975ca3b1a07056552f0993c2cf7b324018/uv_build-0.8.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:23d3d46cd619640b4b3e2977cfe629fb898586d21b8b641c9385021b1755fde5", size = 1299484, upload-time = "2025-08-18T23:59:23.737Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/55/21/14fb0309c64e324f13f309460fc5a1ebf4872c1f91be89d50039c8e3a91c/uv_build-0.8.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a6676b94db118f4b3e903acf52f4acc6e8b558330d576a8438181726b47bad15", size = 1177028, upload-time = "2025-08-18T23:59:25.052Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/ae/61ebacd6b43f97300409412ba99d274305919bbda367c44ea4b114c91ac5/uv_build-0.8.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:2135094eab1657c121a74176a41f2ad30066962f476dac11b6c48ad6cb279392", size = 1367327, upload-time = "2025-08-18T23:59:26.676Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d4/f7/d8c29e322ecb569774e90f3e9a1b8018465a4c88e62c6083aa91f7c53de9/uv_build-0.8.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20199b48eebf3a07046d5988b4eca8c3a8c83e50299e8e6bba085bf8f2e02611", size = 1274839, upload-time = "2025-08-18T23:59:28.034Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/be/63ef8eb542b98d3d4536b8519f9e4d4dbf8f52443975740be9f833fa4985/uv_build-0.8.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9fdd226820cfdba719779f4ccbf594258177f67ef1907141a8b959757c26d55c", size = 1426207, upload-time = "2025-08-18T23:59:29.687Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/b0/3ea05c1cdbc32fd13e0e97d56e8b3be4cd350ed5e6d9aa137ebe65afb5ae/uv_build-0.8.12-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9c76003c6af6c6949f796448458bb104c5d3f7d9a1ced3f3aeed613e2f47677e", size = 1577750, upload-time = "2025-08-18T23:59:30.983Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/53/ed/1391d420efdbeb07353db1404e34830a322fe2efb64853c0d4fcda315276/uv_build-0.8.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dfe32cdb94c85981597d40efc08c01ff30267db18935df50ffcef1258e091d52", size = 1481257, upload-time = "2025-08-18T23:59:32.248Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/26/28/bc6c7d00fb3a4713f85359c8687067111021542f379d5ff49136cfbe9b64/uv_build-0.8.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a96aa67f8071a025b41abc661ddd0cec2731d1530095479f2b810b1c04a09252", size = 1418075, upload-time = "2025-08-18T23:59:33.961Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/23/05/39236c6e86a5d49a0d4c80064907665db34a8c180ba3110bca436ddbb8f3/uv_build-0.8.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6878f2179dafb1053a413ad41f2f9640655489972bec6211aaf8d492b49614af", size = 1421678, upload-time = "2025-08-18T23:59:35.653Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/66/d7/731bec1f5955de6ea33cffcf568a81375dfe80e17215dd66cdf659fcd28c/uv_build-0.8.12-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:7bce23578e8abbb40fd70aebed1afd27d132915e451551322f10aa304dd8bf26", size = 1365561, upload-time = "2025-08-18T23:59:37.664Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bb/b8/1219fa9d21c1deacd8d8b9f4b4193596ea6cdbef718e299b371354c19897/uv_build-0.8.12-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a9c57674dd757f8208b6e4929abd5bcb6b63bab1ea5fab0f3feaa4c40236c7dd", size = 1375369, upload-time = "2025-08-18T23:59:38.948Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ae/31/700da060b59d4bb163f146d2f673292937595efa77e71a73842b945e49c7/uv_build-0.8.12-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:021a75dec60bf14f0bebdf10aafa08a03ad5d2c9bfd82565b77ac56a82316911", size = 1290573, upload-time = "2025-08-18T23:59:40.223Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/9b/711a875605583bed36ff18ccd5351f2582cafedef4720a667e90e6023e3a/uv_build-0.8.12-py3-none-musllinux_1_1_i686.whl", hash = "sha256:2884df52ef9c47bccebf0f616380b281078a4e50fd29a6d44e841f2e2532f687", size = 1380155, upload-time = "2025-08-18T23:59:41.868Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/67/31/4b0269dbebd18e406ec565ead0c0b05909d255cd4650dfac1b198542e92d/uv_build-0.8.12-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:c8072519032f4c90e36ea4650fa4a86a30a6d3355082a31f996e7c9e6a6e92f6", size = 1462583, upload-time = "2025-08-18T23:59:43.164Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f3/01/2d47a047109ac53d40c3912d15a4aeadfa67c3937dcd7cd854f865e25fef/uv_build-0.8.12-py3-none-win32.whl", hash = "sha256:45830715e022b85994c06db03ea1a337684cef441ab3ecd38d4b03071845f662", size = 1251560, upload-time = "2025-08-18T23:59:44.425Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c6/11/d8a0a1b87e4cca37abbeb3756119260d9f84bc954cec0bfb04447138a19e/uv_build-0.8.12-py3-none-win_amd64.whl", hash = "sha256:b549a205e1a7487f278baa5fd59dae6901955be7af024dea9d17615e64312cf4", size = 1329565, upload-time = "2025-08-18T23:59:45.932Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d4/0d/c2b30dd90d9fbd0ddef6db4b0fc60e80643d0ef2501229078dcff79067f1/uv_build-0.8.12-py3-none-win_arm64.whl", hash = "sha256:f0c05d62de6c8cb59eb686ac8c6a4e9549f81603864df4f853923eefc850f674", size = 1236604, upload-time = "2025-08-18T23:59:47.094Z" },
|
||||
]
|
||||
Reference in New Issue
Block a user