diff --git a/projects/crawler/.dockerignore b/projects/crawler/.dockerignore deleted file mode 100644 index 9bbcac5..0000000 --- a/projects/crawler/.dockerignore +++ /dev/null @@ -1,23 +0,0 @@ -# Ignore Python cache files -__pycache__/ -*.pyc - -# Ignore virtual environments -.venv/ - -# Ignore local environment files -.env.local -.env.*.local - -# Ignore logs -*.log - -# Ignore Docker-related files -Dockerfile -docker-compose.yml - -# Ignore other unnecessary files -*.swp -.idea/ -.vscode/ -.DS_Store diff --git a/projects/crawler/.env b/projects/crawler/.env deleted file mode 100644 index 741a97d..0000000 --- a/projects/crawler/.env +++ /dev/null @@ -1,7 +0,0 @@ -BASANGO_CRAWLER_TOKEN=some-token -BASANGO_API_ENDPOINT=http://localhost:8000/api/aggregator/articles?token=dev -BASANGO_REDIS_URL=redis://localhost:6379/0 -BASANGO_QUEUE_PREFIX=basango -BASANGO_QUEUE_TIMEOUT=30 -BASANGO_QUEUE_RESULT_TTL=3600 -BASANGO_QUEUE_FAILURE_TTL=86400 diff --git a/projects/crawler/.gitignore b/projects/crawler/.gitignore deleted file mode 100644 index 09649c3..0000000 --- a/projects/crawler/.gitignore +++ /dev/null @@ -1,22 +0,0 @@ -.idea/ -.vscode/ -.ipynb_checkpoints/ -*.pyc -.env.local -.env.*.local -var/ -.DS_Store - -# Python-generated files -__pycache__/ -.pytest_cache/ -*.py[oc] -build/ -dist/ -wheels/ -*.egg-info - -# Virtual environments -.venv - -data/ diff --git a/projects/crawler/.pre-commit-config.yaml b/projects/crawler/.pre-commit-config.yaml deleted file mode 100644 index 70d5c0b..0000000 --- a/projects/crawler/.pre-commit-config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -repos: - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.12 - hooks: - - id: ruff-check - - id: ruff-format diff --git a/projects/crawler/.python-version b/projects/crawler/.python-version deleted file mode 100644 index 24ee5b1..0000000 --- a/projects/crawler/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.13 diff --git a/projects/crawler/Dockerfile b/projects/crawler/Dockerfile deleted file mode 100644 index 1be8dc3..0000000 --- a/projects/crawler/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -# Use the official Python image as a base -FROM python:3.13-slim - -COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ - -# Install the project into `/app` -WORKDIR /app - -# Enable bytecode compilation -ENV UV_COMPILE_BYTECODE=1 - -# Copy from the cache instead of linking since it's a mounted volume -ENV UV_LINK_MODE=copy - -# Ensure installed tools can be executed out of the box -ENV UV_TOOL_BIN_DIR=/usr/local/bin - -# Install the project's dependencies using the lockfile and settings -RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=uv.lock,target=uv.lock \ - --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ - uv sync --locked --no-install-project --no-dev - -# Then, add the rest of the project source code and install it -# Installing separately from its dependencies allows optimal layer caching -COPY . /app -RUN --mount=type=cache,target=/root/.cache/uv \ - uv sync --locked --no-dev - -# Place executables in the environment at the front of the path -ENV PATH="/app/.venv/bin:$PATH" - -# Reset the entrypoint, don't invoke `uv` -ENTRYPOINT [] diff --git a/projects/crawler/README.md b/projects/crawler/README.md deleted file mode 100644 index 66da142..0000000 --- a/projects/crawler/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# Crawler - -[![crawler audit](https://github.com/bernard-ng/basango/actions/workflows/crawler_audit.yml/badge.svg)](https://github.com/bernard-ng/basango/actions/workflows/crawler_audit.yml) -[![crawler quality](https://github.com/bernard-ng/basango/actions/workflows/crawler_quality.yml/badge.svg)](https://github.com/bernard-ng/basango/actions/workflows/crawler_quality.yml) -[![crawler tests](https://github.com/bernard-ng/basango/actions/workflows/crawler_tests.yml/badge.svg)](https://github.com/bernard-ng/basango/actions/workflows/crawler_tests.yml) -[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) -[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) - ---- - -### Usage - -- Install the project in your virtualenv so the `basango` CLI is available: - - With uv: `uv run --with . basango --help` - - Or install locally: `uv sync` then `basango --help` - -#### Sync crawl (in-process) - -- Crawl a configured source by id and write to CSV/JSON: - - `basango crawl --source-id my-source` - - Limit by page range: `basango crawl --source-id my-source -p 1:3` - - Limit by date range: `basango crawl --source-id my-source -d 2024-10-01:2024-10-31` - - Category, when supported: `basango crawl --source-id my-source -g tech` - -#### Async crawl (Redis + RQ) - -- Enqueue a crawl job and return immediately: - - `basango crawl --source-id my-source --async` -- Start one or more workers to process queues: - - Article-only (default): `basango worker` - - Multiple queues: `basango worker -q listing -q articles -q processed` - - macOS friendly (no forking): `basango worker --simple` - - One-shot draining for CI: `basango worker --burst` - -#### Environment - -- `BASANGO_REDIS_URL` (default `redis://localhost:6379/0`) -- `BASANGO_QUEUE_PREFIX` (default `crawler`) -- `BASANGO_QUEUE_TIMEOUT` (default `600` seconds) -- `BASANGO_QUEUE_RESULT_TTL` (default `3600` seconds) -- `BASANGO_QUEUE_FAILURE_TTL` (default `3600` seconds) - -#### Configuration - -- See `config/pipeline.*.yaml` for source definitions and HTTP client settings. -- Use `-c/--env` to select which pipeline to load (default `development`). diff --git a/projects/crawler/compose.yaml b/projects/crawler/compose.yaml deleted file mode 100644 index 5dc3b22..0000000 --- a/projects/crawler/compose.yaml +++ /dev/null @@ -1,38 +0,0 @@ -services: - basango: - build: . - container_name: basango-app - restart: unless-stopped - networks: - - basango-network - - redis: - image: redis:7-alpine - container_name: basango-redis - restart: unless-stopped - ports: - - "6379:6379" - volumes: - - redis_data:/var/redis - command: redis-server --appendonly yes - networks: - - basango-network - - redis-commander: - image: rediscommander/redis-commander:latest - container_name: basango-redis-commander - restart: unless-stopped - ports: - - "8081:8081" - environment: - - REDIS_HOSTS=local:redis:6379 - depends_on: - - redis - networks: - - basango-network - -networks: - basango-network: - -volumes: - redis_data: diff --git a/projects/crawler/config/pipeline.dev.yaml b/projects/crawler/config/pipeline.dev.yaml deleted file mode 100644 index e3d35fa..0000000 --- a/projects/crawler/config/pipeline.dev.yaml +++ /dev/null @@ -1,97 +0,0 @@ -# Fetching and crawling configuration -fetch: - client: - timeout: 20 - user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango) - follow_redirects: true - verify_ssl: true - rotate_user_agent: true - max_retries: 3 - backoff_initial: 1.0 - backoff_multiplier: 2.0 - backoff_max: 30.0 - respect_retry_after: true - crawler: - notify: false - use_multi_threading: false - max_workers: 5 - -# Source configurations -sources: - html: - - source_id: radiookapi.net - source_url: https://www.radiookapi.net - source_date: - pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$3-$2-$1 $4" - source_selectors: - articles: ".view-content > .views-row.content-row" - article_title: ".views-field-title a" - article_link: ".views-field-title a" - article_body: ".field-name-body" - article_date: ".views-field-created" - article_categories: ".views-field-field-cat-gorie a" - pagination: "ul.pagination > li a(:last-child)" - pagination_template: "/actualite?page={page}" - supports_categories: false - requires_details: false - requires_rate_limit: false - - - source_id: 7sur7.cd - source_url: https://7sur7.cd - source_date: - pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$3-$2-$1 $4" - categories: [ "politique", "economie", "culture", "sport", "societe" ] - source_selectors: - articles: ".view-content > .row.views-row" - article_title: ".views-field-title a" - article_link: ".views-field-title a" - article_body: ".field.field--name-body" - article_date: ".views-field-created" - pagination: "ul.pagination > li a(:last-child)" - pagination_template: "/index.php/category/{category}?page={page}" - supports_categories: true - requires_details: false - requires_rate_limit: false - - - source_id: mediacongo.net - source_url: https://mediacongo.net - source_date: - format: "%d.%m.%Y %H:%M" - source_selectors: - articles: ".for_aitems > .article_other_item" - article_title: "img" - article_link: "a(:first-child)" - article_categories: "a.color_link" - article_body: ".article_ttext" - article_date: ".article_other_about" - pagination: ".nav > a(:last-child)" - pagination_template: "/articles.html?page={page}" - supports_categories: false - requires_details: true - requires_rate_limit: false - - - source_id: actualite.cd - source_url: https://actualite.cd - source_date: - pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$4-$3-$2 $5" - source_selectors: - articles: "#views-bootstrap-taxonomy-term-page-2 > div > div" - article_title: "#actu-titre a" - article_link: "#actu-titre a" - article_categories: "#actu-cat a" - article_body: ".views-field.views-field-body" - article_date: "#p-date" - pagination_template: "/actualite?page={page}" - supports_categories: false - requires_details: true - requires_rate_limit: false - - wordpress: - - source_id: beto.cd - source_url: https://beto.cd - requires_rate_limit: true - - source_id: newscd.net - source_url: https://newscd.net diff --git a/projects/crawler/config/pipeline.prod.yaml b/projects/crawler/config/pipeline.prod.yaml deleted file mode 100644 index 86b09bc..0000000 --- a/projects/crawler/config/pipeline.prod.yaml +++ /dev/null @@ -1,160 +0,0 @@ -# Fetching and crawling configuration -fetch: - client: - timeout: 20 - user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango) - follow_redirects: true - verify_ssl: true - rotate_user_agent: true - max_retries: 3 - backoff_initial: 1.0 - backoff_multiplier: 2.0 - backoff_max: 30.0 - respect_retry_after: true - crawler: - notify: false - use_multi_threading: false - max_workers: 5 - -# Logging configuration -# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) -logging: - level: "ERROR" - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - file_logging: true # Enable logging to file - console_logging: true # Enable logging to console - log_file: "pipeline.log" # Log file name - max_log_size: 10485760 # Maximum size of log file before rotation (10MB) - backup_count: 5 # Number of backup log files to keep - -# Source configurations -sources: - html: - - source_id: radiookapi.net - source_url: https://www.radiookapi.net - source_date: - pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$3-$2-$1 $4" - source_selectors: - articles: ".view-content > .views-row.content-row" - article_title: ".views-field-title a" - article_link: ".views-field-title a" - article_body: ".field-name-body" - article_date: ".views-field-created" - article_categories: ".views-field-field-cat-gorie a" - pagination: "ul.pagination > li a(:last-child)" - pagination_template: "/actualite?page={page}" - supports_categories: false - requires_details: false - requires_rate_limit: false - - - source_id: 7sur7.cd - source_url: https://7sur7.cd - source_date: - pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$3-$2-$1 $4" - categories: [ "politique", "economie", "culture", "sport", "societe" ] - source_selectors: - articles: ".view-content > .row.views-row" - article_title: ".views-field-title a" - article_link: ".views-field-title a" - article_body: ".field.field--name-body" - article_date: ".views-field-created" - pagination: "ul.pagination > li a(:last-child)" - pagination_template: "/index.php/category/{category}?page={page}" - supports_categories: true - requires_details: false - requires_rate_limit: false - - - source_id: mediacongo.net - source_url: https://mediacongo.net - source_date: - format: "%d.%m.%Y %H:%M" - source_selectors: - articles: ".for_aitems > .article_other_item" - article_title: "img" - article_link: "a(:first-child)" - article_categories: "a.color_link" - article_body: ".article_ttext" - article_date: ".article_other_about" - pagination: ".nav > a(:last-child)" - pagination_template: "/articles.html?page={page}" - supports_categories: false - requires_details: true - requires_rate_limit: false - - - source_id: actualite.cd - source_url: https://actualite.cd - source_date: - pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$4-$3-$2 $5" - source_selectors: - articles: "#views-bootstrap-taxonomy-term-page-2 > div > div" - article_title: "#actu-titre a" - article_link: "#actu-titre a" - article_categories: "#actu-cat a" - article_body: ".views-field.views-field-body" - article_date: "#p-date" - pagination_template: "/actualite?page={page}" - supports_categories: false - requires_details: true - requires_rate_limit: false - - wordpress: - - source_id: beto.cd - source_url: https://beto.cd - requires_rate_limit: true - - source_id: newscd.net - source_url: https://newscd.net - - source_id: africanewsrdc.net - source_url: https://www.africanewsrdc.net - - source_id: angazainstitute.ac.cd - source_url: https://angazainstitute.ac.cd - - source_id: b-onetv.cd - source_url: https://b-onetv.cd - - source_id: bukavufm.com - source_url: https://bukavufm.com - - source_id: changement7.net - source_url: https://changement7.net - - source_id: congoactu.net - source_url: https://congoactu.net - - source_id: congoindependant.com - source_url: https://www.congoindependant.com - - source_id: congoquotidien.com - source_url: https://www.congoquotidien.com - - source_id: cumulard.cd - source_url: https://www.cumulard.cd - - source_id: environews-rdc.net - source_url: https://environews-rdc.net - - source_id: freemediardc.info - source_url: https://www.freemediardc.info - - source_id: geopolismagazine.org - source_url: https://geopolismagazine.org - - source_id: habarirdc.net - source_url: https://habarirdc.net - - source_id: infordc.com - source_url: https://infordc.com - - source_id: kilalopress.net - source_url: https://kilalopress.net - - source_id: laprosperiteonline.net - source_url: https://laprosperiteonline.net - - source_id: laprunellerdc.cd - source_url: https://laprunellerdc.cd - - source_id: lesmedias.net - source_url: https://lesmedias.net - - source_id: lesvolcansnews.net - source_url: https://lesvolcansnews.net - - source_id: netic-news.net - source_url: https://www.netic-news.net - - source_id: objectif-infos.cd - source_url: https://objectif-infos.cd - - source_id: scooprdc.net - source_url: https://scooprdc.net - - source_id: journaldekinshasa.com - source_url: https://www.journaldekinshasa.com - - source_id: lepotentiel.cd - source_url: https://lepotentiel.cd - - source_id: acturdc.com - source_url: https://acturdc.com - - source_id: matininfos.net - source_url: https://matininfos.net diff --git a/projects/crawler/config/pipeline.yaml b/projects/crawler/config/pipeline.yaml deleted file mode 100644 index c1ed33b..0000000 --- a/projects/crawler/config/pipeline.yaml +++ /dev/null @@ -1,160 +0,0 @@ -# Fetching and crawling configuration -fetch: - client: - timeout: 20 - user_agent: Basango/0.1 (+https://github.com/bernard-ng/basango) - follow_redirects: true - verify_ssl: true - rotate: true - max_retries: 3 - backoff_initial: 1.0 - backoff_multiplier: 2.0 - backoff_max: 30.0 - respect_retry_after: true - crawler: - notify: false - use_multi_threading: false - max_workers: 5 - -# Logging configuration -# Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) -logging: - level: "INFO" - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - file_logging: true # Enable logging to file - console_logging: true # Enable logging to console - log_file: "pipeline.log" # Log file name - max_log_size: 10485760 # Maximum size of log file before rotation (10MB) - backup_count: 5 # Number of backup log files to keep - -# Source configurations -sources: - html: - - source_id: radiookapi.net - source_url: https://www.radiookapi.net - source_date: - pattern: "/(\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$3-$2-$1 $4" - source_selectors: - articles: ".view-content > .views-row.content-row" - article_title: "h1.page-header" - article_link: ".views-field-title a" - article_body: ".field-name-body" - article_date: ".views-field-created" - article_categories: ".views-field-field-cat-gorie a" - pagination: "ul.pagination > li.pager-last > a" - pagination_template: "actualite" - supports_categories: false - requires_details: true - requires_rate_limit: false - - - source_id: 7sur7.cd - source_url: https://7sur7.cd - source_date: - pattern: "/\\w{3} (\\d{2})\/(\\d{2})\/(\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$3-$2-$1 $4" - categories: [ "politique", "economie", "culture", "sport", "societe" ] - source_selectors: - articles: ".view-content > .row.views-row" - article_title: ".views-field-title a" - article_link: ".views-field-title a" - article_body: ".field.field--name-body" - article_date: ".views-field-created" - pagination: "ul.pagination > li.pager__item.pager__item--last > a" - pagination_template: "index.php/category/{category}" - supports_categories: true - requires_details: false - requires_rate_limit: false - - - source_id: mediacongo.net - source_url: https://www.mediacongo.net - source_date: - format: "%d.%m.%Y %H:%M" - source_selectors: - articles: ".for_aitems > .article_other_item" - article_title: "img" - article_link: "a:first-child" - article_categories: "a.color_link" - article_body: ".article_ttext" - article_date: ".article_other_about" - pagination: "div.pagination > div > a:last-child" - pagination_template: "articles.html" - supports_categories: false - requires_details: true - requires_rate_limit: false - - - source_id: actualite.cd - source_url: https://actualite.cd - source_date: - pattern: "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/" - replacement: "$4-$3-$2 $5" - source_selectors: - articles: "#views-bootstrap-taxonomy-term-page-2 > div > div" - article_title: "#actu-titre a" - article_link: "#actu-titre a" - article_categories: "#actu-cat a" - article_body: ".views-field.views-field-body" - article_date: "#p-date" - pagination_template: "actualite" - supports_categories: false - requires_details: true - requires_rate_limit: false - - wordpress: - - source_id: beto.cd - source_url: https://beto.cd - requires_rate_limit: true - - source_id: newscd.net - source_url: https://newscd.net - - source_id: africanewsrdc.net - source_url: https://www.africanewsrdc.net - - source_id: angazainstitute.ac.cd - source_url: https://angazainstitute.ac.cd - - source_id: b-onetv.cd - source_url: https://b-onetv.cd - - source_id: bukavufm.com - source_url: https://bukavufm.com - - source_id: changement7.net - source_url: https://changement7.net - - source_id: congoactu.net - source_url: https://congoactu.net - - source_id: congoindependant.com - source_url: https://www.congoindependant.com - - source_id: congoquotidien.com - source_url: https://www.congoquotidien.com - - source_id: cumulard.cd - source_url: https://www.cumulard.cd - - source_id: environews-rdc.net - source_url: https://environews-rdc.net - - source_id: freemediardc.info - source_url: https://www.freemediardc.info - - source_id: geopolismagazine.org - source_url: https://geopolismagazine.org - - source_id: habarirdc.net - source_url: https://habarirdc.net - - source_id: infordc.com - source_url: https://infordc.com - - source_id: kilalopress.net - source_url: https://kilalopress.net - - source_id: laprosperiteonline.net - source_url: https://laprosperiteonline.net - - source_id: laprunellerdc.cd - source_url: https://laprunellerdc.cd - - source_id: lesmedias.net - source_url: https://lesmedias.net - - source_id: lesvolcansnews.net - source_url: https://lesvolcansnews.net - - source_id: netic-news.net - source_url: https://www.netic-news.net - - source_id: objectif-infos.cd - source_url: https://objectif-infos.cd - - source_id: scooprdc.net - source_url: https://scooprdc.net - - source_id: journaldekinshasa.com - source_url: https://www.journaldekinshasa.com - - source_id: lepotentiel.cd - source_url: https://lepotentiel.cd - - source_id: acturdc.com - source_url: https://acturdc.com - - source_id: matininfos.net - source_url: https://matininfos.net diff --git a/projects/crawler/pyproject.toml b/projects/crawler/pyproject.toml deleted file mode 100644 index e7c587e..0000000 --- a/projects/crawler/pyproject.toml +++ /dev/null @@ -1,39 +0,0 @@ -[project] -name = "basango" -version = "0.1.0" -description = "Basango : Web Scrapper for DRC News" -readme = "README.md" -requires-python = ">=3.13" -dependencies = [ - "pydantic>=2.11.7", - "pydantic-settings>=2.10.1", - "rq>=2.5.0", - "typer>=0.16.1", - "uv-build>=0.8.12,<0.9.0", - "pyyaml>=6.0.2", - "httpx>=0.27.2", - "trafilatura>=1.7.0", - "selectolax>=0.3.20", - "markdownify>=0.13.1", - "readability-lxml>=0.8.1", - "beautifulsoup4>=4.13.5", - "tiktoken>=0.12.0", -] - -[dependency-groups] -dev = [ - "bandit>=1.8.6", - "pyright>=1.1.404", - "pytest>=8.4.1", - "ruff>=0.12.9", -] - -[project.scripts] -basango = "basango:main" - -[build-system] -requires = ["uv_build>=0.8.12,<0.9.0"] -build-backend = "uv_build" - -[tool.pytest.ini_options] -testpaths = ["tests"] diff --git a/projects/crawler/src/basango/__init__.py b/projects/crawler/src/basango/__init__.py deleted file mode 100644 index 9de631d..0000000 --- a/projects/crawler/src/basango/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -def main() -> None: - # Lazy import to avoid importing CLI deps during package import - from basango.cli import app - - app() - - -if __name__ == "__main__": # pragma: no cover - main() diff --git a/projects/crawler/src/basango/cli.py b/projects/crawler/src/basango/cli.py deleted file mode 100644 index bebaac1..0000000 --- a/projects/crawler/src/basango/cli.py +++ /dev/null @@ -1,186 +0,0 @@ -""" -CLI entry points for crawling and worker management. - -Sync vs async usage -- Synchronous crawl: runs the selected crawler in-process and writes results - via configured persistors (CSV/JSON). Suitable for local development or - small runs. -- Asynchronous crawl: enqueues a listing job in Redis (RQ) and returns - immediately. One or more RQ workers must be running to process jobs. - -Examples -- Sync: `basango crawl --source-id my-source --page 1:3` -- Async: `basango crawl --source-id my-source --async` -- Worker (macOS friendly): `basango worker --simple -q articles` - -Environment -- `BASANGO_REDIS_URL` points the worker/queues to Redis. -- `BASANGO_QUEUE_PREFIX` namespaces queues (default: `crawler`). -""" - -from typing import List, Optional -from enum import Enum - -import typer - -from basango.core.config import CrawlerConfig -from basango.core.config_manager import ConfigManager -from basango.domain import DateRange, PageRange, UpdateDirection -from basango.services import JsonPersistor -from basango.services.crawler.async_api import ( - QueueSettings, - schedule_async_crawl, - start_worker, -) -from basango.services.crawler.html_crawler import HtmlCrawler -from basango.services.crawler.wordpress_crawler import WordpressCrawler - -app = typer.Typer(no_args_is_help=True, add_completion=False) - - -class QueueName(str, Enum): - listing = "listing" - articles = "articles" - processed = "processed" - - -@app.command("crawl") -def crawl_cmd( - source_id: str = typer.Option( - ..., help="Source id to crawl (as defined in config)" - ), - page: str = typer.Option(None, "--page", "-p", help="Page range e.g. '1:10'"), - date: str = typer.Option( - None, "--date", "-d", help="Date range e.g. '2024-10-01:2024-10-31'" - ), - category: str = typer.Option(None, "--category", "-g", help="Optional category"), - notify: bool = typer.Option(False, "--notify", "-n", help="Enable notifications"), - env: str = typer.Option("development", "--env", "-c", help="Environment"), - async_mode: bool = typer.Option( - False, - "--async/--no-async", - help="Schedule crawl through Redis queues instead of running synchronously.", - ), -) -> None: - """Crawl a single source, either synchronously or via the async queue. - - Technical notes - - When `--async` is set, we only enqueue a job (no crawling happens here). - This keeps the CLI responsive and leaves fault-tolerance to RQ workers. - - Persistors (CSV/JSON) are instantiated only for the sync path; the async - path assigns them inside worker tasks to avoid importing heavy deps in the - CLI process and to better isolate failures. - """ - manager = ConfigManager() - pipeline = manager.get(env) - manager.ensure_directories(pipeline) - manager.setup_logging(pipeline) - - source = pipeline.sources.find(source_id) - if source is None: - raise typer.BadParameter(f"Source '{source_id}' not found in config") - - if async_mode: - job_id = schedule_async_crawl( - source_id=source_id, - env=env, - page_range=page, - date_range=date, - category=category, - ) - typer.echo( - f"Scheduled async crawl job {job_id} for source '{source_id}' on queue" - ) - return - - crawler_config = CrawlerConfig( - source=source, - page_range=PageRange.create(page) if page else None, - date_range=DateRange.create(date) if date else None, - category=category, - notify=notify, - direction=UpdateDirection.FORWARD, - ) - - crawlers = [ - HtmlCrawler, - WordpressCrawler, - ] - - source_identifier = getattr(source, "source_id", source_id) or source_id - persistors = [ - JsonPersistor( - data_dir=pipeline.paths.data, - source_id=str(source_identifier), - ), - ] - - for crawler in crawlers: - if crawler.supports() == source.source_kind: - crawler = crawler( - crawler_config, - pipeline.fetch.client, - persistors=persistors, - ) - crawler.fetch() - break - - -@app.command("worker") -def worker_cmd( - queue: Optional[List[QueueName]] = typer.Option( - None, - "--queue", - "-q", - help=( - "Queue name(s) (without prefix). Choices: listing, articles, processed. " - "Provide multiple times to listen to more than one queue." - ), - ), - simple: bool = typer.Option( - False, - "--simple/--no-simple", - help=( - "Run jobs in-process using RQ SimpleWorker (no forking). " - "Recommended on macOS to avoid fork-related crashes." - ), - ), - burst: bool = typer.Option( - False, - "--burst", - help="Process available jobs and exit instead of running continuously.", - ), - redis_url: str = typer.Option( - None, - "--redis-url", - help="Redis connection URL. Defaults to BASANGO_REDIS_URL.", - ), - env: str = typer.Option( - "development", - "--env", - "-c", - help="Environment used to configure logging before starting the worker.", - ), -) -> None: - """Run an RQ worker that consumes crawler queues. - - Notes - - By default the worker listens to the `articles` queue (detail jobs). Use - `-q listing -q articles -q processed` to listen to multiple. - - `--simple` uses RQ's SimpleWorker (no forking). On macOS this avoids - fork-related crashes when libraries aren't fork-safe. - - Use `--burst` to drain the queue and exit, useful for CI or one-off runs. - """ - manager = ConfigManager() - pipeline = manager.get(env) - manager.ensure_directories(pipeline) - manager.setup_logging(pipeline) - - settings = QueueSettings(redis_url=redis_url) if redis_url else QueueSettings() - queue_names = [q.value for q in queue] if queue else None - start_worker( - queue_names=queue_names, - settings=settings, - burst=burst, - simple=simple, - ) diff --git a/projects/crawler/src/basango/core/config/__init__.py b/projects/crawler/src/basango/core/config/__init__.py deleted file mode 100644 index 63414f5..0000000 --- a/projects/crawler/src/basango/core/config/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from .fetch_config import ClientConfig, FetchConfig, CrawlerConfig -from .logging_config import LoggingConfig -from .pipeline_config import PipelineConfig -from .source_config import ( - WordPressSourceConfig, - HtmlSourceConfig, - SourcesConfig, -) - -__all__ = [ - "ClientConfig", - "FetchConfig", - "CrawlerConfig", - "LoggingConfig", - "PipelineConfig", - "WordPressSourceConfig", - "HtmlSourceConfig", - "SourcesConfig", -] diff --git a/projects/crawler/src/basango/core/config/fetch_config.py b/projects/crawler/src/basango/core/config/fetch_config.py deleted file mode 100644 index 02dac65..0000000 --- a/projects/crawler/src/basango/core/config/fetch_config.py +++ /dev/null @@ -1,71 +0,0 @@ -from typing import Optional, Union - -from pydantic import BaseModel, Field - -from basango.domain import PageRange, DateRange, UpdateDirection -from basango.core.config.source_config import HtmlSourceConfig, WordPressSourceConfig - - -class ClientConfig(BaseModel): - timeout: float = Field(default=20.0, description="Request timeout in seconds") - user_agent: str = Field( - default="Basango/0.1 (+https://github.com/bernard-ng/basango)" - ) - follow_redirects: bool = Field(default=True, description="Follow HTTP redirects") - verify_ssl: bool = Field(default=True, description="Verify SSL certificates") - rotate: bool = Field(default=True, description="Rotate User-Agent header") - max_retries: int = Field( - default=3, description="Maximum number of retries on failure" - ) - backoff_initial: float = Field( - default=1.0, description="Initial backoff delay in seconds" - ) - backoff_multiplier: float = Field(default=2.0, description="Backoff multiplier") - backoff_max: float = Field( - default=30.0, description="Maximum backoff delay in seconds" - ) - respect_retry_after: bool = Field( - default=True, description="Respect Retry-After header if present" - ) - - -class CrawlerConfig(BaseModel): - source: Optional[Union[HtmlSourceConfig, WordPressSourceConfig]] = Field( - default=None, description="Source configuration to crawl" - ) - page_range: Optional[PageRange] = Field( - default=None, description="Page range to crawl, e.g: 1:10" - ) - date_range: Optional[DateRange] = Field( - default=None, - description="Date range to filter articles, e.g: 2024-10-01:2024-10-31", - ) - category: Optional[str] = Field( - default=None, description="Optional category to filter articles" - ) - notify: bool = Field( - default=False, description="Enable notifications after crawling" - ) - - is_update: bool = Field( - default=False, - description="Whether this crawl is an update (True) or a full crawl (False)", - ) - use_multi_threading: bool = Field( - default=False, description="Enable multiprocessing for concurrent crawling" - ) - max_workers: int = Field( - default=5, description="Maximum number of concurrent crawling workers" - ) - direction: UpdateDirection = Field( - default=UpdateDirection.FORWARD, description="Crawling direction" - ) - - -class FetchConfig(BaseModel): - client: ClientConfig = Field( - default_factory=ClientConfig, description="Http client configuration" - ) - crawler: CrawlerConfig = Field( - default_factory=CrawlerConfig, description="Crawler configuration" - ) diff --git a/projects/crawler/src/basango/core/config/logging_config.py b/projects/crawler/src/basango/core/config/logging_config.py deleted file mode 100644 index 8fa08e1..0000000 --- a/projects/crawler/src/basango/core/config/logging_config.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - - -class LoggingConfig(BaseModel): - level: str = "INFO" - format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - file_logging: bool = True - console_logging: bool = True - log_file: str = "pipeline.log" - max_log_size: int = 10 * 1024 * 1024 # 10MB - backup_count: int = 5 diff --git a/projects/crawler/src/basango/core/config/pipeline_config.py b/projects/crawler/src/basango/core/config/pipeline_config.py deleted file mode 100644 index 645f4f1..0000000 --- a/projects/crawler/src/basango/core/config/pipeline_config.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from pydantic import Field, BaseModel - -from basango.core.config.fetch_config import FetchConfig -from basango.core.config.logging_config import LoggingConfig -from basango.core.config.source_config import SourcesConfig -from basango.core.project_paths import ProjectPaths - - -def _default_project_paths() -> ProjectPaths: - """Create default project paths relative to the project root.""" - root = Path.cwd() - return ProjectPaths( - root=root, - configs=root / "config", - data=root / "data" / "dataset", - logs=root / "data" / "logs", - ) - - -class PipelineConfig(BaseModel): - paths: ProjectPaths = Field(default_factory=_default_project_paths, alias="paths") - logging: LoggingConfig = Field(default_factory=LoggingConfig) - fetch: FetchConfig = Field(default_factory=FetchConfig) - sources: SourcesConfig = Field(default_factory=SourcesConfig) diff --git a/projects/crawler/src/basango/core/config/source_config.py b/projects/crawler/src/basango/core/config/source_config.py deleted file mode 100644 index bfaa204..0000000 --- a/projects/crawler/src/basango/core/config/source_config.py +++ /dev/null @@ -1,66 +0,0 @@ -from typing import Union - -from pydantic import BaseModel, Field, HttpUrl - -from basango.domain import SourceDate, SourceKind, SourceSelectors - - -class SourceConfigBase(BaseModel): - source_id: str = Field(..., description="Unique identifier for the source") - source_url: HttpUrl = Field(..., description="URL of the source") - source_date: SourceDate = Field( - default_factory=SourceDate, description="Date extraction schema" - ) - source_kind: SourceKind = Field( - ..., description="Type of the source, e.g., 'wordpress' or 'html'" - ) - categories: list[str] = Field( - default_factory=list, description="List of categories to filter articles" - ) - - supports_categories: bool = Field( - default=False, description="the source supports categories" - ) - requires_details: bool = Field( - default=False, description="detailed article is required to compute date range" - ) - requires_rate_limit: bool = Field( - default=False, description="requires rate limit to avoid being blocked" - ) - - -class WordPressSourceConfig(SourceConfigBase): - source_kind: SourceKind = Field( - default=SourceKind.WORDPRESS, description="Type of the source" - ) - source_date: SourceDate = SourceDate( - format="%Y-%m-%dT%H:%M:%S", pattern=None, replacement=None - ) - - -class HtmlSourceConfig(SourceConfigBase): - source_kind: SourceKind = Field( - default=SourceKind.HTML, description="Type of the source" - ) - source_selectors: SourceSelectors = Field( - default_factory=lambda: SourceSelectors(), - description="CSS selectors for extracting articles", - ) - pagination_template: str = Field( - ..., description="Template URL for pagination, e.g., '/actualite?page={page}'" - ) - - -class SourcesConfig(BaseModel): - html: list[HtmlSourceConfig] = Field( - default_factory=list, description="List of source configurations" - ) - wordpress: list[WordPressSourceConfig] = Field( - default_factory=list, description="List of source configurations" - ) - - def find(self, source_id: str) -> Union[HtmlSourceConfig, WordPressSourceConfig]: - for source in self.html + self.wordpress: - if source.source_id == source_id: - return source - raise ValueError(f"Source with id '{source_id}' not found") diff --git a/projects/crawler/src/basango/core/config_manager.py b/projects/crawler/src/basango/core/config_manager.py deleted file mode 100644 index bd144ff..0000000 --- a/projects/crawler/src/basango/core/config_manager.py +++ /dev/null @@ -1,149 +0,0 @@ -import logging -import sys -from pathlib import Path -from typing import Optional, Union, Dict - -import yaml - -from basango.core.config import PipelineConfig -from basango.core.project_paths import ProjectPaths - - -def _ensure_utf8_stream(stream): - try: - if hasattr(stream, "reconfigure"): - stream.reconfigure(encoding="utf-8", errors="replace") - except (AttributeError, ValueError): - return stream - return stream - - -class ConfigManager: - def __init__(self, config_path: Optional[Union[str, Path]] = None): - self.config_path = Path(config_path) if config_path else self._find_config() - self._config: Optional[PipelineConfig] = None - self._setup_paths() - - def get(self, env: Optional[str] = None) -> PipelineConfig: - if env: - path = self.config_path.parent / f"pipeline.{env}.yaml" - - if path.exists(): - base = self.load().model_dump() - self._override(base, self.load(path).model_dump()) - return PipelineConfig(**base) - - if self._config is None: - self._config = self.load() - return self._config - - def load(self, config_path: Optional[Path] = None) -> PipelineConfig: - """Load configuration from file""" - self.config_path = config_path if config_path else self._find_config() - - if not self.config_path.exists(): - logging.warning( - f"Config file not found: {self.config_path}. Using defaults." - ) - return self._create_default() - - try: - with open(self.config_path, "r") as f: - config_data = yaml.safe_load(f) - - if "paths" not in config_data: - config_data["paths"] = self.default_paths.model_dump() - - self._config = PipelineConfig(**config_data) - return self._config - - except Exception as e: - logging.error(f"Failed to load config from {self.config_path}: {e}") - return self._create_default() - - @classmethod - def ensure_directories(cls, cfg: PipelineConfig) -> None: - directories = [cfg.paths.data, cfg.paths.logs, cfg.paths.configs] - - for directory in directories: - Path(directory).mkdir(parents=True, exist_ok=True) - - logging.info("Ensured all required directories exist") - - @classmethod - def setup_logging(cls, cfg: PipelineConfig): - logs_path = cfg.paths.logs - logs_path.mkdir(parents=True, exist_ok=True) - - # Setup logging configuration - log_level = getattr(logging, cfg.logging.level.upper(), logging.INFO) - - # Create formatter - formatter = logging.Formatter(cfg.logging.format) - - # Setup root logger - root_logger = logging.getLogger() - root_logger.setLevel(log_level) - - # Clear existing handlers - root_logger.handlers.clear() - - _ensure_utf8_stream(sys.stdout) - _ensure_utf8_stream(sys.stderr) - # Console handler - if cfg.logging.console_logging: - console_handler = logging.StreamHandler( - stream=_ensure_utf8_stream(sys.stderr) - ) - console_handler.setFormatter(formatter) - root_logger.addHandler(console_handler) - - # File handler - if cfg.logging.file_logging: - from logging.handlers import RotatingFileHandler - - log_file_path = logs_path / cfg.logging.log_file - file_handler = RotatingFileHandler( - log_file_path, - maxBytes=cfg.logging.max_log_size, - backupCount=cfg.logging.backup_count, - encoding="utf-8", - ) - file_handler.setFormatter(formatter) - root_logger.addHandler(file_handler) - - @classmethod - def _find_config(cls) -> Path: - possible_paths = [ - Path.cwd() / "config" / "pipeline.yaml", - Path.cwd() / "config" / "pipeline.yml", - Path.cwd() / "pipeline.yaml", - Path(__file__).parent.parent.parent.parent / "config" / "pipeline.yaml", - ] - - for path in possible_paths: - if path.exists(): - return path - - raise FileNotFoundError( - "No configuration file found in the expected locations." - ) - - def _setup_paths(self) -> None: - root = Path(__file__).parent.parent.parent.parent - self.default_paths = ProjectPaths( - root=root, - configs=root / "config", - data=root / "data" / "dataset", - logs=root / "data" / "logs", - ) - - def _create_default(self) -> PipelineConfig: - return PipelineConfig(paths=self.default_paths) - - def _override(self, base: Dict, update: Dict): - for key, value in update.items(): - if key in base and isinstance(base[key], dict) and isinstance(value, dict): - self._override(base[key], value) - else: - base[key] = value diff --git a/projects/crawler/src/basango/core/project_paths.py b/projects/crawler/src/basango/core/project_paths.py deleted file mode 100644 index 1793a81..0000000 --- a/projects/crawler/src/basango/core/project_paths.py +++ /dev/null @@ -1,26 +0,0 @@ -from pathlib import Path - -from pydantic import BaseModel, field_validator, ConfigDict - - -class ProjectPaths(BaseModel): - model_config = ConfigDict(arbitrary_types_allowed=True) - - root: Path - data: Path - logs: Path - configs: Path - - @classmethod - @field_validator("*", mode="before") - def convert_to_path(cls, v): - return Path(v) if not isinstance(v, Path) else v - - def get_data_path(self, filename: str) -> Path: - return self.data / filename - - def get_logs_path(self, filename: str) -> Path: - return self.logs / filename - - def get_config_path(self, filename: str) -> Path: - return self.configs / filename diff --git a/projects/crawler/src/basango/domain/__init__.py b/projects/crawler/src/basango/domain/__init__.py deleted file mode 100644 index 0c9acc4..0000000 --- a/projects/crawler/src/basango/domain/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from .article import Article -from .date_range import DateRange -from .page_range import PageRange -from .source import SourceKind, SourceDate, SourceSelectors -from .update_direction import UpdateDirection - -__all__ = [ - "Article", - "DateRange", - "PageRange", - "SourceKind", - "SourceDate", - "SourceSelectors", - "UpdateDirection", -] diff --git a/projects/crawler/src/basango/domain/article.py b/projects/crawler/src/basango/domain/article.py deleted file mode 100644 index ea5f214..0000000 --- a/projects/crawler/src/basango/domain/article.py +++ /dev/null @@ -1,30 +0,0 @@ -from datetime import datetime -from typing import Any, Optional - -from pydantic import BaseModel, HttpUrl -from .token_statistics import TokenStatistics - - -class Article(BaseModel): - title: str - link: HttpUrl - body: str - categories: list[str] - source: str - timestamp: datetime - metadata: Optional[dict[str, Any]] = None - token_statistics: Optional["TokenStatistics"] = None - - def to_dict(self) -> dict[str, Any]: - return { - "title": self.title, - "link": str(self.link), - "body": self.body, - "categories": self.categories, - "source": self.source, - "timestamp": int(self.timestamp.timestamp()), - "metadata": self.metadata, - "tokenStatistics": self.token_statistics.to_dict() - if self.token_statistics - else "", - } diff --git a/projects/crawler/src/basango/domain/date_range.py b/projects/crawler/src/basango/domain/date_range.py deleted file mode 100644 index 47fc7ed..0000000 --- a/projects/crawler/src/basango/domain/date_range.py +++ /dev/null @@ -1,64 +0,0 @@ -from dataclasses import dataclass -from datetime import datetime, timezone, timedelta -from typing import Optional - - -def _ensure_utc(dt: datetime) -> datetime: - if dt.tzinfo is None: - return dt.replace(tzinfo=timezone.utc) - return dt - - -@dataclass(frozen=True) -class DateRange: - start: int # Unix timestamp - end: int # Unix timestamp - - def __post_init__(self) -> None: - assert self.start != 0, "[DateRange] Start timestamp cannot be 0" - assert self.end != 0, "[DateRange] End timestamp cannot be 0" - assert self.end >= self.start, ( - "[DateRange] End must be greater than or equal to start" - ) - - def __str__(self) -> str: - return f"{self.start}:{self.end}" - - def in_range(self, ts: int) -> bool: - return self.start <= ts <= self.end - - def out_range(self, ts: int) -> bool: - return ts < self.start or ts > self.end - - def format(self, fmt: str = "%Y-%m-%d") -> str: - start = datetime.fromtimestamp(self.start, tz=timezone.utc).strftime(fmt) - end = datetime.fromtimestamp(self.end, tz=timezone.utc).strftime(fmt) - return f"{start}:{end}" - - @classmethod - def create( - cls, spec: str, fmt: str = "%Y-%m-%d", separator: str = ":" - ) -> "DateRange": - assert separator != "", "[DateRange] Separator cannot be empty" - assert separator in spec, f"[DateRange] {separator} must be in {spec}" - - parts = spec.split(separator) - assert len(parts) == 2, f"[DateRange] Invalid date interval: {spec}" - - start = _ensure_utc(datetime.strptime(parts[0], fmt)) - end = _ensure_utc(datetime.strptime(parts[1], fmt)) - return cls(int(start.timestamp()), int(end.timestamp())) - - @classmethod - def backward(cls, date: Optional[datetime] = None, days: int = 30) -> "DateRange": - base = _ensure_utc(date or datetime.now(timezone.utc)) - - start = base - timedelta(days=days) - end = base + timedelta(days=1) # in future to avoid timezone issues - return cls(int(start.timestamp()), int(end.timestamp())) - - @classmethod - def forward(cls, date: datetime) -> "DateRange": - start = _ensure_utc(date) - end = datetime.now(timezone.utc) + timedelta(days=1) - return cls(int(start.timestamp()), int(end.timestamp())) diff --git a/projects/crawler/src/basango/domain/exception.py b/projects/crawler/src/basango/domain/exception.py deleted file mode 100644 index 26b4ffb..0000000 --- a/projects/crawler/src/basango/domain/exception.py +++ /dev/null @@ -1,18 +0,0 @@ -from basango.domain import DateRange - - -class ArticleNotFoundError(Exception): - pass - - -class ArticleOutOfRange(Exception): - def __init__(self, timestamp: str, date_range: DateRange): - self.timestamp = timestamp - self.date_range = date_range - super().__init__( - f"Article with timestamp {timestamp} is out of range {date_range}" - ) - - @classmethod - def create(cls, timestamp: str, date_range: DateRange) -> "ArticleOutOfRange": - return cls(timestamp, date_range) diff --git a/projects/crawler/src/basango/domain/page_range.py b/projects/crawler/src/basango/domain/page_range.py deleted file mode 100644 index e756c90..0000000 --- a/projects/crawler/src/basango/domain/page_range.py +++ /dev/null @@ -1,20 +0,0 @@ -from dataclasses import dataclass - - -@dataclass(frozen=True) -class PageRange: - start: int - end: int - - @staticmethod - def create(spec: str) -> "PageRange": - parts = spec.split(":") - assert len(parts) == 2, f"[PageRange] Invalid page range: {spec}" - - start, end = int(parts[0]), int(parts[1]) - assert start >= 0, f"[PageRange] Invalid page range: {spec}" - assert end >= start, f"[PageRange] Invalid page range: {spec}" - return PageRange(start=start, end=end) - - def __str__(self): - return f"{self.start}:{self.end}" diff --git a/projects/crawler/src/basango/domain/source.py b/projects/crawler/src/basango/domain/source.py deleted file mode 100644 index 70bf0d6..0000000 --- a/projects/crawler/src/basango/domain/source.py +++ /dev/null @@ -1,41 +0,0 @@ -from enum import StrEnum -from typing import Optional - -from pydantic import BaseModel, Field - - -class SourceKind(StrEnum): - WORDPRESS = "wordpress" - HTML = "html" - - -class SourceDate(BaseModel): - format: str = "%Y-%m-%d %H:%M" - pattern: Optional[str] = None - replacement: Optional[str] = None - - -class SourceSelectors(BaseModel): - articles: Optional[str] = Field( - default=None, description="CSS selector for the list of articles within a page" - ) - article_title: Optional[str] = Field( - default=None, description="CSS selector for the article title" - ) - article_link: Optional[str] = Field( - default=None, description="CSS selector for the article link" - ) - article_body: Optional[str] = Field( - default=None, description="CSS selector for the article body/content" - ) - article_date: Optional[str] = Field( - default=None, description="CSS selector for the article date" - ) - article_categories: Optional[str] = Field( - default=None, description="CSS selector for the article categories" - ) - - pagination: str = Field( - default="ul.pagination > li a", - description="CSS selector for the pagination links", - ) diff --git a/projects/crawler/src/basango/domain/token_statistics.py b/projects/crawler/src/basango/domain/token_statistics.py deleted file mode 100644 index 5d4abf6..0000000 --- a/projects/crawler/src/basango/domain/token_statistics.py +++ /dev/null @@ -1,19 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class TokenStatistics: - """Counts of tokens for different article sections.""" - - title: int - body: int - excerpt: int - categories: int - - def to_dict(self) -> dict[str, int]: - return { - "title": self.title, - "body": self.body, - "excerpt": self.excerpt, - "categories": self.categories, - } diff --git a/projects/crawler/src/basango/domain/update_direction.py b/projects/crawler/src/basango/domain/update_direction.py deleted file mode 100644 index f73c483..0000000 --- a/projects/crawler/src/basango/domain/update_direction.py +++ /dev/null @@ -1,6 +0,0 @@ -from enum import StrEnum - - -class UpdateDirection(StrEnum): - FORWARD = "forward" - BACKWARD = "backward" diff --git a/projects/crawler/src/basango/services/__init__.py b/projects/crawler/src/basango/services/__init__.py deleted file mode 100644 index 42e55cc..0000000 --- a/projects/crawler/src/basango/services/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -from .date_parser import DateParser -from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient -from .open_graph import OpenGraphProvider -from .persistence import BasePersistor, CsvPersistor, JsonPersistor -from .user_agents import UserAgents -from .tokenizer import Tokenizer - -HttpClient = SyncHttpClient - -__all__ = [ - "DateParser", - "BaseHttpClient", - "SyncHttpClient", - "AsyncHttpClient", - "HttpClient", - "OpenGraphProvider", - "UserAgents", - "BasePersistor", - "CsvPersistor", - "JsonPersistor", - "Tokenizer", -] diff --git a/projects/crawler/src/basango/services/crawler/__init__.py b/projects/crawler/src/basango/services/crawler/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/projects/crawler/src/basango/services/crawler/async/__init__.py b/projects/crawler/src/basango/services/crawler/async/__init__.py deleted file mode 100644 index 72407df..0000000 --- a/projects/crawler/src/basango/services/crawler/async/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -from .queue import QueueManager, QueueSettings -from .schemas import ListingTaskPayload, ArticleTaskPayload, ProcessedTaskPayload -from .tasks import ( - schedule_async_crawl, - collect_listing, - collect_article, - forward_for_processing, -) -from .worker import start_worker - -__all__ = [ - "QueueManager", - "QueueSettings", - "ListingTaskPayload", - "ArticleTaskPayload", - "ProcessedTaskPayload", - "schedule_async_crawl", - "collect_listing", - "collect_article", - "forward_for_processing", - "start_worker", -] diff --git a/projects/crawler/src/basango/services/crawler/async/queue.py b/projects/crawler/src/basango/services/crawler/async/queue.py deleted file mode 100644 index efd9084..0000000 --- a/projects/crawler/src/basango/services/crawler/async/queue.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -RQ queue configuration and helpers. - -Design choices -- Queue names are prefixed (e.g. `crawler:articles`) so multiple environments - can share the same Redis. Configure via `BASANGO_QUEUE_PREFIX`. -- Job default timeouts and TTLs are centrally configured to avoid per-enqueue - tuning. Environment variables allow ops to adjust at runtime. -- Task callables are referenced by dotted string path when enqueuing to ensure - RQ workers can import them without importing this module and creating cycles. -""" - -import os -from dataclasses import dataclass, field -from typing import Iterable - -from redis import Redis -from rq import Queue - -from .schemas import ( - ArticleTaskPayload, - ListingTaskPayload, - ProcessedTaskPayload, -) - - -@dataclass(slots=True) -class QueueSettings: - redis_url: str = field( - default_factory=lambda: os.getenv( # type: ignore[arg-type] - "BASANGO_REDIS_URL", "redis://localhost:6379/0" - ) - ) - prefix: str = field( - default_factory=lambda: os.getenv("BASANGO_QUEUE_PREFIX", "crawler") - ) - default_timeout: int = field( - default_factory=lambda: int(os.getenv("BASANGO_QUEUE_TIMEOUT", "600")) - ) - result_ttl: int = field( - default_factory=lambda: int(os.getenv("BASANGO_QUEUE_RESULT_TTL", "3600")) - ) - failure_ttl: int = field( - default_factory=lambda: int(os.getenv("BASANGO_QUEUE_FAILURE_TTL", "3600")) - ) - listing_queue: str = "listing" - article_queue: str = "articles" - processed_queue: str = "processed" - - -class QueueManager: - def __init__(self, settings: QueueSettings | None = None) -> None: - self.settings = settings or QueueSettings() - self.connection = Redis.from_url(self.settings.redis_url) - self.listing_queue = self._build_queue(self.settings.listing_queue) - self.article_queue = self._build_queue(self.settings.article_queue) - self.processed_queue = self._build_queue(self.settings.processed_queue) - - def _build_queue(self, suffix: str) -> Queue: - return Queue( - self.queue_name(suffix), - connection=self.connection, - default_timeout=self.settings.default_timeout, - result_ttl=self.settings.result_ttl, - failure_ttl=self.settings.failure_ttl, - ) - - def queue_name(self, suffix: str) -> str: - return f"{self.settings.prefix}:{suffix}" - - def enqueue_listing(self, payload: ListingTaskPayload): - return self.listing_queue.enqueue( - "basango.services.crawler.async.tasks.collect_listing", - payload.to_dict(), - ) - - def enqueue_article(self, payload: ArticleTaskPayload): - return self.article_queue.enqueue( - "basango.services.crawler.async.tasks.collect_article", - payload.to_dict(), - ) - - def enqueue_processed(self, payload: ProcessedTaskPayload): - return self.processed_queue.enqueue( - "basango.services.crawler.async.tasks.forward_for_processing", - payload.to_dict(), - ) - - def iter_queue_names(self) -> Iterable[str]: - yield self.queue_name(self.settings.listing_queue) - yield self.queue_name(self.settings.article_queue) - yield self.queue_name(self.settings.processed_queue) diff --git a/projects/crawler/src/basango/services/crawler/async/schemas.py b/projects/crawler/src/basango/services/crawler/async/schemas.py deleted file mode 100644 index ca78cb2..0000000 --- a/projects/crawler/src/basango/services/crawler/async/schemas.py +++ /dev/null @@ -1,64 +0,0 @@ -""" -Lightweight task payload schemas. - -Notes -- Use dataclasses with `slots=True` for low overhead and predictable fields. -- `_coerce_kwargs` filters unknown keys so payloads are resilient to schema - changes when workers and producers are not updated in lockstep. -""" - -from dataclasses import asdict, dataclass, fields -from typing import Any, Mapping - -from basango.domain.article import Article - - -def _coerce_kwargs(cls, data: Mapping[str, Any]) -> dict[str, Any]: - return {field.name: data.get(field.name) for field in fields(cls)} - - -@dataclass(slots=True) -class ListingTaskPayload: - source_id: str - env: str = "development" - page_range: str | None = None - date_range: str | None = None - category: str | None = None - - def to_dict(self) -> dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Mapping[str, Any]) -> "ListingTaskPayload": - return cls(**_coerce_kwargs(cls, data)) - - -@dataclass(slots=True) -class ArticleTaskPayload: - source_id: str - env: str = "development" - url: str | None = None - data: Any | None = None - date_range: str | None = None - category: str | None = None - - def to_dict(self) -> dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Mapping[str, Any]) -> "ArticleTaskPayload": - return cls(**_coerce_kwargs(cls, data)) - - -@dataclass(slots=True) -class ProcessedTaskPayload: - source_id: str - article: Article - env: str = "development" - - def to_dict(self) -> dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Mapping[str, Any]) -> "ProcessedTaskPayload": - return cls(**_coerce_kwargs(cls, data)) diff --git a/projects/crawler/src/basango/services/crawler/async/tasks.py b/projects/crawler/src/basango/services/crawler/async/tasks.py deleted file mode 100644 index f1f99db..0000000 --- a/projects/crawler/src/basango/services/crawler/async/tasks.py +++ /dev/null @@ -1,305 +0,0 @@ -""" -RQ task functions for the asynchronous crawl pipeline. - -Pipeline -- schedule_async_crawl: seeds a listing job for a source -- collect_listing: enumerates listing pages and enqueues detail jobs -- collect_article: extracts and persists article data, then forwards -- forward_for_processing: hands the record to downstream system (HTTP API) - -Rationale -- Split listing vs article work to keep jobs small and retryable. -- Use ConfigManager to reconstruct the same pipeline/env in workers. -- Persist locally (CSV/JSON) before forwarding to decouple pipelines. -""" - -import os -import logging -from typing import Any - -from basango.domain.article import Article -from basango.services import SyncHttpClient -from basango.core.config import CrawlerConfig -from basango.core.config_manager import ConfigManager -from basango.domain import DateRange, PageRange, SourceKind, UpdateDirection -from basango.services import JsonPersistor -from basango.services.crawler.html_crawler import HtmlCrawler -from basango.services.crawler.wordpress_crawler import WordpressCrawler - -from .queue import QueueManager, QueueSettings -from .schemas import ( - ArticleTaskPayload, - ListingTaskPayload, - ProcessedTaskPayload, -) - - -logger = logging.getLogger(__name__) - - -def schedule_async_crawl( - *, - source_id: str, - env: str = "development", - page_range: str | None = None, - date_range: str | None = None, - category: str | None = None, - settings: QueueSettings | None = None, -): - # Keep payload serialisable and minimal; workers reconstruct config objects. - payload = ListingTaskPayload( - source_id=source_id, - env=env, - page_range=page_range, - date_range=date_range, - category=category, - ) - manager = QueueManager(settings=settings) - job = manager.enqueue_listing(payload) - logger.info("Scheduled listing collection job %s for source %s", job.id, source_id) - return job.id - - -def collect_listing(payload: dict[str, Any]) -> int: - data = ListingTaskPayload.from_dict(payload) - manager = ConfigManager() - pipeline = manager.get(data.env) - source = pipeline.sources.find(data.source_id) - if source is None: - logger.error("Unknown source id %s", data.source_id) - return 0 - - crawler_config = CrawlerConfig( - source=source, - page_range=PageRange.create(data.page_range) if data.page_range else None, - date_range=DateRange.create(data.date_range) if data.date_range else None, - category=data.category, - notify=False, - direction=UpdateDirection.FORWARD, - ) - client_config = pipeline.fetch.client - queue_manager = QueueManager() - - # Branch by source kind to reuse the same high-level flow with different - # extraction strategies. - if source.source_kind == SourceKind.HTML: - crawler = HtmlCrawler(crawler_config, client_config) - queued = _collect_html_listing(crawler, data, queue_manager) - elif source.source_kind == SourceKind.WORDPRESS: - crawler = WordpressCrawler(crawler_config, client_config) - queued = _collect_wordpress_listing(crawler, data, queue_manager) - else: - logger.warning( - "Async crawling not supported for source kind %s", source.source_kind - ) - queued = 0 - - logger.info("Queued %s article detail jobs for source %s", queued, data.source_id) - return queued - - -def collect_article(payload: dict[str, Any]) -> Article | None: - data = ArticleTaskPayload.from_dict(payload) - manager = ConfigManager() - pipeline = manager.get(data.env) - source = pipeline.sources.find(data.source_id) - if source is None: - logger.error("Unknown source id %s", data.source_id) - return None - - crawler_config = CrawlerConfig( - source=source, - date_range=DateRange.create(data.date_range) if data.date_range else None, - category=data.category, - notify=False, - direction=UpdateDirection.FORWARD, - ) - - # Persist locally first to keep an auditable trail and enable - # replay/recovery independent of downstream availability. - persistors = [ - JsonPersistor( - data_dir=pipeline.paths.data, - source_id=str(source.source_id), - ), - ] - - try: - if source.source_kind == SourceKind.HTML: - article = _collect_html_article( - HtmlCrawler( - crawler_config, pipeline.fetch.client, persistors=persistors - ), - data, - ) - else: - article = _collect_wordpress_article( - WordpressCrawler( - crawler_config, pipeline.fetch.client, persistors=persistors - ), - data, - ) - - queue_manager = QueueManager() - queue_manager.enqueue_processed( - ProcessedTaskPayload( - source_id=data.source_id, - env=data.env, - article=article, - ) - ) - - logger.info( - "Persisted article %s and forwarded to processed queue", article.link - ) - return article - except Exception as exc: # noqa: BLE001 - logger.error( - "Failed to collect article for source %s url %s: %s", - data.source_id, - data.url, - exc, - ) - return None - - -def forward_for_processing(payload: dict[str, Any]) -> Article | None: - data = ProcessedTaskPayload.from_dict(payload) - manager = ConfigManager() - pipeline = manager.get(data.env) - - article = data.article - logger.info( - "Ready for downstream processing: source=%s link=%s", - data.source_id, - article.link, - ) - - try: - client = SyncHttpClient(client_config=pipeline.fetch.client) - client.post( - os.getenv( - "BASANGO_API_ENDPOINT", - "http://localhost:8000/api/aggregator/articles?token=dev", - ), - json=article.to_dict(), - ) - - logger.info("Forwarded article %s to API", article.link) - return article - except Exception as exc: # noqa: BLE001 - logger.error( - "Failed to forward article %s to API: %s", - article.link, - exc, - ) - return None - - -def _collect_html_listing( - crawler: HtmlCrawler, - payload: ListingTaskPayload, - queue_manager: QueueManager, -) -> int: - source = crawler.source - selector = source.source_selectors.articles - if not selector: - logger.warning( - "No article selector configured for HTML source %s", - source.source_id, - ) - return 0 - - page_range = crawler.config.page_range or crawler.get_pagination() - queued = 0 - - for page in range(page_range.start, page_range.end + 1): - page_url = crawler._build_page_url(page) - try: - soup = crawler.crawl(page_url, page) - except Exception as exc: # noqa: BLE001 - logger.exception("Failed to crawl page %s: %s", page_url, exc) - continue - - for node in soup.select(selector): - link = crawler._extract_link(node) - if not link: - continue - queue_manager.enqueue_article( - ArticleTaskPayload( - source_id=payload.source_id, - env=payload.env, - url=link, - date_range=payload.date_range, - category=payload.category, - ) - ) - queued += 1 - - return queued - - -def _collect_wordpress_listing( - crawler: WordpressCrawler, - payload: ListingTaskPayload, - queue_manager: QueueManager, -) -> int: - page_range = crawler.config.page_range or crawler.get_pagination() - queued = 0 - - for page in range(page_range.start, page_range.end + 1): - endpoint = crawler._posts_endpoint(page) - try: - response = crawler.client.get(endpoint) - articles = response.json() - except Exception as exc: # noqa: BLE001 - logger.exception("Failed to fetch WordPress page %s: %s", endpoint, exc) - continue - - if not isinstance(articles, list): - logger.warning("Unexpected WordPress payload type: %s", type(articles)) - continue - - for entry in articles: - queue_manager.enqueue_article( - ArticleTaskPayload( - source_id=payload.source_id, - env=payload.env, - url=entry.get("link"), - data=entry, - date_range=payload.date_range, - category=payload.category, - ) - ) - queued += 1 - - return queued - - -def _collect_html_article( - crawler: HtmlCrawler, - payload: ArticleTaskPayload, -) -> Article: - if not payload.url: - logger.warning("Missing article url for HTML source %s", payload.source_id) - raise ValueError("Missing article url") - - crawler._current_article_url = payload.url # type: ignore[attr-defined] - try: - soup = crawler.crawl(payload.url) - except Exception as exc: # noqa: BLE001 - logger.exception("Failed to crawl article %s: %s", payload.url, exc) - raise exc - - return crawler.fetch_one(str(soup), crawler.config.date_range) - - -def _collect_wordpress_article( - crawler: WordpressCrawler, - payload: ArticleTaskPayload, -) -> Article: - if payload.data is None: - logger.warning("Missing WordPress payload for source %s", payload.source_id) - raise ValueError("Missing WordPress payload") - - return crawler.fetch_one(payload.data, crawler.config.date_range) diff --git a/projects/crawler/src/basango/services/crawler/async/worker.py b/projects/crawler/src/basango/services/crawler/async/worker.py deleted file mode 100644 index 0903c9b..0000000 --- a/projects/crawler/src/basango/services/crawler/async/worker.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Worker bootstrap for RQ queues. - -Defaults to the `articles` queue to prioritise article detail processing. -`SimpleWorker` is exposed for environments where forking is unstable (e.g., -some macOS setups). Use `burst=True` for CI or one-shot consumption. -""" - -import logging -from typing import Sequence - -from rq import Queue, Worker, SimpleWorker - -from .queue import QueueManager, QueueSettings - - -logger = logging.getLogger(__name__) - - -def start_worker( - queue_names: Sequence[str] | None = None, - *, - settings: QueueSettings | None = None, - burst: bool = False, - simple: bool = False, -) -> None: - manager = QueueManager(settings=settings) - if queue_names is None or not list(queue_names): - queue_names = [manager.settings.article_queue] - - resolved = [manager.queue_name(name) for name in queue_names] - queues = [Queue(name, connection=manager.connection) for name in resolved] - - worker_cls = SimpleWorker if simple else Worker - logger.info( - "Starting RQ %s for queues %s", - worker_cls.__name__, - ", ".join(resolved), - ) - worker = worker_cls(queues, connection=manager.connection) - worker.work(burst=burst) diff --git a/projects/crawler/src/basango/services/crawler/async_api.py b/projects/crawler/src/basango/services/crawler/async_api.py deleted file mode 100644 index 7b98ddc..0000000 --- a/projects/crawler/src/basango/services/crawler/async_api.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Thin indirection layer around async components (queues, tasks, worker). - -We import symbols dynamically to avoid importing optional runtime dependencies -like RQ and Redis at module import time. This keeps regular (sync) crawling -usable even if async deps aren't installed, and avoids circular imports when -RQ workers import task callables by string path. -""" - -from importlib import import_module - -_async_queue = import_module("basango.services.crawler.async.queue") -_async_tasks = import_module("basango.services.crawler.async.tasks") -_async_worker = import_module("basango.services.crawler.async.worker") -_async_schemas = import_module("basango.services.crawler.async.schemas") - -QueueManager = getattr(_async_queue, "QueueManager") -QueueSettings = getattr(_async_queue, "QueueSettings") -ListingTaskPayload = getattr(_async_schemas, "ListingTaskPayload") -ArticleTaskPayload = getattr(_async_schemas, "ArticleTaskPayload") -ProcessedTaskPayload = getattr(_async_schemas, "ProcessedTaskPayload") -schedule_async_crawl = getattr(_async_tasks, "schedule_async_crawl") -collect_listing = getattr(_async_tasks, "collect_listing") -collect_article = getattr(_async_tasks, "collect_article") -forward_for_processing = getattr(_async_tasks, "forward_for_processing") -start_worker = getattr(_async_worker, "start_worker") - -__all__ = [ - "QueueManager", - "QueueSettings", - "ListingTaskPayload", - "ArticleTaskPayload", - "ProcessedTaskPayload", - "schedule_async_crawl", - "collect_listing", - "collect_article", - "forward_for_processing", - "start_worker", -] diff --git a/projects/crawler/src/basango/services/crawler/base_crawler.py b/projects/crawler/src/basango/services/crawler/base_crawler.py deleted file mode 100644 index 0584610..0000000 --- a/projects/crawler/src/basango/services/crawler/base_crawler.py +++ /dev/null @@ -1,161 +0,0 @@ -import logging -from abc import ABC, abstractmethod -from dataclasses import asdict, is_dataclass -from datetime import datetime -from typing import Optional, Any, Dict, List, Sequence - -from basango.domain.article import Article -from bs4 import BeautifulSoup -from pydantic import HttpUrl - -from basango.core.config import CrawlerConfig, ClientConfig -from basango.domain import DateRange, SourceKind, PageRange -from basango.domain.exception import ArticleOutOfRange -from basango.services import ( - HttpClient, - DateParser, - OpenGraphProvider, - BasePersistor, - Tokenizer, -) - - -class BaseCrawler(ABC): - """ - Base building blocks shared by concrete crawlers. - - Notable conventions - - `skip`: raises `ArticleOutOfRange` when an item falls outside the desired - date range. Callers catch it to stop pagination early. - - `record_article`: normalises metadata (including dataclasses) before - handing off to persistors. - """ - - def __init__( - self, - crawler_config: CrawlerConfig, - client_config: ClientConfig, - persistors: Sequence[BasePersistor] | None = None, - ) -> None: - self.config = crawler_config - self.source = crawler_config.source - self.client = HttpClient(client_config=client_config) - self.persistors: list[BasePersistor] = list(persistors) if persistors else [] - self.date_parser = DateParser() - self.open_graph = OpenGraphProvider() - self.tokenizer = Tokenizer() - - @abstractmethod - def fetch(self) -> None: - pass - - def crawl(self, url: str, page: Optional[int] = None) -> BeautifulSoup: - if page is not None: - logging.info(f"> Page {page}") - - response = self.client.get(url).text - return BeautifulSoup(response, "html.parser") - - def save_article( - self, - *, - title: str, - link: str, - body: str, - categories: List[str], - timestamp: int, - metadata: Any = None, - ) -> Article: - if metadata is None: - metadata_value = None - elif is_dataclass(metadata) and not isinstance(metadata, type): - metadata_value = asdict(metadata) - elif isinstance(metadata, dict): - metadata_value = metadata - else: - metadata_value = None - - # Get source_id and ensure it's a string - source_id = getattr(self.source, "source_id", None) - if source_id is None: - source_id = "unknown" - - article = Article( - title=title, - link=HttpUrl(link), # Convert str to HttpUrl - body=body, - categories=categories, - source=source_id, # Ensure it's a string, not None - timestamp=datetime.fromtimestamp( - timestamp - ), # Convert int timestamp to datetime - metadata=metadata_value, - ) - article.token_statistics = self.tokenizer.count_tokens( - article.title, article.body, article.categories - ) - - self._persist(article.to_dict()) - logging.info("> %s [saved]", article.title) - - return article - - @abstractmethod - def fetch_one( - self, html: str, date_range: Optional[DateRange] = None - ) -> Article | None: - pass - - @abstractmethod - def get_pagination(self) -> PageRange: - pass - - def get_last_page(self) -> int: - return 1 - - @staticmethod - @abstractmethod - def supports() -> SourceKind: - pass - - @classmethod - def initialize(cls) -> None: - logging.info("Initializing Crawler") - - def completed(self, notify: bool = False) -> None: - logging.info("Crawling completed") - if notify: - logging.info("Sending notification about completion") - # TODO: Implement notification logic here - self._shutdown_persistors() - - @classmethod - def skip(cls, date_range: DateRange, timestamp: str, title: str, date: str) -> None: - if date_range.out_range(int(timestamp)): - # Use an exception to unwind to the crawl loop and stop as soon as - # we detect items beyond the configured range. - raise ArticleOutOfRange.create(timestamp, date_range) - - logging.warning(f"> {title} [Skipped {date}]") - - def _persist(self, article: Dict[str, Any]) -> None: - for persistor in self.persistors: - try: - persistor.persist(article) - except Exception as exc: # noqa: BLE001 - logging.exception( - "Failed to persist article via %s: %s", - persistor.__class__.__name__, - exc, - ) - - def _shutdown_persistors(self) -> None: - for persistor in self.persistors: - try: - persistor.close() - except Exception as exc: # noqa: BLE001 - logging.exception( - "Failed to close persistor %s: %s", - persistor.__class__.__name__, - exc, - ) diff --git a/projects/crawler/src/basango/services/crawler/html_crawler.py b/projects/crawler/src/basango/services/crawler/html_crawler.py deleted file mode 100644 index afe75c9..0000000 --- a/projects/crawler/src/basango/services/crawler/html_crawler.py +++ /dev/null @@ -1,322 +0,0 @@ -import logging -import re -from datetime import datetime, timezone -from typing import Optional, cast, override, Sequence -from urllib.parse import parse_qs, urljoin, urlparse - -from basango.domain.article import Article -from bs4 import BeautifulSoup, Tag -from markdownify import markdownify - -from basango.core.config import CrawlerConfig, ClientConfig -from basango.core.config.source_config import HtmlSourceConfig -from basango.domain import DateRange, PageRange, SourceKind -from basango.domain.exception import ArticleOutOfRange -from basango.services.crawler.base_crawler import BaseCrawler -from basango.services import BasePersistor - - -class HtmlCrawler(BaseCrawler): - """ - Generic HTML crawler driven by CSS selectors. - - Strategy - - Listing pages are iterated to extract per-article links or blocks. - - When `requires_details` is set, a second request fetches the article page - to extract full content; otherwise the article block is parsed inline. - - Pagination is inferred from a template and last-page discovery heuristics - (regex or query string `page` fallback). - """ - - def __init__( - self, - crawler_config: CrawlerConfig, - client_config: ClientConfig, - persistors: Sequence[BasePersistor] | None = None, - ) -> None: - super().__init__(crawler_config, client_config, persistors=persistors) - if not self.source or self.source.source_kind != SourceKind.HTML: - raise ValueError("HtmlCrawler requires a source of kind HTML") - - self.source = cast(HtmlSourceConfig, self.source) - self._current_article_url: Optional[str] = None - - @override - def fetch(self) -> None: - self.initialize() - page_range = self.config.page_range or self.get_pagination() - date_range = self.config.date_range - - article_selector = self.source.source_selectors.articles - if not article_selector: - logging.error( - "No article selector configured for HTML source %s", - self.source.source_id, - ) - return - - stop = False - for page_number in range(page_range.start, page_range.end + 1): - page_url = self._build_page_url(page_number) - try: - soup = self.crawl(page_url, page_number) - except Exception as exc: # noqa: BLE001 - logging.error( - "> page %s => %s [failed]", - page_number, - exc, - ) - continue - - articles = soup.select(article_selector) - if not articles: - logging.info("No articles found on page %s", page_number) - continue - - for article in articles: - try: - self._current_article_url = self._extract_link(article) - target_html = str(article) - - if self.source.requires_details: - if not self._current_article_url: - logging.debug( - "Skipping article without link for details on page %s", - page_number, - ) - continue - try: - detail_soup = self.crawl(self._current_article_url) - target_html = str(detail_soup) - except Exception as detail_exc: # noqa: BLE001 - logging.error( - "Failed to fetch detail page %s: %s", - self._current_article_url, - detail_exc, - ) - continue - - self.fetch_one(target_html, date_range) - except ArticleOutOfRange: - # Using an exception to short-circuit nested loops keeps the - # happy path tidy (no extra flags at each extraction site). - logging.info("No more articles to fetch in this range.") - stop = True - break - except Exception as exc: # noqa: BLE001 - logging.error( - "Failed to process article on %s: %s", - page_url, - exc, - ) - finally: - self._current_article_url = None - - if stop: - break - - self.completed(self.config.notify) - - @override - def fetch_one(self, html: str, date_range: Optional[DateRange] = None) -> Article: - soup = BeautifulSoup(html, "html.parser") - selectors = self.source.source_selectors - - title = self._extract_text(soup, selectors.article_title) or "Untitled" - link = self._current_article_url or self._extract_link(soup) - if not link: - logging.warning("Skipping article '%s' without link", title) - raise ValueError("Missing article link") - - body = self._extract_body(soup, selectors.article_body) - categories = self._extract_categories(soup, selectors.article_categories) - if not categories and self.config.category: - categories = [self.config.category] - - raw_date = self._extract_text(soup, selectors.article_date) - timestamp = self._compute_timestamp(raw_date) - - if date_range and not date_range.in_range(timestamp): - self.skip(date_range, str(timestamp), title, raw_date or "") - - metadata = self.open_graph.consume_html(html) - - return self.save_article( - title=title, - link=link, - body=body, - categories=categories, - timestamp=timestamp, - metadata=metadata, - ) - - @override - def get_pagination(self) -> PageRange: - return PageRange.create(f"0:{self.get_last_page()}") - - @override - def get_last_page(self) -> int: - if not self.source: - return 1 - - if self.source.supports_categories and self.config.category: - path = self.source.pagination_template.replace( - "{category}", self.config.category - ) - else: - path = self.source.pagination_template - - links = self.crawl(f"{self.source.source_url}{path}").select( - self.source.source_selectors.pagination - ) - if not links: - return 1 - - href = links[-1].get("href") - if not href or not isinstance(href, str): - return 1 - - # Heuristic: last pagination link either contains the page number - # directly or as a `page` query param. Prefer regex first to support - # path-style pagination (e.g., /page/4/). - match = re.search(r"(\d+)", href) - if match: - return int(match.group(1)) - - queries = parse_qs(urlparse(href).query) - if "page" in queries and queries["page"]: - try: - return int(queries["page"][0]) - except ValueError: - return 1 - return 1 - - @staticmethod - @override - def supports() -> SourceKind: - return SourceKind.HTML - - def _build_page_url(self, page: int) -> str: - template = self._apply_category(self.source.pagination_template) - if "{page}" in template: - template = template.format(page=page) - elif page > 0: - separator = "&" if "?" in template else "?" - template = f"{template}{separator}page={page}" - - base = str(self.source.source_url) - if not base.endswith("/"): - base = f"{base}/" - return urljoin(base, template.lstrip("/")) - - def _apply_category(self, template: str) -> str: - if "{category}" in template: - replacement = self.config.category or "" - return template.replace("{category}", replacement) - return template - - def _extract_link(self, node: BeautifulSoup | Tag) -> Optional[str]: - selector = self.source.source_selectors.article_link - if not selector: - return None - - target = node.select_one(selector) - if not target: - return None - - # Support a few common attributes for link-like elements (href, - # data-href, src) to tolerate variations in markup without custom code. - raw_href = target.get("href") or target.get("data-href") or target.get("src") - href: Optional[str] - if isinstance(raw_href, str): - href = raw_href.strip() or None - elif isinstance(raw_href, list): - href = next( - ( - item.strip() - for item in raw_href - if isinstance(item, str) and item.strip() - ), - None, - ) - else: - href = None - if not href: - return None - return self._to_absolute_url(href) - - def _to_absolute_url(self, href: str) -> str: - base = str(self.source.source_url) - if not base.endswith("/"): - base = f"{base}/" - return urljoin(base, href) - - @staticmethod - def _extract_text( - node: BeautifulSoup | Tag, selector: Optional[str] - ) -> Optional[str]: - if not selector: - return None - target = node.select_one(selector) - if not target: - return None - - if target.name == "img": - for attr in ("alt", "title"): - value = target.get(attr) - if isinstance(value, str): - stripped = value.strip() - if stripped: - return stripped - elif isinstance(value, list): - for item in value: - if isinstance(item, str): - stripped = item.strip() - if stripped: - return stripped - - text = target.get_text(" ", strip=True) - return text or None - - @staticmethod - def _extract_body(node: BeautifulSoup | Tag, selector: Optional[str]) -> str: - if selector: - matches = node.select(selector) - if matches: - parts = [ - markdownify(item.get_text(" ", strip=False), heading_style="ATX") - for item in matches - if item.get_text(strip=True) - ] - if parts: - # Join without separators: callers can post-process if - # needed, but this preserves maximum fidelity. - return "\n".join(parts) - return markdownify(node.get_text(" ", strip=False), heading_style="ATX") - - @staticmethod - def _extract_categories( - node: BeautifulSoup | Tag, selector: Optional[str] - ) -> list[str]: - if not selector: - return [] - - values: list[str] = [] - for item in node.select(selector): - text = item.get_text(" ", strip=True) - if text: - lower = text.lower() - if lower not in values: - values.append(lower) - return values - - def _compute_timestamp(self, raw_date: Optional[str]) -> int: - if not raw_date: - return int(datetime.now(timezone.utc).timestamp()) - - return self.date_parser.create_timestamp( - raw_date.strip(), - fmt=self.source.source_date.format, - pattern=self.source.source_date.pattern, - replacement=self.source.source_date.replacement, - ) diff --git a/projects/crawler/src/basango/services/crawler/wordpress_crawler.py b/projects/crawler/src/basango/services/crawler/wordpress_crawler.py deleted file mode 100644 index 2bd17bc..0000000 --- a/projects/crawler/src/basango/services/crawler/wordpress_crawler.py +++ /dev/null @@ -1,187 +0,0 @@ -import json -import logging -from datetime import datetime, timezone -from typing import Optional, override, cast, Final, Any, Sequence - -from markdownify import markdownify - -from basango.domain.article import Article -from bs4 import BeautifulSoup - -from basango.core.config import WordPressSourceConfig, CrawlerConfig, ClientConfig -from basango.domain import SourceKind, PageRange, DateRange -from basango.domain.exception import ArticleOutOfRange -from basango.services.crawler.base_crawler import BaseCrawler -from basango.services import BasePersistor - - -class WordpressCrawler(BaseCrawler): - """ - WordPress REST API crawler. - - It uses the `/wp-json/wp/v2/posts` endpoints and limits fields to reduce - payload size. Pagination is driven by WordPress headers `x-wp-totalpages` - and `x-wp-total`. Category IDs are mapped to slugs via a secondary endpoint - and cached per run. - """ - - def __init__( - self, - crawler_config: CrawlerConfig, - client_config: ClientConfig, - persistors: Sequence[BasePersistor] | None = None, - ) -> None: - super().__init__(crawler_config, client_config, persistors=persistors) - if not self.source or self.source.source_kind != SourceKind.WORDPRESS: - raise ValueError("WordpressCrawler requires a source of kind WORDPRESS") - - self.source = cast(WordPressSourceConfig, self.source) - self.category_map: dict[int, str] = {} - - POST_QUERY: Final = "_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc" - CATEGORY_QUERY: Final = ( - "_fields=id,slug,count&orderby=count&order=desc&per_page=100" - ) - TOTAL_PAGES_HEADER: Final = "x-wp-totalpages" - TOTAL_POSTS_HEADER: Final = "x-wp-total" - - @override - def fetch(self) -> None: - self.initialize() - page_range = self.config.page_range or self.get_pagination() - date_range = self.config.date_range - - stop = False - for page_number in range(page_range.start, page_range.end + 1): - endpoint = self._posts_endpoint(page_number) - try: - response = self.client.get(endpoint) - payload = response.text - articles = json.loads(payload) - except Exception as exc: # noqa: BLE001 - logging.error( - "> page %s => %s [failed]", - page_number, - exc, - ) - continue - - for article in articles: - try: - self.fetch_one(article, date_range) - except ArticleOutOfRange: - # Same early-exit semantic as HtmlCrawler - logging.info("No more articles to fetch in this range.") - stop = True - break - except Exception as exc: # noqa: BLE001 - logging.error( - "Failed to process WordPress article on page %s: %s", - page_number, - exc, - ) - if stop: - break - - self.completed(self.config.notify) - - @override - def fetch_one(self, html: Any, date_range: Optional[DateRange] = None) -> Article: - try: - data = json.loads(html) if isinstance(html, str) else html - except json.JSONDecodeError as exc: - logging.error("Failed to decode WordPress payload: %s", exc) - raise exc - - if not isinstance(data, dict): - logging.error("Skipping unexpected WordPress payload: %s", type(data)) - raise ValueError("Unexpected WordPress payload type") - - link = data.get("link") - if not link: - logging.error("Skipping WordPress article without link") - raise ValueError("WordPress article without link") - - title_html = data.get("title", {}).get("rendered", "") - body_html = data.get("content", {}).get("rendered", "") - - title = BeautifulSoup(title_html, "html.parser").get_text(" ", strip=True) - body = markdownify( - BeautifulSoup(body_html, "html.parser").get_text(" ", strip=False), - heading_style="ATX", - ) - timestamp = self._compute_timestamp(data.get("date")) - - categories_value = self._map_categories(data.get("categories", [])) - categories = [item for item in categories_value.split(",") if item] - - if date_range and not date_range.in_range(timestamp): - self.skip(date_range, str(timestamp), title, data.get("date", "")) - - metadata = self.open_graph.consume_url(link) - - return self.save_article( - title=title or data.get("slug", "Untitled"), - link=link, - body=body, - categories=categories, - timestamp=timestamp, - metadata=metadata, - ) - - @override - def get_pagination(self) -> PageRange: - response = self.client.get( - f"{self.source.source_url}wp-json/wp/v2/posts?_fields=id&per_page=100" - ) - pages = int(response.headers.get(self.TOTAL_PAGES_HEADER, "1")) - posts = int(response.headers.get(self.TOTAL_POSTS_HEADER, "0")) - - logging.info("WordPress Pagination %s posts in %s pages", posts, pages) - return PageRange.create(f"1:{pages}") - - def _fetch_categories(self) -> None: - response = self.client.get( - f"{self.source.source_url}wp-json/wp/v2/categories?{self.CATEGORY_QUERY}" - ) - for category in response.json(): - self.category_map[int(category["id"])] = category["slug"] - - def _map_categories(self, categories: list[int]) -> str: - if not self.category_map: - self._fetch_categories() - return ",".join( - self.category_map[category] - for category in sorted(categories) - if category in self.category_map - ) - - def _posts_endpoint(self, page: int) -> str: - base = str(self.source.source_url) - if not base.endswith("/"): - base = f"{base}/" - return f"{base}wp-json/wp/v2/posts?{self.POST_QUERY}&page={page}&per_page=100" - - @staticmethod - def _compute_timestamp(raw: Optional[str]) -> int: - if not raw: - return int(datetime.now(timezone.utc).timestamp()) - - cleaned = raw.replace("Z", "+00:00") - try: - dt = datetime.fromisoformat(cleaned) - except ValueError: - return int(datetime.now(timezone.utc).timestamp()) - - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - return int(dt.timestamp()) - - @override - def get_last_page(self) -> int: - return 1 - - @staticmethod - @override - def supports() -> SourceKind: - return SourceKind.WORDPRESS diff --git a/projects/crawler/src/basango/services/date_parser.py b/projects/crawler/src/basango/services/date_parser.py deleted file mode 100644 index c6e6ee1..0000000 --- a/projects/crawler/src/basango/services/date_parser.py +++ /dev/null @@ -1,82 +0,0 @@ -import logging -import re -from datetime import datetime, timezone -from typing import Optional - - -class DateParser: - MONTHS = { - "janvier": "01", - "février": "02", - "mars": "03", - "avril": "04", - "mai": "05", - "juin": "06", - "juillet": "07", - "août": "08", - "septembre": "09", - "octobre": "10", - "novembre": "11", - "décembre": "12", - } - - DAYS = { - "dimanche": "0", - "lundi": "1", - "mardi": "2", - "mercredi": "3", - "jeudi": "4", - "vendredi": "5", - "samedi": "6", - } - - DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M" - - @classmethod - def _apply_substitution( - cls, date: str, pattern: Optional[str], replacement: Optional[str] - ) -> str: - if not pattern or replacement is None: - return date - - # Accept PHP-like patterns with leading/trailing slashes - if len(pattern) >= 2 and pattern[0] == "/" and pattern.rfind("/") > 0: - pattern = pattern[1 : pattern.rfind("/")] - - # Convert $1 to \1 for Python - replacement = re.sub(r"\$(\d+)", r"\\\1", replacement) - try: - return re.sub(pattern, replacement, date) - except re.error: - logging.error(f"[DateParser] Could not convert {pattern} to {replacement}") - return date - - def create_timestamp( - self, - date: str, - fmt: Optional[str] = None, - pattern: Optional[str] = None, - replacement: Optional[str] = None, - ) -> int: - # Normalize and translate French day/month words - date = date.lower() - for k, v in self.DAYS.items(): - date = date.replace(k, v) - for k, v in self.MONTHS.items(): - date = date.replace(k, v) - - # Optional regex transform - date = self._apply_substitution(date, pattern, replacement) - fmt = fmt or self.DEFAULT_DATE_FORMAT - - try: - dt = datetime.strptime(date, fmt).replace(tzinfo=timezone.utc) - return int(dt.timestamp()) - except Exception as e: - logging.error( - f"[DateParser] Could not parse date '{date}' with format '{fmt}': {e}" - ) - dt = datetime.now(timezone.utc).replace( - hour=0, minute=0, second=0, microsecond=0 - ) - return int(dt.timestamp()) diff --git a/projects/crawler/src/basango/services/http_client/__init__.py b/projects/crawler/src/basango/services/http_client/__init__.py deleted file mode 100644 index d0f18a2..0000000 --- a/projects/crawler/src/basango/services/http_client/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .base_http_client import BaseHttpClient -from .sync_http_client import SyncHttpClient -from .async_http_client import AsyncHttpClient - -__all__ = [ - "BaseHttpClient", - "SyncHttpClient", - "AsyncHttpClient", -] diff --git a/projects/crawler/src/basango/services/http_client/async_http_client.py b/projects/crawler/src/basango/services/http_client/async_http_client.py deleted file mode 100644 index c405d63..0000000 --- a/projects/crawler/src/basango/services/http_client/async_http_client.py +++ /dev/null @@ -1,121 +0,0 @@ -import asyncio -from dataclasses import dataclass, field - -import httpx - -from .base_http_client import ( - BaseHttpClient, - HttpData, - HttpHeaders, - HttpParams, - TRANSIENT_STATUSES, -) - - -@dataclass -class AsyncHttpClient(BaseHttpClient): - _client: httpx.AsyncClient = field(init=False, repr=False) - - def __post_init__(self) -> None: - super().__post_init__() - self._client = httpx.AsyncClient( - follow_redirects=self.client_config.follow_redirects, - max_redirects=5, - verify=self.client_config.verify_ssl, - timeout=self.client_config.timeout, - headers=dict(self._headers), - ) - - async def __aenter__(self) -> "AsyncHttpClient": - return self - - async def __aexit__(self, exc_type, exc, tb) -> None: - await self.aclose() - - def close(self) -> None: - if self._client.is_closed: - return - try: - loop = asyncio.get_running_loop() - except RuntimeError: # no running loop - asyncio.run(self.aclose()) - else: - loop.create_task(self.aclose()) - - async def aclose(self) -> None: - try: - await self._client.aclose() - except Exception: # noqa: BLE001 - pass - - async def _request( - self, - method: str, - url: str, - *, - headers: HttpHeaders = None, - params: HttpParams = None, - data: HttpData = None, - json: HttpData = None, - ) -> httpx.Response: - attempt = 0 - while True: - try: - response = await self._client.request( - method, - url, - headers=self._build_headers(headers), - params=params, - data=data, - json=json, - ) - if ( - response.status_code in TRANSIENT_STATUSES - ) and attempt < self.client_config.max_retries: - await asyncio.sleep(self._retry_delay(attempt, response)) - attempt += 1 - continue - response.raise_for_status() - return response - except httpx.HTTPStatusError as exc: - status = exc.response.status_code if exc.response else 0 - if ( - status in TRANSIENT_STATUSES - ) and attempt < self.client_config.max_retries: - await asyncio.sleep(self._retry_delay(attempt, exc.response)) - attempt += 1 - continue - raise - except httpx.RequestError: - if attempt < self.client_config.max_retries: - await asyncio.sleep(self._compute_backoff(attempt)) - attempt += 1 - continue - raise - - async def get( - self, - url: str, - *, - headers: HttpHeaders = None, - params: HttpParams = None, - ) -> httpx.Response: - return await self._request("GET", url, headers=headers, params=params) - - async def post( - self, - url: str, - *, - headers: HttpHeaders = None, - params: HttpParams = None, - data: HttpData = None, - json: HttpData = None, - ) -> httpx.Response: - return await self._request( - "POST", - url, - headers=headers, - params=params, - data=data, - json=json, - ) diff --git a/projects/crawler/src/basango/services/http_client/base_http_client.py b/projects/crawler/src/basango/services/http_client/base_http_client.py deleted file mode 100644 index 90e0994..0000000 --- a/projects/crawler/src/basango/services/http_client/base_http_client.py +++ /dev/null @@ -1,87 +0,0 @@ -import random -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from datetime import datetime, timezone -from email.utils import parsedate_to_datetime -from typing import Any, Optional, TypeAlias - -import httpx - -from basango.core.config import ClientConfig -from basango.services.user_agents import UserAgents - -HttpHeaders: TypeAlias = dict[str, str] | None -HttpParams: TypeAlias = dict[str, Any] | None -HttpData: TypeAlias = Any | None - -TRANSIENT_STATUSES = (429, 500, 502, 503, 504) - - -@dataclass -class BaseHttpClient(ABC): - client_config: ClientConfig - user_agent_provider: UserAgents | None = None - default_headers: HttpHeaders = None - _user_agent: str = field(init=False, repr=False) - _headers: dict[str, str] = field(init=False, repr=False) - - def __post_init__(self) -> None: - provider = self.user_agent_provider or UserAgents( - rotate=self.client_config.rotate, - fallback=self.client_config.user_agent, - ) - user_agent = provider.get() - self._user_agent = user_agent if user_agent else self.client_config.user_agent - - headers = {"User-Agent": self._user_agent} - if self.default_headers: - headers.update(self.default_headers) - self._headers = headers - - def _compute_backoff(self, attempt: int) -> float: - base = min( - self.client_config.backoff_initial - * (self.client_config.backoff_multiplier**attempt), - self.client_config.backoff_max, - ) - jitter = random.uniform(0, base * 0.25) - return base + jitter - - def _retry_delay( - self, attempt: int, response: Optional[httpx.Response] = None - ) -> float: - delay = 0.0 - if response is not None and self.client_config.respect_retry_after: - retry_after = ( - response.headers.get("Retry-After") if response.headers else None - ) - if retry_after: - delay = self._parse_retry_after(retry_after) - - if delay == 0.0: - delay = self._compute_backoff(attempt) - return delay - - @staticmethod - def _parse_retry_after(header_value: str) -> float: - try: - return max(0.0, float(int(header_value))) - except (TypeError, ValueError): - try: - dt = parsedate_to_datetime(header_value) - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - now = datetime.now(timezone.utc) - return max(0.0, (dt - now).total_seconds()) - except Exception: # noqa: BLE001 - return 0.0 - - def _build_headers(self, headers: HttpHeaders = None) -> dict[str, str]: - merged = dict(self._headers) - if headers: - merged.update(headers) - return merged - - @abstractmethod - def close(self) -> None: # pragma: no cover - enforced by subclasses - """Close the underlying HTTPX client.""" diff --git a/projects/crawler/src/basango/services/http_client/sync_http_client.py b/projects/crawler/src/basango/services/http_client/sync_http_client.py deleted file mode 100644 index eeae8a7..0000000 --- a/projects/crawler/src/basango/services/http_client/sync_http_client.py +++ /dev/null @@ -1,107 +0,0 @@ -import time -from dataclasses import dataclass, field - -import httpx - -from .base_http_client import ( - BaseHttpClient, - HttpData, - HttpHeaders, - HttpParams, - TRANSIENT_STATUSES, -) - - -@dataclass -class SyncHttpClient(BaseHttpClient): - _client: httpx.Client = field(init=False, repr=False) - - def __post_init__(self) -> None: - super().__post_init__() - self._client = httpx.Client( - follow_redirects=self.client_config.follow_redirects, - max_redirects=5, - verify=self.client_config.verify_ssl, - timeout=self.client_config.timeout, - headers=dict(self._headers), - ) - - def __enter__(self) -> "SyncHttpClient": - return self - - def __exit__(self, exc_type, exc, tb) -> None: - self.close() - - def close(self) -> None: - try: - self._client.close() - except Exception: # noqa: BLE001 - pass - - def _request( - self, - method: str, - url: str, - *, - headers: HttpHeaders = None, - params: HttpParams = None, - data: HttpData = None, - json: HttpData = None, - ) -> httpx.Response: - attempt = 0 - while True: - try: - response = self._client.request( - method, - url, - headers=self._build_headers(headers), - params=params, - data=data, - json=json, - ) - if ( - response.status_code in TRANSIENT_STATUSES - ) and attempt < self.client_config.max_retries: - time.sleep(self._retry_delay(attempt, response)) - attempt += 1 - continue - response.raise_for_status() - return response - except httpx.HTTPStatusError as exc: - status = exc.response.status_code if exc.response else 0 - if ( - status in TRANSIENT_STATUSES - ) and attempt < self.client_config.max_retries: - time.sleep(self._retry_delay(attempt, exc.response)) - attempt += 1 - continue - raise - except httpx.RequestError: - if attempt < self.client_config.max_retries: - time.sleep(self._compute_backoff(attempt)) - attempt += 1 - continue - raise - - def get( - self, url: str, *, headers: HttpHeaders = None, params: HttpParams = None - ) -> httpx.Response: - return self._request("GET", url, headers=headers, params=params) - - def post( - self, - url: str, - *, - headers: HttpHeaders = None, - params: HttpParams = None, - data: HttpData = None, - json: HttpData = None, - ) -> httpx.Response: - return self._request( - "POST", - url, - headers=headers, - params=params, - data=data, - json=json, - ) diff --git a/projects/crawler/src/basango/services/open_graph.py b/projects/crawler/src/basango/services/open_graph.py deleted file mode 100644 index aa2e504..0000000 --- a/projects/crawler/src/basango/services/open_graph.py +++ /dev/null @@ -1,55 +0,0 @@ -import logging -from dataclasses import dataclass -from typing import Optional - -import trafilatura - -from basango.core.config import ClientConfig -from basango.services.http_client import SyncHttpClient -from basango.services.user_agents import UserAgents - - -@dataclass -class OpenGraphObject: - title: Optional[str] = None - description: Optional[str] = None - image: Optional[str] = None - url: Optional[str] = None - - -class OpenGraphProvider: - def __init__( - self, user_agent_provider: UserAgents = UserAgents(rotate=False) - ) -> None: - self._user_agent = user_agent_provider.og() - self._http_client = SyncHttpClient( - client_config=ClientConfig(), - default_headers={"User-Agent": self._user_agent}, - ) - - def consume_url(self, url: str) -> OpenGraphObject | None: - try: - logging.info(f"[OpenGraphProvider] Consuming url: {url}") - html = self._http_client.get(url).text - return self.consume_html(html, url) - except Exception as e: - logging.exception(f"[OpenGraphProvider] Failed to consume url: {e}") - return None - - @classmethod - def consume_html( - cls, html: str, url: Optional[str] = None - ) -> OpenGraphObject | None: - try: - meta = trafilatura.extract_metadata(html, default_url=url) - if not meta: - return None - return OpenGraphObject( - title=meta.title or None, - description=meta.description or None, - image=meta.image or None, - url=url, - ) - except Exception as e: - logging.error(f"[OpenGraphProvider] Failed to extract metadata: {e}") - return None diff --git a/projects/crawler/src/basango/services/persistence/__init__.py b/projects/crawler/src/basango/services/persistence/__init__.py deleted file mode 100644 index a529a77..0000000 --- a/projects/crawler/src/basango/services/persistence/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .base_persistor import BasePersistor -from .csv_persistor import CsvPersistor -from .json_persistor import JsonPersistor - -__all__ = [ - "BasePersistor", - "CsvPersistor", - "JsonPersistor", -] diff --git a/projects/crawler/src/basango/services/persistence/base_persistor.py b/projects/crawler/src/basango/services/persistence/base_persistor.py deleted file mode 100644 index f4376b3..0000000 --- a/projects/crawler/src/basango/services/persistence/base_persistor.py +++ /dev/null @@ -1,14 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Mapping, Any - - -class BasePersistor(ABC): - """Abstract interface for article persistence backends.""" - - @abstractmethod - def persist(self, article: Mapping[str, Any]) -> None: - """Persist a single article payload.""" - - def close(self) -> None: # pragma: no cover - optional override - """Hook for subclasses that need explicit shutdown.""" - return None diff --git a/projects/crawler/src/basango/services/persistence/csv_persistor.py b/projects/crawler/src/basango/services/persistence/csv_persistor.py deleted file mode 100644 index 36daaf4..0000000 --- a/projects/crawler/src/basango/services/persistence/csv_persistor.py +++ /dev/null @@ -1,79 +0,0 @@ -import csv -import json -from dataclasses import dataclass, field -from pathlib import Path -from threading import Lock -from typing import Any, Mapping, Sequence - -from .base_persistor import BasePersistor - - -DEFAULT_FIELDS = ( - "title", - "link", - "body", - "categories", - "source", - "timestamp", - "metadata", -) - - -@dataclass -class CsvPersistor(BasePersistor): - data_dir: Path - source_id: str - fieldnames: Sequence[str] = DEFAULT_FIELDS - encoding: str = "utf-8" - _file_path: Path = field(init=False, repr=False) - _lock: Lock = field(default_factory=Lock, init=False, repr=False) - _header_written: bool = field(default=False, init=False, repr=False) - - def __post_init__(self) -> None: - # Pre-create output directory and detect existing header to avoid - # re-writing it across process restarts. - self.data_dir.mkdir(parents=True, exist_ok=True) - self._file_path = self.data_dir / f"{self.source_id}.csv" - if self._file_path.exists() and self._file_path.stat().st_size > 0: - self._header_written = True - - def persist(self, article: Mapping[str, Any]) -> None: - record = self._serialise(article) - # File writes are guarded by a process-local lock to tolerate threads - # sharing the same persistor instance. - with self._lock: - needs_header = not self._header_written or not self._file_path.exists() - with self._file_path.open( - "a", newline="", encoding=self.encoding - ) as handle: - writer = csv.DictWriter( - handle, - fieldnames=self.fieldnames, - quoting=csv.QUOTE_ALL, - lineterminator="\n", - ) - if needs_header: - writer.writeheader() - self._header_written = True - writer.writerow(record) - - def _serialise(self, article: Mapping[str, Any]) -> dict[str, Any]: - categories = article.get("categories") - if isinstance(categories, (list, tuple)): - serialised_categories = ";".join(str(item) for item in categories) - else: - serialised_categories = categories - - metadata = article.get("metadata") - if metadata is None or isinstance(metadata, str): - serialised_metadata = metadata - else: - # JSON-encode metadata to a compact, CSV-safe string; csv will quote it. - serialised_metadata = json.dumps( - metadata, ensure_ascii=True, separators=(",", ":"), sort_keys=True - ) - - record = {field: article.get(field) for field in self.fieldnames} - record["categories"] = serialised_categories - record["metadata"] = serialised_metadata - return record diff --git a/projects/crawler/src/basango/services/persistence/json_persistor.py b/projects/crawler/src/basango/services/persistence/json_persistor.py deleted file mode 100644 index 5ea729d..0000000 --- a/projects/crawler/src/basango/services/persistence/json_persistor.py +++ /dev/null @@ -1,28 +0,0 @@ -import json -from dataclasses import dataclass, field -from pathlib import Path -from threading import Lock -from typing import Any, Mapping - -from .base_persistor import BasePersistor - - -@dataclass -class JsonPersistor(BasePersistor): - data_dir: Path - source_id: str - suffix: str = ".jsonl" - encoding: str = "utf-8" - _file_path: Path = field(init=False, repr=False) - _lock: Lock = field(default_factory=Lock, init=False, repr=False) - - def __post_init__(self) -> None: - self.data_dir.mkdir(parents=True, exist_ok=True) - self._file_path = self.data_dir / f"{self.source_id}{self.suffix}" - - def persist(self, article: Mapping[str, Any]) -> None: - payload = json.dumps(article, ensure_ascii=False) - with self._lock: - with self._file_path.open("a", encoding=self.encoding) as handle: - handle.write(payload) - handle.write("\n") diff --git a/projects/crawler/src/basango/services/tokenizer.py b/projects/crawler/src/basango/services/tokenizer.py deleted file mode 100644 index c9db73d..0000000 --- a/projects/crawler/src/basango/services/tokenizer.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Tokenizer utilities for counting and encoding article text. - -This module wraps the `tiktoken` encoder to provide simple helpers for: -- encoding/decoding text to token ids -- counting tokens for different parts of an Article - -The `Tokenizer` can be constructed with either a specific `model` (preferred) -or an `encoding` name fallback. -""" - -import logging - -import tiktoken -from typing import Optional - -from basango.domain.token_statistics import TokenStatistics - - -class Tokenizer: - """Thin wrapper around tiktoken encoder for token operations.""" - - def __init__( - self, encoding: str = "cl100k_base", model: Optional[str] = None - ) -> None: - self.encoding = encoding - # Prefer model-based encoding lookup if a model is provided. - self.tokenizer = ( - tiktoken.encoding_for_model(model) - if model - else tiktoken.get_encoding(encoding) - ) - - def encode(self, text: str) -> list[int]: - """Encode text into a list of token ids.""" - return self.tokenizer.encode(text) - - def decode(self, tokens: list[int]) -> str: - """Decode a list of token ids back into a string.""" - return self.tokenizer.decode(tokens) - - def count_tokens( - self, title: str, body: str, categories: list[str] - ) -> TokenStatistics: - """Return token counts for the provided Article. - - The excerpt count is computed on the first 200 characters of the body - to give a quick estimate of a short preview's token length. - """ - logging.info(f"[Tokenizer] tokenizing {title}...") - return TokenStatistics( - title=len(self.encode(title)), - body=len(self.encode(body)), - excerpt=len(self.encode(body[:200])), - categories=len(self.encode(", ".join(categories))), - ) diff --git a/projects/crawler/src/basango/services/user_agents.py b/projects/crawler/src/basango/services/user_agents.py deleted file mode 100644 index ecf4f75..0000000 --- a/projects/crawler/src/basango/services/user_agents.py +++ /dev/null @@ -1,28 +0,0 @@ -import random -from dataclasses import dataclass - - -@dataclass -class UserAgents: - USER_AGENTS = [ - "Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5", - "Mozilla/5.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7", - "Mozilla/5.0 (Linux; U; Android 5.0; SM-P815 Build/LRX22G) AppleWebKit/600.4 (KHTML, like Gecko) Chrome/48.0.1562.260 Mobile Safari/600.0", - "Mozilla/5.0 (Windows; U; Windows NT 6.3;) AppleWebKit/533.34 (KHTML, like Gecko) Chrome/51.0.1883.215 Safari/533", - "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.3; x64; en-US Trident/4.0)", - "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_10_3) Gecko/20100101 Firefox/63.4", - "Mozilla/5.0 (Linux; Linux x86_64; en-US) AppleWebKit/603.50 (KHTML, like Gecko) Chrome/55.0.2226.116 Safari/601", - "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 7_8_3; en-US) Gecko/20100101 Firefox/68.9", - "Mozilla/5.0 (iPhone; CPU iPhone OS 8_9_8; like Mac OS X) AppleWebKit/603.34 (KHTML, like Gecko) Chrome/47.0.1126.107 Mobile Safari/602.7", - "Mozilla/5.0 (iPod; CPU iPod OS 8_2_0; like Mac OS X) AppleWebKit/601.40 (KHTML, like Gecko) Chrome/47.0.1590.178 Mobile Safari/535.2", - ] - - rotate: bool = True - fallback: str = "Basango/0.1 (+https://github.com/bernard-ng/basango)" - - def get(self) -> str: - return random.choice(self.USER_AGENTS) if self.rotate else self.fallback - - @classmethod - def og(cls) -> str: - return "facebookexternalhit/1.1" diff --git a/projects/crawler/tests/basango/domain/test_date_range.py b/projects/crawler/tests/basango/domain/test_date_range.py deleted file mode 100644 index d09d821..0000000 --- a/projects/crawler/tests/basango/domain/test_date_range.py +++ /dev/null @@ -1,57 +0,0 @@ -from datetime import datetime, timezone - -import pytest - -from basango.domain import DateRange - - -def ts(y: int, m: int, d: int, hh: int = 0, mm: int = 0, ss: int = 0) -> int: - return int(datetime(y, m, d, hh, mm, ss, tzinfo=timezone.utc).timestamp()) - - -def test_from_parses_two_dates_with_default_format() -> None: - dr = DateRange.create("2024-10-01:2024-10-08") - assert dr.start == ts(2024, 10, 1) - assert dr.end == ts(2024, 10, 8) - - -def test_str_and_format_roundtrip() -> None: - dr = DateRange.create("2024-10-01:2024-10-02") - assert str(dr) == f"{ts(2024, 10, 1)}:{ts(2024, 10, 2)}" - assert dr.format("%Y-%m-%d") == "2024-10-01:2024-10-02" - - -def test_in_range_out_range_inclusive_boundaries() -> None: - dr = DateRange.create("2024-10-01:2024-10-02") - start = ts(2024, 10, 1) - end = ts(2024, 10, 2) - before = start - 1 - after = end + 1 - midday_end = ts(2024, 10, 2, 12, 0, 0) - - assert dr.in_range(start) is True - assert dr.in_range(end) is True - assert dr.out_range(before) is True - # End is at 00:00 of end day; times later that day are outside - assert dr.out_range(midday_end) is True - assert dr.out_range(after) is True - - -def test_backward_uses_days_and_next_day_end() -> None: - base = datetime(2024, 10, 31, tzinfo=timezone.utc) - dr = DateRange.backward(date=base, days=10) - assert dr.start == ts(2024, 10, 21) - assert dr.end == ts(2024, 11, 1) - - -def test_from_raises_on_invalid_separator_or_spec() -> None: - with pytest.raises(AssertionError): - DateRange.create("2024-10-01:2024-10-08", separator="") - with pytest.raises(AssertionError): - DateRange.create("2024-10-01", separator=":") - - -def test_from_accepts_python_format_string() -> None: - dr = DateRange.create("2024/10/01|2024/10/02", fmt="%Y/%m/%d", separator="|") - assert dr.start == ts(2024, 10, 1) - assert dr.end == ts(2024, 10, 2) diff --git a/projects/crawler/tests/basango/domain/test_page_range.py b/projects/crawler/tests/basango/domain/test_page_range.py deleted file mode 100644 index a089766..0000000 --- a/projects/crawler/tests/basango/domain/test_page_range.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -from basango.domain import PageRange - - -def test_it_should_create_page_range(): - pr = PageRange.create("1:10") - assert pr.start == 1 - assert pr.end == 10 - - -def test_end_page_should_be_greater_than_start_page(): - with pytest.raises(AssertionError): - PageRange.create("10:1") - - -def test_non_negative_pages(): - with pytest.raises(AssertionError): - PageRange.create("-1:-10") diff --git a/projects/crawler/tests/basango/services/crawler/test_html_crawler.py b/projects/crawler/tests/basango/services/crawler/test_html_crawler.py deleted file mode 100644 index 9fe7267..0000000 --- a/projects/crawler/tests/basango/services/crawler/test_html_crawler.py +++ /dev/null @@ -1,291 +0,0 @@ -from unittest.mock import patch - -import pytest -from bs4 import BeautifulSoup -from pydantic import HttpUrl - -from basango.core.config import WordPressSourceConfig -from basango.core.config.fetch_config import CrawlerConfig, ClientConfig -from basango.core.config.source_config import HtmlSourceConfig, SourceSelectors -from basango.domain import SourceKind, PageRange -from basango.services.crawler.html_crawler import HtmlCrawler - - -class TestHtmlCrawler: - """Test suite for HtmlCrawler.""" - - @pytest.fixture - def mock_client_config(self): - return ClientConfig() - - @pytest.fixture - def mock_html_source_config(self): - return HtmlSourceConfig( - source_id="test_source", - source_url=HttpUrl("https://example.com"), - pagination_template="news", - source_selectors=SourceSelectors(pagination="ul.pagination > li a"), - supports_categories=True, - ) - - @pytest.fixture - def mock_crawler_config(self, mock_html_source_config): - return CrawlerConfig(source=mock_html_source_config, category="tech") - - @pytest.fixture - def html_crawler(self, mock_crawler_config, mock_client_config): - return HtmlCrawler(mock_crawler_config, mock_client_config) - - def test_with_valid_html_source(self, html_crawler): - """Test __init__ with valid HTML source config.""" - assert html_crawler.source.source_kind == SourceKind.HTML - assert isinstance(html_crawler.source, HtmlSourceConfig) - - def test_with_invalid_source_kind_raises_error(self, mock_client_config): - """Test __init__ raises ValueError when source kind is not HTML.""" - wordpress_source = WordPressSourceConfig( - source_id="test_wordpress", - source_url=HttpUrl("https://example.com"), - ) - config = CrawlerConfig(source=wordpress_source) - - with pytest.raises( - ValueError, match="HtmlCrawler requires a source of kind HTML" - ): - HtmlCrawler(config, mock_client_config) - - def test_with_no_source_raises_error(self, mock_client_config): - """Test __init__ raises ValueError when no source is provided.""" - config = CrawlerConfig(source=None) - - with pytest.raises( - ValueError, match="HtmlCrawler requires a source of kind HTML" - ): - HtmlCrawler(config, mock_client_config) - - def test_get_pagination_returns_valid_page_range(self, html_crawler): - """Test that get_pagination returns a valid PageRange.""" - with patch.object(html_crawler, "get_last_page", return_value=5): - result = html_crawler.get_pagination() - - assert isinstance(result, PageRange) - assert result.start == 0 - assert result.end == 5 - assert str(result) == "0:5" - - def test_get_last_page_with_valid_pagination_links(self, html_crawler): - """Test get_last_page extracts page number from pagination links.""" - # Mock HTML with pagination links - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - assert result == 10 - - def test_get_last_page_with_no_pagination_links(self, html_crawler): - """Test get_last_page returns 1 when no pagination links found.""" - mock_html = "
No pagination here
" - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - assert result == 1 - - def test_get_last_page_with_empty_href(self, html_crawler): - """Test get_last_page returns 1 when href is empty or None.""" - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - assert result == 1 - - def test_get_last_page_with_regex_extraction(self, html_crawler): - """Test get_last_page extracts page number using regex.""" - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - assert result == 25 - - def test_get_last_page_with_query_parameters(self, html_crawler): - """Test get_last_page extracts page number from query parameters.""" - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - assert result == 15 - - def test_get_last_page_with_invalid_page_parameter(self, html_crawler): - """Test get_last_page returns 1 when page parameter is invalid.""" - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - assert result == 1 - - def test_get_last_page_with_category_support(self, html_crawler): - """Test get_last_page uses category in URL when supported.""" - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl") as mock_crawl: - mock_crawl.return_value = mock_soup - html_crawler.get_last_page() - - # The URL construction concatenates source_url with the path - # Since the template doesn't contain {category}, it should remain unchanged - expected_url = "https://example.com/news" - mock_crawl.assert_called_once_with(expected_url) - - def test_get_last_page_with_category_template(self, mock_client_config): - """Test get_last_page uses category replacement when template contains {category}.""" - source_config = HtmlSourceConfig( - source_id="test_source", - source_url=HttpUrl("https://example.com"), - pagination_template="news/{category}", - source_selectors=SourceSelectors(pagination="ul.pagination > li a"), - supports_categories=True, - ) - crawler_config = CrawlerConfig(source=source_config, category="tech") - crawler = HtmlCrawler(crawler_config, mock_client_config) - - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(crawler, "crawl") as mock_crawl: - mock_crawl.return_value = mock_soup - crawler.get_last_page() - - expected_url = "https://example.com/news/tech" - mock_crawl.assert_called_once_with(expected_url) - - def test_get_last_page_without_category_support(self, html_crawler): - """Test get_last_page uses default template when categories not supported.""" - # Modify source to not support categories - html_crawler.source.supports_categories = False - - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl") as mock_crawl: - mock_crawl.return_value = mock_soup - html_crawler.get_last_page() - - # Verify the URL was constructed without category replacement - expected_url = "https://example.com/news" - mock_crawl.assert_called_once_with(expected_url) - - def test_get_last_page_without_category_in_config( - self, mock_client_config, mock_html_source_config - ): - """Test get_last_page uses default template when no category in config.""" - config = CrawlerConfig(source=mock_html_source_config, category=None) - crawler = HtmlCrawler(config, mock_client_config) - - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(crawler, "crawl") as mock_crawl: - mock_crawl.return_value = mock_soup - crawler.get_last_page() - - # Verify the URL was constructed without category replacement - expected_url = "https://example.com/news" - mock_crawl.assert_called_once_with(expected_url) - - def test_get_last_page_with_multiple_numbers_in_href(self, html_crawler): - """Test get_last_page extracts first number when multiple numbers present.""" - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - # Should extract the first number found (2024) - assert result == 2024 - - def test_supports_html_source_kind(self): - """Test that supports method returns True for HTML source kind.""" - assert HtmlCrawler.supports() is SourceKind.HTML - - def test_get_pagination_integration(self, html_crawler): - """Integration test for get_pagination calling get_last_page.""" - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_pagination() - - assert isinstance(result, PageRange) - assert result.start == 0 - assert result.end == 7 - - def test_get_last_page_with_non_string_href(self, html_crawler): - """Test get_last_page handles non-string href attributes.""" - # Create a mock element with href as a list (AttributeValueList) - mock_html = """ - - """ - mock_soup = BeautifulSoup(mock_html, "html.parser") - - # Modify the href to simulate a non-string type by removing it - pagination_link = mock_soup.select("ul.pagination > li a")[-1] - # Instead of setting href to a list, let's test with missing href - del pagination_link.attrs["href"] - - with patch.object(html_crawler, "crawl", return_value=mock_soup): - result = html_crawler.get_last_page() - assert result == 1 diff --git a/projects/crawler/tests/basango/services/crawler/test_wordpress_crawler.py b/projects/crawler/tests/basango/services/crawler/test_wordpress_crawler.py deleted file mode 100644 index e69c573..0000000 --- a/projects/crawler/tests/basango/services/crawler/test_wordpress_crawler.py +++ /dev/null @@ -1,239 +0,0 @@ -from unittest.mock import Mock, patch - -import pytest -from pydantic import HttpUrl - -from basango.core.config.fetch_config import CrawlerConfig, ClientConfig -from basango.core.config.source_config import ( - WordPressSourceConfig, - HtmlSourceConfig, - SourceSelectors, -) -from basango.domain import SourceKind, PageRange -from basango.services.crawler.wordpress_crawler import WordpressCrawler - - -class TestWordPressCrawler: - """Test suite for WordPressCrawler.""" - - @pytest.fixture - def mock_client_config(self): - return ClientConfig() - - @pytest.fixture - def mock_wordpress_source_config(self): - return WordPressSourceConfig( - source_id="test_wordpress_source", - source_url=HttpUrl("https://example.com/"), - supports_categories=True, - categories=["tech", "news"], - ) - - @pytest.fixture - def mock_crawler_config(self, mock_wordpress_source_config): - return CrawlerConfig(source=mock_wordpress_source_config, category="tech") - - @pytest.fixture - def wordpress_crawler(self, mock_crawler_config, mock_client_config): - return WordpressCrawler(mock_crawler_config, mock_client_config) - - @pytest.fixture - def mock_response_with_headers(self): - response = Mock() - response.headers = { - WordpressCrawler.TOTAL_PAGES_HEADER: "5", - WordpressCrawler.TOTAL_POSTS_HEADER: "47", - } - return response - - def test_with_valid_wordpress_source(self, wordpress_crawler): - """Test __init__ with valid WordPress source config.""" - assert wordpress_crawler.source.source_kind == SourceKind.WORDPRESS - assert isinstance(wordpress_crawler.source, WordPressSourceConfig) - - def test_with_invalid_source_kind_raises_error(self, mock_client_config): - """Test __init__ raises ValueError when source kind is not WORDPRESS.""" - html_source = HtmlSourceConfig( - source_id="test_html", - source_url=HttpUrl("https://example.com"), - pagination_template="news", - source_selectors=SourceSelectors(), - ) - config = CrawlerConfig(source=html_source) - - with pytest.raises( - ValueError, match="WordpressCrawler requires a source of kind WORDPRESS" - ): - WordpressCrawler(config, mock_client_config) - - def test_with_no_source_raises_error(self, mock_client_config): - """Test __init__ raises ValueError when source is None.""" - config = CrawlerConfig(source=None) - - with pytest.raises( - ValueError, match="WordpressCrawler requires a source of kind WORDPRESS" - ): - WordpressCrawler(config, mock_client_config) - - def test_get_pagination_returns_valid_page_range( - self, wordpress_crawler, mock_response_with_headers - ): - """Test get_pagination returns correct PageRange from WordPress API headers.""" - with patch.object( - wordpress_crawler.client, "get", return_value=mock_response_with_headers - ): - result = wordpress_crawler.get_pagination() - - assert isinstance(result, PageRange) - assert result.start == 1 - assert result.end == 5 - assert str(result) == "1:5" - - def test_get_pagination_with_default_headers(self, wordpress_crawler): - """Test get_pagination with default headers when WordPress headers are missing.""" - mock_response = Mock() - mock_response.headers = {} # No WordPress headers - - with patch.object(wordpress_crawler.client, "get", return_value=mock_response): - result = wordpress_crawler.get_pagination() - - assert isinstance(result, PageRange) - assert result.start == 1 - assert result.end == 1 # Default when no headers - - def test_get_pagination_makes_correct_api_call(self, wordpress_crawler): - """Test get_pagination makes the correct WordPress API call.""" - mock_response = Mock() - mock_response.headers = { - WordpressCrawler.TOTAL_PAGES_HEADER: "3", - WordpressCrawler.TOTAL_POSTS_HEADER: "25", - } - - with patch.object( - wordpress_crawler.client, "get", return_value=mock_response - ) as mock_get: - wordpress_crawler.get_pagination() - - expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/posts?_fields=id&per_page=100" - mock_get.assert_called_once_with(expected_url) - - def test_fetch_categories_populates_category_map(self, wordpress_crawler): - """Test _fetch_categories populates the category_map correctly.""" - mock_categories_response = Mock() - mock_categories_response.json.return_value = [ - {"id": 1, "slug": "technology", "count": 15}, - {"id": 2, "slug": "business", "count": 10}, - {"id": 3, "slug": "sports", "count": 8}, - ] - - with patch.object( - wordpress_crawler.client, "get", return_value=mock_categories_response - ): - wordpress_crawler._fetch_categories() - - assert len(wordpress_crawler.category_map) == 3 - assert wordpress_crawler.category_map[1] == "technology" - assert wordpress_crawler.category_map[2] == "business" - assert wordpress_crawler.category_map[3] == "sports" - - def test_fetch_categories_makes_correct_api_call(self, wordpress_crawler): - """Test _fetch_categories makes the correct WordPress API call.""" - mock_response = Mock() - mock_response.json.return_value = [] - - with patch.object( - wordpress_crawler.client, "get", return_value=mock_response - ) as mock_get: - wordpress_crawler._fetch_categories() - - expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/categories?{WordpressCrawler.CATEGORY_QUERY}" - mock_get.assert_called_once_with(expected_url) - - def test_map_categories_with_populated_category_map(self, wordpress_crawler): - """Test _map_categories returns correct comma-separated string.""" - - # Pre-populate category map - wordpress_crawler.category_map = { - 1: "technology", - 2: "business", - 3: "sports", - 4: "lifestyle", - } - - result = wordpress_crawler._map_categories([2, 1, 4]) - - # Should be sorted by category ID - assert result == "technology,business,lifestyle" - - def test_map_categories_with_empty_category_map_fetches_categories( - self, wordpress_crawler - ): - """Test _map_categories fetches categories when category_map is empty.""" - mock_categories_response = Mock() - mock_categories_response.json.return_value = [ - {"id": 1, "slug": "tech", "count": 15}, - {"id": 2, "slug": "news", "count": 10}, - ] - - wordpress_crawler.category_map = {} - with patch.object( - wordpress_crawler.client, "get", return_value=mock_categories_response - ): - result = wordpress_crawler._map_categories([1, 2]) - - assert result == "tech,news" - assert len(wordpress_crawler.category_map) == 2 - - def test_map_categories_filters_unknown_category_ids(self, wordpress_crawler): - """Test _map_categories filters out unknown category IDs.""" - wordpress_crawler.category_map = {1: "technology", 2: "business"} - - result = wordpress_crawler._map_categories([1, 99, 2, 100]) - - # Should only include known categories - assert result == "technology,business" - - def test_map_categories_with_empty_category_list(self, wordpress_crawler): - """Test _map_categories returns empty string for empty category list.""" - wordpress_crawler.category_map = {1: "tech", 2: "news"} - - result = wordpress_crawler._map_categories([]) - - assert result == "" - - def test_map_categories_sorts_by_category_id(self, wordpress_crawler): - """Test _map_categories sorts categories by ID.""" - wordpress_crawler.category_map = {3: "charlie", 1: "alpha", 2: "beta"} - - result = wordpress_crawler._map_categories([3, 1, 2]) - - # Should be sorted by ID: 1, 2, 3 - assert result == "alpha,beta,charlie" - - def test_supports_wordpress_source_kind(self): - """Test supports method returns True for WordPress source kind.""" - assert WordpressCrawler.supports() is SourceKind.WORDPRESS - - @pytest.mark.parametrize( - "pages,posts,expected_start,expected_end", - [ - ("1", "10", 1, 1), - ("5", "47", 1, 5), - ("10", "100", 1, 10), - ], - ) - def test_get_pagination_with_various_header_values( - self, wordpress_crawler, pages, posts, expected_start, expected_end - ): - """Test get_pagination with various header values.""" - mock_response = Mock() - mock_response.headers = { - WordpressCrawler.TOTAL_PAGES_HEADER: pages, - WordpressCrawler.TOTAL_POSTS_HEADER: posts, - } - - with patch.object(wordpress_crawler.client, "get", return_value=mock_response): - result = wordpress_crawler.get_pagination() - - assert result.start == expected_start - assert result.end == expected_end diff --git a/projects/crawler/tests/basango/services/test_date_parser.py b/projects/crawler/tests/basango/services/test_date_parser.py deleted file mode 100644 index 9fb7fe6..0000000 --- a/projects/crawler/tests/basango/services/test_date_parser.py +++ /dev/null @@ -1,70 +0,0 @@ -from datetime import datetime, timezone - -import pytest - -from basango.services.date_parser import DateParser - - -@pytest.mark.parametrize( - "date_str, fmt, pattern, replacement, expected", - [ - ( - "2004-02-12T15:19:21", - "%Y-%m-%dT%H:%M:%S", - None, - None, - 1076599161, # 2004-02-12 15:19:21 UTC - ), - ( - "08/10/2024 - 00:00", - "%Y-%m-%d %H:%M", - r"/(\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/", - r"$3-$2-$1 $4", - 1728345600, # 2024-10-08 00:00:00 UTC - ), - ( - "mar 08/10/2024 - 00:00", - "%Y-%m-%d %H:%M", - r"/\w{3} (\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/", - r"$3-$2-$1 $4", - 1728345600, # 2024-10-08 00:00:00 UTC - ), - ( - "Mardi 8 octobre 2024 - 00:00", - "%Y-%m-%d %H:%M", - r"/(\d{1}) (\d{1,2}) (\d{2}) (\d{4}) - (\d{2}:\d{2})/", - r"$4-$3-$2 $5", - 1728345600, # 2024-10-08 00:00:00 UTC - ), - ( - "8.10.2024 00:00", - "%d.%m.%Y %H:%M", - None, - None, - 1728345600, # 2024-10-08 00:00:00 UTC - ), - ], -) -def test_create_timestamp_with_valid_dates( - date_str: str, - fmt: str | None, - pattern: str | None, - replacement: str | None, - expected: int, -) -> None: - dr = DateParser() - result = dr.create_timestamp(date_str, fmt, pattern, replacement) - assert result == expected - - -def test_create_timestamp_with_invalid_date_falls_back_to_midnight_today() -> None: - dr = DateParser() - - # Compute expected midnight (UTC) before invoking the parser to avoid edge cases. - now = datetime.now(timezone.utc) - expected_midnight = int( - now.replace(hour=0, minute=0, second=0, microsecond=0).timestamp() - ) - - result = dr.create_timestamp("invalid date string", None, None, None) - assert result == expected_midnight diff --git a/projects/crawler/tests/conftest.py b/projects/crawler/tests/conftest.py deleted file mode 100644 index 81c79b8..0000000 --- a/projects/crawler/tests/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import os -import sys - - -# Ensure 'src' is on sys.path so `import basango...` works in tests -ROOT = os.path.dirname(os.path.dirname(__file__)) -SRC = os.path.join(ROOT, "src") -if SRC not in sys.path: - sys.path.insert(0, SRC) diff --git a/projects/crawler/uv.lock b/projects/crawler/uv.lock deleted file mode 100644 index da03417..0000000 --- a/projects/crawler/uv.lock +++ /dev/null @@ -1,948 +0,0 @@ -version = 1 -revision = 3 -requires-python = ">=3.13" - -[[package]] -name = "annotated-types" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, -] - -[[package]] -name = "anyio" -version = "4.10.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "idna" }, - { name = "sniffio" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f1/b4/636b3b65173d3ce9a38ef5f0522789614e590dab6a8d505340a4efe4c567/anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6", size = 213252, upload-time = "2025-08-04T08:54:26.451Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" }, -] - -[[package]] -name = "babel" -version = "2.17.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, -] - -[[package]] -name = "bandit" -version = "1.8.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "pyyaml" }, - { name = "rich" }, - { name = "stevedore" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fb/b5/7eb834e213d6f73aace21938e5e90425c92e5f42abafaf8a6d5d21beed51/bandit-1.8.6.tar.gz", hash = "sha256:dbfe9c25fc6961c2078593de55fd19f2559f9e45b99f1272341f5b95dea4e56b", size = 4240271, upload-time = "2025-07-06T03:10:50.9Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/48/ca/ba5f909b40ea12ec542d5d7bdd13ee31c4d65f3beed20211ef81c18fa1f3/bandit-1.8.6-py3-none-any.whl", hash = "sha256:3348e934d736fcdb68b6aa4030487097e23a501adf3e7827b63658df464dddd0", size = 133808, upload-time = "2025-07-06T03:10:49.134Z" }, -] - -[[package]] -name = "basango" -version = "0.1.0" -source = { editable = "." } -dependencies = [ - { name = "beautifulsoup4" }, - { name = "httpx" }, - { name = "markdownify" }, - { name = "pydantic" }, - { name = "pydantic-settings" }, - { name = "pyyaml" }, - { name = "readability-lxml" }, - { name = "rq" }, - { name = "selectolax" }, - { name = "tiktoken" }, - { name = "trafilatura" }, - { name = "typer" }, - { name = "uv-build" }, -] - -[package.dev-dependencies] -dev = [ - { name = "bandit" }, - { name = "pyright" }, - { name = "pytest" }, - { name = "ruff" }, -] - -[package.metadata] -requires-dist = [ - { name = "beautifulsoup4", specifier = ">=4.13.5" }, - { name = "httpx", specifier = ">=0.27.2" }, - { name = "markdownify", specifier = ">=0.13.1" }, - { name = "pydantic", specifier = ">=2.11.7" }, - { name = "pydantic-settings", specifier = ">=2.10.1" }, - { name = "pyyaml", specifier = ">=6.0.2" }, - { name = "readability-lxml", specifier = ">=0.8.1" }, - { name = "rq", specifier = ">=2.5.0" }, - { name = "selectolax", specifier = ">=0.3.20" }, - { name = "tiktoken", specifier = ">=0.12.0" }, - { name = "trafilatura", specifier = ">=1.7.0" }, - { name = "typer", specifier = ">=0.16.1" }, - { name = "uv-build", specifier = ">=0.8.12,<0.9.0" }, -] - -[package.metadata.requires-dev] -dev = [ - { name = "bandit", specifier = ">=1.8.6" }, - { name = "pyright", specifier = ">=1.1.404" }, - { name = "pytest", specifier = ">=8.4.1" }, - { name = "ruff", specifier = ">=0.12.9" }, -] - -[[package]] -name = "beautifulsoup4" -version = "4.13.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "soupsieve" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/85/2e/3e5079847e653b1f6dc647aa24549d68c6addb4c595cc0d902d1b19308ad/beautifulsoup4-4.13.5.tar.gz", hash = "sha256:5e70131382930e7c3de33450a2f54a63d5e4b19386eab43a5b34d594268f3695", size = 622954, upload-time = "2025-08-24T14:06:13.168Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/eb/f4151e0c7377a6e08a38108609ba5cede57986802757848688aeedd1b9e8/beautifulsoup4-4.13.5-py3-none-any.whl", hash = "sha256:642085eaa22233aceadff9c69651bc51e8bf3f874fb6d7104ece2beb24b47c4a", size = 105113, upload-time = "2025-08-24T14:06:14.884Z" }, -] - -[[package]] -name = "certifi" -version = "2025.8.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" }, -] - -[[package]] -name = "chardet" -version = "5.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" }, -] - -[[package]] -name = "charset-normalizer" -version = "3.4.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/83/2d/5fd176ceb9b2fc619e63405525573493ca23441330fcdaee6bef9460e924/charset_normalizer-3.4.3.tar.gz", hash = "sha256:6fce4b8500244f6fcb71465d4a4930d132ba9ab8e71a7859e6a5d59851068d14", size = 122371, upload-time = "2025-08-09T07:57:28.46Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/65/ca/2135ac97709b400c7654b4b764daf5c5567c2da45a30cdd20f9eefe2d658/charset_normalizer-3.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:14c2a87c65b351109f6abfc424cab3927b3bdece6f706e4d12faaf3d52ee5efe", size = 205326, upload-time = "2025-08-09T07:56:24.721Z" }, - { url = "https://files.pythonhosted.org/packages/71/11/98a04c3c97dd34e49c7d247083af03645ca3730809a5509443f3c37f7c99/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41d1fc408ff5fdfb910200ec0e74abc40387bccb3252f3f27c0676731df2b2c8", size = 146008, upload-time = "2025-08-09T07:56:26.004Z" }, - { url = "https://files.pythonhosted.org/packages/60/f5/4659a4cb3c4ec146bec80c32d8bb16033752574c20b1252ee842a95d1a1e/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1bb60174149316da1c35fa5233681f7c0f9f514509b8e399ab70fea5f17e45c9", size = 159196, upload-time = "2025-08-09T07:56:27.25Z" }, - { url = "https://files.pythonhosted.org/packages/86/9e/f552f7a00611f168b9a5865a1414179b2c6de8235a4fa40189f6f79a1753/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30d006f98569de3459c2fc1f2acde170b7b2bd265dc1943e87e1a4efe1b67c31", size = 156819, upload-time = "2025-08-09T07:56:28.515Z" }, - { url = "https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:416175faf02e4b0810f1f38bcb54682878a4af94059a1cd63b8747244420801f", size = 151350, upload-time = "2025-08-09T07:56:29.716Z" }, - { url = "https://files.pythonhosted.org/packages/c2/a9/3865b02c56f300a6f94fc631ef54f0a8a29da74fb45a773dfd3dcd380af7/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6aab0f181c486f973bc7262a97f5aca3ee7e1437011ef0c2ec04b5a11d16c927", size = 148644, upload-time = "2025-08-09T07:56:30.984Z" }, - { url = "https://files.pythonhosted.org/packages/77/d9/cbcf1a2a5c7d7856f11e7ac2d782aec12bdfea60d104e60e0aa1c97849dc/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabf8315679312cfa71302f9bd509ded4f2f263fb5b765cf1433b39106c3cc9", size = 160468, upload-time = "2025-08-09T07:56:32.252Z" }, - { url = "https://files.pythonhosted.org/packages/f6/42/6f45efee8697b89fda4d50580f292b8f7f9306cb2971d4b53f8914e4d890/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:bd28b817ea8c70215401f657edef3a8aa83c29d447fb0b622c35403780ba11d5", size = 158187, upload-time = "2025-08-09T07:56:33.481Z" }, - { url = "https://files.pythonhosted.org/packages/70/99/f1c3bdcfaa9c45b3ce96f70b14f070411366fa19549c1d4832c935d8e2c3/charset_normalizer-3.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:18343b2d246dc6761a249ba1fb13f9ee9a2bcd95decc767319506056ea4ad4dc", size = 152699, upload-time = "2025-08-09T07:56:34.739Z" }, - { url = "https://files.pythonhosted.org/packages/a3/ad/b0081f2f99a4b194bcbb1934ef3b12aa4d9702ced80a37026b7607c72e58/charset_normalizer-3.4.3-cp313-cp313-win32.whl", hash = "sha256:6fb70de56f1859a3f71261cbe41005f56a7842cc348d3aeb26237560bfa5e0ce", size = 99580, upload-time = "2025-08-09T07:56:35.981Z" }, - { url = "https://files.pythonhosted.org/packages/9a/8f/ae790790c7b64f925e5c953b924aaa42a243fb778fed9e41f147b2a5715a/charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:cf1ebb7d78e1ad8ec2a8c4732c7be2e736f6e5123a4146c5b89c9d1f585f8cef", size = 107366, upload-time = "2025-08-09T07:56:37.339Z" }, - { url = "https://files.pythonhosted.org/packages/8e/91/b5a06ad970ddc7a0e513112d40113e834638f4ca1120eb727a249fb2715e/charset_normalizer-3.4.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3cd35b7e8aedeb9e34c41385fda4f73ba609e561faedfae0a9e75e44ac558a15", size = 204342, upload-time = "2025-08-09T07:56:38.687Z" }, - { url = "https://files.pythonhosted.org/packages/ce/ec/1edc30a377f0a02689342f214455c3f6c2fbedd896a1d2f856c002fc3062/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b89bc04de1d83006373429975f8ef9e7932534b8cc9ca582e4db7d20d91816db", size = 145995, upload-time = "2025-08-09T07:56:40.048Z" }, - { url = "https://files.pythonhosted.org/packages/17/e5/5e67ab85e6d22b04641acb5399c8684f4d37caf7558a53859f0283a650e9/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2001a39612b241dae17b4687898843f254f8748b796a2e16f1051a17078d991d", size = 158640, upload-time = "2025-08-09T07:56:41.311Z" }, - { url = "https://files.pythonhosted.org/packages/f1/e5/38421987f6c697ee3722981289d554957c4be652f963d71c5e46a262e135/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8dcfc373f888e4fb39a7bc57e93e3b845e7f462dacc008d9749568b1c4ece096", size = 156636, upload-time = "2025-08-09T07:56:43.195Z" }, - { url = "https://files.pythonhosted.org/packages/a0/e4/5a075de8daa3ec0745a9a3b54467e0c2967daaaf2cec04c845f73493e9a1/charset_normalizer-3.4.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b97b8404387b96cdbd30ad660f6407799126d26a39ca65729162fd810a99aa", size = 150939, upload-time = "2025-08-09T07:56:44.819Z" }, - { url = "https://files.pythonhosted.org/packages/02/f7/3611b32318b30974131db62b4043f335861d4d9b49adc6d57c1149cc49d4/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ccf600859c183d70eb47e05a44cd80a4ce77394d1ac0f79dbd2dd90a69a3a049", size = 148580, upload-time = "2025-08-09T07:56:46.684Z" }, - { url = "https://files.pythonhosted.org/packages/7e/61/19b36f4bd67f2793ab6a99b979b4e4f3d8fc754cbdffb805335df4337126/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:53cd68b185d98dde4ad8990e56a58dea83a4162161b1ea9272e5c9182ce415e0", size = 159870, upload-time = "2025-08-09T07:56:47.941Z" }, - { url = "https://files.pythonhosted.org/packages/06/57/84722eefdd338c04cf3030ada66889298eaedf3e7a30a624201e0cbe424a/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:30a96e1e1f865f78b030d65241c1ee850cdf422d869e9028e2fc1d5e4db73b92", size = 157797, upload-time = "2025-08-09T07:56:49.756Z" }, - { url = "https://files.pythonhosted.org/packages/72/2a/aff5dd112b2f14bcc3462c312dce5445806bfc8ab3a7328555da95330e4b/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d716a916938e03231e86e43782ca7878fb602a125a91e7acb8b5112e2e96ac16", size = 152224, upload-time = "2025-08-09T07:56:51.369Z" }, - { url = "https://files.pythonhosted.org/packages/b7/8c/9839225320046ed279c6e839d51f028342eb77c91c89b8ef2549f951f3ec/charset_normalizer-3.4.3-cp314-cp314-win32.whl", hash = "sha256:c6dbd0ccdda3a2ba7c2ecd9d77b37f3b5831687d8dc1b6ca5f56a4880cc7b7ce", size = 100086, upload-time = "2025-08-09T07:56:52.722Z" }, - { url = "https://files.pythonhosted.org/packages/ee/7a/36fbcf646e41f710ce0a563c1c9a343c6edf9be80786edeb15b6f62e17db/charset_normalizer-3.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:73dc19b562516fc9bcf6e5d6e596df0b4eb98d87e4f79f3ae71840e6ed21361c", size = 107400, upload-time = "2025-08-09T07:56:55.172Z" }, - { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" }, -] - -[[package]] -name = "click" -version = "8.2.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" }, -] - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, -] - -[[package]] -name = "courlan" -version = "1.3.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "babel" }, - { name = "tld" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/6f/54/6d6ceeff4bed42e7a10d6064d35ee43a810e7b3e8beb4abeae8cff4713ae/courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190", size = 206382, upload-time = "2024-10-29T16:40:20.994Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/ca/6a667ccbe649856dcd3458bab80b016681b274399d6211187c6ab969fc50/courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be", size = 33848, upload-time = "2024-10-29T16:40:18.325Z" }, -] - -[[package]] -name = "croniter" -version = "6.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "python-dateutil" }, - { name = "pytz" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ad/2f/44d1ae153a0e27be56be43465e5cb39b9650c781e001e7864389deb25090/croniter-6.0.0.tar.gz", hash = "sha256:37c504b313956114a983ece2c2b07790b1f1094fe9d81cc94739214748255577", size = 64481, upload-time = "2024-12-17T17:17:47.32Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/07/4b/290b4c3efd6417a8b0c284896de19b1d5855e6dbdb97d2a35e68fa42de85/croniter-6.0.0-py2.py3-none-any.whl", hash = "sha256:2f878c3856f17896979b2a4379ba1f09c83e374931ea15cc835c5dd2eee9b368", size = 25468, upload-time = "2024-12-17T17:17:45.359Z" }, -] - -[[package]] -name = "cssselect" -version = "1.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870, upload-time = "2025-03-10T09:30:29.638Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786, upload-time = "2025-03-10T09:30:28.048Z" }, -] - -[[package]] -name = "dateparser" -version = "1.2.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "python-dateutil" }, - { name = "pytz" }, - { name = "regex" }, - { name = "tzlocal" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/30/064144f0df1749e7bb5faaa7f52b007d7c2d08ec08fed8411aba87207f68/dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7", size = 329840, upload-time = "2025-06-26T09:29:23.211Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/22/f020c047ae1346613db9322638186468238bcfa8849b4668a22b97faad65/dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482", size = 315453, upload-time = "2025-06-26T09:29:21.412Z" }, -] - -[[package]] -name = "h11" -version = "0.16.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, -] - -[[package]] -name = "htmldate" -version = "1.9.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "charset-normalizer" }, - { name = "dateparser" }, - { name = "lxml" }, - { name = "python-dateutil" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a5/26/aaae4cab984f0b7dd0f5f1b823fa2ed2fd4a2bb50acd5bd2f0d217562678/htmldate-1.9.3.tar.gz", hash = "sha256:ac0caf4628c3ded4042011e2d60dc68dfb314c77b106587dd307a80d77e708e9", size = 44913, upload-time = "2024-12-30T12:52:35.206Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/05/49/8872130016209c20436ce0c1067de1cf630755d0443d068a5bc17fa95015/htmldate-1.9.3-py3-none-any.whl", hash = "sha256:3fadc422cf3c10a5cdb5e1b914daf37ec7270400a80a1b37e2673ff84faaaff8", size = 31565, upload-time = "2024-12-30T12:52:32.145Z" }, -] - -[[package]] -name = "httpcore" -version = "1.0.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "h11" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, -] - -[[package]] -name = "httpx" -version = "0.28.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "anyio" }, - { name = "certifi" }, - { name = "httpcore" }, - { name = "idna" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, -] - -[[package]] -name = "idna" -version = "3.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, -] - -[[package]] -name = "iniconfig" -version = "2.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, -] - -[[package]] -name = "justext" -version = "3.0.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "lxml", extra = ["html-clean"] }, -] -sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521, upload-time = "2025-02-25T20:21:49.934Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" }, -] - -[[package]] -name = "lxml" -version = "5.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/76/3d/14e82fc7c8fb1b7761f7e748fd47e2ec8276d137b6acfe5a4bb73853e08f/lxml-5.4.0.tar.gz", hash = "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd", size = 3679479, upload-time = "2025-04-23T01:50:29.322Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/cb/2ba1e9dd953415f58548506fa5549a7f373ae55e80c61c9041b7fd09a38a/lxml-5.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:773e27b62920199c6197130632c18fb7ead3257fce1ffb7d286912e56ddb79e0", size = 8110086, upload-time = "2025-04-23T01:46:52.218Z" }, - { url = "https://files.pythonhosted.org/packages/b5/3e/6602a4dca3ae344e8609914d6ab22e52ce42e3e1638c10967568c5c1450d/lxml-5.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9c671845de9699904b1e9df95acfe8dfc183f2310f163cdaa91a3535af95de", size = 4404613, upload-time = "2025-04-23T01:46:55.281Z" }, - { url = "https://files.pythonhosted.org/packages/4c/72/bf00988477d3bb452bef9436e45aeea82bb40cdfb4684b83c967c53909c7/lxml-5.4.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9454b8d8200ec99a224df8854786262b1bd6461f4280064c807303c642c05e76", size = 5012008, upload-time = "2025-04-23T01:46:57.817Z" }, - { url = "https://files.pythonhosted.org/packages/92/1f/93e42d93e9e7a44b2d3354c462cd784dbaaf350f7976b5d7c3f85d68d1b1/lxml-5.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cccd007d5c95279e529c146d095f1d39ac05139de26c098166c4beb9374b0f4d", size = 4760915, upload-time = "2025-04-23T01:47:00.745Z" }, - { url = "https://files.pythonhosted.org/packages/45/0b/363009390d0b461cf9976a499e83b68f792e4c32ecef092f3f9ef9c4ba54/lxml-5.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0fce1294a0497edb034cb416ad3e77ecc89b313cff7adbee5334e4dc0d11f422", size = 5283890, upload-time = "2025-04-23T01:47:04.702Z" }, - { url = "https://files.pythonhosted.org/packages/19/dc/6056c332f9378ab476c88e301e6549a0454dbee8f0ae16847414f0eccb74/lxml-5.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:24974f774f3a78ac12b95e3a20ef0931795ff04dbb16db81a90c37f589819551", size = 4812644, upload-time = "2025-04-23T01:47:07.833Z" }, - { url = "https://files.pythonhosted.org/packages/ee/8a/f8c66bbb23ecb9048a46a5ef9b495fd23f7543df642dabeebcb2eeb66592/lxml-5.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:497cab4d8254c2a90bf988f162ace2ddbfdd806fce3bda3f581b9d24c852e03c", size = 4921817, upload-time = "2025-04-23T01:47:10.317Z" }, - { url = "https://files.pythonhosted.org/packages/04/57/2e537083c3f381f83d05d9b176f0d838a9e8961f7ed8ddce3f0217179ce3/lxml-5.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e794f698ae4c5084414efea0f5cc9f4ac562ec02d66e1484ff822ef97c2cadff", size = 4753916, upload-time = "2025-04-23T01:47:12.823Z" }, - { url = "https://files.pythonhosted.org/packages/d8/80/ea8c4072109a350848f1157ce83ccd9439601274035cd045ac31f47f3417/lxml-5.4.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:2c62891b1ea3094bb12097822b3d44b93fc6c325f2043c4d2736a8ff09e65f60", size = 5289274, upload-time = "2025-04-23T01:47:15.916Z" }, - { url = "https://files.pythonhosted.org/packages/b3/47/c4be287c48cdc304483457878a3f22999098b9a95f455e3c4bda7ec7fc72/lxml-5.4.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:142accb3e4d1edae4b392bd165a9abdee8a3c432a2cca193df995bc3886249c8", size = 4874757, upload-time = "2025-04-23T01:47:19.793Z" }, - { url = "https://files.pythonhosted.org/packages/2f/04/6ef935dc74e729932e39478e44d8cfe6a83550552eaa072b7c05f6f22488/lxml-5.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1a42b3a19346e5601d1b8296ff6ef3d76038058f311902edd574461e9c036982", size = 4947028, upload-time = "2025-04-23T01:47:22.401Z" }, - { url = "https://files.pythonhosted.org/packages/cb/f9/c33fc8daa373ef8a7daddb53175289024512b6619bc9de36d77dca3df44b/lxml-5.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4291d3c409a17febf817259cb37bc62cb7eb398bcc95c1356947e2871911ae61", size = 4834487, upload-time = "2025-04-23T01:47:25.513Z" }, - { url = "https://files.pythonhosted.org/packages/8d/30/fc92bb595bcb878311e01b418b57d13900f84c2b94f6eca9e5073ea756e6/lxml-5.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4f5322cf38fe0e21c2d73901abf68e6329dc02a4994e483adbcf92b568a09a54", size = 5381688, upload-time = "2025-04-23T01:47:28.454Z" }, - { url = "https://files.pythonhosted.org/packages/43/d1/3ba7bd978ce28bba8e3da2c2e9d5ae3f8f521ad3f0ca6ea4788d086ba00d/lxml-5.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:0be91891bdb06ebe65122aa6bf3fc94489960cf7e03033c6f83a90863b23c58b", size = 5242043, upload-time = "2025-04-23T01:47:31.208Z" }, - { url = "https://files.pythonhosted.org/packages/ee/cd/95fa2201041a610c4d08ddaf31d43b98ecc4b1d74b1e7245b1abdab443cb/lxml-5.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:15a665ad90054a3d4f397bc40f73948d48e36e4c09f9bcffc7d90c87410e478a", size = 5021569, upload-time = "2025-04-23T01:47:33.805Z" }, - { url = "https://files.pythonhosted.org/packages/2d/a6/31da006fead660b9512d08d23d31e93ad3477dd47cc42e3285f143443176/lxml-5.4.0-cp313-cp313-win32.whl", hash = "sha256:d5663bc1b471c79f5c833cffbc9b87d7bf13f87e055a5c86c363ccd2348d7e82", size = 3485270, upload-time = "2025-04-23T01:47:36.133Z" }, - { url = "https://files.pythonhosted.org/packages/fc/14/c115516c62a7d2499781d2d3d7215218c0731b2c940753bf9f9b7b73924d/lxml-5.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:bcb7a1096b4b6b24ce1ac24d4942ad98f983cd3810f9711bcd0293f43a9d8b9f", size = 3814606, upload-time = "2025-04-23T01:47:39.028Z" }, -] - -[package.optional-dependencies] -html-clean = [ - { name = "lxml-html-clean" }, -] - -[[package]] -name = "lxml-html-clean" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "lxml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/79/b6/466e71db127950fb8d172026a8f0a9f0dc6f64c8e78e2ca79f252e5790b8/lxml_html_clean-0.4.2.tar.gz", hash = "sha256:91291e7b5db95430abf461bc53440964d58e06cc468950f9e47db64976cebcb3", size = 21622, upload-time = "2025-04-09T11:33:59.432Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/0b/942cb7278d6caad79343ad2ddd636ed204a47909b969d19114a3097f5aa3/lxml_html_clean-0.4.2-py3-none-any.whl", hash = "sha256:74ccfba277adcfea87a1e9294f47dd86b05d65b4da7c5b07966e3d5f3be8a505", size = 14184, upload-time = "2025-04-09T11:33:57.988Z" }, -] - -[[package]] -name = "markdown-it-py" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mdurl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, -] - -[[package]] -name = "markdownify" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "beautifulsoup4" }, - { name = "six" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/83/1b/6f2697b51eaca81f08852fd2734745af15718fea10222a1d40f8a239c4ea/markdownify-1.2.0.tar.gz", hash = "sha256:f6c367c54eb24ee953921804dfe6d6575c5e5b42c643955e7242034435de634c", size = 18771, upload-time = "2025-08-09T17:44:15.302Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" }, -] - -[[package]] -name = "mdurl" -version = "0.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, -] - -[[package]] -name = "nodeenv" -version = "1.9.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, -] - -[[package]] -name = "packaging" -version = "25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, -] - -[[package]] -name = "pbr" -version = "7.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "setuptools" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/80/88/baf6b45d064271f19fefac7def6a030a893f912f430de0024dd595ced61f/pbr-7.0.0.tar.gz", hash = "sha256:cf4127298723dafbce3afd13775ccf3885be5d3c8435751b867f9a6a10b71a39", size = 129146, upload-time = "2025-08-13T09:16:41.654Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d6/98/120c3e21bf3fc0ef397a3906465ee9f5c76996c52811e65455eadc12d68a/pbr-7.0.0-py2.py3-none-any.whl", hash = "sha256:b447e63a2bc04fd975fc0480b8d5ebf979179e2c0ae203bf1eff9ea20073bc38", size = 125109, upload-time = "2025-08-13T09:16:40.269Z" }, -] - -[[package]] -name = "pluggy" -version = "1.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, -] - -[[package]] -name = "pydantic" -version = "2.11.7" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-types" }, - { name = "pydantic-core" }, - { name = "typing-extensions" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" }, -] - -[[package]] -name = "pydantic-core" -version = "2.33.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, - { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, - { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, - { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, - { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, - { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, - { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, - { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, - { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, - { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, - { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, - { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, - { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, - { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, - { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, - { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, - { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, -] - -[[package]] -name = "pydantic-settings" -version = "2.10.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pydantic" }, - { name = "python-dotenv" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/68/85/1ea668bbab3c50071ca613c6ab30047fb36ab0da1b92fa8f17bbc38fd36c/pydantic_settings-2.10.1.tar.gz", hash = "sha256:06f0062169818d0f5524420a360d632d5857b83cffd4d42fe29597807a1614ee", size = 172583, upload-time = "2025-06-24T13:26:46.841Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/58/f0/427018098906416f580e3cf1366d3b1abfb408a0652e9f31600c24a1903c/pydantic_settings-2.10.1-py3-none-any.whl", hash = "sha256:a60952460b99cf661dc25c29c0ef171721f98bfcb52ef8d9ea4c943d7c8cc796", size = 45235, upload-time = "2025-06-24T13:26:45.485Z" }, -] - -[[package]] -name = "pygments" -version = "2.19.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, -] - -[[package]] -name = "pyright" -version = "1.1.404" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nodeenv" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e2/6e/026be64c43af681d5632722acd100b06d3d39f383ec382ff50a71a6d5bce/pyright-1.1.404.tar.gz", hash = "sha256:455e881a558ca6be9ecca0b30ce08aa78343ecc031d37a198ffa9a7a1abeb63e", size = 4065679, upload-time = "2025-08-20T18:46:14.029Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/84/30/89aa7f7d7a875bbb9a577d4b1dc5a3e404e3d2ae2657354808e905e358e0/pyright-1.1.404-py3-none-any.whl", hash = "sha256:c7b7ff1fdb7219c643079e4c3e7d4125f0dafcc19d253b47e898d130ea426419", size = 5902951, upload-time = "2025-08-20T18:46:12.096Z" }, -] - -[[package]] -name = "pytest" -version = "8.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "iniconfig" }, - { name = "packaging" }, - { name = "pluggy" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" }, -] - -[[package]] -name = "python-dateutil" -version = "2.9.0.post0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "six" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, -] - -[[package]] -name = "python-dotenv" -version = "1.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f6/b0/4bc07ccd3572a2f9df7e6782f52b0c6c90dcbb803ac4a167702d7d0dfe1e/python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab", size = 41978, upload-time = "2025-06-24T04:21:07.341Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5f/ed/539768cf28c661b5b068d66d96a2f155c4971a5d55684a514c1a0e0dec2f/python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc", size = 20556, upload-time = "2025-06-24T04:21:06.073Z" }, -] - -[[package]] -name = "pytz" -version = "2025.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, -] - -[[package]] -name = "pyyaml" -version = "6.0.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" }, - { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" }, - { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" }, - { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361, upload-time = "2024-08-06T20:32:51.188Z" }, - { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523, upload-time = "2024-08-06T20:32:53.019Z" }, - { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660, upload-time = "2024-08-06T20:32:54.708Z" }, - { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload-time = "2024-08-06T20:32:56.985Z" }, - { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload-time = "2024-08-06T20:33:03.001Z" }, - { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, -] - -[[package]] -name = "readability-lxml" -version = "0.8.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "chardet" }, - { name = "cssselect" }, - { name = "lxml", extra = ["html-clean"] }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/3e/dc87d97532ddad58af786ec89c7036182e352574c1cba37bf2bf783d2b15/readability_lxml-0.8.4.1.tar.gz", hash = "sha256:9d2924f5942dd7f37fb4da353263b22a3e877ccf922d0e45e348e4177b035a53", size = 22874, upload-time = "2025-05-03T21:11:45.493Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/75/2cc58965097e351415af420be81c4665cf80da52a17ef43c01ffbe2caf91/readability_lxml-0.8.4.1-py3-none-any.whl", hash = "sha256:874c0cea22c3bf2b78c7f8df831bfaad3c0a89b7301d45a188db581652b4b465", size = 19912, upload-time = "2025-05-03T21:11:43.993Z" }, -] - -[[package]] -name = "redis" -version = "6.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0d/d6/e8b92798a5bd67d659d51a18170e91c16ac3b59738d91894651ee255ed49/redis-6.4.0.tar.gz", hash = "sha256:b01bc7282b8444e28ec36b261df5375183bb47a07eb9c603f284e89cbc5ef010", size = 4647399, upload-time = "2025-08-07T08:10:11.441Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl", hash = "sha256:f0544fa9604264e9464cdf4814e7d4830f74b165d52f2a330a760a88dd248b7f", size = 279847, upload-time = "2025-08-07T08:10:09.84Z" }, -] - -[[package]] -name = "regex" -version = "2025.9.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b2/5a/4c63457fbcaf19d138d72b2e9b39405954f98c0349b31c601bfcb151582c/regex-2025.9.1.tar.gz", hash = "sha256:88ac07b38d20b54d79e704e38aa3bd2c0f8027432164226bdee201a1c0c9c9ff", size = 400852, upload-time = "2025-09-01T22:10:10.479Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/25/b2959ce90c6138c5142fe5264ee1f9b71a0c502ca4c7959302a749407c79/regex-2025.9.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bc6834727d1b98d710a63e6c823edf6ffbf5792eba35d3fa119531349d4142ef", size = 485932, upload-time = "2025-09-01T22:08:57.913Z" }, - { url = "https://files.pythonhosted.org/packages/49/2e/6507a2a85f3f2be6643438b7bd976e67ad73223692d6988eb1ff444106d3/regex-2025.9.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c3dc05b6d579875719bccc5f3037b4dc80433d64e94681a0061845bd8863c025", size = 289568, upload-time = "2025-09-01T22:08:59.258Z" }, - { url = "https://files.pythonhosted.org/packages/c7/d8/de4a4b57215d99868f1640e062a7907e185ec7476b4b689e2345487c1ff4/regex-2025.9.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:22213527df4c985ec4a729b055a8306272d41d2f45908d7bacb79be0fa7a75ad", size = 286984, upload-time = "2025-09-01T22:09:00.835Z" }, - { url = "https://files.pythonhosted.org/packages/03/15/e8cb403403a57ed316e80661db0e54d7aa2efcd85cb6156f33cc18746922/regex-2025.9.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8e3f6e3c5a5a1adc3f7ea1b5aec89abfc2f4fbfba55dafb4343cd1d084f715b2", size = 797514, upload-time = "2025-09-01T22:09:02.538Z" }, - { url = "https://files.pythonhosted.org/packages/e4/26/2446f2b9585fed61faaa7e2bbce3aca7dd8df6554c32addee4c4caecf24a/regex-2025.9.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bcb89c02a0d6c2bec9b0bb2d8c78782699afe8434493bfa6b4021cc51503f249", size = 862586, upload-time = "2025-09-01T22:09:04.322Z" }, - { url = "https://files.pythonhosted.org/packages/fd/b8/82ffbe9c0992c31bbe6ae1c4b4e21269a5df2559102b90543c9b56724c3c/regex-2025.9.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b0e2f95413eb0c651cd1516a670036315b91b71767af83bc8525350d4375ccba", size = 910815, upload-time = "2025-09-01T22:09:05.978Z" }, - { url = "https://files.pythonhosted.org/packages/2f/d8/7303ea38911759c1ee30cc5bc623ee85d3196b733c51fd6703c34290a8d9/regex-2025.9.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a41dc039e1c97d3c2ed3e26523f748e58c4de3ea7a31f95e1cf9ff973fff5a", size = 802042, upload-time = "2025-09-01T22:09:07.865Z" }, - { url = "https://files.pythonhosted.org/packages/fc/0e/6ad51a55ed4b5af512bb3299a05d33309bda1c1d1e1808fa869a0bed31bc/regex-2025.9.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f0b4258b161094f66857a26ee938d3fe7b8a5063861e44571215c44fbf0e5df", size = 786764, upload-time = "2025-09-01T22:09:09.362Z" }, - { url = "https://files.pythonhosted.org/packages/8d/d5/394e3ffae6baa5a9217bbd14d96e0e5da47bb069d0dbb8278e2681a2b938/regex-2025.9.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bf70e18ac390e6977ea7e56f921768002cb0fa359c4199606c7219854ae332e0", size = 856557, upload-time = "2025-09-01T22:09:11.129Z" }, - { url = "https://files.pythonhosted.org/packages/cd/80/b288d3910c41194ad081b9fb4b371b76b0bbfdce93e7709fc98df27b37dc/regex-2025.9.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b84036511e1d2bb0a4ff1aec26951caa2dea8772b223c9e8a19ed8885b32dbac", size = 849108, upload-time = "2025-09-01T22:09:12.877Z" }, - { url = "https://files.pythonhosted.org/packages/d1/cd/5ec76bf626d0d5abdc277b7a1734696f5f3d14fbb4a3e2540665bc305d85/regex-2025.9.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c2e05dcdfe224047f2a59e70408274c325d019aad96227ab959403ba7d58d2d7", size = 788201, upload-time = "2025-09-01T22:09:14.561Z" }, - { url = "https://files.pythonhosted.org/packages/b5/36/674672f3fdead107565a2499f3007788b878188acec6d42bc141c5366c2c/regex-2025.9.1-cp313-cp313-win32.whl", hash = "sha256:3b9a62107a7441b81ca98261808fed30ae36ba06c8b7ee435308806bd53c1ed8", size = 264508, upload-time = "2025-09-01T22:09:16.193Z" }, - { url = "https://files.pythonhosted.org/packages/83/ad/931134539515eb64ce36c24457a98b83c1b2e2d45adf3254b94df3735a76/regex-2025.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:b38afecc10c177eb34cfae68d669d5161880849ba70c05cbfbe409f08cc939d7", size = 275469, upload-time = "2025-09-01T22:09:17.462Z" }, - { url = "https://files.pythonhosted.org/packages/24/8c/96d34e61c0e4e9248836bf86d69cb224fd222f270fa9045b24e218b65604/regex-2025.9.1-cp313-cp313-win_arm64.whl", hash = "sha256:ec329890ad5e7ed9fc292858554d28d58d56bf62cf964faf0aa57964b21155a0", size = 268586, upload-time = "2025-09-01T22:09:18.948Z" }, - { url = "https://files.pythonhosted.org/packages/21/b1/453cbea5323b049181ec6344a803777914074b9726c9c5dc76749966d12d/regex-2025.9.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:72fb7a016467d364546f22b5ae86c45680a4e0de6b2a6f67441d22172ff641f1", size = 486111, upload-time = "2025-09-01T22:09:20.734Z" }, - { url = "https://files.pythonhosted.org/packages/f6/0e/92577f197bd2f7652c5e2857f399936c1876978474ecc5b068c6d8a79c86/regex-2025.9.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c9527fa74eba53f98ad86be2ba003b3ebe97e94b6eb2b916b31b5f055622ef03", size = 289520, upload-time = "2025-09-01T22:09:22.249Z" }, - { url = "https://files.pythonhosted.org/packages/af/c6/b472398116cca7ea5a6c4d5ccd0fc543f7fd2492cb0c48d2852a11972f73/regex-2025.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c905d925d194c83a63f92422af7544ec188301451b292c8b487f0543726107ca", size = 287215, upload-time = "2025-09-01T22:09:23.657Z" }, - { url = "https://files.pythonhosted.org/packages/cf/11/f12ecb0cf9ca792a32bb92f758589a84149017467a544f2f6bfb45c0356d/regex-2025.9.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74df7c74a63adcad314426b1f4ea6054a5ab25d05b0244f0c07ff9ce640fa597", size = 797855, upload-time = "2025-09-01T22:09:25.197Z" }, - { url = "https://files.pythonhosted.org/packages/46/88/bbb848f719a540fb5997e71310f16f0b33a92c5d4b4d72d4311487fff2a3/regex-2025.9.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4f6e935e98ea48c7a2e8be44494de337b57a204470e7f9c9c42f912c414cd6f5", size = 863363, upload-time = "2025-09-01T22:09:26.705Z" }, - { url = "https://files.pythonhosted.org/packages/54/a9/2321eb3e2838f575a78d48e03c1e83ea61bd08b74b7ebbdeca8abc50fc25/regex-2025.9.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4a62d033cd9ebefc7c5e466731a508dfabee827d80b13f455de68a50d3c2543d", size = 910202, upload-time = "2025-09-01T22:09:28.906Z" }, - { url = "https://files.pythonhosted.org/packages/33/07/d1d70835d7d11b7e126181f316f7213c4572ecf5c5c97bdbb969fb1f38a2/regex-2025.9.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef971ebf2b93bdc88d8337238be4dfb851cc97ed6808eb04870ef67589415171", size = 801808, upload-time = "2025-09-01T22:09:30.733Z" }, - { url = "https://files.pythonhosted.org/packages/13/d1/29e4d1bed514ef2bf3a4ead3cb8bb88ca8af94130239a4e68aa765c35b1c/regex-2025.9.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d936a1db208bdca0eca1f2bb2c1ba1d8370b226785c1e6db76e32a228ffd0ad5", size = 786824, upload-time = "2025-09-01T22:09:32.61Z" }, - { url = "https://files.pythonhosted.org/packages/33/27/20d8ccb1bee460faaa851e6e7cc4cfe852a42b70caa1dca22721ba19f02f/regex-2025.9.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:7e786d9e4469698fc63815b8de08a89165a0aa851720eb99f5e0ea9d51dd2b6a", size = 857406, upload-time = "2025-09-01T22:09:34.117Z" }, - { url = "https://files.pythonhosted.org/packages/74/fe/60c6132262dc36430d51e0c46c49927d113d3a38c1aba6a26c7744c84cf3/regex-2025.9.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:6b81d7dbc5466ad2c57ce3a0ddb717858fe1a29535c8866f8514d785fdb9fc5b", size = 848593, upload-time = "2025-09-01T22:09:35.598Z" }, - { url = "https://files.pythonhosted.org/packages/cc/ae/2d4ff915622fabbef1af28387bf71e7f2f4944a348b8460d061e85e29bf0/regex-2025.9.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cd4890e184a6feb0ef195338a6ce68906a8903a0f2eb7e0ab727dbc0a3156273", size = 787951, upload-time = "2025-09-01T22:09:37.139Z" }, - { url = "https://files.pythonhosted.org/packages/85/37/dc127703a9e715a284cc2f7dbdd8a9776fd813c85c126eddbcbdd1ca5fec/regex-2025.9.1-cp314-cp314-win32.whl", hash = "sha256:34679a86230e46164c9e0396b56cab13c0505972343880b9e705083cc5b8ec86", size = 269833, upload-time = "2025-09-01T22:09:39.245Z" }, - { url = "https://files.pythonhosted.org/packages/83/bf/4bed4d3d0570e16771defd5f8f15f7ea2311edcbe91077436d6908956c4a/regex-2025.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:a1196e530a6bfa5f4bde029ac5b0295a6ecfaaffbfffede4bbaf4061d9455b70", size = 278742, upload-time = "2025-09-01T22:09:40.651Z" }, - { url = "https://files.pythonhosted.org/packages/cf/3e/7d7ac6fd085023312421e0d69dfabdfb28e116e513fadbe9afe710c01893/regex-2025.9.1-cp314-cp314-win_arm64.whl", hash = "sha256:f46d525934871ea772930e997d577d48c6983e50f206ff7b66d4ac5f8941e993", size = 271860, upload-time = "2025-09-01T22:09:42.413Z" }, -] - -[[package]] -name = "requests" -version = "2.32.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "charset-normalizer" }, - { name = "idna" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, -] - -[[package]] -name = "rich" -version = "14.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e3/30/3c4d035596d3cf444529e0b2953ad0466f6049528a879d27534700580395/rich-14.1.0-py3-none-any.whl", hash = "sha256:536f5f1785986d6dbdea3c75205c473f970777b4a0d6c6dd1b696aa05a3fa04f", size = 243368, upload-time = "2025-07-25T07:32:56.73Z" }, -] - -[[package]] -name = "rq" -version = "2.5.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "croniter" }, - { name = "redis" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/48/1c/1c390fd8594e7367c1ee672297f7a877c0982b9c26877242c5a509ad27c0/rq-2.5.0.tar.gz", hash = "sha256:b55d328fcaeaf25823b8b8450283225f8048bd1c52abaaca192c99201ab5c687", size = 666978, upload-time = "2025-08-15T10:41:34.84Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/36/8917bcfc9794cbc4dd984962feb401f2dfeee0d89e1e40e3367420996f42/rq-2.5.0-py3-none-any.whl", hash = "sha256:90c74eb5b5793ff08e6c3391fd6deb7151f308ac8f04b6831580b38e90688155", size = 108377, upload-time = "2025-08-15T10:41:21.792Z" }, -] - -[[package]] -name = "ruff" -version = "0.12.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4a/45/2e403fa7007816b5fbb324cb4f8ed3c7402a927a0a0cb2b6279879a8bfdc/ruff-0.12.9.tar.gz", hash = "sha256:fbd94b2e3c623f659962934e52c2bea6fc6da11f667a427a368adaf3af2c866a", size = 5254702, upload-time = "2025-08-14T16:08:55.2Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ad/20/53bf098537adb7b6a97d98fcdebf6e916fcd11b2e21d15f8c171507909cc/ruff-0.12.9-py3-none-linux_armv6l.whl", hash = "sha256:fcebc6c79fcae3f220d05585229463621f5dbf24d79fdc4936d9302e177cfa3e", size = 11759705, upload-time = "2025-08-14T16:08:12.968Z" }, - { url = "https://files.pythonhosted.org/packages/20/4d/c764ee423002aac1ec66b9d541285dd29d2c0640a8086c87de59ebbe80d5/ruff-0.12.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:aed9d15f8c5755c0e74467731a007fcad41f19bcce41cd75f768bbd687f8535f", size = 12527042, upload-time = "2025-08-14T16:08:16.54Z" }, - { url = "https://files.pythonhosted.org/packages/8b/45/cfcdf6d3eb5fc78a5b419e7e616d6ccba0013dc5b180522920af2897e1be/ruff-0.12.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5b15ea354c6ff0d7423814ba6d44be2807644d0c05e9ed60caca87e963e93f70", size = 11724457, upload-time = "2025-08-14T16:08:18.686Z" }, - { url = "https://files.pythonhosted.org/packages/72/e6/44615c754b55662200c48bebb02196dbb14111b6e266ab071b7e7297b4ec/ruff-0.12.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d596c2d0393c2502eaabfef723bd74ca35348a8dac4267d18a94910087807c53", size = 11949446, upload-time = "2025-08-14T16:08:21.059Z" }, - { url = "https://files.pythonhosted.org/packages/fd/d1/9b7d46625d617c7df520d40d5ac6cdcdf20cbccb88fad4b5ecd476a6bb8d/ruff-0.12.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1b15599931a1a7a03c388b9c5df1bfa62be7ede6eb7ef753b272381f39c3d0ff", size = 11566350, upload-time = "2025-08-14T16:08:23.433Z" }, - { url = "https://files.pythonhosted.org/packages/59/20/b73132f66f2856bc29d2d263c6ca457f8476b0bbbe064dac3ac3337a270f/ruff-0.12.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3d02faa2977fb6f3f32ddb7828e212b7dd499c59eb896ae6c03ea5c303575756", size = 13270430, upload-time = "2025-08-14T16:08:25.837Z" }, - { url = "https://files.pythonhosted.org/packages/a2/21/eaf3806f0a3d4c6be0a69d435646fba775b65f3f2097d54898b0fd4bb12e/ruff-0.12.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:17d5b6b0b3a25259b69ebcba87908496e6830e03acfb929ef9fd4c58675fa2ea", size = 14264717, upload-time = "2025-08-14T16:08:27.907Z" }, - { url = "https://files.pythonhosted.org/packages/d2/82/1d0c53bd37dcb582b2c521d352fbf4876b1e28bc0d8894344198f6c9950d/ruff-0.12.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:72db7521860e246adbb43f6ef464dd2a532ef2ef1f5dd0d470455b8d9f1773e0", size = 13684331, upload-time = "2025-08-14T16:08:30.352Z" }, - { url = "https://files.pythonhosted.org/packages/3b/2f/1c5cf6d8f656306d42a686f1e207f71d7cebdcbe7b2aa18e4e8a0cb74da3/ruff-0.12.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a03242c1522b4e0885af63320ad754d53983c9599157ee33e77d748363c561ce", size = 12739151, upload-time = "2025-08-14T16:08:32.55Z" }, - { url = "https://files.pythonhosted.org/packages/47/09/25033198bff89b24d734e6479e39b1968e4c992e82262d61cdccaf11afb9/ruff-0.12.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fc83e4e9751e6c13b5046d7162f205d0a7bac5840183c5beebf824b08a27340", size = 12954992, upload-time = "2025-08-14T16:08:34.816Z" }, - { url = "https://files.pythonhosted.org/packages/52/8e/d0dbf2f9dca66c2d7131feefc386523404014968cd6d22f057763935ab32/ruff-0.12.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:881465ed56ba4dd26a691954650de6ad389a2d1fdb130fe51ff18a25639fe4bb", size = 12899569, upload-time = "2025-08-14T16:08:36.852Z" }, - { url = "https://files.pythonhosted.org/packages/a0/bd/b614d7c08515b1428ed4d3f1d4e3d687deffb2479703b90237682586fa66/ruff-0.12.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:43f07a3ccfc62cdb4d3a3348bf0588358a66da756aa113e071b8ca8c3b9826af", size = 11751983, upload-time = "2025-08-14T16:08:39.314Z" }, - { url = "https://files.pythonhosted.org/packages/58/d6/383e9f818a2441b1a0ed898d7875f11273f10882f997388b2b51cb2ae8b5/ruff-0.12.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:07adb221c54b6bba24387911e5734357f042e5669fa5718920ee728aba3cbadc", size = 11538635, upload-time = "2025-08-14T16:08:41.297Z" }, - { url = "https://files.pythonhosted.org/packages/20/9c/56f869d314edaa9fc1f491706d1d8a47747b9d714130368fbd69ce9024e9/ruff-0.12.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f5cd34fabfdea3933ab85d72359f118035882a01bff15bd1d2b15261d85d5f66", size = 12534346, upload-time = "2025-08-14T16:08:43.39Z" }, - { url = "https://files.pythonhosted.org/packages/bd/4b/d8b95c6795a6c93b439bc913ee7a94fda42bb30a79285d47b80074003ee7/ruff-0.12.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f6be1d2ca0686c54564da8e7ee9e25f93bdd6868263805f8c0b8fc6a449db6d7", size = 13017021, upload-time = "2025-08-14T16:08:45.889Z" }, - { url = "https://files.pythonhosted.org/packages/c7/c1/5f9a839a697ce1acd7af44836f7c2181cdae5accd17a5cb85fcbd694075e/ruff-0.12.9-py3-none-win32.whl", hash = "sha256:cc7a37bd2509974379d0115cc5608a1a4a6c4bff1b452ea69db83c8855d53f93", size = 11734785, upload-time = "2025-08-14T16:08:48.062Z" }, - { url = "https://files.pythonhosted.org/packages/fa/66/cdddc2d1d9a9f677520b7cfc490d234336f523d4b429c1298de359a3be08/ruff-0.12.9-py3-none-win_amd64.whl", hash = "sha256:6fb15b1977309741d7d098c8a3cb7a30bc112760a00fb6efb7abc85f00ba5908", size = 12840654, upload-time = "2025-08-14T16:08:50.158Z" }, - { url = "https://files.pythonhosted.org/packages/ac/fd/669816bc6b5b93b9586f3c1d87cd6bc05028470b3ecfebb5938252c47a35/ruff-0.12.9-py3-none-win_arm64.whl", hash = "sha256:63c8c819739d86b96d500cce885956a1a48ab056bbcbc61b747ad494b2485089", size = 11949623, upload-time = "2025-08-14T16:08:52.233Z" }, -] - -[[package]] -name = "selectolax" -version = "0.3.34" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/bf/8c/8bbe1b17098b4e2a63a251361870303c37ad4c3170536277096575c24ca4/selectolax-0.3.34.tar.gz", hash = "sha256:c2cdb30b60994f1e0b74574dd408f1336d2fadd68a3ebab8ea573740dcbf17e2", size = 4706599, upload-time = "2025-08-28T23:17:44.131Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d0/29/eeb77d1a77599023387d4d00655960dfa3d760557b42a65ef347e29b40b0/selectolax-0.3.34-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2bb74e079098d758bd3d5c77b1c66c90098de305e4084b60981e561acf52c12a", size = 2001199, upload-time = "2025-08-28T23:16:59.467Z" }, - { url = "https://files.pythonhosted.org/packages/21/80/326b9dd2901b64c3c654db9e8841ddc412b9c2af0047b7d43290bbb276be/selectolax-0.3.34-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cc39822f714e6e434ceb893e1ccff873f3f88c8db8226ba2f8a5f4a7a0e2aa29", size = 1994171, upload-time = "2025-08-28T23:17:01.206Z" }, - { url = "https://files.pythonhosted.org/packages/15/af/1265e4f9429b3c3cf098ba08cb3264d7e16990ed3029d89e9890012aae76/selectolax-0.3.34-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:181b67949ec23b4f11b6f2e426ba9904dd25c73d12c2cb22caf8fae21a363e99", size = 2196092, upload-time = "2025-08-28T23:17:02.574Z" }, - { url = "https://files.pythonhosted.org/packages/1c/41/e67100abd8b0b2a5e1d5d7fa864c31d31e9a2c0bbd08ce4e951235f13143/selectolax-0.3.34-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0b09f9d7b22bbb633966ac2019ec059caf735a5bdb4a5784bab0f4db2198fd6a", size = 2233674, upload-time = "2025-08-28T23:17:03.928Z" }, - { url = "https://files.pythonhosted.org/packages/3a/24/7ad043805c9292b4f535071c223d10aad7703b4460d68de1dce9dcf21d3f/selectolax-0.3.34-cp313-cp313-win32.whl", hash = "sha256:6e2ae8a984f82c9373e8a5ec0450f67603fde843fed73675f5187986e9e45b59", size = 1686489, upload-time = "2025-08-28T23:17:05.341Z" }, - { url = "https://files.pythonhosted.org/packages/6b/79/62666fbfcd847c0cfc2b75b496bfa8382d765e7a3d5a2c792004760a6e61/selectolax-0.3.34-cp313-cp313-win_amd64.whl", hash = "sha256:96acd5414aaf0bb8677258ff7b0f494953b2621f71be1e3d69e01743545509ec", size = 1789924, upload-time = "2025-08-28T23:17:06.708Z" }, - { url = "https://files.pythonhosted.org/packages/5d/b5/0bb579210a7de36d97c359016e77119513d3e810c61e99ade72089bc1b4d/selectolax-0.3.34-cp313-cp313-win_arm64.whl", hash = "sha256:1d309fd17ba72bb46a282154f75752ed7746de6f00e2c1eec4cd421dcdadf008", size = 1737480, upload-time = "2025-08-28T23:17:08.575Z" }, - { url = "https://files.pythonhosted.org/packages/b8/5c/ab87e8ecb3c6aa1053d1c6d1eba0e47e292cc72aff0f6fbb89d920d4d87c/selectolax-0.3.34-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:3e9c4197563c9b62b56dd7545bfd993ce071fd40b8779736e9bc59813f014c23", size = 2000587, upload-time = "2025-08-28T23:17:10.327Z" }, - { url = "https://files.pythonhosted.org/packages/72/8e/5c08bd5628f73ab582696f8349138a569115a0fd6ab71842e4115ceec4ff/selectolax-0.3.34-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f96eaa0da764a4b9e08e792c0f17cce98749f1406ffad35e6d4835194570bdbf", size = 1994327, upload-time = "2025-08-28T23:17:11.709Z" }, - { url = "https://files.pythonhosted.org/packages/ac/29/02b22eff289b29ee3f869a85e4be4f7f3cf4b480d429bb18aab014848917/selectolax-0.3.34-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:412ce46d963444cd378e9f3197a2f30b05d858722677a361fc44ad244d2bb7db", size = 2201620, upload-time = "2025-08-28T23:17:13.538Z" }, - { url = "https://files.pythonhosted.org/packages/6d/d3/bdd3a94bb1276be4ef4371dbfd254137b22f5c54a94d051a8d72c3956dc6/selectolax-0.3.34-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:58dd7dc062b0424adb001817bf9b05476d165a4db1885a69cac66ca16b313035", size = 2233487, upload-time = "2025-08-28T23:17:14.921Z" }, - { url = "https://files.pythonhosted.org/packages/e6/6a/5d551c570f29bfca5815f45fa6e6a3310cc5bc6c9b1073a968d71f73612b/selectolax-0.3.34-cp314-cp314-win32.whl", hash = "sha256:4255558fa48e3685a13f3d9dfc84586146c7b0b86e44c899ac2ac263357c987f", size = 1779755, upload-time = "2025-08-28T23:17:16.322Z" }, - { url = "https://files.pythonhosted.org/packages/cc/dc/5def41b07cb3b917841022489e6bd6c3277363c23b44eca00a0ada93221c/selectolax-0.3.34-cp314-cp314-win_amd64.whl", hash = "sha256:6cbf2707d79afd7e15083f3f32c11c9b6e39a39026c8b362ce25959842a837b6", size = 1877332, upload-time = "2025-08-28T23:17:17.766Z" }, - { url = "https://files.pythonhosted.org/packages/19/0f/63da99be8f78bbfca0cb3f9ad71b7475ab97383f830c86a9abd29c6d3f25/selectolax-0.3.34-cp314-cp314-win_arm64.whl", hash = "sha256:3aa83e4d1f5f5534c9d9e44fc53640c82edc7d0eef6fca0829830cccc8df9568", size = 1831124, upload-time = "2025-08-28T23:17:19.744Z" }, - { url = "https://files.pythonhosted.org/packages/39/5c/07d8031c6c106de10ff42b4440ad7fa6a038650942bb2e194e4eb9ffec6d/selectolax-0.3.34-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:bb0b9002974ec7052f7eb1439b8e404e11a00a26affcbdd73fc53fc55beec809", size = 2023889, upload-time = "2025-08-28T23:17:21.222Z" }, - { url = "https://files.pythonhosted.org/packages/fd/80/fa8220c2eae44928b5ae73eccd44baedb328109f115c948d796c46d11048/selectolax-0.3.34-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:38e5fdffab6d08800a19671ac9641ff9ca6738fad42090f4dd0da76e4db29582", size = 2011882, upload-time = "2025-08-28T23:17:22.844Z" }, - { url = "https://files.pythonhosted.org/packages/f6/02/657089f68f59308bd90137102a7f6da0c3770128ae7245e1290e99f5a48d/selectolax-0.3.34-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:871d35e19dfde9ee83c1df139940c2e5cdf6a50ef3d147a0e9acf382b63b5b3e", size = 2221871, upload-time = "2025-08-28T23:17:24.259Z" }, - { url = "https://files.pythonhosted.org/packages/d2/56/1ad7877f9b2b12f616a8847eca0a3047c6b5ed14588f21fe1f6915357efb/selectolax-0.3.34-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f3f269bc53bc84ccc166704263712f4448130ec827a38a0df230cffe3dc46a9", size = 2241032, upload-time = "2025-08-28T23:17:25.76Z" }, - { url = "https://files.pythonhosted.org/packages/60/c0/30ce665b7382f663fdbb282748ddee392a61c85f51862776b128d8644d45/selectolax-0.3.34-cp314-cp314t-win32.whl", hash = "sha256:b957d105c2f3d86de872f61be1c9a92e1d84580a5ec89a413282f60ffb3f7bc1", size = 1828494, upload-time = "2025-08-28T23:17:27.447Z" }, - { url = "https://files.pythonhosted.org/packages/a4/9e/11d023ad74d0d1a48cefdddbb2d00365c4d9a97735d7c24c0f206cd1babb/selectolax-0.3.34-cp314-cp314t-win_amd64.whl", hash = "sha256:9c609d639ce09154d688063bb830dc351fb944fa52629e25717dbab45ad04327", size = 1951608, upload-time = "2025-08-28T23:17:29.327Z" }, - { url = "https://files.pythonhosted.org/packages/cc/20/a5f93b84e3e6de9756dc82465c0dff57b1c8a25b1815bca0817e4342494c/selectolax-0.3.34-cp314-cp314t-win_arm64.whl", hash = "sha256:6359e94d66fb4fce9fb7c9d18252c3d8cba28b90f7412da8ce610bd77746f750", size = 1852855, upload-time = "2025-08-28T23:17:30.746Z" }, -] - -[[package]] -name = "setuptools" -version = "80.9.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/18/5d/3bf57dcd21979b887f014ea83c24ae194cfcd12b9e0fda66b957c69d1fca/setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c", size = 1319958, upload-time = "2025-05-27T00:56:51.443Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486, upload-time = "2025-05-27T00:56:49.664Z" }, -] - -[[package]] -name = "shellingham" -version = "1.5.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, -] - -[[package]] -name = "six" -version = "1.17.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, -] - -[[package]] -name = "sniffio" -version = "1.3.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, -] - -[[package]] -name = "soupsieve" -version = "2.8" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" }, -] - -[[package]] -name = "stevedore" -version = "5.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pbr" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/28/3f/13cacea96900bbd31bb05c6b74135f85d15564fc583802be56976c940470/stevedore-5.4.1.tar.gz", hash = "sha256:3135b5ae50fe12816ef291baff420acb727fcd356106e3e9cbfa9e5985cd6f4b", size = 513858, upload-time = "2025-02-20T14:03:57.285Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/45/8c4ebc0c460e6ec38e62ab245ad3c7fc10b210116cea7c16d61602aa9558/stevedore-5.4.1-py3-none-any.whl", hash = "sha256:d10a31c7b86cba16c1f6e8d15416955fc797052351a56af15e608ad20811fcfe", size = 49533, upload-time = "2025-02-20T14:03:55.849Z" }, -] - -[[package]] -name = "tiktoken" -version = "0.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "regex" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" }, - { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" }, - { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" }, - { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" }, - { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" }, - { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" }, - { url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" }, - { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" }, - { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" }, - { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" }, - { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" }, - { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" }, - { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" }, - { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" }, - { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" }, - { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" }, - { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" }, - { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" }, - { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" }, - { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" }, - { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" }, - { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" }, - { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" }, - { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" }, - { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" }, - { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" }, - { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" }, - { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" }, -] - -[[package]] -name = "tld" -version = "0.13.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/df/a1/5723b07a70c1841a80afc9ac572fdf53488306848d844cd70519391b0d26/tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350", size = 462000, upload-time = "2025-05-21T22:18:29.341Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/70/b2f38360c3fc4bc9b5e8ef429e1fde63749144ac583c2dbdf7e21e27a9ad/tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c", size = 274718, upload-time = "2025-05-21T22:18:25.811Z" }, -] - -[[package]] -name = "trafilatura" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "charset-normalizer" }, - { name = "courlan" }, - { name = "htmldate" }, - { name = "justext" }, - { name = "lxml" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/06/25/e3ebeefdebfdfae8c4a4396f5a6ea51fc6fa0831d63ce338e5090a8003dc/trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247", size = 253404, upload-time = "2024-12-03T15:23:24.16Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8a/b6/097367f180b6383a3581ca1b86fcae284e52075fa941d1232df35293363c/trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d", size = 132557, upload-time = "2024-12-03T15:23:21.41Z" }, -] - -[[package]] -name = "typer" -version = "0.16.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "rich" }, - { name = "shellingham" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" }, -] - -[[package]] -name = "typing-extensions" -version = "4.14.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673, upload-time = "2025-07-04T13:28:34.16Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" }, -] - -[[package]] -name = "typing-inspection" -version = "0.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" }, -] - -[[package]] -name = "tzdata" -version = "2025.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, -] - -[[package]] -name = "tzlocal" -version = "5.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "tzdata", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" }, -] - -[[package]] -name = "urllib3" -version = "2.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, -] - -[[package]] -name = "uv-build" -version = "0.8.12" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/58/1d/109827cffcdd2430783450591083a3cc9b80c8d34f962ff86e00a7d73eaf/uv_build-0.8.12.tar.gz", hash = "sha256:49666685059bf5c62e5634371b00b2012ebe3e4e4d0f479cff0400bf66ad1e3a", size = 322245, upload-time = "2025-08-18T23:59:48.408Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6d/6e/75995ef959314680fc127c3d947bc2dec1fed57a0fb400b81270dda01132/uv_build-0.8.12-py3-none-linux_armv6l.whl", hash = "sha256:03cd118ae8731aeca7994a48d6f23a5d4aacef5ee9c88bc60daf99ad698cefae", size = 1318465, upload-time = "2025-08-18T23:59:19.615Z" }, - { url = "https://files.pythonhosted.org/packages/fc/55/fa65b463af6b2c1738b81d6153975ca3b1a07056552f0993c2cf7b324018/uv_build-0.8.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:23d3d46cd619640b4b3e2977cfe629fb898586d21b8b641c9385021b1755fde5", size = 1299484, upload-time = "2025-08-18T23:59:23.737Z" }, - { url = "https://files.pythonhosted.org/packages/55/21/14fb0309c64e324f13f309460fc5a1ebf4872c1f91be89d50039c8e3a91c/uv_build-0.8.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a6676b94db118f4b3e903acf52f4acc6e8b558330d576a8438181726b47bad15", size = 1177028, upload-time = "2025-08-18T23:59:25.052Z" }, - { url = "https://files.pythonhosted.org/packages/dc/ae/61ebacd6b43f97300409412ba99d274305919bbda367c44ea4b114c91ac5/uv_build-0.8.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:2135094eab1657c121a74176a41f2ad30066962f476dac11b6c48ad6cb279392", size = 1367327, upload-time = "2025-08-18T23:59:26.676Z" }, - { url = "https://files.pythonhosted.org/packages/d4/f7/d8c29e322ecb569774e90f3e9a1b8018465a4c88e62c6083aa91f7c53de9/uv_build-0.8.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20199b48eebf3a07046d5988b4eca8c3a8c83e50299e8e6bba085bf8f2e02611", size = 1274839, upload-time = "2025-08-18T23:59:28.034Z" }, - { url = "https://files.pythonhosted.org/packages/a3/be/63ef8eb542b98d3d4536b8519f9e4d4dbf8f52443975740be9f833fa4985/uv_build-0.8.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9fdd226820cfdba719779f4ccbf594258177f67ef1907141a8b959757c26d55c", size = 1426207, upload-time = "2025-08-18T23:59:29.687Z" }, - { url = "https://files.pythonhosted.org/packages/80/b0/3ea05c1cdbc32fd13e0e97d56e8b3be4cd350ed5e6d9aa137ebe65afb5ae/uv_build-0.8.12-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9c76003c6af6c6949f796448458bb104c5d3f7d9a1ced3f3aeed613e2f47677e", size = 1577750, upload-time = "2025-08-18T23:59:30.983Z" }, - { url = "https://files.pythonhosted.org/packages/53/ed/1391d420efdbeb07353db1404e34830a322fe2efb64853c0d4fcda315276/uv_build-0.8.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dfe32cdb94c85981597d40efc08c01ff30267db18935df50ffcef1258e091d52", size = 1481257, upload-time = "2025-08-18T23:59:32.248Z" }, - { url = "https://files.pythonhosted.org/packages/26/28/bc6c7d00fb3a4713f85359c8687067111021542f379d5ff49136cfbe9b64/uv_build-0.8.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a96aa67f8071a025b41abc661ddd0cec2731d1530095479f2b810b1c04a09252", size = 1418075, upload-time = "2025-08-18T23:59:33.961Z" }, - { url = "https://files.pythonhosted.org/packages/23/05/39236c6e86a5d49a0d4c80064907665db34a8c180ba3110bca436ddbb8f3/uv_build-0.8.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6878f2179dafb1053a413ad41f2f9640655489972bec6211aaf8d492b49614af", size = 1421678, upload-time = "2025-08-18T23:59:35.653Z" }, - { url = "https://files.pythonhosted.org/packages/66/d7/731bec1f5955de6ea33cffcf568a81375dfe80e17215dd66cdf659fcd28c/uv_build-0.8.12-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:7bce23578e8abbb40fd70aebed1afd27d132915e451551322f10aa304dd8bf26", size = 1365561, upload-time = "2025-08-18T23:59:37.664Z" }, - { url = "https://files.pythonhosted.org/packages/bb/b8/1219fa9d21c1deacd8d8b9f4b4193596ea6cdbef718e299b371354c19897/uv_build-0.8.12-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a9c57674dd757f8208b6e4929abd5bcb6b63bab1ea5fab0f3feaa4c40236c7dd", size = 1375369, upload-time = "2025-08-18T23:59:38.948Z" }, - { url = "https://files.pythonhosted.org/packages/ae/31/700da060b59d4bb163f146d2f673292937595efa77e71a73842b945e49c7/uv_build-0.8.12-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:021a75dec60bf14f0bebdf10aafa08a03ad5d2c9bfd82565b77ac56a82316911", size = 1290573, upload-time = "2025-08-18T23:59:40.223Z" }, - { url = "https://files.pythonhosted.org/packages/d5/9b/711a875605583bed36ff18ccd5351f2582cafedef4720a667e90e6023e3a/uv_build-0.8.12-py3-none-musllinux_1_1_i686.whl", hash = "sha256:2884df52ef9c47bccebf0f616380b281078a4e50fd29a6d44e841f2e2532f687", size = 1380155, upload-time = "2025-08-18T23:59:41.868Z" }, - { url = "https://files.pythonhosted.org/packages/67/31/4b0269dbebd18e406ec565ead0c0b05909d255cd4650dfac1b198542e92d/uv_build-0.8.12-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:c8072519032f4c90e36ea4650fa4a86a30a6d3355082a31f996e7c9e6a6e92f6", size = 1462583, upload-time = "2025-08-18T23:59:43.164Z" }, - { url = "https://files.pythonhosted.org/packages/f3/01/2d47a047109ac53d40c3912d15a4aeadfa67c3937dcd7cd854f865e25fef/uv_build-0.8.12-py3-none-win32.whl", hash = "sha256:45830715e022b85994c06db03ea1a337684cef441ab3ecd38d4b03071845f662", size = 1251560, upload-time = "2025-08-18T23:59:44.425Z" }, - { url = "https://files.pythonhosted.org/packages/c6/11/d8a0a1b87e4cca37abbeb3756119260d9f84bc954cec0bfb04447138a19e/uv_build-0.8.12-py3-none-win_amd64.whl", hash = "sha256:b549a205e1a7487f278baa5fd59dae6901955be7af024dea9d17615e64312cf4", size = 1329565, upload-time = "2025-08-18T23:59:45.932Z" }, - { url = "https://files.pythonhosted.org/packages/d4/0d/c2b30dd90d9fbd0ddef6db4b0fc60e80643d0ef2501229078dcff79067f1/uv_build-0.8.12-py3-none-win_arm64.whl", hash = "sha256:f0c05d62de6c8cb59eb686ac8c6a4e9549f81603864df4f853923eefc850f674", size = 1236604, upload-time = "2025-08-18T23:59:47.094Z" }, -]