[backend, crawler] feat: support token statistics

This commit is contained in:
2025-10-25 03:23:15 +02:00
parent 8e456cff75
commit 799cda6e06
32 changed files with 414 additions and 60 deletions
+1 -1
View File
@@ -12,7 +12,7 @@
- Install the project in your virtualenv so the `basango` CLI is available:
- With uv: `uv run --with . basango --help`
- Or install locally: `pip install -e .` then `basango --help`
- Or install locally: `uv sync` then `basango --help`
#### Sync crawl (in-process)
+2 -2
View File
@@ -37,7 +37,7 @@ sources:
replacement: "$3-$2-$1 $4"
source_selectors:
articles: ".view-content > .views-row.content-row"
article_title: ".views-field-title a"
article_title: "h1.page-header"
article_link: ".views-field-title a"
article_body: ".field-name-body"
article_date: ".views-field-created"
@@ -45,7 +45,7 @@ sources:
pagination: "ul.pagination > li.pager-last > a"
pagination_template: "actualite"
supports_categories: false
requires_details: false
requires_details: true
requires_rate_limit: false
- source_id: 7sur7.cd
+1
View File
@@ -17,6 +17,7 @@ dependencies = [
"markdownify>=0.13.1",
"readability-lxml>=0.8.1",
"beautifulsoup4>=4.13.5",
"tiktoken>=0.12.0",
]
[dependency-groups]
@@ -2,6 +2,7 @@ from datetime import datetime
from typing import Any, Optional
from pydantic import BaseModel, HttpUrl
from .token_statistics import TokenStatistics
class Article(BaseModel):
@@ -12,6 +13,7 @@ class Article(BaseModel):
source: str
timestamp: datetime
metadata: Optional[dict[str, Any]] = None
token_statistics: Optional["TokenStatistics"] = None
def to_dict(self) -> dict[str, Any]:
return {
@@ -22,4 +24,7 @@ class Article(BaseModel):
"source": self.source,
"timestamp": int(self.timestamp.timestamp()),
"metadata": self.metadata,
"tokenStatistics": self.token_statistics.to_dict()
if self.token_statistics
else "",
}
@@ -0,0 +1,19 @@
from dataclasses import dataclass
@dataclass
class TokenStatistics:
"""Counts of tokens for different article sections."""
title: int
body: int
excerpt: int
categories: int
def to_dict(self) -> dict[str, int]:
return {
"title": self.title,
"body": self.body,
"excerpt": self.excerpt,
"categories": self.categories,
}
@@ -3,6 +3,7 @@ from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient
from .open_graph import OpenGraphProvider
from .persistence import BasePersistor, CsvPersistor, JsonPersistor
from .user_agents import UserAgentProvider
from .tokenizer import Tokenizer
HttpClient = SyncHttpClient
@@ -17,4 +18,5 @@ __all__ = [
"BasePersistor",
"CsvPersistor",
"JsonPersistor",
"Tokenizer",
]
@@ -1,15 +1,23 @@
import logging
from abc import ABC, abstractmethod
from dataclasses import asdict, is_dataclass
from datetime import datetime
from typing import Optional, Any, Dict, List, Sequence
from basango.domain.article import Article
from bs4 import BeautifulSoup
from pydantic import HttpUrl
from basango.core.config import CrawlerConfig, ClientConfig
from basango.domain import DateRange, SourceKind, PageRange
from basango.domain.exception import ArticleOutOfRange
from basango.services import HttpClient, DateParser, OpenGraphProvider, BasePersistor
from basango.services import (
HttpClient,
DateParser,
OpenGraphProvider,
BasePersistor,
Tokenizer,
)
class BaseCrawler(ABC):
@@ -35,6 +43,7 @@ class BaseCrawler(ABC):
self.persistors: list[BasePersistor] = list(persistors) if persistors else []
self.date_parser = DateParser()
self.open_graph = OpenGraphProvider()
self.tokenizer = Tokenizer()
@abstractmethod
def fetch(self) -> None:
@@ -61,23 +70,35 @@ class BaseCrawler(ABC):
metadata_value = None
elif is_dataclass(metadata) and not isinstance(metadata, type):
metadata_value = asdict(metadata)
else:
elif isinstance(metadata, dict):
metadata_value = metadata
else:
metadata_value = None
article = {
"title": title,
"link": link,
"body": body,
"categories": categories,
"source": getattr(self.source, "source_id", None),
"timestamp": timestamp,
"metadata": metadata_value,
}
# Get source_id and ensure it's a string
source_id = getattr(self.source, "source_id", None)
if source_id is None:
source_id = "unknown"
self._persist(article)
logging.info(f"> {article['title']} [saved]")
article = Article(
title=title,
link=HttpUrl(link), # Convert str to HttpUrl
body=body,
categories=categories,
source=source_id, # Ensure it's a string, not None
timestamp=datetime.fromtimestamp(
timestamp
), # Convert int timestamp to datetime
metadata=metadata_value,
)
article.token_statistics = self.tokenizer.count_tokens(
article.title, article.body, article.categories
)
return Article(**article)
self._persist(article.to_dict())
logging.info("> %s [saved]", article.title)
return article
@abstractmethod
def fetch_one(
@@ -6,6 +6,7 @@ from urllib.parse import parse_qs, urljoin, urlparse
from basango.domain.article import Article
from bs4 import BeautifulSoup, Tag
from markdownify import markdownify
from basango.core.config import CrawlerConfig, ClientConfig
from basango.core.config.source_config import HtmlSourceConfig
@@ -283,15 +284,15 @@ class HtmlCrawler(BaseCrawler):
matches = node.select(selector)
if matches:
parts = [
item.get_text(" ", strip=True)
markdownify(item.get_text(" ", strip=False), heading_style="ATX")
for item in matches
if item.get_text(strip=True)
]
if parts:
# Join without separators: callers can post-process if
# needed, but this preserves maximum fidelity.
return "".join(parts)
return node.get_text(" ", strip=True)
return "\n".join(parts)
return markdownify(node.get_text(" ", strip=False), heading_style="ATX")
@staticmethod
def _extract_categories(
@@ -3,6 +3,8 @@ import logging
from datetime import datetime, timezone
from typing import Optional, override, cast, Final, Any, Sequence
from markdownify import markdownify
from basango.domain.article import Article
from bs4 import BeautifulSoup
@@ -104,7 +106,10 @@ class WordpressCrawler(BaseCrawler):
body_html = data.get("content", {}).get("rendered", "")
title = BeautifulSoup(title_html, "html.parser").get_text(" ", strip=True)
body = BeautifulSoup(body_html, "html.parser").get_text(" ", strip=True)
body = markdownify(
BeautifulSoup(body_html, "html.parser").get_text(" ", strip=False),
heading_style="ATX",
)
timestamp = self._compute_timestamp(data.get("date"))
categories_value = self._map_categories(data.get("categories", []))
@@ -0,0 +1,56 @@
"""
Tokenizer utilities for counting and encoding article text.
This module wraps the `tiktoken` encoder to provide simple helpers for:
- encoding/decoding text to token ids
- counting tokens for different parts of an Article
The `Tokenizer` can be constructed with either a specific `model` (preferred)
or an `encoding` name fallback.
"""
import logging
import tiktoken
from typing import Optional
from basango.domain.token_statistics import TokenStatistics
class Tokenizer:
"""Thin wrapper around tiktoken encoder for token operations."""
def __init__(
self, encoding: str = "cl100k_base", model: Optional[str] = None
) -> None:
self.encoding = encoding
# Prefer model-based encoding lookup if a model is provided.
self.tokenizer = (
tiktoken.encoding_for_model(model)
if model
else tiktoken.get_encoding(encoding)
)
def encode(self, text: str) -> list[int]:
"""Encode text into a list of token ids."""
return self.tokenizer.encode(text)
def decode(self, tokens: list[int]) -> str:
"""Decode a list of token ids back into a string."""
return self.tokenizer.decode(tokens)
def count_tokens(
self, title: str, body: str, categories: list[str]
) -> TokenStatistics:
"""Return token counts for the provided Article.
The excerpt count is computed on the first 200 characters of the body
to give a quick estimate of a short preview's token length.
"""
logging.info(f"[Tokenizer] tokenizing {title}...")
return TokenStatistics(
title=len(self.encode(title)),
body=len(self.encode(body)),
excerpt=len(self.encode(body[:200])),
categories=len(self.encode(", ".join(categories))),
)
+57
View File
@@ -62,6 +62,7 @@ dependencies = [
{ name = "readability-lxml" },
{ name = "rq" },
{ name = "selectolax" },
{ name = "tiktoken" },
{ name = "trafilatura" },
{ name = "typer" },
{ name = "uv-build" },
@@ -86,6 +87,7 @@ requires-dist = [
{ name = "readability-lxml", specifier = ">=0.8.1" },
{ name = "rq", specifier = ">=2.5.0" },
{ name = "selectolax", specifier = ">=0.3.20" },
{ name = "tiktoken", specifier = ">=0.12.0" },
{ name = "trafilatura", specifier = ">=1.7.0" },
{ name = "typer", specifier = ">=0.16.1" },
{ name = "uv-build", specifier = ">=0.8.12,<0.9.0" },
@@ -632,6 +634,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cf/3e/7d7ac6fd085023312421e0d69dfabdfb28e116e513fadbe9afe710c01893/regex-2025.9.1-cp314-cp314-win_arm64.whl", hash = "sha256:f46d525934871ea772930e997d577d48c6983e50f206ff7b66d4ac5f8941e993", size = 271860, upload-time = "2025-09-01T22:09:42.413Z" },
]
[[package]]
name = "requests"
version = "2.32.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "charset-normalizer" },
{ name = "idna" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
]
[[package]]
name = "rich"
version = "14.1.0"
@@ -771,6 +788,46 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f7/45/8c4ebc0c460e6ec38e62ab245ad3c7fc10b210116cea7c16d61602aa9558/stevedore-5.4.1-py3-none-any.whl", hash = "sha256:d10a31c7b86cba16c1f6e8d15416955fc797052351a56af15e608ad20811fcfe", size = 49533, upload-time = "2025-02-20T14:03:55.849Z" },
]
[[package]]
name = "tiktoken"
version = "0.12.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "regex" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" },
{ url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" },
{ url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" },
{ url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" },
{ url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" },
{ url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" },
{ url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" },
{ url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" },
{ url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" },
{ url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" },
{ url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" },
{ url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" },
{ url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" },
{ url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" },
{ url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" },
{ url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" },
{ url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" },
{ url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" },
{ url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" },
{ url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" },
{ url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" },
{ url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" },
{ url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" },
{ url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" },
{ url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" },
{ url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" },
{ url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" },
{ url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
]
[[package]]
name = "tld"
version = "0.13.1"