diff --git a/README.md b/README.md
index a7b0606..ee22eb3 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Basango : Towards a scalable and intelligent system for Congolese News curation
+# Basango: Towards a scalable and intelligent system for Congolese News curation
[](https://github.com/bernard-ng/basango/actions/workflows/backend_audit.yaml)
[](https://github.com/bernard-ng/basango/actions/workflows/backend_deploy.yaml)
diff --git a/projects/backend/CITATION.cff b/projects/backend/CITATION.cff
index 25fa00b..749193f 100644
--- a/projects/backend/CITATION.cff
+++ b/projects/backend/CITATION.cff
@@ -2,7 +2,7 @@
# Visit https://bit.ly/cffinit to generate yours today!
cff-version: 1.2.0
-title: DRC News Corpus
+title: Basango
message: >-
If you use this software, please cite it using the
metadata from this file.
@@ -14,11 +14,11 @@ authors:
email: bernard@devscast.tech
affiliation: Devscast Community
orcid: 'https://orcid.org/0009-0003-9777-6349'
-repository-code: 'https://github.com/bernard-ng/drc-news-corpus'
+repository-code: 'https://github.com/bernard-ng/basango'
repository: >-
- https://www.huggingface.c0/datasets/bernard-ng/drc-news-corpus
+ https://www.huggingface.c0/datasets/bernard-ng/basango
abstract: >-
- The "DRC News Corpus" is a curated collection of news
+ The "Basango" is a curated collection of news
articles sourced from major media outlets covering a wide
spectrum of topics related to the Democratic Republic of
Congo (DRC). This dataset encompasses a diverse range of
diff --git a/projects/backend/README.md b/projects/backend/README.md
index 6ed4522..1d7590a 100644
--- a/projects/backend/README.md
+++ b/projects/backend/README.md
@@ -1,24 +1,24 @@
# Core and Backend
-
-
-
-
+
+
+
+
| Scope | Link |
|-------------------|------------------------------------------------------------|
-| core and backend | https://github.com/bernard-ng/drc-news-corpus |
+| core and backend | https://github.com/bernard-ng/basango |
| ML models | https://github.com/bernard-ng/drc-news-ml |
| Mobile App | https://github.com/bernard-ng/basango |
-| Dataset (partial) | https://huggingface.co/datasets/bernard-ng/drc-news-corpus |
+| Dataset (partial) | https://huggingface.co/datasets/bernard-ng/basango |
---
-## DRC News Corpus : Towards a scalable and intelligent system for Congolese News curation
+## Basango : Towards a scalable and intelligent system for Congolese News curation
### Introduction
-The **"DRC News Corpus"** is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.
+The **"Basango"** is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.
### Scalability and Use Cases:
@@ -45,7 +45,7 @@ If you want to rebuild the dataset follow the steps bellow :
#### Installation
```bash
-git clone https://github.com/bernard-ng/drc-news-corpus.git && cd drc-news-corpus
+git clone https://github.com/bernard-ng/basango.git && cd basango
make build
make start
```
@@ -104,5 +104,5 @@ a CSV file will be generated in the `data` directory.
### Acknowledgment:
-The compilation and curation of the "DRC News Corpus" were conducted by Tshabu Ngandu Bernard with the primary objective of facilitating research and analysis related to the Democratic Republic of Congo.
+The compilation and curation of the "Basango" were conducted by Tshabu Ngandu Bernard with the primary objective of facilitating research and analysis related to the Democratic Republic of Congo.
I do not own the content of the articles, and all rights belong to the respective publishers. The dataset is intended for non-commercial research purposes only.
diff --git a/projects/backend/api/bruno.json b/projects/backend/api/bruno.json
index d43c349..c15b8c0 100644
--- a/projects/backend/api/bruno.json
+++ b/projects/backend/api/bruno.json
@@ -1,6 +1,6 @@
{
"version": "1",
- "name": "drc-news-corpus",
+ "name": "basango",
"type": "collection",
"ignore": [
"node_modules",
diff --git a/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml b/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml
index 4dab811..3608637 100644
--- a/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml
+++ b/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml
@@ -18,7 +18,7 @@
-
+
@@ -30,6 +30,7 @@
+
+
+
+
+
+
+
+
diff --git a/projects/backend/config/migrations/Version20251024234318.php b/projects/backend/config/migrations/Version20251024234318.php
new file mode 100644
index 0000000..a4aedec
--- /dev/null
+++ b/projects/backend/config/migrations/Version20251024234318.php
@@ -0,0 +1,31 @@
+
+ */
+final class Version20251024234318 extends AbstractMigration
+{
+ public function getDescription(): string
+ {
+ return 'add token statistics to article';
+ }
+
+ public function up(Schema $schema): void
+ {
+ $this->addSql('ALTER TABLE article ADD token_statistics JSONB DEFAULT NULL');
+ }
+
+ public function down(Schema $schema): void
+ {
+ $this->addSql('ALTER TABLE article DROP token_statistics');
+ }
+}
diff --git a/projects/backend/config/packages/doctrine.yaml b/projects/backend/config/packages/doctrine.yaml
index 2d257c7..0f47178 100644
--- a/projects/backend/config/packages/doctrine.yaml
+++ b/projects/backend/config/packages/doctrine.yaml
@@ -70,6 +70,7 @@ doctrine:
article_id: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\ArticleIdType
source_id: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\SourceIdType
open_graph: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\OpenGraphType
+ token_statistics: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\TokenStatisticsType
# Identity and Access
user_id: Basango\IdentityAndAccess\Infrastructure\Persistence\Doctrine\DBAL\Types\UserIdType
@@ -125,6 +126,7 @@ doctrine:
orm:
auto_generate_proxy_classes: true
enable_lazy_ghost_objects: true
+ enable_native_lazy_objects: true
entity_managers:
default:
validate_xml_mapping: false
diff --git a/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php b/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php
index 965bc5a..53a146d 100644
--- a/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php
+++ b/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php
@@ -6,6 +6,7 @@ namespace Basango\Aggregator\Application\UseCase\Command;
use Basango\Aggregator\Domain\Model\ValueObject\Link;
use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph;
+use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics;
/**
* Class Save.
@@ -17,11 +18,12 @@ final readonly class CreateArticle
public function __construct(
public string $title,
public Link $link,
- public string $categories,
+ public array $categories,
public string $body,
public string $source,
public int $timestamp,
- public ?OpenGraph $metadata = null
+ public ?OpenGraph $metadata = null,
+ public ?TokenStatistics $tokenStatistics = null
) {
}
}
diff --git a/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php b/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php
index 144b5fa..967e2c1 100644
--- a/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php
+++ b/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php
@@ -43,12 +43,13 @@ final readonly class CreateArticleHandler implements CommandHandler
link: $command->link,
body: $command->body,
hash: $hash,
- categories: mb_strtolower($command->categories),
+ categories: $command->categories,
source: $source,
publishedAt: $publishedAt
);
$article
->defineOpenGraph($command->metadata)
+ ->defineTokenStatistics($command->tokenStatistics)
->computeReadingTime();
$this->articleRepository->add($article);
diff --git a/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php b/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php
index 94cc038..9eee4b7 100644
--- a/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php
+++ b/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php
@@ -10,6 +10,7 @@ use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph;
use Basango\Aggregator\Domain\Model\ValueObject\ReadingTime;
use Basango\Aggregator\Domain\Model\ValueObject\Scoring\Credibility;
use Basango\Aggregator\Domain\Model\ValueObject\Scoring\Sentiment;
+use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics;
/**
* Class Article.
@@ -25,13 +26,14 @@ class Article
public readonly Link $link,
public readonly string $body,
public readonly string $hash,
- private(set) string $categories,
+ private(set) array $categories,
public readonly Source $source,
public readonly \DateTimeImmutable $publishedAt,
public readonly \DateTimeImmutable $crawledAt = new \DateTimeImmutable(),
private(set) Credibility $credibility = new Credibility(),
private(set) Sentiment $sentiment = Sentiment::NEUTRAL,
private(set) ?OpenGraph $metadata = null,
+ private(set) ?TokenStatistics $tokenStatistics = null,
private(set) ?ReadingTime $readingTime = null,
private(set) ?\DateTimeImmutable $updatedAt = null,
public readonly ?string $image = null,
@@ -56,7 +58,7 @@ class Article
return $this;
}
- public function assignCategories(string $categories): self
+ public function assignCategories(array $categories): self
{
$this->categories = $categories;
$this->updatedAt = new \DateTimeImmutable();
@@ -83,4 +85,11 @@ class Article
return $this;
}
+
+ public function defineTokenStatistics(?TokenStatistics $statistics): self
+ {
+ $this->tokenStatistics = $statistics;
+
+ return $this;
+ }
}
diff --git a/projects/backend/src/Aggregator/Domain/Model/ValueObject/TokenStatistics.php b/projects/backend/src/Aggregator/Domain/Model/ValueObject/TokenStatistics.php
new file mode 100644
index 0000000..f384fd0
--- /dev/null
+++ b/projects/backend/src/Aggregator/Domain/Model/ValueObject/TokenStatistics.php
@@ -0,0 +1,62 @@
+
+ */
+final class TokenStatistics implements \JsonSerializable
+{
+ public ?int $total {
+ get {
+ return ($this->title ?? 0)
+ + ($this->body ?? 0)
+ + ($this->excerpt ?? 0)
+ + ($this->categories ?? 0);
+ }
+ }
+
+ public function __construct(
+ public readonly ?int $title = null,
+ public readonly ?int $body = null,
+ public readonly ?int $excerpt = null,
+ public readonly ?int $categories = null,
+ ) {
+ }
+
+ public static function tryFrom(?string $value): ?self
+ {
+ if ($value === null) {
+ return null;
+ }
+
+ try {
+ $object = \json_decode($value, true, 512, JSON_THROW_ON_ERROR);
+
+ return new self(
+ $object['title'] ?? null,
+ $object['body'] ?? null,
+ $object['excerpt'] ?? null,
+ $object['categories'] ?? null,
+ );
+ } catch (\Throwable) {
+ return null;
+ }
+ }
+
+ #[\Override]
+ public function jsonSerialize(): array
+ {
+ return [
+ 'title' => $this->title,
+ 'body' => $this->body,
+ 'excerpt' => $this->excerpt,
+ 'categories' => $this->categories,
+ 'total' => $this->total,
+ ];
+ }
+}
diff --git a/projects/backend/src/Aggregator/Infrastructure/Persistence/Doctrine/DBAL/Types/TokenStatisticsType.php b/projects/backend/src/Aggregator/Infrastructure/Persistence/Doctrine/DBAL/Types/TokenStatisticsType.php
new file mode 100644
index 0000000..ae98942
--- /dev/null
+++ b/projects/backend/src/Aggregator/Infrastructure/Persistence/Doctrine/DBAL/Types/TokenStatisticsType.php
@@ -0,0 +1,67 @@
+
+ */
+final class TokenStatisticsType extends Type
+{
+ public function getSQLDeclaration(array $column, AbstractPlatform $platform): string
+ {
+ return $platform->getJsonTypeDeclarationSQL([
+ 'nullable' => true,
+ 'jsonb' => true,
+ ]);
+ }
+
+ public function getName(): string
+ {
+ return 'token_statistics';
+ }
+
+ #[\Override]
+ public function convertToPHPValue(mixed $value, AbstractPlatform $platform): ?TokenStatistics
+ {
+ if ($value === null) {
+ return null;
+ }
+
+ if (! \is_string($value)) {
+ throw ConversionException::conversionFailedInvalidType($value, $this->getName(), ['null', 'string', TokenStatistics::class]);
+ }
+
+ try {
+ return TokenStatistics::tryFrom($value);
+ } catch (\Throwable $e) {
+ throw ConversionException::conversionFailed($value, $this->getName(), $e);
+ }
+ }
+
+ #[\Override]
+ public function convertToDatabaseValue($value, AbstractPlatform $platform): ?string
+ {
+ if ($value instanceof TokenStatistics) {
+ return json_encode($value) ?: null;
+ }
+
+ if ($value === null || $value === '') {
+ return null;
+ }
+
+ if (! \is_string($value)) {
+ throw ConversionException::conversionFailedInvalidType($value, $this->getName(), ['null', 'string', TokenStatistics::class]);
+ }
+
+ throw ConversionException::conversionFailed($value, $this->getName());
+ }
+}
diff --git a/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php b/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php
index c572338..aacf9ee 100644
--- a/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php
+++ b/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php
@@ -47,11 +47,12 @@ final class AddArticleController extends AbstractController
$this->handleCommand(new CreateArticle(
$model->title,
Link::from($model->link),
- implode(', ', $model->categories),
+ $model->categories,
$model->body,
$model->source,
$model->timestamp,
$model->metadata,
+ $model->tokenStatistics
));
return new JsonResponse(status: Response::HTTP_CREATED);
diff --git a/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php b/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php
index a15a29e..90ab6f1 100644
--- a/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php
+++ b/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php
@@ -5,6 +5,7 @@ declare(strict_types=1);
namespace Basango\Aggregator\Presentation\WriteModel;
use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph;
+use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics;
use Symfony\Component\Validator\Constraints as Assert;
/**
@@ -32,4 +33,6 @@ final class AddArticleModel
public array $categories = [];
public ?OpenGraph $metadata = null;
+
+ public ?TokenStatistics $tokenStatistics = null;
}
diff --git a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php
index 1105190..921d210 100644
--- a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php
+++ b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php
@@ -42,7 +42,7 @@ final readonly class GetArticleOverviewListDbalHandler implements GetArticleOver
$qb->from('article', 'a')
->innerJoin('a', 'source', 's', 'a.source_id = s.id')
- //->orderBy('a.published_at', $query->filters->sortDirection->value)
+ ->orderBy('a.published_at', $query->filters->sortDirection->value)
->setParameter('userId', $query->userId->toString())
;
diff --git a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php
index 94d424f..e37076d 100644
--- a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php
+++ b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php
@@ -62,15 +62,17 @@ trait ArticleQuery
private function applyArticleFilters(QueryBuilder $qb, ArticleFilters $filters): QueryBuilder
{
if ($filters->category !== null) {
- // PostgreSQL array containment for single value
$qb->andWhere(':category = ANY(a.categories)')
->setParameter('category', $filters->category);
}
if ($filters->search !== null) {
- // Case-insensitive search in PostgreSQL
- $qb->andWhere('a.title ILIKE :search')
- ->setParameter('search', sprintf('%%%s%%', $filters->search));
+ $qb
+ ->addSelect("ts_rank(a.tsv, to_tsquery('french', :search)) AS rank")
+ ->andWhere("a.tsv @@ to_tsquery('french', :search)")
+ ->setParameter('search', $filters->search)
+ ->resetOrderBy()
+ ->orderBy('rank', $filters->sortDirection->value);
}
if ($filters->dateRange instanceof DateRange) {
diff --git a/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php b/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php
index b9d9660..373351c 100644
--- a/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php
+++ b/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php
@@ -4,8 +4,8 @@ declare(strict_types=1);
namespace Basango\IdentityAndAccess\Domain\Model\Entity;
-use Gesdinet\JWTRefreshTokenBundle\Entity\RefreshToken as BaseRefreshToken;
+use Gesdinet\JWTRefreshTokenBundle\Model\AbstractRefreshToken;
-class RefreshToken extends BaseRefreshToken
+class RefreshToken extends AbstractRefreshToken
{
}
diff --git a/projects/backend/src/SharedKernel/Domain/Application.php b/projects/backend/src/SharedKernel/Domain/Application.php
index 215caaf..9d01770 100644
--- a/projects/backend/src/SharedKernel/Domain/Application.php
+++ b/projects/backend/src/SharedKernel/Domain/Application.php
@@ -11,15 +11,15 @@ namespace Basango\SharedKernel\Domain;
*/
final class Application
{
- public string $name = 'DRC News Corpus';
+ public string $name = 'Basango';
- public string $website = 'https://research.devscast.org/drc-news-corpus';
+ public string $website = 'https://basango.ngandu.dev';
public string $emailAddress = 'contact@devscast.tech';
public string $infoAddress = 'contact@devscast.tech';
- public string $emailName = 'DRC News Corpus';
+ public string $emailName = 'Basango';
public string $legalName = 'Devscast Software SàSu';
diff --git a/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php b/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php
index d1e60a5..eeceac7 100644
--- a/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php
+++ b/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php
@@ -48,14 +48,13 @@ trait PaginationQuery
PaginatorKeyset $keyset,
SortDirection $direction = SortDirection::DESC
): QueryBuilder {
- $orderDirection = strtoupper($direction->value);
$comparisonOperator = $direction === SortDirection::ASC ? '>' : '<';
if ($keyset->date !== null) {
- $qb->addOrderBy($keyset->date, $orderDirection);
+ $qb->addOrderBy($keyset->date, $direction->value);
}
- $qb->addOrderBy($keyset->id, $orderDirection);
+ $qb->addOrderBy($keyset->id, $direction->value);
$cursor = PaginationCursor::decode($page->cursor);
if (! $cursor instanceof PaginationCursor) {
diff --git a/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php b/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php
index af99a74..03760b7 100644
--- a/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php
+++ b/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php
@@ -22,9 +22,9 @@ final class DefaultController extends AbstractController
public function __invoke(): JsonResponse
{
return $this->json([
- 'repository' => 'https://github.com/bernard-ng/drc-news-corpus',
- 'title' => 'DRC News Corpus : Towards a scalable and intelligent system for Congolese News curation',
- 'description' => 'The DRC News Corpus is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.',
+ 'repository' => 'https://github.com/bernard-ng/basango',
+ 'title' => 'Basango : Towards a scalable and intelligent system for Congolese News curation',
+ 'description' => 'The Basango is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.',
'status' => 200,
]);
}
diff --git a/projects/crawler/README.md b/projects/crawler/README.md
index 9af3938..66da142 100644
--- a/projects/crawler/README.md
+++ b/projects/crawler/README.md
@@ -12,7 +12,7 @@
- Install the project in your virtualenv so the `basango` CLI is available:
- With uv: `uv run --with . basango --help`
- - Or install locally: `pip install -e .` then `basango --help`
+ - Or install locally: `uv sync` then `basango --help`
#### Sync crawl (in-process)
diff --git a/projects/crawler/config/pipeline.yaml b/projects/crawler/config/pipeline.yaml
index a48b52b..c1ed33b 100644
--- a/projects/crawler/config/pipeline.yaml
+++ b/projects/crawler/config/pipeline.yaml
@@ -37,7 +37,7 @@ sources:
replacement: "$3-$2-$1 $4"
source_selectors:
articles: ".view-content > .views-row.content-row"
- article_title: ".views-field-title a"
+ article_title: "h1.page-header"
article_link: ".views-field-title a"
article_body: ".field-name-body"
article_date: ".views-field-created"
@@ -45,7 +45,7 @@ sources:
pagination: "ul.pagination > li.pager-last > a"
pagination_template: "actualite"
supports_categories: false
- requires_details: false
+ requires_details: true
requires_rate_limit: false
- source_id: 7sur7.cd
diff --git a/projects/crawler/pyproject.toml b/projects/crawler/pyproject.toml
index 594e064..e7c587e 100644
--- a/projects/crawler/pyproject.toml
+++ b/projects/crawler/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
"markdownify>=0.13.1",
"readability-lxml>=0.8.1",
"beautifulsoup4>=4.13.5",
+ "tiktoken>=0.12.0",
]
[dependency-groups]
diff --git a/projects/crawler/src/basango/domain/article.py b/projects/crawler/src/basango/domain/article.py
index 7ad2f26..ea5f214 100644
--- a/projects/crawler/src/basango/domain/article.py
+++ b/projects/crawler/src/basango/domain/article.py
@@ -2,6 +2,7 @@ from datetime import datetime
from typing import Any, Optional
from pydantic import BaseModel, HttpUrl
+from .token_statistics import TokenStatistics
class Article(BaseModel):
@@ -12,6 +13,7 @@ class Article(BaseModel):
source: str
timestamp: datetime
metadata: Optional[dict[str, Any]] = None
+ token_statistics: Optional["TokenStatistics"] = None
def to_dict(self) -> dict[str, Any]:
return {
@@ -22,4 +24,7 @@ class Article(BaseModel):
"source": self.source,
"timestamp": int(self.timestamp.timestamp()),
"metadata": self.metadata,
+ "tokenStatistics": self.token_statistics.to_dict()
+ if self.token_statistics
+ else "",
}
diff --git a/projects/crawler/src/basango/domain/token_statistics.py b/projects/crawler/src/basango/domain/token_statistics.py
new file mode 100644
index 0000000..5d4abf6
--- /dev/null
+++ b/projects/crawler/src/basango/domain/token_statistics.py
@@ -0,0 +1,19 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class TokenStatistics:
+ """Counts of tokens for different article sections."""
+
+ title: int
+ body: int
+ excerpt: int
+ categories: int
+
+ def to_dict(self) -> dict[str, int]:
+ return {
+ "title": self.title,
+ "body": self.body,
+ "excerpt": self.excerpt,
+ "categories": self.categories,
+ }
diff --git a/projects/crawler/src/basango/services/__init__.py b/projects/crawler/src/basango/services/__init__.py
index bc33387..5ceebff 100644
--- a/projects/crawler/src/basango/services/__init__.py
+++ b/projects/crawler/src/basango/services/__init__.py
@@ -3,6 +3,7 @@ from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient
from .open_graph import OpenGraphProvider
from .persistence import BasePersistor, CsvPersistor, JsonPersistor
from .user_agents import UserAgentProvider
+from .tokenizer import Tokenizer
HttpClient = SyncHttpClient
@@ -17,4 +18,5 @@ __all__ = [
"BasePersistor",
"CsvPersistor",
"JsonPersistor",
+ "Tokenizer",
]
diff --git a/projects/crawler/src/basango/services/crawler/base_crawler.py b/projects/crawler/src/basango/services/crawler/base_crawler.py
index 9241722..0584610 100644
--- a/projects/crawler/src/basango/services/crawler/base_crawler.py
+++ b/projects/crawler/src/basango/services/crawler/base_crawler.py
@@ -1,15 +1,23 @@
import logging
from abc import ABC, abstractmethod
from dataclasses import asdict, is_dataclass
+from datetime import datetime
from typing import Optional, Any, Dict, List, Sequence
from basango.domain.article import Article
from bs4 import BeautifulSoup
+from pydantic import HttpUrl
from basango.core.config import CrawlerConfig, ClientConfig
from basango.domain import DateRange, SourceKind, PageRange
from basango.domain.exception import ArticleOutOfRange
-from basango.services import HttpClient, DateParser, OpenGraphProvider, BasePersistor
+from basango.services import (
+ HttpClient,
+ DateParser,
+ OpenGraphProvider,
+ BasePersistor,
+ Tokenizer,
+)
class BaseCrawler(ABC):
@@ -35,6 +43,7 @@ class BaseCrawler(ABC):
self.persistors: list[BasePersistor] = list(persistors) if persistors else []
self.date_parser = DateParser()
self.open_graph = OpenGraphProvider()
+ self.tokenizer = Tokenizer()
@abstractmethod
def fetch(self) -> None:
@@ -61,23 +70,35 @@ class BaseCrawler(ABC):
metadata_value = None
elif is_dataclass(metadata) and not isinstance(metadata, type):
metadata_value = asdict(metadata)
- else:
+ elif isinstance(metadata, dict):
metadata_value = metadata
+ else:
+ metadata_value = None
- article = {
- "title": title,
- "link": link,
- "body": body,
- "categories": categories,
- "source": getattr(self.source, "source_id", None),
- "timestamp": timestamp,
- "metadata": metadata_value,
- }
+ # Get source_id and ensure it's a string
+ source_id = getattr(self.source, "source_id", None)
+ if source_id is None:
+ source_id = "unknown"
- self._persist(article)
- logging.info(f"> {article['title']} [saved]")
+ article = Article(
+ title=title,
+ link=HttpUrl(link), # Convert str to HttpUrl
+ body=body,
+ categories=categories,
+ source=source_id, # Ensure it's a string, not None
+ timestamp=datetime.fromtimestamp(
+ timestamp
+ ), # Convert int timestamp to datetime
+ metadata=metadata_value,
+ )
+ article.token_statistics = self.tokenizer.count_tokens(
+ article.title, article.body, article.categories
+ )
- return Article(**article)
+ self._persist(article.to_dict())
+ logging.info("> %s [saved]", article.title)
+
+ return article
@abstractmethod
def fetch_one(
diff --git a/projects/crawler/src/basango/services/crawler/html_crawler.py b/projects/crawler/src/basango/services/crawler/html_crawler.py
index 1011798..afe75c9 100644
--- a/projects/crawler/src/basango/services/crawler/html_crawler.py
+++ b/projects/crawler/src/basango/services/crawler/html_crawler.py
@@ -6,6 +6,7 @@ from urllib.parse import parse_qs, urljoin, urlparse
from basango.domain.article import Article
from bs4 import BeautifulSoup, Tag
+from markdownify import markdownify
from basango.core.config import CrawlerConfig, ClientConfig
from basango.core.config.source_config import HtmlSourceConfig
@@ -283,15 +284,15 @@ class HtmlCrawler(BaseCrawler):
matches = node.select(selector)
if matches:
parts = [
- item.get_text(" ", strip=True)
+ markdownify(item.get_text(" ", strip=False), heading_style="ATX")
for item in matches
if item.get_text(strip=True)
]
if parts:
# Join without separators: callers can post-process if
# needed, but this preserves maximum fidelity.
- return "".join(parts)
- return node.get_text(" ", strip=True)
+ return "\n".join(parts)
+ return markdownify(node.get_text(" ", strip=False), heading_style="ATX")
@staticmethod
def _extract_categories(
diff --git a/projects/crawler/src/basango/services/crawler/wordpress_crawler.py b/projects/crawler/src/basango/services/crawler/wordpress_crawler.py
index 821c5f6..2bd17bc 100644
--- a/projects/crawler/src/basango/services/crawler/wordpress_crawler.py
+++ b/projects/crawler/src/basango/services/crawler/wordpress_crawler.py
@@ -3,6 +3,8 @@ import logging
from datetime import datetime, timezone
from typing import Optional, override, cast, Final, Any, Sequence
+from markdownify import markdownify
+
from basango.domain.article import Article
from bs4 import BeautifulSoup
@@ -104,7 +106,10 @@ class WordpressCrawler(BaseCrawler):
body_html = data.get("content", {}).get("rendered", "")
title = BeautifulSoup(title_html, "html.parser").get_text(" ", strip=True)
- body = BeautifulSoup(body_html, "html.parser").get_text(" ", strip=True)
+ body = markdownify(
+ BeautifulSoup(body_html, "html.parser").get_text(" ", strip=False),
+ heading_style="ATX",
+ )
timestamp = self._compute_timestamp(data.get("date"))
categories_value = self._map_categories(data.get("categories", []))
diff --git a/projects/crawler/src/basango/services/tokenizer.py b/projects/crawler/src/basango/services/tokenizer.py
new file mode 100644
index 0000000..c9db73d
--- /dev/null
+++ b/projects/crawler/src/basango/services/tokenizer.py
@@ -0,0 +1,56 @@
+"""
+Tokenizer utilities for counting and encoding article text.
+
+This module wraps the `tiktoken` encoder to provide simple helpers for:
+- encoding/decoding text to token ids
+- counting tokens for different parts of an Article
+
+The `Tokenizer` can be constructed with either a specific `model` (preferred)
+or an `encoding` name fallback.
+"""
+
+import logging
+
+import tiktoken
+from typing import Optional
+
+from basango.domain.token_statistics import TokenStatistics
+
+
+class Tokenizer:
+ """Thin wrapper around tiktoken encoder for token operations."""
+
+ def __init__(
+ self, encoding: str = "cl100k_base", model: Optional[str] = None
+ ) -> None:
+ self.encoding = encoding
+ # Prefer model-based encoding lookup if a model is provided.
+ self.tokenizer = (
+ tiktoken.encoding_for_model(model)
+ if model
+ else tiktoken.get_encoding(encoding)
+ )
+
+ def encode(self, text: str) -> list[int]:
+ """Encode text into a list of token ids."""
+ return self.tokenizer.encode(text)
+
+ def decode(self, tokens: list[int]) -> str:
+ """Decode a list of token ids back into a string."""
+ return self.tokenizer.decode(tokens)
+
+ def count_tokens(
+ self, title: str, body: str, categories: list[str]
+ ) -> TokenStatistics:
+ """Return token counts for the provided Article.
+
+ The excerpt count is computed on the first 200 characters of the body
+ to give a quick estimate of a short preview's token length.
+ """
+ logging.info(f"[Tokenizer] tokenizing {title}...")
+ return TokenStatistics(
+ title=len(self.encode(title)),
+ body=len(self.encode(body)),
+ excerpt=len(self.encode(body[:200])),
+ categories=len(self.encode(", ".join(categories))),
+ )
diff --git a/projects/crawler/uv.lock b/projects/crawler/uv.lock
index dae5f8b..da03417 100644
--- a/projects/crawler/uv.lock
+++ b/projects/crawler/uv.lock
@@ -62,6 +62,7 @@ dependencies = [
{ name = "readability-lxml" },
{ name = "rq" },
{ name = "selectolax" },
+ { name = "tiktoken" },
{ name = "trafilatura" },
{ name = "typer" },
{ name = "uv-build" },
@@ -86,6 +87,7 @@ requires-dist = [
{ name = "readability-lxml", specifier = ">=0.8.1" },
{ name = "rq", specifier = ">=2.5.0" },
{ name = "selectolax", specifier = ">=0.3.20" },
+ { name = "tiktoken", specifier = ">=0.12.0" },
{ name = "trafilatura", specifier = ">=1.7.0" },
{ name = "typer", specifier = ">=0.16.1" },
{ name = "uv-build", specifier = ">=0.8.12,<0.9.0" },
@@ -632,6 +634,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cf/3e/7d7ac6fd085023312421e0d69dfabdfb28e116e513fadbe9afe710c01893/regex-2025.9.1-cp314-cp314-win_arm64.whl", hash = "sha256:f46d525934871ea772930e997d577d48c6983e50f206ff7b66d4ac5f8941e993", size = 271860, upload-time = "2025-09-01T22:09:42.413Z" },
]
+[[package]]
+name = "requests"
+version = "2.32.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "certifi" },
+ { name = "charset-normalizer" },
+ { name = "idna" },
+ { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
+]
+
[[package]]
name = "rich"
version = "14.1.0"
@@ -771,6 +788,46 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f7/45/8c4ebc0c460e6ec38e62ab245ad3c7fc10b210116cea7c16d61602aa9558/stevedore-5.4.1-py3-none-any.whl", hash = "sha256:d10a31c7b86cba16c1f6e8d15416955fc797052351a56af15e608ad20811fcfe", size = 49533, upload-time = "2025-02-20T14:03:55.849Z" },
]
+[[package]]
+name = "tiktoken"
+version = "0.12.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "regex" },
+ { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" },
+ { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" },
+ { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" },
+ { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" },
+ { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" },
+ { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" },
+ { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" },
+ { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" },
+ { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" },
+ { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" },
+ { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" },
+ { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" },
+ { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" },
+ { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" },
+ { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" },
+ { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
+]
+
[[package]]
name = "tld"
version = "0.13.1"