From 799cda6e0677b83ee09e71975cf3095615584f05 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Sat, 25 Oct 2025 03:23:15 +0200 Subject: [PATCH] [backend, crawler] feat: support token statistics --- README.md | 2 +- projects/backend/CITATION.cff | 8 +-- projects/backend/README.md | 20 +++--- projects/backend/api/bruno.json | 2 +- .../Aggregator/Entity.Article.orm.xml | 3 +- .../Entity.RefreshToken.orm.xml | 7 ++ .../migrations/Version20251024234318.php | 31 +++++++++ .../backend/config/packages/doctrine.yaml | 2 + .../UseCase/Command/CreateArticle.php | 6 +- .../CommandHandler/CreateArticleHandler.php | 3 +- .../Domain/Model/Entity/Article.php | 13 +++- .../Model/ValueObject/TokenStatistics.php | 62 +++++++++++++++++ .../DBAL/Types/TokenStatisticsType.php | 67 +++++++++++++++++++ .../Web/Controller/AddArticleController.php | 3 +- .../WriteModel/AddArticleModel.php | 3 + .../GetArticleOverviewListDbalHandler.php | 2 +- .../Doctrine/DBAL/Queries/ArticleQuery.php | 10 +-- .../Domain/Model/Entity/RefreshToken.php | 4 +- .../src/SharedKernel/Domain/Application.php | 6 +- .../DBAL/Features/PaginationQuery.php | 5 +- .../Web/Controller/DefaultController.php | 6 +- projects/crawler/README.md | 2 +- projects/crawler/config/pipeline.yaml | 4 +- projects/crawler/pyproject.toml | 1 + .../crawler/src/basango/domain/article.py | 5 ++ .../src/basango/domain/token_statistics.py | 19 ++++++ .../crawler/src/basango/services/__init__.py | 2 + .../basango/services/crawler/base_crawler.py | 49 ++++++++++---- .../basango/services/crawler/html_crawler.py | 7 +- .../services/crawler/wordpress_crawler.py | 7 +- .../crawler/src/basango/services/tokenizer.py | 56 ++++++++++++++++ projects/crawler/uv.lock | 57 ++++++++++++++++ 32 files changed, 414 insertions(+), 60 deletions(-) create mode 100644 projects/backend/config/migrations/Version20251024234318.php create mode 100644 projects/backend/src/Aggregator/Domain/Model/ValueObject/TokenStatistics.php create mode 100644 projects/backend/src/Aggregator/Infrastructure/Persistence/Doctrine/DBAL/Types/TokenStatisticsType.php create mode 100644 projects/crawler/src/basango/domain/token_statistics.py create mode 100644 projects/crawler/src/basango/services/tokenizer.py diff --git a/README.md b/README.md index a7b0606..ee22eb3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Basango : Towards a scalable and intelligent system for Congolese News curation +# Basango: Towards a scalable and intelligent system for Congolese News curation [![backend audit](https://github.com/bernard-ng/basango/actions/workflows/backend_audit.yaml/badge.svg)](https://github.com/bernard-ng/basango/actions/workflows/backend_audit.yaml) [![backend deploy](https://github.com/bernard-ng/basango/actions/workflows/backend_deploy.yaml/badge.svg)](https://github.com/bernard-ng/basango/actions/workflows/backend_deploy.yaml) diff --git a/projects/backend/CITATION.cff b/projects/backend/CITATION.cff index 25fa00b..749193f 100644 --- a/projects/backend/CITATION.cff +++ b/projects/backend/CITATION.cff @@ -2,7 +2,7 @@ # Visit https://bit.ly/cffinit to generate yours today! cff-version: 1.2.0 -title: DRC News Corpus +title: Basango message: >- If you use this software, please cite it using the metadata from this file. @@ -14,11 +14,11 @@ authors: email: bernard@devscast.tech affiliation: Devscast Community orcid: 'https://orcid.org/0009-0003-9777-6349' -repository-code: 'https://github.com/bernard-ng/drc-news-corpus' +repository-code: 'https://github.com/bernard-ng/basango' repository: >- - https://www.huggingface.c0/datasets/bernard-ng/drc-news-corpus + https://www.huggingface.c0/datasets/bernard-ng/basango abstract: >- - The "DRC News Corpus" is a curated collection of news + The "Basango" is a curated collection of news articles sourced from major media outlets covering a wide spectrum of topics related to the Democratic Republic of Congo (DRC). This dataset encompasses a diverse range of diff --git a/projects/backend/README.md b/projects/backend/README.md index 6ed4522..1d7590a 100644 --- a/projects/backend/README.md +++ b/projects/backend/README.md @@ -1,24 +1,24 @@ # Core and Backend -![Deployed](https://github.com/bernard-ng/drc-news-corpus/actions/workflows/deploy.yaml/badge.svg) -![Coding Standard](https://github.com/bernard-ng/drc-news-corpus/actions/workflows/quality.yaml/badge.svg) -![Tests](https://github.com/bernard-ng/drc-news-corpus/actions/workflows/tests.yaml/badge.svg) -![Security](https://github.com/bernard-ng/drc-news-corpus/actions/workflows/audit.yaml/badge.svg) +![Deployed](https://github.com/bernard-ng/basango/actions/workflows/deploy.yaml/badge.svg) +![Coding Standard](https://github.com/bernard-ng/basango/actions/workflows/quality.yaml/badge.svg) +![Tests](https://github.com/bernard-ng/basango/actions/workflows/tests.yaml/badge.svg) +![Security](https://github.com/bernard-ng/basango/actions/workflows/audit.yaml/badge.svg) | Scope | Link | |-------------------|------------------------------------------------------------| -| core and backend | https://github.com/bernard-ng/drc-news-corpus | +| core and backend | https://github.com/bernard-ng/basango | | ML models | https://github.com/bernard-ng/drc-news-ml | | Mobile App | https://github.com/bernard-ng/basango | -| Dataset (partial) | https://huggingface.co/datasets/bernard-ng/drc-news-corpus | +| Dataset (partial) | https://huggingface.co/datasets/bernard-ng/basango | --- -## DRC News Corpus : Towards a scalable and intelligent system for Congolese News curation +## Basango : Towards a scalable and intelligent system for Congolese News curation ### Introduction -The **"DRC News Corpus"** is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs. +The **"Basango"** is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs. ### Scalability and Use Cases: @@ -45,7 +45,7 @@ If you want to rebuild the dataset follow the steps bellow : #### Installation ```bash -git clone https://github.com/bernard-ng/drc-news-corpus.git && cd drc-news-corpus +git clone https://github.com/bernard-ng/basango.git && cd basango make build make start ``` @@ -104,5 +104,5 @@ a CSV file will be generated in the `data` directory. ### Acknowledgment: -The compilation and curation of the "DRC News Corpus" were conducted by Tshabu Ngandu Bernard with the primary objective of facilitating research and analysis related to the Democratic Republic of Congo. +The compilation and curation of the "Basango" were conducted by Tshabu Ngandu Bernard with the primary objective of facilitating research and analysis related to the Democratic Republic of Congo. I do not own the content of the articles, and all rights belong to the respective publishers. The dataset is intended for non-commercial research purposes only. diff --git a/projects/backend/api/bruno.json b/projects/backend/api/bruno.json index d43c349..c15b8c0 100644 --- a/projects/backend/api/bruno.json +++ b/projects/backend/api/bruno.json @@ -1,6 +1,6 @@ { "version": "1", - "name": "drc-news-corpus", + "name": "basango", "type": "collection", "ignore": [ "node_modules", diff --git a/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml b/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml index 4dab811..3608637 100644 --- a/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml +++ b/projects/backend/config/doctrine/Aggregator/Entity.Article.orm.xml @@ -18,7 +18,7 @@ - + @@ -30,6 +30,7 @@ + + + + + + + + diff --git a/projects/backend/config/migrations/Version20251024234318.php b/projects/backend/config/migrations/Version20251024234318.php new file mode 100644 index 0000000..a4aedec --- /dev/null +++ b/projects/backend/config/migrations/Version20251024234318.php @@ -0,0 +1,31 @@ + + */ +final class Version20251024234318 extends AbstractMigration +{ + public function getDescription(): string + { + return 'add token statistics to article'; + } + + public function up(Schema $schema): void + { + $this->addSql('ALTER TABLE article ADD token_statistics JSONB DEFAULT NULL'); + } + + public function down(Schema $schema): void + { + $this->addSql('ALTER TABLE article DROP token_statistics'); + } +} diff --git a/projects/backend/config/packages/doctrine.yaml b/projects/backend/config/packages/doctrine.yaml index 2d257c7..0f47178 100644 --- a/projects/backend/config/packages/doctrine.yaml +++ b/projects/backend/config/packages/doctrine.yaml @@ -70,6 +70,7 @@ doctrine: article_id: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\ArticleIdType source_id: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\SourceIdType open_graph: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\OpenGraphType + token_statistics: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\TokenStatisticsType # Identity and Access user_id: Basango\IdentityAndAccess\Infrastructure\Persistence\Doctrine\DBAL\Types\UserIdType @@ -125,6 +126,7 @@ doctrine: orm: auto_generate_proxy_classes: true enable_lazy_ghost_objects: true + enable_native_lazy_objects: true entity_managers: default: validate_xml_mapping: false diff --git a/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php b/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php index 965bc5a..53a146d 100644 --- a/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php +++ b/projects/backend/src/Aggregator/Application/UseCase/Command/CreateArticle.php @@ -6,6 +6,7 @@ namespace Basango\Aggregator\Application\UseCase\Command; use Basango\Aggregator\Domain\Model\ValueObject\Link; use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph; +use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics; /** * Class Save. @@ -17,11 +18,12 @@ final readonly class CreateArticle public function __construct( public string $title, public Link $link, - public string $categories, + public array $categories, public string $body, public string $source, public int $timestamp, - public ?OpenGraph $metadata = null + public ?OpenGraph $metadata = null, + public ?TokenStatistics $tokenStatistics = null ) { } } diff --git a/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php b/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php index 144b5fa..967e2c1 100644 --- a/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php +++ b/projects/backend/src/Aggregator/Application/UseCase/CommandHandler/CreateArticleHandler.php @@ -43,12 +43,13 @@ final readonly class CreateArticleHandler implements CommandHandler link: $command->link, body: $command->body, hash: $hash, - categories: mb_strtolower($command->categories), + categories: $command->categories, source: $source, publishedAt: $publishedAt ); $article ->defineOpenGraph($command->metadata) + ->defineTokenStatistics($command->tokenStatistics) ->computeReadingTime(); $this->articleRepository->add($article); diff --git a/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php b/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php index 94cc038..9eee4b7 100644 --- a/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php +++ b/projects/backend/src/Aggregator/Domain/Model/Entity/Article.php @@ -10,6 +10,7 @@ use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph; use Basango\Aggregator\Domain\Model\ValueObject\ReadingTime; use Basango\Aggregator\Domain\Model\ValueObject\Scoring\Credibility; use Basango\Aggregator\Domain\Model\ValueObject\Scoring\Sentiment; +use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics; /** * Class Article. @@ -25,13 +26,14 @@ class Article public readonly Link $link, public readonly string $body, public readonly string $hash, - private(set) string $categories, + private(set) array $categories, public readonly Source $source, public readonly \DateTimeImmutable $publishedAt, public readonly \DateTimeImmutable $crawledAt = new \DateTimeImmutable(), private(set) Credibility $credibility = new Credibility(), private(set) Sentiment $sentiment = Sentiment::NEUTRAL, private(set) ?OpenGraph $metadata = null, + private(set) ?TokenStatistics $tokenStatistics = null, private(set) ?ReadingTime $readingTime = null, private(set) ?\DateTimeImmutable $updatedAt = null, public readonly ?string $image = null, @@ -56,7 +58,7 @@ class Article return $this; } - public function assignCategories(string $categories): self + public function assignCategories(array $categories): self { $this->categories = $categories; $this->updatedAt = new \DateTimeImmutable(); @@ -83,4 +85,11 @@ class Article return $this; } + + public function defineTokenStatistics(?TokenStatistics $statistics): self + { + $this->tokenStatistics = $statistics; + + return $this; + } } diff --git a/projects/backend/src/Aggregator/Domain/Model/ValueObject/TokenStatistics.php b/projects/backend/src/Aggregator/Domain/Model/ValueObject/TokenStatistics.php new file mode 100644 index 0000000..f384fd0 --- /dev/null +++ b/projects/backend/src/Aggregator/Domain/Model/ValueObject/TokenStatistics.php @@ -0,0 +1,62 @@ + + */ +final class TokenStatistics implements \JsonSerializable +{ + public ?int $total { + get { + return ($this->title ?? 0) + + ($this->body ?? 0) + + ($this->excerpt ?? 0) + + ($this->categories ?? 0); + } + } + + public function __construct( + public readonly ?int $title = null, + public readonly ?int $body = null, + public readonly ?int $excerpt = null, + public readonly ?int $categories = null, + ) { + } + + public static function tryFrom(?string $value): ?self + { + if ($value === null) { + return null; + } + + try { + $object = \json_decode($value, true, 512, JSON_THROW_ON_ERROR); + + return new self( + $object['title'] ?? null, + $object['body'] ?? null, + $object['excerpt'] ?? null, + $object['categories'] ?? null, + ); + } catch (\Throwable) { + return null; + } + } + + #[\Override] + public function jsonSerialize(): array + { + return [ + 'title' => $this->title, + 'body' => $this->body, + 'excerpt' => $this->excerpt, + 'categories' => $this->categories, + 'total' => $this->total, + ]; + } +} diff --git a/projects/backend/src/Aggregator/Infrastructure/Persistence/Doctrine/DBAL/Types/TokenStatisticsType.php b/projects/backend/src/Aggregator/Infrastructure/Persistence/Doctrine/DBAL/Types/TokenStatisticsType.php new file mode 100644 index 0000000..ae98942 --- /dev/null +++ b/projects/backend/src/Aggregator/Infrastructure/Persistence/Doctrine/DBAL/Types/TokenStatisticsType.php @@ -0,0 +1,67 @@ + + */ +final class TokenStatisticsType extends Type +{ + public function getSQLDeclaration(array $column, AbstractPlatform $platform): string + { + return $platform->getJsonTypeDeclarationSQL([ + 'nullable' => true, + 'jsonb' => true, + ]); + } + + public function getName(): string + { + return 'token_statistics'; + } + + #[\Override] + public function convertToPHPValue(mixed $value, AbstractPlatform $platform): ?TokenStatistics + { + if ($value === null) { + return null; + } + + if (! \is_string($value)) { + throw ConversionException::conversionFailedInvalidType($value, $this->getName(), ['null', 'string', TokenStatistics::class]); + } + + try { + return TokenStatistics::tryFrom($value); + } catch (\Throwable $e) { + throw ConversionException::conversionFailed($value, $this->getName(), $e); + } + } + + #[\Override] + public function convertToDatabaseValue($value, AbstractPlatform $platform): ?string + { + if ($value instanceof TokenStatistics) { + return json_encode($value) ?: null; + } + + if ($value === null || $value === '') { + return null; + } + + if (! \is_string($value)) { + throw ConversionException::conversionFailedInvalidType($value, $this->getName(), ['null', 'string', TokenStatistics::class]); + } + + throw ConversionException::conversionFailed($value, $this->getName()); + } +} diff --git a/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php b/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php index c572338..aacf9ee 100644 --- a/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php +++ b/projects/backend/src/Aggregator/Presentation/Web/Controller/AddArticleController.php @@ -47,11 +47,12 @@ final class AddArticleController extends AbstractController $this->handleCommand(new CreateArticle( $model->title, Link::from($model->link), - implode(', ', $model->categories), + $model->categories, $model->body, $model->source, $model->timestamp, $model->metadata, + $model->tokenStatistics )); return new JsonResponse(status: Response::HTTP_CREATED); diff --git a/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php b/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php index a15a29e..90ab6f1 100644 --- a/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php +++ b/projects/backend/src/Aggregator/Presentation/WriteModel/AddArticleModel.php @@ -5,6 +5,7 @@ declare(strict_types=1); namespace Basango\Aggregator\Presentation\WriteModel; use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph; +use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics; use Symfony\Component\Validator\Constraints as Assert; /** @@ -32,4 +33,6 @@ final class AddArticleModel public array $categories = []; public ?OpenGraph $metadata = null; + + public ?TokenStatistics $tokenStatistics = null; } diff --git a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php index 1105190..921d210 100644 --- a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php +++ b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/GetArticleOverviewListDbalHandler.php @@ -42,7 +42,7 @@ final readonly class GetArticleOverviewListDbalHandler implements GetArticleOver $qb->from('article', 'a') ->innerJoin('a', 'source', 's', 'a.source_id = s.id') - //->orderBy('a.published_at', $query->filters->sortDirection->value) + ->orderBy('a.published_at', $query->filters->sortDirection->value) ->setParameter('userId', $query->userId->toString()) ; diff --git a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php index 94d424f..e37076d 100644 --- a/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php +++ b/projects/backend/src/FeedManagement/Infrastructure/Persistence/Doctrine/DBAL/Queries/ArticleQuery.php @@ -62,15 +62,17 @@ trait ArticleQuery private function applyArticleFilters(QueryBuilder $qb, ArticleFilters $filters): QueryBuilder { if ($filters->category !== null) { - // PostgreSQL array containment for single value $qb->andWhere(':category = ANY(a.categories)') ->setParameter('category', $filters->category); } if ($filters->search !== null) { - // Case-insensitive search in PostgreSQL - $qb->andWhere('a.title ILIKE :search') - ->setParameter('search', sprintf('%%%s%%', $filters->search)); + $qb + ->addSelect("ts_rank(a.tsv, to_tsquery('french', :search)) AS rank") + ->andWhere("a.tsv @@ to_tsquery('french', :search)") + ->setParameter('search', $filters->search) + ->resetOrderBy() + ->orderBy('rank', $filters->sortDirection->value); } if ($filters->dateRange instanceof DateRange) { diff --git a/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php b/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php index b9d9660..373351c 100644 --- a/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php +++ b/projects/backend/src/IdentityAndAccess/Domain/Model/Entity/RefreshToken.php @@ -4,8 +4,8 @@ declare(strict_types=1); namespace Basango\IdentityAndAccess\Domain\Model\Entity; -use Gesdinet\JWTRefreshTokenBundle\Entity\RefreshToken as BaseRefreshToken; +use Gesdinet\JWTRefreshTokenBundle\Model\AbstractRefreshToken; -class RefreshToken extends BaseRefreshToken +class RefreshToken extends AbstractRefreshToken { } diff --git a/projects/backend/src/SharedKernel/Domain/Application.php b/projects/backend/src/SharedKernel/Domain/Application.php index 215caaf..9d01770 100644 --- a/projects/backend/src/SharedKernel/Domain/Application.php +++ b/projects/backend/src/SharedKernel/Domain/Application.php @@ -11,15 +11,15 @@ namespace Basango\SharedKernel\Domain; */ final class Application { - public string $name = 'DRC News Corpus'; + public string $name = 'Basango'; - public string $website = 'https://research.devscast.org/drc-news-corpus'; + public string $website = 'https://basango.ngandu.dev'; public string $emailAddress = 'contact@devscast.tech'; public string $infoAddress = 'contact@devscast.tech'; - public string $emailName = 'DRC News Corpus'; + public string $emailName = 'Basango'; public string $legalName = 'Devscast Software SàSu'; diff --git a/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php b/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php index d1e60a5..eeceac7 100644 --- a/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php +++ b/projects/backend/src/SharedKernel/Infrastructure/Persistence/Doctrine/DBAL/Features/PaginationQuery.php @@ -48,14 +48,13 @@ trait PaginationQuery PaginatorKeyset $keyset, SortDirection $direction = SortDirection::DESC ): QueryBuilder { - $orderDirection = strtoupper($direction->value); $comparisonOperator = $direction === SortDirection::ASC ? '>' : '<'; if ($keyset->date !== null) { - $qb->addOrderBy($keyset->date, $orderDirection); + $qb->addOrderBy($keyset->date, $direction->value); } - $qb->addOrderBy($keyset->id, $orderDirection); + $qb->addOrderBy($keyset->id, $direction->value); $cursor = PaginationCursor::decode($page->cursor); if (! $cursor instanceof PaginationCursor) { diff --git a/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php b/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php index af99a74..03760b7 100644 --- a/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php +++ b/projects/backend/src/SharedKernel/Presentation/Web/Controller/DefaultController.php @@ -22,9 +22,9 @@ final class DefaultController extends AbstractController public function __invoke(): JsonResponse { return $this->json([ - 'repository' => 'https://github.com/bernard-ng/drc-news-corpus', - 'title' => 'DRC News Corpus : Towards a scalable and intelligent system for Congolese News curation', - 'description' => 'The DRC News Corpus is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.', + 'repository' => 'https://github.com/bernard-ng/basango', + 'title' => 'Basango : Towards a scalable and intelligent system for Congolese News curation', + 'description' => 'The Basango is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.', 'status' => 200, ]); } diff --git a/projects/crawler/README.md b/projects/crawler/README.md index 9af3938..66da142 100644 --- a/projects/crawler/README.md +++ b/projects/crawler/README.md @@ -12,7 +12,7 @@ - Install the project in your virtualenv so the `basango` CLI is available: - With uv: `uv run --with . basango --help` - - Or install locally: `pip install -e .` then `basango --help` + - Or install locally: `uv sync` then `basango --help` #### Sync crawl (in-process) diff --git a/projects/crawler/config/pipeline.yaml b/projects/crawler/config/pipeline.yaml index a48b52b..c1ed33b 100644 --- a/projects/crawler/config/pipeline.yaml +++ b/projects/crawler/config/pipeline.yaml @@ -37,7 +37,7 @@ sources: replacement: "$3-$2-$1 $4" source_selectors: articles: ".view-content > .views-row.content-row" - article_title: ".views-field-title a" + article_title: "h1.page-header" article_link: ".views-field-title a" article_body: ".field-name-body" article_date: ".views-field-created" @@ -45,7 +45,7 @@ sources: pagination: "ul.pagination > li.pager-last > a" pagination_template: "actualite" supports_categories: false - requires_details: false + requires_details: true requires_rate_limit: false - source_id: 7sur7.cd diff --git a/projects/crawler/pyproject.toml b/projects/crawler/pyproject.toml index 594e064..e7c587e 100644 --- a/projects/crawler/pyproject.toml +++ b/projects/crawler/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "markdownify>=0.13.1", "readability-lxml>=0.8.1", "beautifulsoup4>=4.13.5", + "tiktoken>=0.12.0", ] [dependency-groups] diff --git a/projects/crawler/src/basango/domain/article.py b/projects/crawler/src/basango/domain/article.py index 7ad2f26..ea5f214 100644 --- a/projects/crawler/src/basango/domain/article.py +++ b/projects/crawler/src/basango/domain/article.py @@ -2,6 +2,7 @@ from datetime import datetime from typing import Any, Optional from pydantic import BaseModel, HttpUrl +from .token_statistics import TokenStatistics class Article(BaseModel): @@ -12,6 +13,7 @@ class Article(BaseModel): source: str timestamp: datetime metadata: Optional[dict[str, Any]] = None + token_statistics: Optional["TokenStatistics"] = None def to_dict(self) -> dict[str, Any]: return { @@ -22,4 +24,7 @@ class Article(BaseModel): "source": self.source, "timestamp": int(self.timestamp.timestamp()), "metadata": self.metadata, + "tokenStatistics": self.token_statistics.to_dict() + if self.token_statistics + else "", } diff --git a/projects/crawler/src/basango/domain/token_statistics.py b/projects/crawler/src/basango/domain/token_statistics.py new file mode 100644 index 0000000..5d4abf6 --- /dev/null +++ b/projects/crawler/src/basango/domain/token_statistics.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass + + +@dataclass +class TokenStatistics: + """Counts of tokens for different article sections.""" + + title: int + body: int + excerpt: int + categories: int + + def to_dict(self) -> dict[str, int]: + return { + "title": self.title, + "body": self.body, + "excerpt": self.excerpt, + "categories": self.categories, + } diff --git a/projects/crawler/src/basango/services/__init__.py b/projects/crawler/src/basango/services/__init__.py index bc33387..5ceebff 100644 --- a/projects/crawler/src/basango/services/__init__.py +++ b/projects/crawler/src/basango/services/__init__.py @@ -3,6 +3,7 @@ from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient from .open_graph import OpenGraphProvider from .persistence import BasePersistor, CsvPersistor, JsonPersistor from .user_agents import UserAgentProvider +from .tokenizer import Tokenizer HttpClient = SyncHttpClient @@ -17,4 +18,5 @@ __all__ = [ "BasePersistor", "CsvPersistor", "JsonPersistor", + "Tokenizer", ] diff --git a/projects/crawler/src/basango/services/crawler/base_crawler.py b/projects/crawler/src/basango/services/crawler/base_crawler.py index 9241722..0584610 100644 --- a/projects/crawler/src/basango/services/crawler/base_crawler.py +++ b/projects/crawler/src/basango/services/crawler/base_crawler.py @@ -1,15 +1,23 @@ import logging from abc import ABC, abstractmethod from dataclasses import asdict, is_dataclass +from datetime import datetime from typing import Optional, Any, Dict, List, Sequence from basango.domain.article import Article from bs4 import BeautifulSoup +from pydantic import HttpUrl from basango.core.config import CrawlerConfig, ClientConfig from basango.domain import DateRange, SourceKind, PageRange from basango.domain.exception import ArticleOutOfRange -from basango.services import HttpClient, DateParser, OpenGraphProvider, BasePersistor +from basango.services import ( + HttpClient, + DateParser, + OpenGraphProvider, + BasePersistor, + Tokenizer, +) class BaseCrawler(ABC): @@ -35,6 +43,7 @@ class BaseCrawler(ABC): self.persistors: list[BasePersistor] = list(persistors) if persistors else [] self.date_parser = DateParser() self.open_graph = OpenGraphProvider() + self.tokenizer = Tokenizer() @abstractmethod def fetch(self) -> None: @@ -61,23 +70,35 @@ class BaseCrawler(ABC): metadata_value = None elif is_dataclass(metadata) and not isinstance(metadata, type): metadata_value = asdict(metadata) - else: + elif isinstance(metadata, dict): metadata_value = metadata + else: + metadata_value = None - article = { - "title": title, - "link": link, - "body": body, - "categories": categories, - "source": getattr(self.source, "source_id", None), - "timestamp": timestamp, - "metadata": metadata_value, - } + # Get source_id and ensure it's a string + source_id = getattr(self.source, "source_id", None) + if source_id is None: + source_id = "unknown" - self._persist(article) - logging.info(f"> {article['title']} [saved]") + article = Article( + title=title, + link=HttpUrl(link), # Convert str to HttpUrl + body=body, + categories=categories, + source=source_id, # Ensure it's a string, not None + timestamp=datetime.fromtimestamp( + timestamp + ), # Convert int timestamp to datetime + metadata=metadata_value, + ) + article.token_statistics = self.tokenizer.count_tokens( + article.title, article.body, article.categories + ) - return Article(**article) + self._persist(article.to_dict()) + logging.info("> %s [saved]", article.title) + + return article @abstractmethod def fetch_one( diff --git a/projects/crawler/src/basango/services/crawler/html_crawler.py b/projects/crawler/src/basango/services/crawler/html_crawler.py index 1011798..afe75c9 100644 --- a/projects/crawler/src/basango/services/crawler/html_crawler.py +++ b/projects/crawler/src/basango/services/crawler/html_crawler.py @@ -6,6 +6,7 @@ from urllib.parse import parse_qs, urljoin, urlparse from basango.domain.article import Article from bs4 import BeautifulSoup, Tag +from markdownify import markdownify from basango.core.config import CrawlerConfig, ClientConfig from basango.core.config.source_config import HtmlSourceConfig @@ -283,15 +284,15 @@ class HtmlCrawler(BaseCrawler): matches = node.select(selector) if matches: parts = [ - item.get_text(" ", strip=True) + markdownify(item.get_text(" ", strip=False), heading_style="ATX") for item in matches if item.get_text(strip=True) ] if parts: # Join without separators: callers can post-process if # needed, but this preserves maximum fidelity. - return "".join(parts) - return node.get_text(" ", strip=True) + return "\n".join(parts) + return markdownify(node.get_text(" ", strip=False), heading_style="ATX") @staticmethod def _extract_categories( diff --git a/projects/crawler/src/basango/services/crawler/wordpress_crawler.py b/projects/crawler/src/basango/services/crawler/wordpress_crawler.py index 821c5f6..2bd17bc 100644 --- a/projects/crawler/src/basango/services/crawler/wordpress_crawler.py +++ b/projects/crawler/src/basango/services/crawler/wordpress_crawler.py @@ -3,6 +3,8 @@ import logging from datetime import datetime, timezone from typing import Optional, override, cast, Final, Any, Sequence +from markdownify import markdownify + from basango.domain.article import Article from bs4 import BeautifulSoup @@ -104,7 +106,10 @@ class WordpressCrawler(BaseCrawler): body_html = data.get("content", {}).get("rendered", "") title = BeautifulSoup(title_html, "html.parser").get_text(" ", strip=True) - body = BeautifulSoup(body_html, "html.parser").get_text(" ", strip=True) + body = markdownify( + BeautifulSoup(body_html, "html.parser").get_text(" ", strip=False), + heading_style="ATX", + ) timestamp = self._compute_timestamp(data.get("date")) categories_value = self._map_categories(data.get("categories", [])) diff --git a/projects/crawler/src/basango/services/tokenizer.py b/projects/crawler/src/basango/services/tokenizer.py new file mode 100644 index 0000000..c9db73d --- /dev/null +++ b/projects/crawler/src/basango/services/tokenizer.py @@ -0,0 +1,56 @@ +""" +Tokenizer utilities for counting and encoding article text. + +This module wraps the `tiktoken` encoder to provide simple helpers for: +- encoding/decoding text to token ids +- counting tokens for different parts of an Article + +The `Tokenizer` can be constructed with either a specific `model` (preferred) +or an `encoding` name fallback. +""" + +import logging + +import tiktoken +from typing import Optional + +from basango.domain.token_statistics import TokenStatistics + + +class Tokenizer: + """Thin wrapper around tiktoken encoder for token operations.""" + + def __init__( + self, encoding: str = "cl100k_base", model: Optional[str] = None + ) -> None: + self.encoding = encoding + # Prefer model-based encoding lookup if a model is provided. + self.tokenizer = ( + tiktoken.encoding_for_model(model) + if model + else tiktoken.get_encoding(encoding) + ) + + def encode(self, text: str) -> list[int]: + """Encode text into a list of token ids.""" + return self.tokenizer.encode(text) + + def decode(self, tokens: list[int]) -> str: + """Decode a list of token ids back into a string.""" + return self.tokenizer.decode(tokens) + + def count_tokens( + self, title: str, body: str, categories: list[str] + ) -> TokenStatistics: + """Return token counts for the provided Article. + + The excerpt count is computed on the first 200 characters of the body + to give a quick estimate of a short preview's token length. + """ + logging.info(f"[Tokenizer] tokenizing {title}...") + return TokenStatistics( + title=len(self.encode(title)), + body=len(self.encode(body)), + excerpt=len(self.encode(body[:200])), + categories=len(self.encode(", ".join(categories))), + ) diff --git a/projects/crawler/uv.lock b/projects/crawler/uv.lock index dae5f8b..da03417 100644 --- a/projects/crawler/uv.lock +++ b/projects/crawler/uv.lock @@ -62,6 +62,7 @@ dependencies = [ { name = "readability-lxml" }, { name = "rq" }, { name = "selectolax" }, + { name = "tiktoken" }, { name = "trafilatura" }, { name = "typer" }, { name = "uv-build" }, @@ -86,6 +87,7 @@ requires-dist = [ { name = "readability-lxml", specifier = ">=0.8.1" }, { name = "rq", specifier = ">=2.5.0" }, { name = "selectolax", specifier = ">=0.3.20" }, + { name = "tiktoken", specifier = ">=0.12.0" }, { name = "trafilatura", specifier = ">=1.7.0" }, { name = "typer", specifier = ">=0.16.1" }, { name = "uv-build", specifier = ">=0.8.12,<0.9.0" }, @@ -632,6 +634,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/3e/7d7ac6fd085023312421e0d69dfabdfb28e116e513fadbe9afe710c01893/regex-2025.9.1-cp314-cp314-win_arm64.whl", hash = "sha256:f46d525934871ea772930e997d577d48c6983e50f206ff7b66d4ac5f8941e993", size = 271860, upload-time = "2025-09-01T22:09:42.413Z" }, ] +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + [[package]] name = "rich" version = "14.1.0" @@ -771,6 +788,46 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/45/8c4ebc0c460e6ec38e62ab245ad3c7fc10b210116cea7c16d61602aa9558/stevedore-5.4.1-py3-none-any.whl", hash = "sha256:d10a31c7b86cba16c1f6e8d15416955fc797052351a56af15e608ad20811fcfe", size = 49533, upload-time = "2025-02-20T14:03:55.849Z" }, ] +[[package]] +name = "tiktoken" +version = "0.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "regex" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" }, + { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" }, + { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" }, + { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" }, + { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" }, + { url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" }, + { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" }, + { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" }, + { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" }, + { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" }, + { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" }, + { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" }, + { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" }, + { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" }, + { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" }, + { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" }, + { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" }, + { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" }, + { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" }, + { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" }, + { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" }, + { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" }, + { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" }, + { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" }, + { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" }, + { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" }, + { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" }, +] + [[package]] name = "tld" version = "0.13.1"