[backend, crawler] feat: support token statistics
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# Basango : Towards a scalable and intelligent system for Congolese News curation
|
||||
# Basango: Towards a scalable and intelligent system for Congolese News curation
|
||||
|
||||
[](https://github.com/bernard-ng/basango/actions/workflows/backend_audit.yaml)
|
||||
[](https://github.com/bernard-ng/basango/actions/workflows/backend_deploy.yaml)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# Visit https://bit.ly/cffinit to generate yours today!
|
||||
|
||||
cff-version: 1.2.0
|
||||
title: DRC News Corpus
|
||||
title: Basango
|
||||
message: >-
|
||||
If you use this software, please cite it using the
|
||||
metadata from this file.
|
||||
@@ -14,11 +14,11 @@ authors:
|
||||
email: bernard@devscast.tech
|
||||
affiliation: Devscast Community
|
||||
orcid: 'https://orcid.org/0009-0003-9777-6349'
|
||||
repository-code: 'https://github.com/bernard-ng/drc-news-corpus'
|
||||
repository-code: 'https://github.com/bernard-ng/basango'
|
||||
repository: >-
|
||||
https://www.huggingface.c0/datasets/bernard-ng/drc-news-corpus
|
||||
https://www.huggingface.c0/datasets/bernard-ng/basango
|
||||
abstract: >-
|
||||
The "DRC News Corpus" is a curated collection of news
|
||||
The "Basango" is a curated collection of news
|
||||
articles sourced from major media outlets covering a wide
|
||||
spectrum of topics related to the Democratic Republic of
|
||||
Congo (DRC). This dataset encompasses a diverse range of
|
||||
|
||||
+10
-10
@@ -1,24 +1,24 @@
|
||||
# Core and Backend
|
||||
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||

|
||||

|
||||

|
||||
|
||||
| Scope | Link |
|
||||
|-------------------|------------------------------------------------------------|
|
||||
| core and backend | https://github.com/bernard-ng/drc-news-corpus |
|
||||
| core and backend | https://github.com/bernard-ng/basango |
|
||||
| ML models | https://github.com/bernard-ng/drc-news-ml |
|
||||
| Mobile App | https://github.com/bernard-ng/basango |
|
||||
| Dataset (partial) | https://huggingface.co/datasets/bernard-ng/drc-news-corpus |
|
||||
| Dataset (partial) | https://huggingface.co/datasets/bernard-ng/basango |
|
||||
|
||||
---
|
||||
|
||||
## DRC News Corpus : Towards a scalable and intelligent system for Congolese News curation
|
||||
## Basango : Towards a scalable and intelligent system for Congolese News curation
|
||||
|
||||
### Introduction
|
||||
|
||||
The **"DRC News Corpus"** is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.
|
||||
The **"Basango"** is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.
|
||||
|
||||
### Scalability and Use Cases:
|
||||
|
||||
@@ -45,7 +45,7 @@ If you want to rebuild the dataset follow the steps bellow :
|
||||
|
||||
#### Installation
|
||||
```bash
|
||||
git clone https://github.com/bernard-ng/drc-news-corpus.git && cd drc-news-corpus
|
||||
git clone https://github.com/bernard-ng/basango.git && cd basango
|
||||
make build
|
||||
make start
|
||||
```
|
||||
@@ -104,5 +104,5 @@ a CSV file will be generated in the `data` directory.
|
||||
|
||||
|
||||
### Acknowledgment:
|
||||
The compilation and curation of the "DRC News Corpus" were conducted by Tshabu Ngandu Bernard with the primary objective of facilitating research and analysis related to the Democratic Republic of Congo.
|
||||
The compilation and curation of the "Basango" were conducted by Tshabu Ngandu Bernard with the primary objective of facilitating research and analysis related to the Democratic Republic of Congo.
|
||||
I do not own the content of the articles, and all rights belong to the respective publishers. The dataset is intended for non-commercial research purposes only.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"version": "1",
|
||||
"name": "drc-news-corpus",
|
||||
"name": "basango",
|
||||
"type": "collection",
|
||||
"ignore": [
|
||||
"node_modules",
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
<field name="hash" length="32" />
|
||||
<field name="categories" type="text[]" nullable="true" />
|
||||
|
||||
<many-to-one field="source" target-entity="Basango\Aggregator\Domain\Model\Entity\Source">
|
||||
<many-to-one field="source" target-entity="Basango\Aggregator\Domain\Model\Entity\Source" fetch="EAGER">
|
||||
<join-column nullable="false" on-delete="CASCADE" />
|
||||
</many-to-one>
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
</field>
|
||||
<field name="metadata" type="open_graph" nullable="true" />
|
||||
<embedded name="readingTime" class="Basango\Aggregator\Domain\Model\ValueObject\ReadingTime" use-column-prefix="false" />
|
||||
<field name="tokenStatistics" type="token_statistics" nullable="true" />
|
||||
|
||||
<field name="image"
|
||||
insertable="false"
|
||||
|
||||
@@ -8,5 +8,12 @@
|
||||
repository-class="Gesdinet\JWTRefreshTokenBundle\Entity\RefreshTokenRepository"
|
||||
table="refresh_tokens"
|
||||
>
|
||||
<id name="id" type="integer">
|
||||
<generator strategy="SEQUENCE" />
|
||||
<sequence-generator sequence-name="refresh_tokens_id_seq" allocation-size="100" initial-value="1" />
|
||||
</id>
|
||||
<field name="refreshToken" type="string" column="refresh_token" length="128" unique="true"/>
|
||||
<field name="username" type="string" length="255" column="username"/>
|
||||
<field name="valid" type="datetime"/>
|
||||
</entity>
|
||||
</doctrine-mapping>
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace DoctrineMigrations;
|
||||
|
||||
use Doctrine\DBAL\Schema\Schema;
|
||||
use Doctrine\Migrations\AbstractMigration;
|
||||
|
||||
/**
|
||||
* Class Version20251024234318.
|
||||
*
|
||||
* @author bernard-ng <bernard@devscast.tech>
|
||||
*/
|
||||
final class Version20251024234318 extends AbstractMigration
|
||||
{
|
||||
public function getDescription(): string
|
||||
{
|
||||
return 'add token statistics to article';
|
||||
}
|
||||
|
||||
public function up(Schema $schema): void
|
||||
{
|
||||
$this->addSql('ALTER TABLE article ADD token_statistics JSONB DEFAULT NULL');
|
||||
}
|
||||
|
||||
public function down(Schema $schema): void
|
||||
{
|
||||
$this->addSql('ALTER TABLE article DROP token_statistics');
|
||||
}
|
||||
}
|
||||
@@ -70,6 +70,7 @@ doctrine:
|
||||
article_id: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\ArticleIdType
|
||||
source_id: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\SourceIdType
|
||||
open_graph: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\OpenGraphType
|
||||
token_statistics: Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types\TokenStatisticsType
|
||||
|
||||
# Identity and Access
|
||||
user_id: Basango\IdentityAndAccess\Infrastructure\Persistence\Doctrine\DBAL\Types\UserIdType
|
||||
@@ -125,6 +126,7 @@ doctrine:
|
||||
orm:
|
||||
auto_generate_proxy_classes: true
|
||||
enable_lazy_ghost_objects: true
|
||||
enable_native_lazy_objects: true
|
||||
entity_managers:
|
||||
default:
|
||||
validate_xml_mapping: false
|
||||
|
||||
@@ -6,6 +6,7 @@ namespace Basango\Aggregator\Application\UseCase\Command;
|
||||
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\Link;
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph;
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics;
|
||||
|
||||
/**
|
||||
* Class Save.
|
||||
@@ -17,11 +18,12 @@ final readonly class CreateArticle
|
||||
public function __construct(
|
||||
public string $title,
|
||||
public Link $link,
|
||||
public string $categories,
|
||||
public array $categories,
|
||||
public string $body,
|
||||
public string $source,
|
||||
public int $timestamp,
|
||||
public ?OpenGraph $metadata = null
|
||||
public ?OpenGraph $metadata = null,
|
||||
public ?TokenStatistics $tokenStatistics = null
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
||||
+2
-1
@@ -43,12 +43,13 @@ final readonly class CreateArticleHandler implements CommandHandler
|
||||
link: $command->link,
|
||||
body: $command->body,
|
||||
hash: $hash,
|
||||
categories: mb_strtolower($command->categories),
|
||||
categories: $command->categories,
|
||||
source: $source,
|
||||
publishedAt: $publishedAt
|
||||
);
|
||||
$article
|
||||
->defineOpenGraph($command->metadata)
|
||||
->defineTokenStatistics($command->tokenStatistics)
|
||||
->computeReadingTime();
|
||||
|
||||
$this->articleRepository->add($article);
|
||||
|
||||
@@ -10,6 +10,7 @@ use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph;
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\ReadingTime;
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\Scoring\Credibility;
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\Scoring\Sentiment;
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics;
|
||||
|
||||
/**
|
||||
* Class Article.
|
||||
@@ -25,13 +26,14 @@ class Article
|
||||
public readonly Link $link,
|
||||
public readonly string $body,
|
||||
public readonly string $hash,
|
||||
private(set) string $categories,
|
||||
private(set) array $categories,
|
||||
public readonly Source $source,
|
||||
public readonly \DateTimeImmutable $publishedAt,
|
||||
public readonly \DateTimeImmutable $crawledAt = new \DateTimeImmutable(),
|
||||
private(set) Credibility $credibility = new Credibility(),
|
||||
private(set) Sentiment $sentiment = Sentiment::NEUTRAL,
|
||||
private(set) ?OpenGraph $metadata = null,
|
||||
private(set) ?TokenStatistics $tokenStatistics = null,
|
||||
private(set) ?ReadingTime $readingTime = null,
|
||||
private(set) ?\DateTimeImmutable $updatedAt = null,
|
||||
public readonly ?string $image = null,
|
||||
@@ -56,7 +58,7 @@ class Article
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function assignCategories(string $categories): self
|
||||
public function assignCategories(array $categories): self
|
||||
{
|
||||
$this->categories = $categories;
|
||||
$this->updatedAt = new \DateTimeImmutable();
|
||||
@@ -83,4 +85,11 @@ class Article
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function defineTokenStatistics(?TokenStatistics $statistics): self
|
||||
{
|
||||
$this->tokenStatistics = $statistics;
|
||||
|
||||
return $this;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Basango\Aggregator\Domain\Model\ValueObject;
|
||||
|
||||
/**
|
||||
* Class TokenStatistics.
|
||||
*
|
||||
* @author bernard-ng <bernard@devscast.tech>
|
||||
*/
|
||||
final class TokenStatistics implements \JsonSerializable
|
||||
{
|
||||
public ?int $total {
|
||||
get {
|
||||
return ($this->title ?? 0)
|
||||
+ ($this->body ?? 0)
|
||||
+ ($this->excerpt ?? 0)
|
||||
+ ($this->categories ?? 0);
|
||||
}
|
||||
}
|
||||
|
||||
public function __construct(
|
||||
public readonly ?int $title = null,
|
||||
public readonly ?int $body = null,
|
||||
public readonly ?int $excerpt = null,
|
||||
public readonly ?int $categories = null,
|
||||
) {
|
||||
}
|
||||
|
||||
public static function tryFrom(?string $value): ?self
|
||||
{
|
||||
if ($value === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$object = \json_decode($value, true, 512, JSON_THROW_ON_ERROR);
|
||||
|
||||
return new self(
|
||||
$object['title'] ?? null,
|
||||
$object['body'] ?? null,
|
||||
$object['excerpt'] ?? null,
|
||||
$object['categories'] ?? null,
|
||||
);
|
||||
} catch (\Throwable) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
#[\Override]
|
||||
public function jsonSerialize(): array
|
||||
{
|
||||
return [
|
||||
'title' => $this->title,
|
||||
'body' => $this->body,
|
||||
'excerpt' => $this->excerpt,
|
||||
'categories' => $this->categories,
|
||||
'total' => $this->total,
|
||||
];
|
||||
}
|
||||
}
|
||||
+67
@@ -0,0 +1,67 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Basango\Aggregator\Infrastructure\Persistence\Doctrine\DBAL\Types;
|
||||
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics;
|
||||
use Doctrine\DBAL\Platforms\AbstractPlatform;
|
||||
use Doctrine\DBAL\Types\ConversionException;
|
||||
use Doctrine\DBAL\Types\Type;
|
||||
|
||||
/**
|
||||
* Class TokenStatisticsType.
|
||||
*
|
||||
* @author bernard-ng <bernard@devscast.tech>
|
||||
*/
|
||||
final class TokenStatisticsType extends Type
|
||||
{
|
||||
public function getSQLDeclaration(array $column, AbstractPlatform $platform): string
|
||||
{
|
||||
return $platform->getJsonTypeDeclarationSQL([
|
||||
'nullable' => true,
|
||||
'jsonb' => true,
|
||||
]);
|
||||
}
|
||||
|
||||
public function getName(): string
|
||||
{
|
||||
return 'token_statistics';
|
||||
}
|
||||
|
||||
#[\Override]
|
||||
public function convertToPHPValue(mixed $value, AbstractPlatform $platform): ?TokenStatistics
|
||||
{
|
||||
if ($value === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (! \is_string($value)) {
|
||||
throw ConversionException::conversionFailedInvalidType($value, $this->getName(), ['null', 'string', TokenStatistics::class]);
|
||||
}
|
||||
|
||||
try {
|
||||
return TokenStatistics::tryFrom($value);
|
||||
} catch (\Throwable $e) {
|
||||
throw ConversionException::conversionFailed($value, $this->getName(), $e);
|
||||
}
|
||||
}
|
||||
|
||||
#[\Override]
|
||||
public function convertToDatabaseValue($value, AbstractPlatform $platform): ?string
|
||||
{
|
||||
if ($value instanceof TokenStatistics) {
|
||||
return json_encode($value) ?: null;
|
||||
}
|
||||
|
||||
if ($value === null || $value === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (! \is_string($value)) {
|
||||
throw ConversionException::conversionFailedInvalidType($value, $this->getName(), ['null', 'string', TokenStatistics::class]);
|
||||
}
|
||||
|
||||
throw ConversionException::conversionFailed($value, $this->getName());
|
||||
}
|
||||
}
|
||||
+2
-1
@@ -47,11 +47,12 @@ final class AddArticleController extends AbstractController
|
||||
$this->handleCommand(new CreateArticle(
|
||||
$model->title,
|
||||
Link::from($model->link),
|
||||
implode(', ', $model->categories),
|
||||
$model->categories,
|
||||
$model->body,
|
||||
$model->source,
|
||||
$model->timestamp,
|
||||
$model->metadata,
|
||||
$model->tokenStatistics
|
||||
));
|
||||
|
||||
return new JsonResponse(status: Response::HTTP_CREATED);
|
||||
|
||||
@@ -5,6 +5,7 @@ declare(strict_types=1);
|
||||
namespace Basango\Aggregator\Presentation\WriteModel;
|
||||
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\OpenGraph;
|
||||
use Basango\Aggregator\Domain\Model\ValueObject\TokenStatistics;
|
||||
use Symfony\Component\Validator\Constraints as Assert;
|
||||
|
||||
/**
|
||||
@@ -32,4 +33,6 @@ final class AddArticleModel
|
||||
public array $categories = [];
|
||||
|
||||
public ?OpenGraph $metadata = null;
|
||||
|
||||
public ?TokenStatistics $tokenStatistics = null;
|
||||
}
|
||||
|
||||
+1
-1
@@ -42,7 +42,7 @@ final readonly class GetArticleOverviewListDbalHandler implements GetArticleOver
|
||||
|
||||
$qb->from('article', 'a')
|
||||
->innerJoin('a', 'source', 's', 'a.source_id = s.id')
|
||||
//->orderBy('a.published_at', $query->filters->sortDirection->value)
|
||||
->orderBy('a.published_at', $query->filters->sortDirection->value)
|
||||
->setParameter('userId', $query->userId->toString())
|
||||
;
|
||||
|
||||
|
||||
+6
-4
@@ -62,15 +62,17 @@ trait ArticleQuery
|
||||
private function applyArticleFilters(QueryBuilder $qb, ArticleFilters $filters): QueryBuilder
|
||||
{
|
||||
if ($filters->category !== null) {
|
||||
// PostgreSQL array containment for single value
|
||||
$qb->andWhere(':category = ANY(a.categories)')
|
||||
->setParameter('category', $filters->category);
|
||||
}
|
||||
|
||||
if ($filters->search !== null) {
|
||||
// Case-insensitive search in PostgreSQL
|
||||
$qb->andWhere('a.title ILIKE :search')
|
||||
->setParameter('search', sprintf('%%%s%%', $filters->search));
|
||||
$qb
|
||||
->addSelect("ts_rank(a.tsv, to_tsquery('french', :search)) AS rank")
|
||||
->andWhere("a.tsv @@ to_tsquery('french', :search)")
|
||||
->setParameter('search', $filters->search)
|
||||
->resetOrderBy()
|
||||
->orderBy('rank', $filters->sortDirection->value);
|
||||
}
|
||||
|
||||
if ($filters->dateRange instanceof DateRange) {
|
||||
|
||||
@@ -4,8 +4,8 @@ declare(strict_types=1);
|
||||
|
||||
namespace Basango\IdentityAndAccess\Domain\Model\Entity;
|
||||
|
||||
use Gesdinet\JWTRefreshTokenBundle\Entity\RefreshToken as BaseRefreshToken;
|
||||
use Gesdinet\JWTRefreshTokenBundle\Model\AbstractRefreshToken;
|
||||
|
||||
class RefreshToken extends BaseRefreshToken
|
||||
class RefreshToken extends AbstractRefreshToken
|
||||
{
|
||||
}
|
||||
|
||||
@@ -11,15 +11,15 @@ namespace Basango\SharedKernel\Domain;
|
||||
*/
|
||||
final class Application
|
||||
{
|
||||
public string $name = 'DRC News Corpus';
|
||||
public string $name = 'Basango';
|
||||
|
||||
public string $website = 'https://research.devscast.org/drc-news-corpus';
|
||||
public string $website = 'https://basango.ngandu.dev';
|
||||
|
||||
public string $emailAddress = 'contact@devscast.tech';
|
||||
|
||||
public string $infoAddress = 'contact@devscast.tech';
|
||||
|
||||
public string $emailName = 'DRC News Corpus';
|
||||
public string $emailName = 'Basango';
|
||||
|
||||
public string $legalName = 'Devscast Software SàSu';
|
||||
|
||||
|
||||
+2
-3
@@ -48,14 +48,13 @@ trait PaginationQuery
|
||||
PaginatorKeyset $keyset,
|
||||
SortDirection $direction = SortDirection::DESC
|
||||
): QueryBuilder {
|
||||
$orderDirection = strtoupper($direction->value);
|
||||
$comparisonOperator = $direction === SortDirection::ASC ? '>' : '<';
|
||||
|
||||
if ($keyset->date !== null) {
|
||||
$qb->addOrderBy($keyset->date, $orderDirection);
|
||||
$qb->addOrderBy($keyset->date, $direction->value);
|
||||
}
|
||||
|
||||
$qb->addOrderBy($keyset->id, $orderDirection);
|
||||
$qb->addOrderBy($keyset->id, $direction->value);
|
||||
|
||||
$cursor = PaginationCursor::decode($page->cursor);
|
||||
if (! $cursor instanceof PaginationCursor) {
|
||||
|
||||
@@ -22,9 +22,9 @@ final class DefaultController extends AbstractController
|
||||
public function __invoke(): JsonResponse
|
||||
{
|
||||
return $this->json([
|
||||
'repository' => 'https://github.com/bernard-ng/drc-news-corpus',
|
||||
'title' => 'DRC News Corpus : Towards a scalable and intelligent system for Congolese News curation',
|
||||
'description' => 'The DRC News Corpus is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.',
|
||||
'repository' => 'https://github.com/bernard-ng/basango',
|
||||
'title' => 'Basango : Towards a scalable and intelligent system for Congolese News curation',
|
||||
'description' => 'The Basango is a structured and scalable dataset of news articles sourced from major media outlets covering diverse aspects of the Democratic Republic of Congo (DRC). Designed for efficiency, this system enables the automated collection, processing, and organization of news stories spanning politics, economy, society, culture, environment, and international affairs.',
|
||||
'status' => 200,
|
||||
]);
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
- Install the project in your virtualenv so the `basango` CLI is available:
|
||||
- With uv: `uv run --with . basango --help`
|
||||
- Or install locally: `pip install -e .` then `basango --help`
|
||||
- Or install locally: `uv sync` then `basango --help`
|
||||
|
||||
#### Sync crawl (in-process)
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ sources:
|
||||
replacement: "$3-$2-$1 $4"
|
||||
source_selectors:
|
||||
articles: ".view-content > .views-row.content-row"
|
||||
article_title: ".views-field-title a"
|
||||
article_title: "h1.page-header"
|
||||
article_link: ".views-field-title a"
|
||||
article_body: ".field-name-body"
|
||||
article_date: ".views-field-created"
|
||||
@@ -45,7 +45,7 @@ sources:
|
||||
pagination: "ul.pagination > li.pager-last > a"
|
||||
pagination_template: "actualite"
|
||||
supports_categories: false
|
||||
requires_details: false
|
||||
requires_details: true
|
||||
requires_rate_limit: false
|
||||
|
||||
- source_id: 7sur7.cd
|
||||
|
||||
@@ -17,6 +17,7 @@ dependencies = [
|
||||
"markdownify>=0.13.1",
|
||||
"readability-lxml>=0.8.1",
|
||||
"beautifulsoup4>=4.13.5",
|
||||
"tiktoken>=0.12.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
||||
@@ -2,6 +2,7 @@ from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from .token_statistics import TokenStatistics
|
||||
|
||||
|
||||
class Article(BaseModel):
|
||||
@@ -12,6 +13,7 @@ class Article(BaseModel):
|
||||
source: str
|
||||
timestamp: datetime
|
||||
metadata: Optional[dict[str, Any]] = None
|
||||
token_statistics: Optional["TokenStatistics"] = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
@@ -22,4 +24,7 @@ class Article(BaseModel):
|
||||
"source": self.source,
|
||||
"timestamp": int(self.timestamp.timestamp()),
|
||||
"metadata": self.metadata,
|
||||
"tokenStatistics": self.token_statistics.to_dict()
|
||||
if self.token_statistics
|
||||
else "",
|
||||
}
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenStatistics:
|
||||
"""Counts of tokens for different article sections."""
|
||||
|
||||
title: int
|
||||
body: int
|
||||
excerpt: int
|
||||
categories: int
|
||||
|
||||
def to_dict(self) -> dict[str, int]:
|
||||
return {
|
||||
"title": self.title,
|
||||
"body": self.body,
|
||||
"excerpt": self.excerpt,
|
||||
"categories": self.categories,
|
||||
}
|
||||
@@ -3,6 +3,7 @@ from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient
|
||||
from .open_graph import OpenGraphProvider
|
||||
from .persistence import BasePersistor, CsvPersistor, JsonPersistor
|
||||
from .user_agents import UserAgentProvider
|
||||
from .tokenizer import Tokenizer
|
||||
|
||||
HttpClient = SyncHttpClient
|
||||
|
||||
@@ -17,4 +18,5 @@ __all__ = [
|
||||
"BasePersistor",
|
||||
"CsvPersistor",
|
||||
"JsonPersistor",
|
||||
"Tokenizer",
|
||||
]
|
||||
|
||||
@@ -1,15 +1,23 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import asdict, is_dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional, Any, Dict, List, Sequence
|
||||
|
||||
from basango.domain.article import Article
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import HttpUrl
|
||||
|
||||
from basango.core.config import CrawlerConfig, ClientConfig
|
||||
from basango.domain import DateRange, SourceKind, PageRange
|
||||
from basango.domain.exception import ArticleOutOfRange
|
||||
from basango.services import HttpClient, DateParser, OpenGraphProvider, BasePersistor
|
||||
from basango.services import (
|
||||
HttpClient,
|
||||
DateParser,
|
||||
OpenGraphProvider,
|
||||
BasePersistor,
|
||||
Tokenizer,
|
||||
)
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
@@ -35,6 +43,7 @@ class BaseCrawler(ABC):
|
||||
self.persistors: list[BasePersistor] = list(persistors) if persistors else []
|
||||
self.date_parser = DateParser()
|
||||
self.open_graph = OpenGraphProvider()
|
||||
self.tokenizer = Tokenizer()
|
||||
|
||||
@abstractmethod
|
||||
def fetch(self) -> None:
|
||||
@@ -61,23 +70,35 @@ class BaseCrawler(ABC):
|
||||
metadata_value = None
|
||||
elif is_dataclass(metadata) and not isinstance(metadata, type):
|
||||
metadata_value = asdict(metadata)
|
||||
else:
|
||||
elif isinstance(metadata, dict):
|
||||
metadata_value = metadata
|
||||
else:
|
||||
metadata_value = None
|
||||
|
||||
article = {
|
||||
"title": title,
|
||||
"link": link,
|
||||
"body": body,
|
||||
"categories": categories,
|
||||
"source": getattr(self.source, "source_id", None),
|
||||
"timestamp": timestamp,
|
||||
"metadata": metadata_value,
|
||||
}
|
||||
# Get source_id and ensure it's a string
|
||||
source_id = getattr(self.source, "source_id", None)
|
||||
if source_id is None:
|
||||
source_id = "unknown"
|
||||
|
||||
self._persist(article)
|
||||
logging.info(f"> {article['title']} [saved]")
|
||||
article = Article(
|
||||
title=title,
|
||||
link=HttpUrl(link), # Convert str to HttpUrl
|
||||
body=body,
|
||||
categories=categories,
|
||||
source=source_id, # Ensure it's a string, not None
|
||||
timestamp=datetime.fromtimestamp(
|
||||
timestamp
|
||||
), # Convert int timestamp to datetime
|
||||
metadata=metadata_value,
|
||||
)
|
||||
article.token_statistics = self.tokenizer.count_tokens(
|
||||
article.title, article.body, article.categories
|
||||
)
|
||||
|
||||
return Article(**article)
|
||||
self._persist(article.to_dict())
|
||||
logging.info("> %s [saved]", article.title)
|
||||
|
||||
return article
|
||||
|
||||
@abstractmethod
|
||||
def fetch_one(
|
||||
|
||||
@@ -6,6 +6,7 @@ from urllib.parse import parse_qs, urljoin, urlparse
|
||||
|
||||
from basango.domain.article import Article
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from markdownify import markdownify
|
||||
|
||||
from basango.core.config import CrawlerConfig, ClientConfig
|
||||
from basango.core.config.source_config import HtmlSourceConfig
|
||||
@@ -283,15 +284,15 @@ class HtmlCrawler(BaseCrawler):
|
||||
matches = node.select(selector)
|
||||
if matches:
|
||||
parts = [
|
||||
item.get_text(" ", strip=True)
|
||||
markdownify(item.get_text(" ", strip=False), heading_style="ATX")
|
||||
for item in matches
|
||||
if item.get_text(strip=True)
|
||||
]
|
||||
if parts:
|
||||
# Join without separators: callers can post-process if
|
||||
# needed, but this preserves maximum fidelity.
|
||||
return "".join(parts)
|
||||
return node.get_text(" ", strip=True)
|
||||
return "\n".join(parts)
|
||||
return markdownify(node.get_text(" ", strip=False), heading_style="ATX")
|
||||
|
||||
@staticmethod
|
||||
def _extract_categories(
|
||||
|
||||
@@ -3,6 +3,8 @@ import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, override, cast, Final, Any, Sequence
|
||||
|
||||
from markdownify import markdownify
|
||||
|
||||
from basango.domain.article import Article
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -104,7 +106,10 @@ class WordpressCrawler(BaseCrawler):
|
||||
body_html = data.get("content", {}).get("rendered", "")
|
||||
|
||||
title = BeautifulSoup(title_html, "html.parser").get_text(" ", strip=True)
|
||||
body = BeautifulSoup(body_html, "html.parser").get_text(" ", strip=True)
|
||||
body = markdownify(
|
||||
BeautifulSoup(body_html, "html.parser").get_text(" ", strip=False),
|
||||
heading_style="ATX",
|
||||
)
|
||||
timestamp = self._compute_timestamp(data.get("date"))
|
||||
|
||||
categories_value = self._map_categories(data.get("categories", []))
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
"""
|
||||
Tokenizer utilities for counting and encoding article text.
|
||||
|
||||
This module wraps the `tiktoken` encoder to provide simple helpers for:
|
||||
- encoding/decoding text to token ids
|
||||
- counting tokens for different parts of an Article
|
||||
|
||||
The `Tokenizer` can be constructed with either a specific `model` (preferred)
|
||||
or an `encoding` name fallback.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import tiktoken
|
||||
from typing import Optional
|
||||
|
||||
from basango.domain.token_statistics import TokenStatistics
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""Thin wrapper around tiktoken encoder for token operations."""
|
||||
|
||||
def __init__(
|
||||
self, encoding: str = "cl100k_base", model: Optional[str] = None
|
||||
) -> None:
|
||||
self.encoding = encoding
|
||||
# Prefer model-based encoding lookup if a model is provided.
|
||||
self.tokenizer = (
|
||||
tiktoken.encoding_for_model(model)
|
||||
if model
|
||||
else tiktoken.get_encoding(encoding)
|
||||
)
|
||||
|
||||
def encode(self, text: str) -> list[int]:
|
||||
"""Encode text into a list of token ids."""
|
||||
return self.tokenizer.encode(text)
|
||||
|
||||
def decode(self, tokens: list[int]) -> str:
|
||||
"""Decode a list of token ids back into a string."""
|
||||
return self.tokenizer.decode(tokens)
|
||||
|
||||
def count_tokens(
|
||||
self, title: str, body: str, categories: list[str]
|
||||
) -> TokenStatistics:
|
||||
"""Return token counts for the provided Article.
|
||||
|
||||
The excerpt count is computed on the first 200 characters of the body
|
||||
to give a quick estimate of a short preview's token length.
|
||||
"""
|
||||
logging.info(f"[Tokenizer] tokenizing {title}...")
|
||||
return TokenStatistics(
|
||||
title=len(self.encode(title)),
|
||||
body=len(self.encode(body)),
|
||||
excerpt=len(self.encode(body[:200])),
|
||||
categories=len(self.encode(", ".join(categories))),
|
||||
)
|
||||
Generated
+57
@@ -62,6 +62,7 @@ dependencies = [
|
||||
{ name = "readability-lxml" },
|
||||
{ name = "rq" },
|
||||
{ name = "selectolax" },
|
||||
{ name = "tiktoken" },
|
||||
{ name = "trafilatura" },
|
||||
{ name = "typer" },
|
||||
{ name = "uv-build" },
|
||||
@@ -86,6 +87,7 @@ requires-dist = [
|
||||
{ name = "readability-lxml", specifier = ">=0.8.1" },
|
||||
{ name = "rq", specifier = ">=2.5.0" },
|
||||
{ name = "selectolax", specifier = ">=0.3.20" },
|
||||
{ name = "tiktoken", specifier = ">=0.12.0" },
|
||||
{ name = "trafilatura", specifier = ">=1.7.0" },
|
||||
{ name = "typer", specifier = ">=0.16.1" },
|
||||
{ name = "uv-build", specifier = ">=0.8.12,<0.9.0" },
|
||||
@@ -632,6 +634,21 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/3e/7d7ac6fd085023312421e0d69dfabdfb28e116e513fadbe9afe710c01893/regex-2025.9.1-cp314-cp314-win_arm64.whl", hash = "sha256:f46d525934871ea772930e997d577d48c6983e50f206ff7b66d4ac5f8941e993", size = 271860, upload-time = "2025-09-01T22:09:42.413Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.32.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "idna" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "14.1.0"
|
||||
@@ -771,6 +788,46 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f7/45/8c4ebc0c460e6ec38e62ab245ad3c7fc10b210116cea7c16d61602aa9558/stevedore-5.4.1-py3-none-any.whl", hash = "sha256:d10a31c7b86cba16c1f6e8d15416955fc797052351a56af15e608ad20811fcfe", size = 49533, upload-time = "2025-02-20T14:03:55.849Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiktoken"
|
||||
version = "0.12.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "regex" },
|
||||
{ name = "requests" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tld"
|
||||
version = "0.13.1"
|
||||
|
||||
Reference in New Issue
Block a user