fix: async queue

This commit is contained in:
2025-10-06 20:00:54 +02:00
parent 497633fb61
commit e105ff233f
16 changed files with 88 additions and 60 deletions
+1
View File
@@ -9,6 +9,7 @@ var/
# Python-generated files # Python-generated files
__pycache__/ __pycache__/
.pytest_cache/
*.py[oc] *.py[oc]
build/ build/
dist/ dist/
+33 -8
View File
@@ -1,13 +1,12 @@
from __future__ import annotations
from typing import List, Optional from typing import List, Optional
from enum import Enum
import typer import typer
from basango.core.config import CrawlerConfig from basango.core.config import CrawlerConfig
from basango.core.config_manager import ConfigManager from basango.core.config_manager import ConfigManager
from basango.domain import DateRange, PageRange, UpdateDirection from basango.domain import DateRange, PageRange, UpdateDirection
from basango.services import CsvPersistor from basango.services import CsvPersistor, JsonPersistor
from basango.services.crawler.async_api import ( from basango.services.crawler.async_api import (
QueueSettings, QueueSettings,
schedule_async_crawl, schedule_async_crawl,
@@ -19,6 +18,12 @@ from basango.services.crawler.wordpress_crawler import WordpressCrawler
app = typer.Typer(no_args_is_help=True, add_completion=False) app = typer.Typer(no_args_is_help=True, add_completion=False)
class QueueName(str, Enum):
listing = "listing"
articles = "articles"
processed = "processed"
@app.command("crawl") @app.command("crawl")
def crawl_cmd( def crawl_cmd(
source_id: str = typer.Option( source_id: str = typer.Option(
@@ -79,7 +84,11 @@ def crawl_cmd(
CsvPersistor( CsvPersistor(
data_dir=pipeline.paths.data, data_dir=pipeline.paths.data,
source_id=str(source_identifier), source_id=str(source_identifier),
) ),
JsonPersistor(
data_dir=pipeline.paths.data,
source_id=str(source_identifier),
),
] ]
for crawler in crawlers: for crawler in crawlers:
@@ -95,11 +104,22 @@ def crawl_cmd(
@app.command("worker") @app.command("worker")
def worker_cmd( def worker_cmd(
queue: Optional[List[str]] = typer.Option( queue: Optional[List[QueueName]] = typer.Option(
None, None,
"--queue", "--queue",
"-q", "-q",
help="Queue name(s) (without prefix). Provide multiple times to listen to more than one queue.", help=(
"Queue name(s) (without prefix). Choices: listing, articles, processed. "
"Provide multiple times to listen to more than one queue."
),
),
simple: bool = typer.Option(
False,
"--simple/--no-simple",
help=(
"Run jobs in-process using RQ SimpleWorker (no forking). "
"Recommended on macOS to avoid fork-related crashes."
),
), ),
burst: bool = typer.Option( burst: bool = typer.Option(
False, False,
@@ -125,5 +145,10 @@ def worker_cmd(
manager.setup_logging(pipeline) manager.setup_logging(pipeline)
settings = QueueSettings(redis_url=redis_url) if redis_url else QueueSettings() settings = QueueSettings(redis_url=redis_url) if redis_url else QueueSettings()
queue_names = list(queue) if queue else None queue_names = [q.value for q in queue] if queue else None
start_worker(queue_names=queue_names, settings=settings, burst=burst) start_worker(
queue_names=queue_names,
settings=settings,
burst=burst,
simple=simple,
)
@@ -1,5 +1,3 @@
from __future__ import annotations
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Iterable from typing import Iterable
@@ -1,5 +1,3 @@
from __future__ import annotations
from dataclasses import asdict, dataclass, fields from dataclasses import asdict, dataclass, fields
from typing import Any, Mapping from typing import Any, Mapping
@@ -1,12 +1,10 @@
from __future__ import annotations
import logging import logging
from typing import Any from typing import Any
from basango.core.config import CrawlerConfig from basango.core.config import CrawlerConfig
from basango.core.config_manager import ConfigManager from basango.core.config_manager import ConfigManager
from basango.domain import DateRange, PageRange, SourceKind, UpdateDirection from basango.domain import DateRange, PageRange, SourceKind, UpdateDirection
from basango.services import CsvPersistor from basango.services import CsvPersistor, JsonPersistor, ApiPersistor
from basango.services.crawler.html_crawler import HtmlCrawler from basango.services.crawler.html_crawler import HtmlCrawler
from basango.services.crawler.wordpress_crawler import WordpressCrawler from basango.services.crawler.wordpress_crawler import WordpressCrawler
@@ -101,7 +99,11 @@ def collect_article(payload: dict[str, Any]) -> dict[str, Any] | None:
CsvPersistor( CsvPersistor(
data_dir=pipeline.paths.data, data_dir=pipeline.paths.data,
source_id=str(source_identifier), source_id=str(source_identifier),
) ),
JsonPersistor(
data_dir=pipeline.paths.data,
source_id=str(source_identifier),
),
] ]
queue_manager = QueueManager() queue_manager = QueueManager()
@@ -124,7 +126,6 @@ def collect_article(payload: dict[str, Any]) -> dict[str, Any] | None:
) )
article = None article = None
if article:
queue_manager.enqueue_processed( queue_manager.enqueue_processed(
ProcessedTaskPayload( ProcessedTaskPayload(
source_id=data.source_id, source_id=data.source_id,
@@ -132,16 +133,22 @@ def collect_article(payload: dict[str, Any]) -> dict[str, Any] | None:
article=article, article=article,
) )
) )
if article:
logger.info( logger.info(
"Persisted article %s and forwarded to processed queue", "Persisted article %s and forwarded to processed queue",
article.get("link"), article.get("link"),
) )
else:
logger.info("Persisted article and forwarded to processed queue")
return article return article
def forward_for_processing(payload: dict[str, Any]) -> dict[str, Any] | None: def forward_for_processing(payload: dict[str, Any]) -> dict[str, Any] | None:
data = ProcessedTaskPayload.from_dict(payload) data = ProcessedTaskPayload.from_dict(payload)
manager = ConfigManager()
pipeline = manager.get(data.env)
article = dict(data.article) if data.article is not None else None article = dict(data.article) if data.article is not None else None
if article is None: if article is None:
logger.info( logger.info(
@@ -153,7 +160,13 @@ def forward_for_processing(payload: dict[str, Any]) -> dict[str, Any] | None:
data.source_id, data.source_id,
article.get("link"), article.get("link"),
) )
return article
persistor = ApiPersistor(
endpoint="http://localhost:8000/api/articles",
client_config=pipeline.fetch.client,
)
persistor.persist(article)
logger.info("Forwarded article %s to API", article.get("link"))
def _collect_html_listing( def _collect_html_listing(
@@ -1,9 +1,7 @@
from __future__ import annotations
import logging import logging
from typing import Sequence from typing import Sequence
from rq import Queue, Worker from rq import Queue, Worker, SimpleWorker
from .queue import QueueManager, QueueSettings from .queue import QueueManager, QueueSettings
@@ -16,6 +14,7 @@ def start_worker(
*, *,
settings: QueueSettings | None = None, settings: QueueSettings | None = None,
burst: bool = False, burst: bool = False,
simple: bool = False,
) -> None: ) -> None:
manager = QueueManager(settings=settings) manager = QueueManager(settings=settings)
if queue_names is None or not list(queue_names): if queue_names is None or not list(queue_names):
@@ -24,6 +23,11 @@ def start_worker(
resolved = [manager.queue_name(name) for name in queue_names] resolved = [manager.queue_name(name) for name in queue_names]
queues = [Queue(name, connection=manager.connection) for name in resolved] queues = [Queue(name, connection=manager.connection) for name in resolved]
logger.info("Starting RQ worker for queues %s", ", ".join(resolved)) worker_cls = SimpleWorker if simple else Worker
worker = Worker(queues, connection=manager.connection) logger.info(
"Starting RQ %s for queues %s",
worker_cls.__name__,
", ".join(resolved),
)
worker = worker_cls(queues, connection=manager.connection)
worker.work(burst=burst) worker.work(burst=burst)
@@ -1,5 +1,3 @@
from __future__ import annotations
from importlib import import_module from importlib import import_module
_async_queue = import_module("basango.services.crawler.async.queue") _async_queue = import_module("basango.services.crawler.async.queue")
@@ -1,5 +1,3 @@
from __future__ import annotations
import asyncio import asyncio
from dataclasses import dataclass, field from dataclasses import dataclass, field
@@ -1,5 +1,3 @@
from __future__ import annotations
import random import random
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass, field from dataclasses import dataclass, field
@@ -1,5 +1,3 @@
from __future__ import annotations
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
@@ -1,9 +1,8 @@
from __future__ import annotations
import logging import logging
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Mapping from typing import Any, Mapping
from basango.core.config import ClientConfig
from basango.services.http_client import SyncHttpClient from basango.services.http_client import SyncHttpClient
from .base_persistor import BasePersistor from .base_persistor import BasePersistor
@@ -12,17 +11,15 @@ from .base_persistor import BasePersistor
@dataclass @dataclass
class ApiPersistor(BasePersistor): class ApiPersistor(BasePersistor):
endpoint: str endpoint: str
http_client: SyncHttpClient client_config: ClientConfig
headers: dict[str, str] | None = None
raise_for_status: bool = True raise_for_status: bool = True
def __post_init__(self) -> None:
self.client = SyncHttpClient(client_config=self.client_config)
def persist(self, article: Mapping[str, Any]) -> None: def persist(self, article: Mapping[str, Any]) -> None:
try: try:
response = self.http_client.post( response = self.client.post(self.endpoint, json=article)
self.endpoint,
json=article,
headers=self.headers,
)
if self.raise_for_status: if self.raise_for_status:
response.raise_for_status() response.raise_for_status()
except Exception as exc: # noqa: BLE001 except Exception as exc: # noqa: BLE001
@@ -1,5 +1,3 @@
from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Mapping, Any from typing import Mapping, Any
@@ -1,5 +1,3 @@
from __future__ import annotations
import csv import csv
import json import json
from dataclasses import dataclass, field from dataclasses import dataclass, field
@@ -44,7 +42,12 @@ class CsvPersistor(BasePersistor):
with self._file_path.open( with self._file_path.open(
"a", newline="", encoding=self.encoding "a", newline="", encoding=self.encoding
) as handle: ) as handle:
writer = csv.DictWriter(handle, fieldnames=self.fieldnames) writer = csv.DictWriter(
handle,
fieldnames=self.fieldnames,
quoting=csv.QUOTE_ALL,
lineterminator="\n",
)
if needs_header: if needs_header:
writer.writeheader() writer.writeheader()
self._header_written = True self._header_written = True
@@ -61,7 +64,10 @@ class CsvPersistor(BasePersistor):
if metadata is None or isinstance(metadata, str): if metadata is None or isinstance(metadata, str):
serialised_metadata = metadata serialised_metadata = metadata
else: else:
serialised_metadata = json.dumps(metadata, ensure_ascii=False) # JSON-encode metadata to a string that is CSV-safe; csv module will quote it
serialised_metadata = json.dumps(
metadata, ensure_ascii=True, separators=(",", ":"), sort_keys=True
)
record = {field: article.get(field) for field in self.fieldnames} record = {field: article.get(field) for field in self.fieldnames}
record["categories"] = serialised_categories record["categories"] = serialised_categories
@@ -1,5 +1,3 @@
from __future__ import annotations
import json import json
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
@@ -251,10 +251,9 @@ class TestHtmlCrawler:
# Should extract the first number found (2024) # Should extract the first number found (2024)
assert result == 2024 assert result == 2024
def test_supports_html_source_kind(self, html_crawler): def test_supports_html_source_kind(self):
"""Test that supports method returns True for HTML source kind.""" """Test that supports method returns True for HTML source kind."""
assert html_crawler.supports(SourceKind.HTML) is True assert HtmlCrawler.supports() is SourceKind.HTML
assert html_crawler.supports(SourceKind.WORDPRESS) is False
def test_get_pagination_integration(self, html_crawler): def test_get_pagination_integration(self, html_crawler):
"""Integration test for get_pagination calling get_last_page.""" """Integration test for get_pagination calling get_last_page."""
@@ -210,10 +210,9 @@ class TestWordPressCrawler:
# Should be sorted by ID: 1, 2, 3 # Should be sorted by ID: 1, 2, 3
assert result == "alpha,beta,charlie" assert result == "alpha,beta,charlie"
def test_supports_wordpress_source_kind(self, wordpress_crawler): def test_supports_wordpress_source_kind(self):
"""Test supports method returns True for WordPress source kind.""" """Test supports method returns True for WordPress source kind."""
assert wordpress_crawler.supports(SourceKind.WORDPRESS) is True assert WordpressCrawler.supports() is SourceKind.WORDPRESS
assert wordpress_crawler.supports(SourceKind.HTML) is False
@pytest.mark.parametrize( @pytest.mark.parametrize(
"pages,posts,expected_start,expected_end", "pages,posts,expected_start,expected_end",