[crawler]: stabilize import

This commit is contained in:
2025-11-02 21:26:07 +02:00
parent 07bb3992ad
commit c53c0b576b
51 changed files with 441 additions and 685 deletions
@@ -2,7 +2,7 @@ from .date_parser import DateParser
from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient
from .open_graph import OpenGraphProvider
from .persistence import BasePersistor, CsvPersistor, JsonPersistor
from .user_agents import UserAgentProvider
from .user_agents import UserAgents
from .tokenizer import Tokenizer
HttpClient = SyncHttpClient
@@ -14,7 +14,7 @@ __all__ = [
"AsyncHttpClient",
"HttpClient",
"OpenGraphProvider",
"UserAgentProvider",
"UserAgents",
"BasePersistor",
"CsvPersistor",
"JsonPersistor",
@@ -8,7 +8,7 @@ from typing import Any, Optional, TypeAlias
import httpx
from basango.core.config import ClientConfig
from basango.services.user_agents import UserAgentProvider
from basango.services.user_agents import UserAgents
HttpHeaders: TypeAlias = dict[str, str] | None
HttpParams: TypeAlias = dict[str, Any] | None
@@ -20,13 +20,13 @@ TRANSIENT_STATUSES = (429, 500, 502, 503, 504)
@dataclass
class BaseHttpClient(ABC):
client_config: ClientConfig
user_agent_provider: UserAgentProvider | None = None
user_agent_provider: UserAgents | None = None
default_headers: HttpHeaders = None
_user_agent: str = field(init=False, repr=False)
_headers: dict[str, str] = field(init=False, repr=False)
def __post_init__(self) -> None:
provider = self.user_agent_provider or UserAgentProvider(
provider = self.user_agent_provider or UserAgents(
rotate=self.client_config.rotate,
fallback=self.client_config.user_agent,
)
@@ -6,7 +6,7 @@ import trafilatura
from basango.core.config import ClientConfig
from basango.services.http_client import SyncHttpClient
from basango.services.user_agents import UserAgentProvider
from basango.services.user_agents import UserAgents
@dataclass
@@ -19,7 +19,7 @@ class OpenGraphObject:
class OpenGraphProvider:
def __init__(
self, user_agent_provider: UserAgentProvider = UserAgentProvider(rotate=False)
self, user_agent_provider: UserAgents = UserAgents(rotate=False)
) -> None:
self._user_agent = user_agent_provider.og()
self._http_client = SyncHttpClient(
@@ -3,7 +3,7 @@ from dataclasses import dataclass
@dataclass
class UserAgentProvider:
class UserAgents:
USER_AGENTS = [
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
"Mozilla/5.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7",