[crawler]: stabilize import

This commit is contained in:
2025-11-02 21:26:07 +02:00
parent 07bb3992ad
commit c53c0b576b
51 changed files with 441 additions and 685 deletions
+5 -5
View File
@@ -1,4 +1,4 @@
# This file is the entry point to configure your own services.
# This file is the entry point to configure your own process.
# Files in the packages/ subdirectory configure your dependencies.
# Put parameters here that don't need to change on each machine where the app is deployed
@@ -7,10 +7,10 @@ parameters:
basango_notification_email: '%env(BASANGO_NOTIFICATION_EMAIL)%'
services:
# default configuration for services in *this* file
# default configuration for process in *this* file
_defaults:
autowire: true # Automatically injects dependencies in your services.
autoconfigure: true # Automatically registers your services as commands, event subscribers, etc.
autowire: true # Automatically injects dependencies in your process.
autoconfigure: true # Automatically registers your process as commands, event subscribers, etc.
bind:
$projectDir: '%kernel.project_dir%'
$crawlingNotificationEmail: '%basango_notification_email%'
@@ -42,7 +42,7 @@ services:
tags:
- { name: monolog.formatter }
# makes classes in src/ available to be used as services
# makes classes in src/ available to be used as process
# this creates a service per class whose id is the fully-qualified class name
Basango\:
resource: '../src/'
@@ -2,7 +2,7 @@ from .date_parser import DateParser
from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient
from .open_graph import OpenGraphProvider
from .persistence import BasePersistor, CsvPersistor, JsonPersistor
from .user_agents import UserAgentProvider
from .user_agents import UserAgents
from .tokenizer import Tokenizer
HttpClient = SyncHttpClient
@@ -14,7 +14,7 @@ __all__ = [
"AsyncHttpClient",
"HttpClient",
"OpenGraphProvider",
"UserAgentProvider",
"UserAgents",
"BasePersistor",
"CsvPersistor",
"JsonPersistor",
@@ -8,7 +8,7 @@ from typing import Any, Optional, TypeAlias
import httpx
from basango.core.config import ClientConfig
from basango.services.user_agents import UserAgentProvider
from basango.services.user_agents import UserAgents
HttpHeaders: TypeAlias = dict[str, str] | None
HttpParams: TypeAlias = dict[str, Any] | None
@@ -20,13 +20,13 @@ TRANSIENT_STATUSES = (429, 500, 502, 503, 504)
@dataclass
class BaseHttpClient(ABC):
client_config: ClientConfig
user_agent_provider: UserAgentProvider | None = None
user_agent_provider: UserAgents | None = None
default_headers: HttpHeaders = None
_user_agent: str = field(init=False, repr=False)
_headers: dict[str, str] = field(init=False, repr=False)
def __post_init__(self) -> None:
provider = self.user_agent_provider or UserAgentProvider(
provider = self.user_agent_provider or UserAgents(
rotate=self.client_config.rotate,
fallback=self.client_config.user_agent,
)
@@ -6,7 +6,7 @@ import trafilatura
from basango.core.config import ClientConfig
from basango.services.http_client import SyncHttpClient
from basango.services.user_agents import UserAgentProvider
from basango.services.user_agents import UserAgents
@dataclass
@@ -19,7 +19,7 @@ class OpenGraphObject:
class OpenGraphProvider:
def __init__(
self, user_agent_provider: UserAgentProvider = UserAgentProvider(rotate=False)
self, user_agent_provider: UserAgents = UserAgents(rotate=False)
) -> None:
self._user_agent = user_agent_provider.og()
self._http_client = SyncHttpClient(
@@ -3,7 +3,7 @@ from dataclasses import dataclass
@dataclass
class UserAgentProvider:
class UserAgents:
USER_AGENTS = [
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
"Mozilla/5.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7",