[crawler]: stabilize import
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# This file is the entry point to configure your own services.
|
||||
# This file is the entry point to configure your own process.
|
||||
# Files in the packages/ subdirectory configure your dependencies.
|
||||
|
||||
# Put parameters here that don't need to change on each machine where the app is deployed
|
||||
@@ -7,10 +7,10 @@ parameters:
|
||||
basango_notification_email: '%env(BASANGO_NOTIFICATION_EMAIL)%'
|
||||
|
||||
services:
|
||||
# default configuration for services in *this* file
|
||||
# default configuration for process in *this* file
|
||||
_defaults:
|
||||
autowire: true # Automatically injects dependencies in your services.
|
||||
autoconfigure: true # Automatically registers your services as commands, event subscribers, etc.
|
||||
autowire: true # Automatically injects dependencies in your process.
|
||||
autoconfigure: true # Automatically registers your process as commands, event subscribers, etc.
|
||||
bind:
|
||||
$projectDir: '%kernel.project_dir%'
|
||||
$crawlingNotificationEmail: '%basango_notification_email%'
|
||||
@@ -42,7 +42,7 @@ services:
|
||||
tags:
|
||||
- { name: monolog.formatter }
|
||||
|
||||
# makes classes in src/ available to be used as services
|
||||
# makes classes in src/ available to be used as process
|
||||
# this creates a service per class whose id is the fully-qualified class name
|
||||
Basango\:
|
||||
resource: '../src/'
|
||||
|
||||
@@ -2,7 +2,7 @@ from .date_parser import DateParser
|
||||
from .http_client import BaseHttpClient, SyncHttpClient, AsyncHttpClient
|
||||
from .open_graph import OpenGraphProvider
|
||||
from .persistence import BasePersistor, CsvPersistor, JsonPersistor
|
||||
from .user_agents import UserAgentProvider
|
||||
from .user_agents import UserAgents
|
||||
from .tokenizer import Tokenizer
|
||||
|
||||
HttpClient = SyncHttpClient
|
||||
@@ -14,7 +14,7 @@ __all__ = [
|
||||
"AsyncHttpClient",
|
||||
"HttpClient",
|
||||
"OpenGraphProvider",
|
||||
"UserAgentProvider",
|
||||
"UserAgents",
|
||||
"BasePersistor",
|
||||
"CsvPersistor",
|
||||
"JsonPersistor",
|
||||
|
||||
@@ -8,7 +8,7 @@ from typing import Any, Optional, TypeAlias
|
||||
import httpx
|
||||
|
||||
from basango.core.config import ClientConfig
|
||||
from basango.services.user_agents import UserAgentProvider
|
||||
from basango.services.user_agents import UserAgents
|
||||
|
||||
HttpHeaders: TypeAlias = dict[str, str] | None
|
||||
HttpParams: TypeAlias = dict[str, Any] | None
|
||||
@@ -20,13 +20,13 @@ TRANSIENT_STATUSES = (429, 500, 502, 503, 504)
|
||||
@dataclass
|
||||
class BaseHttpClient(ABC):
|
||||
client_config: ClientConfig
|
||||
user_agent_provider: UserAgentProvider | None = None
|
||||
user_agent_provider: UserAgents | None = None
|
||||
default_headers: HttpHeaders = None
|
||||
_user_agent: str = field(init=False, repr=False)
|
||||
_headers: dict[str, str] = field(init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
provider = self.user_agent_provider or UserAgentProvider(
|
||||
provider = self.user_agent_provider or UserAgents(
|
||||
rotate=self.client_config.rotate,
|
||||
fallback=self.client_config.user_agent,
|
||||
)
|
||||
|
||||
@@ -6,7 +6,7 @@ import trafilatura
|
||||
|
||||
from basango.core.config import ClientConfig
|
||||
from basango.services.http_client import SyncHttpClient
|
||||
from basango.services.user_agents import UserAgentProvider
|
||||
from basango.services.user_agents import UserAgents
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -19,7 +19,7 @@ class OpenGraphObject:
|
||||
|
||||
class OpenGraphProvider:
|
||||
def __init__(
|
||||
self, user_agent_provider: UserAgentProvider = UserAgentProvider(rotate=False)
|
||||
self, user_agent_provider: UserAgents = UserAgents(rotate=False)
|
||||
) -> None:
|
||||
self._user_agent = user_agent_provider.og()
|
||||
self._http_client = SyncHttpClient(
|
||||
|
||||
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserAgentProvider:
|
||||
class UserAgents:
|
||||
USER_AGENTS = [
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
|
||||
"Mozilla/5.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7",
|
||||
|
||||
Reference in New Issue
Block a user