feat: crawling html and wordpress sources
This commit is contained in:
@@ -42,11 +42,12 @@ def crawl_cmd(
|
|||||||
)
|
)
|
||||||
|
|
||||||
crawlers = [
|
crawlers = [
|
||||||
HtmlCrawler(crawler_config, pipeline.fetch.client),
|
HtmlCrawler,
|
||||||
WordpressCrawler(crawler_config, pipeline.fetch.client),
|
WordpressCrawler,
|
||||||
]
|
]
|
||||||
|
|
||||||
for crawler in crawlers:
|
for crawler in crawlers:
|
||||||
if crawler.supports(source.source_kind):
|
if crawler.supports() == source.source_kind:
|
||||||
|
crawler = crawler(crawler_config, pipeline.fetch.client)
|
||||||
crawler.fetch()
|
crawler.fetch()
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union, Dict
|
from typing import Optional, Union, Dict
|
||||||
|
|
||||||
@@ -8,6 +9,15 @@ from basango.core.config import PipelineConfig
|
|||||||
from basango.core.project_paths import ProjectPaths
|
from basango.core.project_paths import ProjectPaths
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_utf8_stream(stream):
|
||||||
|
try:
|
||||||
|
if hasattr(stream, "reconfigure"):
|
||||||
|
stream.reconfigure(encoding="utf-8", errors="replace")
|
||||||
|
except (AttributeError, ValueError):
|
||||||
|
return stream
|
||||||
|
return stream
|
||||||
|
|
||||||
|
|
||||||
class ConfigManager:
|
class ConfigManager:
|
||||||
def __init__(self, config_path: Optional[Union[str, Path]] = None):
|
def __init__(self, config_path: Optional[Union[str, Path]] = None):
|
||||||
self.config_path = Path(config_path) if config_path else self._find_config()
|
self.config_path = Path(config_path) if config_path else self._find_config()
|
||||||
@@ -78,9 +88,13 @@ class ConfigManager:
|
|||||||
# Clear existing handlers
|
# Clear existing handlers
|
||||||
root_logger.handlers.clear()
|
root_logger.handlers.clear()
|
||||||
|
|
||||||
|
_ensure_utf8_stream(sys.stdout)
|
||||||
|
_ensure_utf8_stream(sys.stderr)
|
||||||
# Console handler
|
# Console handler
|
||||||
if cfg.logging.console_logging:
|
if cfg.logging.console_logging:
|
||||||
console_handler = logging.StreamHandler()
|
console_handler = logging.StreamHandler(
|
||||||
|
stream=_ensure_utf8_stream(sys.stderr)
|
||||||
|
)
|
||||||
console_handler.setFormatter(formatter)
|
console_handler.setFormatter(formatter)
|
||||||
root_logger.addHandler(console_handler)
|
root_logger.addHandler(console_handler)
|
||||||
|
|
||||||
@@ -93,6 +107,7 @@ class ConfigManager:
|
|||||||
log_file_path,
|
log_file_path,
|
||||||
maxBytes=cfg.logging.max_log_size,
|
maxBytes=cfg.logging.max_log_size,
|
||||||
backupCount=cfg.logging.backup_count,
|
backupCount=cfg.logging.backup_count,
|
||||||
|
encoding="utf-8",
|
||||||
)
|
)
|
||||||
file_handler.setFormatter(formatter)
|
file_handler.setFormatter(formatter)
|
||||||
root_logger.addHandler(file_handler)
|
root_logger.addHandler(file_handler)
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ class SourceKind(StrEnum):
|
|||||||
|
|
||||||
|
|
||||||
class SourceDate(BaseModel):
|
class SourceDate(BaseModel):
|
||||||
format: str = "%Y-%m-%dT%H:%M:%S"
|
format: str = "%Y-%m-%d %H:%M"
|
||||||
pattern: Optional[str] = None
|
pattern: Optional[str] = None
|
||||||
replacement: Optional[str] = None
|
replacement: Optional[str] = None
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Optional
|
from dataclasses import asdict, is_dataclass
|
||||||
|
from typing import Optional, Any, Dict, List
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from basango.core.config import CrawlerConfig, ClientConfig
|
from basango.core.config import CrawlerConfig, ClientConfig
|
||||||
from basango.domain import DateRange, SourceKind, PageRange
|
from basango.domain import DateRange, SourceKind, PageRange
|
||||||
from basango.domain.exception import ArticleOutOfRange
|
from basango.domain.exception import ArticleOutOfRange
|
||||||
from basango.services import HttpClient
|
from basango.services import HttpClient, DateParser, OpenGraphProvider
|
||||||
|
|
||||||
|
|
||||||
class BaseCrawler(ABC):
|
class BaseCrawler(ABC):
|
||||||
@@ -17,6 +18,9 @@ class BaseCrawler(ABC):
|
|||||||
self.config = crawler_config
|
self.config = crawler_config
|
||||||
self.source = crawler_config.source
|
self.source = crawler_config.source
|
||||||
self.client = HttpClient(client_config=client_config)
|
self.client = HttpClient(client_config=client_config)
|
||||||
|
self.results: List[Dict[str, Any]] = []
|
||||||
|
self.date_parser = DateParser()
|
||||||
|
self.open_graph = OpenGraphProvider()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fetch(self) -> None:
|
def fetch(self) -> None:
|
||||||
@@ -29,6 +33,34 @@ class BaseCrawler(ABC):
|
|||||||
response = self.client.get(url).text
|
response = self.client.get(url).text
|
||||||
return BeautifulSoup(response, "html.parser")
|
return BeautifulSoup(response, "html.parser")
|
||||||
|
|
||||||
|
def record_article(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
title: str,
|
||||||
|
link: str,
|
||||||
|
body: str,
|
||||||
|
categories: List[str],
|
||||||
|
timestamp: int,
|
||||||
|
metadata: Any = None,
|
||||||
|
) -> None:
|
||||||
|
if metadata is None:
|
||||||
|
metadata_value = None
|
||||||
|
elif is_dataclass(metadata) and not isinstance(metadata, type):
|
||||||
|
metadata_value = asdict(metadata)
|
||||||
|
else:
|
||||||
|
metadata_value = metadata
|
||||||
|
article = {
|
||||||
|
"title": title,
|
||||||
|
"link": link,
|
||||||
|
"body": body,
|
||||||
|
"categories": categories,
|
||||||
|
"source": getattr(self.source, "source_id", None),
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"metadata": metadata_value,
|
||||||
|
}
|
||||||
|
self.results.append(article)
|
||||||
|
logging.info(f"> {title} [saved]")
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fetch_one(self, html: str, date_range: Optional[DateRange] = None) -> None:
|
def fetch_one(self, html: str, date_range: Optional[DateRange] = None) -> None:
|
||||||
pass
|
pass
|
||||||
@@ -40,8 +72,9 @@ class BaseCrawler(ABC):
|
|||||||
def get_last_page(self) -> int:
|
def get_last_page(self) -> int:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def supports(self, source_kind: SourceKind) -> bool:
|
def supports() -> SourceKind:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -1,10 +1,15 @@
|
|||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from datetime import datetime, timezone
|
||||||
from typing import Optional, cast, override
|
from typing import Optional, cast, override
|
||||||
from urllib.parse import urlparse, parse_qs
|
from urllib.parse import parse_qs, urljoin, urlparse
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
from basango.core.config import CrawlerConfig, ClientConfig
|
from basango.core.config import CrawlerConfig, ClientConfig
|
||||||
from basango.core.config.source_config import HtmlSourceConfig
|
from basango.core.config.source_config import HtmlSourceConfig
|
||||||
from basango.domain import PageRange, SourceKind, DateRange
|
from basango.domain import DateRange, PageRange, SourceKind
|
||||||
|
from basango.domain.exception import ArticleOutOfRange
|
||||||
from basango.services.crawler.base_crawler import BaseCrawler
|
from basango.services.crawler.base_crawler import BaseCrawler
|
||||||
|
|
||||||
|
|
||||||
@@ -17,16 +22,114 @@ class HtmlCrawler(BaseCrawler):
|
|||||||
raise ValueError("HtmlCrawler requires a source of kind HTML")
|
raise ValueError("HtmlCrawler requires a source of kind HTML")
|
||||||
|
|
||||||
self.source = cast(HtmlSourceConfig, self.source)
|
self.source = cast(HtmlSourceConfig, self.source)
|
||||||
|
self._current_article_url: Optional[str] = None
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def fetch(self) -> None:
|
def fetch(self) -> None:
|
||||||
self.initialize()
|
self.initialize()
|
||||||
page = self.config.page_range or self.get_pagination()
|
page_range = self.config.page_range or self.get_pagination()
|
||||||
print(page)
|
date_range = self.config.date_range
|
||||||
|
|
||||||
|
article_selector = self.source.source_selectors.articles
|
||||||
|
if not article_selector:
|
||||||
|
logging.error(
|
||||||
|
"No article selector configured for HTML source %s",
|
||||||
|
self.source.source_id,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
stop = False
|
||||||
|
for page_number in range(page_range.start, page_range.end + 1):
|
||||||
|
page_url = self._build_page_url(page_number)
|
||||||
|
try:
|
||||||
|
soup = self.crawl(page_url, page_number)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logging.error(
|
||||||
|
"> page %s => %s [failed]",
|
||||||
|
page_number,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
articles = soup.select(article_selector)
|
||||||
|
if not articles:
|
||||||
|
logging.info("No articles found on page %s", page_number)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
try:
|
||||||
|
self._current_article_url = self._extract_link(article)
|
||||||
|
target_html = str(article)
|
||||||
|
|
||||||
|
if self.source.requires_details:
|
||||||
|
if not self._current_article_url:
|
||||||
|
logging.debug(
|
||||||
|
"Skipping article without link for details on page %s",
|
||||||
|
page_number,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
detail_soup = self.crawl(self._current_article_url)
|
||||||
|
target_html = str(detail_soup)
|
||||||
|
except Exception as detail_exc: # noqa: BLE001
|
||||||
|
logging.error(
|
||||||
|
"Failed to fetch detail page %s: %s",
|
||||||
|
self._current_article_url,
|
||||||
|
detail_exc,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.fetch_one(target_html, date_range)
|
||||||
|
except ArticleOutOfRange:
|
||||||
|
logging.info("No more articles to fetch in this range.")
|
||||||
|
stop = True
|
||||||
|
break
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logging.error(
|
||||||
|
"Failed to process article on %s: %s",
|
||||||
|
page_url,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
self._current_article_url = None
|
||||||
|
|
||||||
|
if stop:
|
||||||
|
break
|
||||||
|
|
||||||
|
self.completed(self.config.notify)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def fetch_one(self, html: str, date_range: Optional[DateRange] = None) -> None:
|
def fetch_one(self, html: str, date_range: Optional[DateRange] = None) -> None:
|
||||||
pass
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
selectors = self.source.source_selectors
|
||||||
|
|
||||||
|
title = self._extract_text(soup, selectors.article_title) or "Untitled"
|
||||||
|
link = self._current_article_url or self._extract_link(soup)
|
||||||
|
if not link:
|
||||||
|
logging.warning("Skipping article '%s' without link", title)
|
||||||
|
return
|
||||||
|
|
||||||
|
body = self._extract_body(soup, selectors.article_body)
|
||||||
|
categories = self._extract_categories(soup, selectors.article_categories)
|
||||||
|
if not categories and self.config.category:
|
||||||
|
categories = [self.config.category]
|
||||||
|
|
||||||
|
raw_date = self._extract_text(soup, selectors.article_date)
|
||||||
|
timestamp = self._compute_timestamp(raw_date)
|
||||||
|
|
||||||
|
if date_range and not date_range.in_range(timestamp):
|
||||||
|
self.skip(date_range, str(timestamp), title, raw_date or "")
|
||||||
|
|
||||||
|
metadata = self.open_graph.consume_html(html)
|
||||||
|
|
||||||
|
self.record_article(
|
||||||
|
title=title,
|
||||||
|
link=link,
|
||||||
|
body=body,
|
||||||
|
categories=categories,
|
||||||
|
timestamp=timestamp,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def get_pagination(self) -> PageRange:
|
def get_pagination(self) -> PageRange:
|
||||||
@@ -67,6 +170,128 @@ class HtmlCrawler(BaseCrawler):
|
|||||||
return 1
|
return 1
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
@override
|
@override
|
||||||
def supports(self, source_kind: SourceKind) -> bool:
|
def supports() -> SourceKind:
|
||||||
return source_kind == SourceKind.HTML
|
return SourceKind.HTML
|
||||||
|
|
||||||
|
def _build_page_url(self, page: int) -> str:
|
||||||
|
template = self._apply_category(self.source.pagination_template)
|
||||||
|
if "{page}" in template:
|
||||||
|
template = template.format(page=page)
|
||||||
|
elif page > 0:
|
||||||
|
separator = "&" if "?" in template else "?"
|
||||||
|
template = f"{template}{separator}page={page}"
|
||||||
|
|
||||||
|
base = str(self.source.source_url)
|
||||||
|
if not base.endswith("/"):
|
||||||
|
base = f"{base}/"
|
||||||
|
return urljoin(base, template.lstrip("/"))
|
||||||
|
|
||||||
|
def _apply_category(self, template: str) -> str:
|
||||||
|
if "{category}" in template:
|
||||||
|
replacement = self.config.category or ""
|
||||||
|
return template.replace("{category}", replacement)
|
||||||
|
return template
|
||||||
|
|
||||||
|
def _extract_link(self, node: BeautifulSoup | Tag) -> Optional[str]:
|
||||||
|
selector = self.source.source_selectors.article_link
|
||||||
|
if not selector:
|
||||||
|
return None
|
||||||
|
|
||||||
|
target = node.select_one(selector)
|
||||||
|
if not target:
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_href = target.get("href") or target.get("data-href") or target.get("src")
|
||||||
|
href: Optional[str]
|
||||||
|
if isinstance(raw_href, str):
|
||||||
|
href = raw_href.strip() or None
|
||||||
|
elif isinstance(raw_href, list):
|
||||||
|
href = next(
|
||||||
|
(
|
||||||
|
item.strip()
|
||||||
|
for item in raw_href
|
||||||
|
if isinstance(item, str) and item.strip()
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
href = None
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
return self._to_absolute_url(href)
|
||||||
|
|
||||||
|
def _to_absolute_url(self, href: str) -> str:
|
||||||
|
base = str(self.source.source_url)
|
||||||
|
if not base.endswith("/"):
|
||||||
|
base = f"{base}/"
|
||||||
|
return urljoin(base, href)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_text(
|
||||||
|
node: BeautifulSoup | Tag, selector: Optional[str]
|
||||||
|
) -> Optional[str]:
|
||||||
|
if not selector:
|
||||||
|
return None
|
||||||
|
target = node.select_one(selector)
|
||||||
|
if not target:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if target.name == "img":
|
||||||
|
for attr in ("alt", "title"):
|
||||||
|
value = target.get(attr)
|
||||||
|
if isinstance(value, str):
|
||||||
|
stripped = value.strip()
|
||||||
|
if stripped:
|
||||||
|
return stripped
|
||||||
|
elif isinstance(value, list):
|
||||||
|
for item in value:
|
||||||
|
if isinstance(item, str):
|
||||||
|
stripped = item.strip()
|
||||||
|
if stripped:
|
||||||
|
return stripped
|
||||||
|
|
||||||
|
text = target.get_text(" ", strip=True)
|
||||||
|
return text or None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_body(node: BeautifulSoup | Tag, selector: Optional[str]) -> str:
|
||||||
|
if selector:
|
||||||
|
matches = node.select(selector)
|
||||||
|
if matches:
|
||||||
|
parts = [
|
||||||
|
item.get_text(" ", strip=True)
|
||||||
|
for item in matches
|
||||||
|
if item.get_text(strip=True)
|
||||||
|
]
|
||||||
|
if parts:
|
||||||
|
return "".join(parts)
|
||||||
|
return node.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_categories(
|
||||||
|
node: BeautifulSoup | Tag, selector: Optional[str]
|
||||||
|
) -> list[str]:
|
||||||
|
if not selector:
|
||||||
|
return []
|
||||||
|
|
||||||
|
values: list[str] = []
|
||||||
|
for item in node.select(selector):
|
||||||
|
text = item.get_text(" ", strip=True)
|
||||||
|
if text:
|
||||||
|
lower = text.lower()
|
||||||
|
if lower not in values:
|
||||||
|
values.append(lower)
|
||||||
|
return values
|
||||||
|
|
||||||
|
def _compute_timestamp(self, raw_date: Optional[str]) -> int:
|
||||||
|
if not raw_date:
|
||||||
|
return int(datetime.now(timezone.utc).timestamp())
|
||||||
|
|
||||||
|
return self.date_parser.create_timestamp(
|
||||||
|
raw_date.strip(),
|
||||||
|
fmt=self.source.source_date.format,
|
||||||
|
pattern=self.source.source_date.pattern,
|
||||||
|
replacement=self.source.source_date.replacement,
|
||||||
|
)
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional, override, cast, Final
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional, override, cast, Final, Any
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from basango.core.config import WordPressSourceConfig, CrawlerConfig, ClientConfig
|
from basango.core.config import WordPressSourceConfig, CrawlerConfig, ClientConfig
|
||||||
from basango.domain import SourceKind, PageRange, DateRange
|
from basango.domain import SourceKind, PageRange, DateRange
|
||||||
|
from basango.domain.exception import ArticleOutOfRange
|
||||||
from basango.services.crawler.base_crawler import BaseCrawler
|
from basango.services.crawler.base_crawler import BaseCrawler
|
||||||
|
|
||||||
|
|
||||||
@@ -15,6 +20,7 @@ class WordpressCrawler(BaseCrawler):
|
|||||||
raise ValueError("WordpressCrawler requires a source of kind WORDPRESS")
|
raise ValueError("WordpressCrawler requires a source of kind WORDPRESS")
|
||||||
|
|
||||||
self.source = cast(WordPressSourceConfig, self.source)
|
self.source = cast(WordPressSourceConfig, self.source)
|
||||||
|
self.category_map: dict[int, str] = {}
|
||||||
|
|
||||||
POST_QUERY: Final = "_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc"
|
POST_QUERY: Final = "_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc"
|
||||||
CATEGORY_QUERY: Final = (
|
CATEGORY_QUERY: Final = (
|
||||||
@@ -23,17 +29,86 @@ class WordpressCrawler(BaseCrawler):
|
|||||||
TOTAL_PAGES_HEADER: Final = "x-wp-totalpages"
|
TOTAL_PAGES_HEADER: Final = "x-wp-totalpages"
|
||||||
TOTAL_POSTS_HEADER: Final = "x-wp-total"
|
TOTAL_POSTS_HEADER: Final = "x-wp-total"
|
||||||
|
|
||||||
category_map: dict[int, str] = {}
|
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def fetch(self) -> None:
|
def fetch(self) -> None:
|
||||||
self.initialize()
|
self.initialize()
|
||||||
page = self.config.page_range or self.get_pagination()
|
page_range = self.config.page_range or self.get_pagination()
|
||||||
print(page)
|
date_range = self.config.date_range
|
||||||
|
|
||||||
|
stop = False
|
||||||
|
for page_number in range(page_range.start, page_range.end + 1):
|
||||||
|
endpoint = self._posts_endpoint(page_number)
|
||||||
|
try:
|
||||||
|
response = self.client.get(endpoint)
|
||||||
|
payload = response.text
|
||||||
|
articles = json.loads(payload)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logging.error(
|
||||||
|
"> page %s => %s [failed]",
|
||||||
|
page_number,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
try:
|
||||||
|
self.fetch_one(article, date_range)
|
||||||
|
except ArticleOutOfRange:
|
||||||
|
logging.info("No more articles to fetch in this range.")
|
||||||
|
stop = True
|
||||||
|
break
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logging.error(
|
||||||
|
"Failed to process WordPress article on page %s: %s",
|
||||||
|
page_number,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
if stop:
|
||||||
|
break
|
||||||
|
|
||||||
|
self.completed(self.config.notify)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def fetch_one(self, html: str, date_range: Optional[DateRange] = None) -> None:
|
def fetch_one(self, html: Any, date_range: Optional[DateRange] = None) -> None:
|
||||||
pass
|
try:
|
||||||
|
data = json.loads(html) if isinstance(html, str) else html
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
logging.error("Failed to decode WordPress payload: %s", exc)
|
||||||
|
return
|
||||||
|
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
logging.error("Skipping unexpected WordPress payload: %s", type(data))
|
||||||
|
return
|
||||||
|
|
||||||
|
link = data.get("link")
|
||||||
|
if not link:
|
||||||
|
logging.error("Skipping WordPress article without link")
|
||||||
|
return
|
||||||
|
|
||||||
|
title = BeautifulSoup(
|
||||||
|
data.get("title", {}).get("rendered", ""), "html.parser"
|
||||||
|
).get_text(" ", strip=True)
|
||||||
|
body = BeautifulSoup(
|
||||||
|
data.get("content", {}).get("rendered", ""), "html.parser"
|
||||||
|
).get_text(" ", strip=True)
|
||||||
|
timestamp = self._compute_timestamp(data.get("date"))
|
||||||
|
|
||||||
|
categories_value = self._map_categories(data.get("categories", []))
|
||||||
|
categories = [item for item in categories_value.split(",") if item]
|
||||||
|
|
||||||
|
if date_range and not date_range.in_range(timestamp):
|
||||||
|
self.skip(date_range, str(timestamp), title, data.get("date", ""))
|
||||||
|
|
||||||
|
metadata = self.open_graph.consume_url(link)
|
||||||
|
|
||||||
|
self.record_article(
|
||||||
|
title=title or data.get("slug", "Untitled"),
|
||||||
|
link=link,
|
||||||
|
body=body,
|
||||||
|
categories=categories,
|
||||||
|
timestamp=timestamp,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def get_pagination(self) -> PageRange:
|
def get_pagination(self) -> PageRange:
|
||||||
@@ -43,7 +118,7 @@ class WordpressCrawler(BaseCrawler):
|
|||||||
pages = int(response.headers.get(self.TOTAL_PAGES_HEADER, "1"))
|
pages = int(response.headers.get(self.TOTAL_PAGES_HEADER, "1"))
|
||||||
posts = int(response.headers.get(self.TOTAL_POSTS_HEADER, "0"))
|
posts = int(response.headers.get(self.TOTAL_POSTS_HEADER, "0"))
|
||||||
|
|
||||||
logging.info(f"WordPress Pagination {posts} posts in {pages} pages")
|
logging.info("WordPress Pagination %s posts in %s pages", posts, pages)
|
||||||
return PageRange.create(f"1:{pages}")
|
return PageRange.create(f"1:{pages}")
|
||||||
|
|
||||||
def _fetch_categories(self) -> None:
|
def _fetch_categories(self) -> None:
|
||||||
@@ -62,10 +137,34 @@ class WordpressCrawler(BaseCrawler):
|
|||||||
if category in self.category_map
|
if category in self.category_map
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _posts_endpoint(self, page: int) -> str:
|
||||||
|
base = str(self.source.source_url)
|
||||||
|
if not base.endswith("/"):
|
||||||
|
base = f"{base}/"
|
||||||
|
return f"{base}wp-json/wp/v2/posts?{self.POST_QUERY}&page={page}&per_page=100"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _compute_timestamp(raw: Optional[str]) -> int:
|
||||||
|
if not raw:
|
||||||
|
return int(datetime.now(timezone.utc).timestamp())
|
||||||
|
|
||||||
|
cleaned = raw.replace("Z", "+00:00")
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(cleaned)
|
||||||
|
except ValueError:
|
||||||
|
return int(datetime.now(timezone.utc).timestamp())
|
||||||
|
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
return int(dt.timestamp())
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def get_last_page(self) -> int:
|
def get_last_page(self) -> int:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@staticmethod
|
||||||
@override
|
@override
|
||||||
def supports(self, source_kind: SourceKind) -> bool:
|
def supports() -> SourceKind:
|
||||||
return source_kind == SourceKind.WORDPRESS
|
return SourceKind.WORDPRESS
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user