Initial commit

This commit is contained in:
2025-10-05 13:55:28 +02:00
commit 68d521677a
767 changed files with 46947 additions and 0 deletions
@@ -0,0 +1,57 @@
from datetime import datetime, timezone
import pytest
from basango.domain import DateRange
def ts(y: int, m: int, d: int, hh: int = 0, mm: int = 0, ss: int = 0) -> int:
return int(datetime(y, m, d, hh, mm, ss, tzinfo=timezone.utc).timestamp())
def test_from_parses_two_dates_with_default_format() -> None:
dr = DateRange.create("2024-10-01:2024-10-08")
assert dr.start == ts(2024, 10, 1)
assert dr.end == ts(2024, 10, 8)
def test_str_and_format_roundtrip() -> None:
dr = DateRange.create("2024-10-01:2024-10-02")
assert str(dr) == f"{ts(2024, 10, 1)}:{ts(2024, 10, 2)}"
assert dr.format("%Y-%m-%d") == "2024-10-01:2024-10-02"
def test_in_range_out_range_inclusive_boundaries() -> None:
dr = DateRange.create("2024-10-01:2024-10-02")
start = ts(2024, 10, 1)
end = ts(2024, 10, 2)
before = start - 1
after = end + 1
midday_end = ts(2024, 10, 2, 12, 0, 0)
assert dr.in_range(start) is True
assert dr.in_range(end) is True
assert dr.out_range(before) is True
# End is at 00:00 of end day; times later that day are outside
assert dr.out_range(midday_end) is True
assert dr.out_range(after) is True
def test_backward_uses_days_and_next_day_end() -> None:
base = datetime(2024, 10, 31, tzinfo=timezone.utc)
dr = DateRange.backward(date=base, days=10)
assert dr.start == ts(2024, 10, 21)
assert dr.end == ts(2024, 11, 1)
def test_from_raises_on_invalid_separator_or_spec() -> None:
with pytest.raises(AssertionError):
DateRange.create("2024-10-01:2024-10-08", separator="")
with pytest.raises(AssertionError):
DateRange.create("2024-10-01", separator=":")
def test_from_accepts_python_format_string() -> None:
dr = DateRange.create("2024/10/01|2024/10/02", fmt="%Y/%m/%d", separator="|")
assert dr.start == ts(2024, 10, 1)
assert dr.end == ts(2024, 10, 2)
@@ -0,0 +1,19 @@
import pytest
from basango.domain import PageRange
def test_it_should_create_page_range():
pr = PageRange.create("1:10")
assert pr.start == 1
assert pr.end == 10
def test_end_page_should_be_greater_than_start_page():
with pytest.raises(AssertionError):
PageRange.create("10:1")
def test_non_negative_pages():
with pytest.raises(AssertionError):
PageRange.create("-1:-10")
@@ -0,0 +1,292 @@
from unittest.mock import patch
import pytest
from bs4 import BeautifulSoup
from pydantic import HttpUrl
from basango.core.config import WordPressSourceConfig
from basango.core.config.fetch_config import CrawlerConfig, ClientConfig
from basango.core.config.source_config import HtmlSourceConfig, SourceSelectors
from basango.domain import SourceKind, PageRange
from basango.services.crawler.html_crawler import HtmlCrawler
class TestHtmlCrawler:
"""Test suite for HtmlCrawler."""
@pytest.fixture
def mock_client_config(self):
return ClientConfig()
@pytest.fixture
def mock_html_source_config(self):
return HtmlSourceConfig(
source_id="test_source",
source_url=HttpUrl("https://example.com"),
pagination_template="news",
source_selectors=SourceSelectors(pagination="ul.pagination > li a"),
supports_categories=True,
)
@pytest.fixture
def mock_crawler_config(self, mock_html_source_config):
return CrawlerConfig(source=mock_html_source_config, category="tech")
@pytest.fixture
def html_crawler(self, mock_crawler_config, mock_client_config):
return HtmlCrawler(mock_crawler_config, mock_client_config)
def test_with_valid_html_source(self, html_crawler):
"""Test __init__ with valid HTML source config."""
assert html_crawler.source.source_kind == SourceKind.HTML
assert isinstance(html_crawler.source, HtmlSourceConfig)
def test_with_invalid_source_kind_raises_error(self, mock_client_config):
"""Test __init__ raises ValueError when source kind is not HTML."""
wordpress_source = WordPressSourceConfig(
source_id="test_wordpress",
source_url=HttpUrl("https://example.com"),
)
config = CrawlerConfig(source=wordpress_source)
with pytest.raises(
ValueError, match="HtmlCrawler requires a source of kind HTML"
):
HtmlCrawler(config, mock_client_config)
def test_with_no_source_raises_error(self, mock_client_config):
"""Test __init__ raises ValueError when no source is provided."""
config = CrawlerConfig(source=None)
with pytest.raises(
ValueError, match="HtmlCrawler requires a source of kind HTML"
):
HtmlCrawler(config, mock_client_config)
def test_get_pagination_returns_valid_page_range(self, html_crawler):
"""Test that get_pagination returns a valid PageRange."""
with patch.object(html_crawler, "get_last_page", return_value=5):
result = html_crawler.get_pagination()
assert isinstance(result, PageRange)
assert result.start == 0
assert result.end == 5
assert str(result) == "0:5"
def test_get_last_page_with_valid_pagination_links(self, html_crawler):
"""Test get_last_page extracts page number from pagination links."""
# Mock HTML with pagination links
mock_html = """
<ul class="pagination">
<li><a href="/news?page=1">1</a></li>
<li><a href="/news?page=2">2</a></li>
<li><a href="/news?page=3">3</a></li>
<li><a href="/news?page=10">10</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
assert result == 10
def test_get_last_page_with_no_pagination_links(self, html_crawler):
"""Test get_last_page returns 1 when no pagination links found."""
mock_html = "<div>No pagination here</div>"
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
assert result == 1
def test_get_last_page_with_empty_href(self, html_crawler):
"""Test get_last_page returns 1 when href is empty or None."""
mock_html = """
<ul class="pagination">
<li><a>No href</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
assert result == 1
def test_get_last_page_with_regex_extraction(self, html_crawler):
"""Test get_last_page extracts page number using regex."""
mock_html = """
<ul class="pagination">
<li><a href="/articles/page/25/">Page 25</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
assert result == 25
def test_get_last_page_with_query_parameters(self, html_crawler):
"""Test get_last_page extracts page number from query parameters."""
mock_html = """
<ul class="pagination">
<li><a href="/news?category=tech&page=15&sort=date">Last</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
assert result == 15
def test_get_last_page_with_invalid_page_parameter(self, html_crawler):
"""Test get_last_page returns 1 when page parameter is invalid."""
mock_html = """
<ul class="pagination">
<li><a href="/news?page=invalid">Last</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
assert result == 1
def test_get_last_page_with_category_support(self, html_crawler):
"""Test get_last_page uses category in URL when supported."""
mock_html = """
<ul class="pagination">
<li><a href="/news?category=tech&page=8">8</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl") as mock_crawl:
mock_crawl.return_value = mock_soup
html_crawler.get_last_page()
# The URL construction concatenates source_url with the path
# Since the template doesn't contain {category}, it should remain unchanged
expected_url = "https://example.com/news"
mock_crawl.assert_called_once_with(expected_url)
def test_get_last_page_with_category_template(self, mock_client_config):
"""Test get_last_page uses category replacement when template contains {category}."""
source_config = HtmlSourceConfig(
source_id="test_source",
source_url=HttpUrl("https://example.com"),
pagination_template="news/{category}",
source_selectors=SourceSelectors(pagination="ul.pagination > li a"),
supports_categories=True,
)
crawler_config = CrawlerConfig(source=source_config, category="tech")
crawler = HtmlCrawler(crawler_config, mock_client_config)
mock_html = """
<ul class="pagination">
<li><a href="/news/tech?page=5">5</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(crawler, "crawl") as mock_crawl:
mock_crawl.return_value = mock_soup
crawler.get_last_page()
expected_url = "https://example.com/news/tech"
mock_crawl.assert_called_once_with(expected_url)
def test_get_last_page_without_category_support(self, html_crawler):
"""Test get_last_page uses default template when categories not supported."""
# Modify source to not support categories
html_crawler.source.supports_categories = False
mock_html = """
<ul class="pagination">
<li><a href="/news?page=5">5</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl") as mock_crawl:
mock_crawl.return_value = mock_soup
html_crawler.get_last_page()
# Verify the URL was constructed without category replacement
expected_url = "https://example.com/news"
mock_crawl.assert_called_once_with(expected_url)
def test_get_last_page_without_category_in_config(
self, mock_client_config, mock_html_source_config
):
"""Test get_last_page uses default template when no category in config."""
config = CrawlerConfig(source=mock_html_source_config, category=None)
crawler = HtmlCrawler(config, mock_client_config)
mock_html = """
<ul class="pagination">
<li><a href="/news?page=3">3</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(crawler, "crawl") as mock_crawl:
mock_crawl.return_value = mock_soup
crawler.get_last_page()
# Verify the URL was constructed without category replacement
expected_url = "https://example.com/news"
mock_crawl.assert_called_once_with(expected_url)
def test_get_last_page_with_multiple_numbers_in_href(self, html_crawler):
"""Test get_last_page extracts first number when multiple numbers present."""
mock_html = """
<ul class="pagination">
<li><a href="/news/2024/page/42/comments/100">Last</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
# Should extract the first number found (2024)
assert result == 2024
def test_supports_html_source_kind(self, html_crawler):
"""Test that supports method returns True for HTML source kind."""
assert html_crawler.supports(SourceKind.HTML) is True
assert html_crawler.supports(SourceKind.WORDPRESS) is False
def test_get_pagination_integration(self, html_crawler):
"""Integration test for get_pagination calling get_last_page."""
mock_html = """
<ul class="pagination">
<li><a href="/news?page=7">7</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_pagination()
assert isinstance(result, PageRange)
assert result.start == 0
assert result.end == 7
def test_get_last_page_with_non_string_href(self, html_crawler):
"""Test get_last_page handles non-string href attributes."""
# Create a mock element with href as a list (AttributeValueList)
mock_html = """
<ul class="pagination">
<li><a href="/news?page=5">5</a></li>
</ul>
"""
mock_soup = BeautifulSoup(mock_html, "html.parser")
# Modify the href to simulate a non-string type by removing it
pagination_link = mock_soup.select("ul.pagination > li a")[-1]
# Instead of setting href to a list, let's test with missing href
del pagination_link.attrs["href"]
with patch.object(html_crawler, "crawl", return_value=mock_soup):
result = html_crawler.get_last_page()
assert result == 1
@@ -0,0 +1,240 @@
from unittest.mock import Mock, patch
import pytest
from pydantic import HttpUrl
from basango.core.config.fetch_config import CrawlerConfig, ClientConfig
from basango.core.config.source_config import (
WordPressSourceConfig,
HtmlSourceConfig,
SourceSelectors,
)
from basango.domain import SourceKind, PageRange
from basango.services.crawler.wordpress_crawler import WordpressCrawler
class TestWordPressCrawler:
"""Test suite for WordPressCrawler."""
@pytest.fixture
def mock_client_config(self):
return ClientConfig()
@pytest.fixture
def mock_wordpress_source_config(self):
return WordPressSourceConfig(
source_id="test_wordpress_source",
source_url=HttpUrl("https://example.com/"),
supports_categories=True,
categories=["tech", "news"],
)
@pytest.fixture
def mock_crawler_config(self, mock_wordpress_source_config):
return CrawlerConfig(source=mock_wordpress_source_config, category="tech")
@pytest.fixture
def wordpress_crawler(self, mock_crawler_config, mock_client_config):
return WordpressCrawler(mock_crawler_config, mock_client_config)
@pytest.fixture
def mock_response_with_headers(self):
response = Mock()
response.headers = {
WordpressCrawler.TOTAL_PAGES_HEADER: "5",
WordpressCrawler.TOTAL_POSTS_HEADER: "47",
}
return response
def test_with_valid_wordpress_source(self, wordpress_crawler):
"""Test __init__ with valid WordPress source config."""
assert wordpress_crawler.source.source_kind == SourceKind.WORDPRESS
assert isinstance(wordpress_crawler.source, WordPressSourceConfig)
def test_with_invalid_source_kind_raises_error(self, mock_client_config):
"""Test __init__ raises ValueError when source kind is not WORDPRESS."""
html_source = HtmlSourceConfig(
source_id="test_html",
source_url=HttpUrl("https://example.com"),
pagination_template="news",
source_selectors=SourceSelectors(),
)
config = CrawlerConfig(source=html_source)
with pytest.raises(
ValueError, match="WordpressCrawler requires a source of kind WORDPRESS"
):
WordpressCrawler(config, mock_client_config)
def test_with_no_source_raises_error(self, mock_client_config):
"""Test __init__ raises ValueError when source is None."""
config = CrawlerConfig(source=None)
with pytest.raises(
ValueError, match="WordpressCrawler requires a source of kind WORDPRESS"
):
WordpressCrawler(config, mock_client_config)
def test_get_pagination_returns_valid_page_range(
self, wordpress_crawler, mock_response_with_headers
):
"""Test get_pagination returns correct PageRange from WordPress API headers."""
with patch.object(
wordpress_crawler.client, "get", return_value=mock_response_with_headers
):
result = wordpress_crawler.get_pagination()
assert isinstance(result, PageRange)
assert result.start == 1
assert result.end == 5
assert str(result) == "1:5"
def test_get_pagination_with_default_headers(self, wordpress_crawler):
"""Test get_pagination with default headers when WordPress headers are missing."""
mock_response = Mock()
mock_response.headers = {} # No WordPress headers
with patch.object(wordpress_crawler.client, "get", return_value=mock_response):
result = wordpress_crawler.get_pagination()
assert isinstance(result, PageRange)
assert result.start == 1
assert result.end == 1 # Default when no headers
def test_get_pagination_makes_correct_api_call(self, wordpress_crawler):
"""Test get_pagination makes the correct WordPress API call."""
mock_response = Mock()
mock_response.headers = {
WordpressCrawler.TOTAL_PAGES_HEADER: "3",
WordpressCrawler.TOTAL_POSTS_HEADER: "25",
}
with patch.object(
wordpress_crawler.client, "get", return_value=mock_response
) as mock_get:
wordpress_crawler.get_pagination()
expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/posts?_fields=id&per_page=100"
mock_get.assert_called_once_with(expected_url)
def test_fetch_categories_populates_category_map(self, wordpress_crawler):
"""Test _fetch_categories populates the category_map correctly."""
mock_categories_response = Mock()
mock_categories_response.json.return_value = [
{"id": 1, "slug": "technology", "count": 15},
{"id": 2, "slug": "business", "count": 10},
{"id": 3, "slug": "sports", "count": 8},
]
with patch.object(
wordpress_crawler.client, "get", return_value=mock_categories_response
):
wordpress_crawler._fetch_categories()
assert len(wordpress_crawler.category_map) == 3
assert wordpress_crawler.category_map[1] == "technology"
assert wordpress_crawler.category_map[2] == "business"
assert wordpress_crawler.category_map[3] == "sports"
def test_fetch_categories_makes_correct_api_call(self, wordpress_crawler):
"""Test _fetch_categories makes the correct WordPress API call."""
mock_response = Mock()
mock_response.json.return_value = []
with patch.object(
wordpress_crawler.client, "get", return_value=mock_response
) as mock_get:
wordpress_crawler._fetch_categories()
expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/categories?{WordpressCrawler.CATEGORY_QUERY}"
mock_get.assert_called_once_with(expected_url)
def test_map_categories_with_populated_category_map(self, wordpress_crawler):
"""Test _map_categories returns correct comma-separated string."""
# Pre-populate category map
wordpress_crawler.category_map = {
1: "technology",
2: "business",
3: "sports",
4: "lifestyle",
}
result = wordpress_crawler._map_categories([2, 1, 4])
# Should be sorted by category ID
assert result == "technology,business,lifestyle"
def test_map_categories_with_empty_category_map_fetches_categories(
self, wordpress_crawler
):
"""Test _map_categories fetches categories when category_map is empty."""
mock_categories_response = Mock()
mock_categories_response.json.return_value = [
{"id": 1, "slug": "tech", "count": 15},
{"id": 2, "slug": "news", "count": 10},
]
wordpress_crawler.category_map = {}
with patch.object(
wordpress_crawler.client, "get", return_value=mock_categories_response
):
result = wordpress_crawler._map_categories([1, 2])
assert result == "tech,news"
assert len(wordpress_crawler.category_map) == 2
def test_map_categories_filters_unknown_category_ids(self, wordpress_crawler):
"""Test _map_categories filters out unknown category IDs."""
wordpress_crawler.category_map = {1: "technology", 2: "business"}
result = wordpress_crawler._map_categories([1, 99, 2, 100])
# Should only include known categories
assert result == "technology,business"
def test_map_categories_with_empty_category_list(self, wordpress_crawler):
"""Test _map_categories returns empty string for empty category list."""
wordpress_crawler.category_map = {1: "tech", 2: "news"}
result = wordpress_crawler._map_categories([])
assert result == ""
def test_map_categories_sorts_by_category_id(self, wordpress_crawler):
"""Test _map_categories sorts categories by ID."""
wordpress_crawler.category_map = {3: "charlie", 1: "alpha", 2: "beta"}
result = wordpress_crawler._map_categories([3, 1, 2])
# Should be sorted by ID: 1, 2, 3
assert result == "alpha,beta,charlie"
def test_supports_wordpress_source_kind(self, wordpress_crawler):
"""Test supports method returns True for WordPress source kind."""
assert wordpress_crawler.supports(SourceKind.WORDPRESS) is True
assert wordpress_crawler.supports(SourceKind.HTML) is False
@pytest.mark.parametrize(
"pages,posts,expected_start,expected_end",
[
("1", "10", 1, 1),
("5", "47", 1, 5),
("10", "100", 1, 10),
],
)
def test_get_pagination_with_various_header_values(
self, wordpress_crawler, pages, posts, expected_start, expected_end
):
"""Test get_pagination with various header values."""
mock_response = Mock()
mock_response.headers = {
WordpressCrawler.TOTAL_PAGES_HEADER: pages,
WordpressCrawler.TOTAL_POSTS_HEADER: posts,
}
with patch.object(wordpress_crawler.client, "get", return_value=mock_response):
result = wordpress_crawler.get_pagination()
assert result.start == expected_start
assert result.end == expected_end
@@ -0,0 +1,70 @@
from datetime import datetime, timezone
import pytest
from basango.services.date_parser import DateParser
@pytest.mark.parametrize(
"date_str, fmt, pattern, replacement, expected",
[
(
"2004-02-12T15:19:21",
"%Y-%m-%dT%H:%M:%S",
None,
None,
1076599161, # 2004-02-12 15:19:21 UTC
),
(
"08/10/2024 - 00:00",
"%Y-%m-%d %H:%M",
r"/(\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/",
r"$3-$2-$1 $4",
1728345600, # 2024-10-08 00:00:00 UTC
),
(
"mar 08/10/2024 - 00:00",
"%Y-%m-%d %H:%M",
r"/\w{3} (\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/",
r"$3-$2-$1 $4",
1728345600, # 2024-10-08 00:00:00 UTC
),
(
"Mardi 8 octobre 2024 - 00:00",
"%Y-%m-%d %H:%M",
r"/(\d{1}) (\d{1,2}) (\d{2}) (\d{4}) - (\d{2}:\d{2})/",
r"$4-$3-$2 $5",
1728345600, # 2024-10-08 00:00:00 UTC
),
(
"8.10.2024 00:00",
"%d.%m.%Y %H:%M",
None,
None,
1728345600, # 2024-10-08 00:00:00 UTC
),
],
)
def test_create_timestamp_with_valid_dates(
date_str: str,
fmt: str | None,
pattern: str | None,
replacement: str | None,
expected: int,
) -> None:
dr = DateParser()
result = dr.create_timestamp(date_str, fmt, pattern, replacement)
assert result == expected
def test_create_timestamp_with_invalid_date_falls_back_to_midnight_today() -> None:
dr = DateParser()
# Compute expected midnight (UTC) before invoking the parser to avoid edge cases.
now = datetime.now(timezone.utc)
expected_midnight = int(
now.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
)
result = dr.create_timestamp("invalid date string", None, None, None)
assert result == expected_midnight
+9
View File
@@ -0,0 +1,9 @@
import os
import sys
# Ensure 'src' is on sys.path so `import basango...` works in tests
ROOT = os.path.dirname(os.path.dirname(__file__))
SRC = os.path.join(ROOT, "src")
if SRC not in sys.path:
sys.path.insert(0, SRC)