Initial commit
This commit is contained in:
@@ -0,0 +1,57 @@
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from basango.domain import DateRange
|
||||
|
||||
|
||||
def ts(y: int, m: int, d: int, hh: int = 0, mm: int = 0, ss: int = 0) -> int:
|
||||
return int(datetime(y, m, d, hh, mm, ss, tzinfo=timezone.utc).timestamp())
|
||||
|
||||
|
||||
def test_from_parses_two_dates_with_default_format() -> None:
|
||||
dr = DateRange.create("2024-10-01:2024-10-08")
|
||||
assert dr.start == ts(2024, 10, 1)
|
||||
assert dr.end == ts(2024, 10, 8)
|
||||
|
||||
|
||||
def test_str_and_format_roundtrip() -> None:
|
||||
dr = DateRange.create("2024-10-01:2024-10-02")
|
||||
assert str(dr) == f"{ts(2024, 10, 1)}:{ts(2024, 10, 2)}"
|
||||
assert dr.format("%Y-%m-%d") == "2024-10-01:2024-10-02"
|
||||
|
||||
|
||||
def test_in_range_out_range_inclusive_boundaries() -> None:
|
||||
dr = DateRange.create("2024-10-01:2024-10-02")
|
||||
start = ts(2024, 10, 1)
|
||||
end = ts(2024, 10, 2)
|
||||
before = start - 1
|
||||
after = end + 1
|
||||
midday_end = ts(2024, 10, 2, 12, 0, 0)
|
||||
|
||||
assert dr.in_range(start) is True
|
||||
assert dr.in_range(end) is True
|
||||
assert dr.out_range(before) is True
|
||||
# End is at 00:00 of end day; times later that day are outside
|
||||
assert dr.out_range(midday_end) is True
|
||||
assert dr.out_range(after) is True
|
||||
|
||||
|
||||
def test_backward_uses_days_and_next_day_end() -> None:
|
||||
base = datetime(2024, 10, 31, tzinfo=timezone.utc)
|
||||
dr = DateRange.backward(date=base, days=10)
|
||||
assert dr.start == ts(2024, 10, 21)
|
||||
assert dr.end == ts(2024, 11, 1)
|
||||
|
||||
|
||||
def test_from_raises_on_invalid_separator_or_spec() -> None:
|
||||
with pytest.raises(AssertionError):
|
||||
DateRange.create("2024-10-01:2024-10-08", separator="")
|
||||
with pytest.raises(AssertionError):
|
||||
DateRange.create("2024-10-01", separator=":")
|
||||
|
||||
|
||||
def test_from_accepts_python_format_string() -> None:
|
||||
dr = DateRange.create("2024/10/01|2024/10/02", fmt="%Y/%m/%d", separator="|")
|
||||
assert dr.start == ts(2024, 10, 1)
|
||||
assert dr.end == ts(2024, 10, 2)
|
||||
@@ -0,0 +1,19 @@
|
||||
import pytest
|
||||
|
||||
from basango.domain import PageRange
|
||||
|
||||
|
||||
def test_it_should_create_page_range():
|
||||
pr = PageRange.create("1:10")
|
||||
assert pr.start == 1
|
||||
assert pr.end == 10
|
||||
|
||||
|
||||
def test_end_page_should_be_greater_than_start_page():
|
||||
with pytest.raises(AssertionError):
|
||||
PageRange.create("10:1")
|
||||
|
||||
|
||||
def test_non_negative_pages():
|
||||
with pytest.raises(AssertionError):
|
||||
PageRange.create("-1:-10")
|
||||
@@ -0,0 +1,292 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import HttpUrl
|
||||
|
||||
from basango.core.config import WordPressSourceConfig
|
||||
from basango.core.config.fetch_config import CrawlerConfig, ClientConfig
|
||||
from basango.core.config.source_config import HtmlSourceConfig, SourceSelectors
|
||||
from basango.domain import SourceKind, PageRange
|
||||
from basango.services.crawler.html_crawler import HtmlCrawler
|
||||
|
||||
|
||||
class TestHtmlCrawler:
|
||||
"""Test suite for HtmlCrawler."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client_config(self):
|
||||
return ClientConfig()
|
||||
|
||||
@pytest.fixture
|
||||
def mock_html_source_config(self):
|
||||
return HtmlSourceConfig(
|
||||
source_id="test_source",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
pagination_template="news",
|
||||
source_selectors=SourceSelectors(pagination="ul.pagination > li a"),
|
||||
supports_categories=True,
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_crawler_config(self, mock_html_source_config):
|
||||
return CrawlerConfig(source=mock_html_source_config, category="tech")
|
||||
|
||||
@pytest.fixture
|
||||
def html_crawler(self, mock_crawler_config, mock_client_config):
|
||||
return HtmlCrawler(mock_crawler_config, mock_client_config)
|
||||
|
||||
def test_with_valid_html_source(self, html_crawler):
|
||||
"""Test __init__ with valid HTML source config."""
|
||||
assert html_crawler.source.source_kind == SourceKind.HTML
|
||||
assert isinstance(html_crawler.source, HtmlSourceConfig)
|
||||
|
||||
def test_with_invalid_source_kind_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when source kind is not HTML."""
|
||||
wordpress_source = WordPressSourceConfig(
|
||||
source_id="test_wordpress",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
)
|
||||
config = CrawlerConfig(source=wordpress_source)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="HtmlCrawler requires a source of kind HTML"
|
||||
):
|
||||
HtmlCrawler(config, mock_client_config)
|
||||
|
||||
def test_with_no_source_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when no source is provided."""
|
||||
config = CrawlerConfig(source=None)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="HtmlCrawler requires a source of kind HTML"
|
||||
):
|
||||
HtmlCrawler(config, mock_client_config)
|
||||
|
||||
def test_get_pagination_returns_valid_page_range(self, html_crawler):
|
||||
"""Test that get_pagination returns a valid PageRange."""
|
||||
with patch.object(html_crawler, "get_last_page", return_value=5):
|
||||
result = html_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 0
|
||||
assert result.end == 5
|
||||
assert str(result) == "0:5"
|
||||
|
||||
def test_get_last_page_with_valid_pagination_links(self, html_crawler):
|
||||
"""Test get_last_page extracts page number from pagination links."""
|
||||
# Mock HTML with pagination links
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=1">1</a></li>
|
||||
<li><a href="/news?page=2">2</a></li>
|
||||
<li><a href="/news?page=3">3</a></li>
|
||||
<li><a href="/news?page=10">10</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 10
|
||||
|
||||
def test_get_last_page_with_no_pagination_links(self, html_crawler):
|
||||
"""Test get_last_page returns 1 when no pagination links found."""
|
||||
mock_html = "<div>No pagination here</div>"
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
|
||||
def test_get_last_page_with_empty_href(self, html_crawler):
|
||||
"""Test get_last_page returns 1 when href is empty or None."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a>No href</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
|
||||
def test_get_last_page_with_regex_extraction(self, html_crawler):
|
||||
"""Test get_last_page extracts page number using regex."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/articles/page/25/">Page 25</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 25
|
||||
|
||||
def test_get_last_page_with_query_parameters(self, html_crawler):
|
||||
"""Test get_last_page extracts page number from query parameters."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?category=tech&page=15&sort=date">Last</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 15
|
||||
|
||||
def test_get_last_page_with_invalid_page_parameter(self, html_crawler):
|
||||
"""Test get_last_page returns 1 when page parameter is invalid."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=invalid">Last</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
|
||||
def test_get_last_page_with_category_support(self, html_crawler):
|
||||
"""Test get_last_page uses category in URL when supported."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?category=tech&page=8">8</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
html_crawler.get_last_page()
|
||||
|
||||
# The URL construction concatenates source_url with the path
|
||||
# Since the template doesn't contain {category}, it should remain unchanged
|
||||
expected_url = "https://example.com/news"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_with_category_template(self, mock_client_config):
|
||||
"""Test get_last_page uses category replacement when template contains {category}."""
|
||||
source_config = HtmlSourceConfig(
|
||||
source_id="test_source",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
pagination_template="news/{category}",
|
||||
source_selectors=SourceSelectors(pagination="ul.pagination > li a"),
|
||||
supports_categories=True,
|
||||
)
|
||||
crawler_config = CrawlerConfig(source=source_config, category="tech")
|
||||
crawler = HtmlCrawler(crawler_config, mock_client_config)
|
||||
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news/tech?page=5">5</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
crawler.get_last_page()
|
||||
|
||||
expected_url = "https://example.com/news/tech"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_without_category_support(self, html_crawler):
|
||||
"""Test get_last_page uses default template when categories not supported."""
|
||||
# Modify source to not support categories
|
||||
html_crawler.source.supports_categories = False
|
||||
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=5">5</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
html_crawler.get_last_page()
|
||||
|
||||
# Verify the URL was constructed without category replacement
|
||||
expected_url = "https://example.com/news"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_without_category_in_config(
|
||||
self, mock_client_config, mock_html_source_config
|
||||
):
|
||||
"""Test get_last_page uses default template when no category in config."""
|
||||
config = CrawlerConfig(source=mock_html_source_config, category=None)
|
||||
crawler = HtmlCrawler(config, mock_client_config)
|
||||
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=3">3</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(crawler, "crawl") as mock_crawl:
|
||||
mock_crawl.return_value = mock_soup
|
||||
crawler.get_last_page()
|
||||
|
||||
# Verify the URL was constructed without category replacement
|
||||
expected_url = "https://example.com/news"
|
||||
mock_crawl.assert_called_once_with(expected_url)
|
||||
|
||||
def test_get_last_page_with_multiple_numbers_in_href(self, html_crawler):
|
||||
"""Test get_last_page extracts first number when multiple numbers present."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news/2024/page/42/comments/100">Last</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
# Should extract the first number found (2024)
|
||||
assert result == 2024
|
||||
|
||||
def test_supports_html_source_kind(self, html_crawler):
|
||||
"""Test that supports method returns True for HTML source kind."""
|
||||
assert html_crawler.supports(SourceKind.HTML) is True
|
||||
assert html_crawler.supports(SourceKind.WORDPRESS) is False
|
||||
|
||||
def test_get_pagination_integration(self, html_crawler):
|
||||
"""Integration test for get_pagination calling get_last_page."""
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=7">7</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 0
|
||||
assert result.end == 7
|
||||
|
||||
def test_get_last_page_with_non_string_href(self, html_crawler):
|
||||
"""Test get_last_page handles non-string href attributes."""
|
||||
# Create a mock element with href as a list (AttributeValueList)
|
||||
mock_html = """
|
||||
<ul class="pagination">
|
||||
<li><a href="/news?page=5">5</a></li>
|
||||
</ul>
|
||||
"""
|
||||
mock_soup = BeautifulSoup(mock_html, "html.parser")
|
||||
|
||||
# Modify the href to simulate a non-string type by removing it
|
||||
pagination_link = mock_soup.select("ul.pagination > li a")[-1]
|
||||
# Instead of setting href to a list, let's test with missing href
|
||||
del pagination_link.attrs["href"]
|
||||
|
||||
with patch.object(html_crawler, "crawl", return_value=mock_soup):
|
||||
result = html_crawler.get_last_page()
|
||||
assert result == 1
|
||||
@@ -0,0 +1,240 @@
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from pydantic import HttpUrl
|
||||
|
||||
from basango.core.config.fetch_config import CrawlerConfig, ClientConfig
|
||||
from basango.core.config.source_config import (
|
||||
WordPressSourceConfig,
|
||||
HtmlSourceConfig,
|
||||
SourceSelectors,
|
||||
)
|
||||
from basango.domain import SourceKind, PageRange
|
||||
from basango.services.crawler.wordpress_crawler import WordpressCrawler
|
||||
|
||||
|
||||
class TestWordPressCrawler:
|
||||
"""Test suite for WordPressCrawler."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client_config(self):
|
||||
return ClientConfig()
|
||||
|
||||
@pytest.fixture
|
||||
def mock_wordpress_source_config(self):
|
||||
return WordPressSourceConfig(
|
||||
source_id="test_wordpress_source",
|
||||
source_url=HttpUrl("https://example.com/"),
|
||||
supports_categories=True,
|
||||
categories=["tech", "news"],
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_crawler_config(self, mock_wordpress_source_config):
|
||||
return CrawlerConfig(source=mock_wordpress_source_config, category="tech")
|
||||
|
||||
@pytest.fixture
|
||||
def wordpress_crawler(self, mock_crawler_config, mock_client_config):
|
||||
return WordpressCrawler(mock_crawler_config, mock_client_config)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_response_with_headers(self):
|
||||
response = Mock()
|
||||
response.headers = {
|
||||
WordpressCrawler.TOTAL_PAGES_HEADER: "5",
|
||||
WordpressCrawler.TOTAL_POSTS_HEADER: "47",
|
||||
}
|
||||
return response
|
||||
|
||||
def test_with_valid_wordpress_source(self, wordpress_crawler):
|
||||
"""Test __init__ with valid WordPress source config."""
|
||||
assert wordpress_crawler.source.source_kind == SourceKind.WORDPRESS
|
||||
assert isinstance(wordpress_crawler.source, WordPressSourceConfig)
|
||||
|
||||
def test_with_invalid_source_kind_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when source kind is not WORDPRESS."""
|
||||
html_source = HtmlSourceConfig(
|
||||
source_id="test_html",
|
||||
source_url=HttpUrl("https://example.com"),
|
||||
pagination_template="news",
|
||||
source_selectors=SourceSelectors(),
|
||||
)
|
||||
config = CrawlerConfig(source=html_source)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="WordpressCrawler requires a source of kind WORDPRESS"
|
||||
):
|
||||
WordpressCrawler(config, mock_client_config)
|
||||
|
||||
def test_with_no_source_raises_error(self, mock_client_config):
|
||||
"""Test __init__ raises ValueError when source is None."""
|
||||
config = CrawlerConfig(source=None)
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="WordpressCrawler requires a source of kind WORDPRESS"
|
||||
):
|
||||
WordpressCrawler(config, mock_client_config)
|
||||
|
||||
def test_get_pagination_returns_valid_page_range(
|
||||
self, wordpress_crawler, mock_response_with_headers
|
||||
):
|
||||
"""Test get_pagination returns correct PageRange from WordPress API headers."""
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_response_with_headers
|
||||
):
|
||||
result = wordpress_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 1
|
||||
assert result.end == 5
|
||||
assert str(result) == "1:5"
|
||||
|
||||
def test_get_pagination_with_default_headers(self, wordpress_crawler):
|
||||
"""Test get_pagination with default headers when WordPress headers are missing."""
|
||||
mock_response = Mock()
|
||||
mock_response.headers = {} # No WordPress headers
|
||||
|
||||
with patch.object(wordpress_crawler.client, "get", return_value=mock_response):
|
||||
result = wordpress_crawler.get_pagination()
|
||||
|
||||
assert isinstance(result, PageRange)
|
||||
assert result.start == 1
|
||||
assert result.end == 1 # Default when no headers
|
||||
|
||||
def test_get_pagination_makes_correct_api_call(self, wordpress_crawler):
|
||||
"""Test get_pagination makes the correct WordPress API call."""
|
||||
mock_response = Mock()
|
||||
mock_response.headers = {
|
||||
WordpressCrawler.TOTAL_PAGES_HEADER: "3",
|
||||
WordpressCrawler.TOTAL_POSTS_HEADER: "25",
|
||||
}
|
||||
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_response
|
||||
) as mock_get:
|
||||
wordpress_crawler.get_pagination()
|
||||
|
||||
expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/posts?_fields=id&per_page=100"
|
||||
mock_get.assert_called_once_with(expected_url)
|
||||
|
||||
def test_fetch_categories_populates_category_map(self, wordpress_crawler):
|
||||
"""Test _fetch_categories populates the category_map correctly."""
|
||||
mock_categories_response = Mock()
|
||||
mock_categories_response.json.return_value = [
|
||||
{"id": 1, "slug": "technology", "count": 15},
|
||||
{"id": 2, "slug": "business", "count": 10},
|
||||
{"id": 3, "slug": "sports", "count": 8},
|
||||
]
|
||||
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_categories_response
|
||||
):
|
||||
wordpress_crawler._fetch_categories()
|
||||
|
||||
assert len(wordpress_crawler.category_map) == 3
|
||||
assert wordpress_crawler.category_map[1] == "technology"
|
||||
assert wordpress_crawler.category_map[2] == "business"
|
||||
assert wordpress_crawler.category_map[3] == "sports"
|
||||
|
||||
def test_fetch_categories_makes_correct_api_call(self, wordpress_crawler):
|
||||
"""Test _fetch_categories makes the correct WordPress API call."""
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = []
|
||||
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_response
|
||||
) as mock_get:
|
||||
wordpress_crawler._fetch_categories()
|
||||
|
||||
expected_url = f"{wordpress_crawler.source.source_url}wp-json/wp/v2/categories?{WordpressCrawler.CATEGORY_QUERY}"
|
||||
mock_get.assert_called_once_with(expected_url)
|
||||
|
||||
def test_map_categories_with_populated_category_map(self, wordpress_crawler):
|
||||
"""Test _map_categories returns correct comma-separated string."""
|
||||
|
||||
# Pre-populate category map
|
||||
wordpress_crawler.category_map = {
|
||||
1: "technology",
|
||||
2: "business",
|
||||
3: "sports",
|
||||
4: "lifestyle",
|
||||
}
|
||||
|
||||
result = wordpress_crawler._map_categories([2, 1, 4])
|
||||
|
||||
# Should be sorted by category ID
|
||||
assert result == "technology,business,lifestyle"
|
||||
|
||||
def test_map_categories_with_empty_category_map_fetches_categories(
|
||||
self, wordpress_crawler
|
||||
):
|
||||
"""Test _map_categories fetches categories when category_map is empty."""
|
||||
mock_categories_response = Mock()
|
||||
mock_categories_response.json.return_value = [
|
||||
{"id": 1, "slug": "tech", "count": 15},
|
||||
{"id": 2, "slug": "news", "count": 10},
|
||||
]
|
||||
|
||||
wordpress_crawler.category_map = {}
|
||||
with patch.object(
|
||||
wordpress_crawler.client, "get", return_value=mock_categories_response
|
||||
):
|
||||
result = wordpress_crawler._map_categories([1, 2])
|
||||
|
||||
assert result == "tech,news"
|
||||
assert len(wordpress_crawler.category_map) == 2
|
||||
|
||||
def test_map_categories_filters_unknown_category_ids(self, wordpress_crawler):
|
||||
"""Test _map_categories filters out unknown category IDs."""
|
||||
wordpress_crawler.category_map = {1: "technology", 2: "business"}
|
||||
|
||||
result = wordpress_crawler._map_categories([1, 99, 2, 100])
|
||||
|
||||
# Should only include known categories
|
||||
assert result == "technology,business"
|
||||
|
||||
def test_map_categories_with_empty_category_list(self, wordpress_crawler):
|
||||
"""Test _map_categories returns empty string for empty category list."""
|
||||
wordpress_crawler.category_map = {1: "tech", 2: "news"}
|
||||
|
||||
result = wordpress_crawler._map_categories([])
|
||||
|
||||
assert result == ""
|
||||
|
||||
def test_map_categories_sorts_by_category_id(self, wordpress_crawler):
|
||||
"""Test _map_categories sorts categories by ID."""
|
||||
wordpress_crawler.category_map = {3: "charlie", 1: "alpha", 2: "beta"}
|
||||
|
||||
result = wordpress_crawler._map_categories([3, 1, 2])
|
||||
|
||||
# Should be sorted by ID: 1, 2, 3
|
||||
assert result == "alpha,beta,charlie"
|
||||
|
||||
def test_supports_wordpress_source_kind(self, wordpress_crawler):
|
||||
"""Test supports method returns True for WordPress source kind."""
|
||||
assert wordpress_crawler.supports(SourceKind.WORDPRESS) is True
|
||||
assert wordpress_crawler.supports(SourceKind.HTML) is False
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pages,posts,expected_start,expected_end",
|
||||
[
|
||||
("1", "10", 1, 1),
|
||||
("5", "47", 1, 5),
|
||||
("10", "100", 1, 10),
|
||||
],
|
||||
)
|
||||
def test_get_pagination_with_various_header_values(
|
||||
self, wordpress_crawler, pages, posts, expected_start, expected_end
|
||||
):
|
||||
"""Test get_pagination with various header values."""
|
||||
mock_response = Mock()
|
||||
mock_response.headers = {
|
||||
WordpressCrawler.TOTAL_PAGES_HEADER: pages,
|
||||
WordpressCrawler.TOTAL_POSTS_HEADER: posts,
|
||||
}
|
||||
|
||||
with patch.object(wordpress_crawler.client, "get", return_value=mock_response):
|
||||
result = wordpress_crawler.get_pagination()
|
||||
|
||||
assert result.start == expected_start
|
||||
assert result.end == expected_end
|
||||
@@ -0,0 +1,70 @@
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from basango.services.date_parser import DateParser
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_str, fmt, pattern, replacement, expected",
|
||||
[
|
||||
(
|
||||
"2004-02-12T15:19:21",
|
||||
"%Y-%m-%dT%H:%M:%S",
|
||||
None,
|
||||
None,
|
||||
1076599161, # 2004-02-12 15:19:21 UTC
|
||||
),
|
||||
(
|
||||
"08/10/2024 - 00:00",
|
||||
"%Y-%m-%d %H:%M",
|
||||
r"/(\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/",
|
||||
r"$3-$2-$1 $4",
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
(
|
||||
"mar 08/10/2024 - 00:00",
|
||||
"%Y-%m-%d %H:%M",
|
||||
r"/\w{3} (\d{2})\/(\d{2})\/(\d{4}) - (\d{2}:\d{2})/",
|
||||
r"$3-$2-$1 $4",
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
(
|
||||
"Mardi 8 octobre 2024 - 00:00",
|
||||
"%Y-%m-%d %H:%M",
|
||||
r"/(\d{1}) (\d{1,2}) (\d{2}) (\d{4}) - (\d{2}:\d{2})/",
|
||||
r"$4-$3-$2 $5",
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
(
|
||||
"8.10.2024 00:00",
|
||||
"%d.%m.%Y %H:%M",
|
||||
None,
|
||||
None,
|
||||
1728345600, # 2024-10-08 00:00:00 UTC
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_create_timestamp_with_valid_dates(
|
||||
date_str: str,
|
||||
fmt: str | None,
|
||||
pattern: str | None,
|
||||
replacement: str | None,
|
||||
expected: int,
|
||||
) -> None:
|
||||
dr = DateParser()
|
||||
result = dr.create_timestamp(date_str, fmt, pattern, replacement)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_create_timestamp_with_invalid_date_falls_back_to_midnight_today() -> None:
|
||||
dr = DateParser()
|
||||
|
||||
# Compute expected midnight (UTC) before invoking the parser to avoid edge cases.
|
||||
now = datetime.now(timezone.utc)
|
||||
expected_midnight = int(
|
||||
now.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
|
||||
)
|
||||
|
||||
result = dr.create_timestamp("invalid date string", None, None, None)
|
||||
assert result == expected_midnight
|
||||
@@ -0,0 +1,9 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
# Ensure 'src' is on sys.path so `import basango...` works in tests
|
||||
ROOT = os.path.dirname(os.path.dirname(__file__))
|
||||
SRC = os.path.join(ROOT, "src")
|
||||
if SRC not in sys.path:
|
||||
sys.path.insert(0, SRC)
|
||||
Reference in New Issue
Block a user