import type { HtmlSourceOptions, WordPressSourceOptions } from "@basango/domain/config"; import { logger } from "@basango/logger"; import { UnsupportedSourceKindError } from "#crawler/errors"; import { QueueManager, createQueueManager } from "#crawler/process/async/queue"; import { DetailsTaskPayload, ListingTaskPayload } from "#crawler/process/async/schemas"; import { createPersistors, resolveCrawlerConfig } from "#crawler/process/crawler"; import { HtmlCrawler } from "#crawler/process/parsers/html"; import { WordPressCrawler } from "#crawler/process/parsers/wordpress"; import { createTimestampRange, formatPageRange, formatTimestampRange, resolveSourceConfig, resolveSourceUpdateDates, } from "#crawler/utils"; export const collectHtmlListing = async ( payload: ListingTaskPayload, manager: QueueManager = createQueueManager(), ): Promise => { const source = resolveSourceConfig(payload.sourceId) as HtmlSourceOptions; if (source.sourceKind !== "html") { return await collectWordPressListing(payload, manager); } const settings = resolveCrawlerConfig(source, payload); await resolveSourceUpdateDates(settings); const crawler = new HtmlCrawler(settings); const pageRange = settings.pageRange ?? (await crawler.getPagination()); let queued = 0; for (let page = pageRange.start; page <= pageRange.end; page += 1) { const target = crawler.buildEndpointUrl(page) ?? `${source.sourceUrl}`; try { const items = await crawler.fetchLinks(target, source.sourceSelectors.articles); for (const node of items) { const url = crawler.extractLink(node); if (!url) continue; await manager.enqueueArticle({ category: payload.category, dateRange: createTimestampRange(payload.dateRange), sourceId: payload.sourceId, url, } as DetailsTaskPayload); queued += 1; } } catch (error) { logger.error({ error, target }, "Failed to crawl page"); } } return queued; }; export const collectWordPressListing = async ( payload: ListingTaskPayload, manager: QueueManager = createQueueManager(), ): Promise => { const source = resolveSourceConfig(payload.sourceId) as WordPressSourceOptions; if (source.sourceKind !== "wordpress") { return await collectHtmlListing(payload, manager); } const settings = resolveCrawlerConfig(source, payload); await resolveSourceUpdateDates(settings); const crawler = new WordPressCrawler(settings); const pageRange = settings.pageRange ?? (await crawler.getPagination()); let queued = 0; for (let page = pageRange.start; page <= pageRange.end; page += 1) { const url = crawler.buildEndpointUrl(page); try { const entries = await crawler.fetchLinks(url); for (const data of entries) { const url = data.link; if (!url) continue; await manager.enqueueArticle({ category: payload.category, data, dateRange: createTimestampRange(payload.dateRange), sourceId: payload.sourceId, url, } as DetailsTaskPayload); queued += 1; } } catch (error) { logger.error({ error, page }, "Failed to fetch WordPress page"); } } return queued; }; export const collectArticle = async (payload: DetailsTaskPayload): Promise => { const source = resolveSourceConfig(payload.sourceId); const settings = resolveCrawlerConfig(source, { category: payload.category, dateRange: payload.dateRange ? formatTimestampRange(payload.dateRange) : undefined, pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined, sourceId: payload.sourceId, }); const persistors = createPersistors(source); if (source.sourceKind === "html") { const crawler = new HtmlCrawler(settings, { persistors }); const html = await crawler.crawl(payload.url); return await crawler.fetchOne(html, settings.dateRange); } if (source.sourceKind === "wordpress") { const crawler = new WordPressCrawler(settings, { persistors }); return await crawler.fetchOne(payload.data ?? {}, settings.dateRange); } throw new UnsupportedSourceKindError(`Unsupported source kind`); };