feat(crawler): fix async crawling

This commit is contained in:
2025-11-09 01:01:07 +02:00
parent e8c0f0422b
commit 2b5482e9f5
58 changed files with 243 additions and 169 deletions
+2 -2
View File
@@ -9,8 +9,8 @@
"editor.defaultFormatter": "biomejs.biome" "editor.defaultFormatter": "biomejs.biome"
}, },
"editor.codeActionsOnSave": { "editor.codeActionsOnSave": {
"source.organizeImports.biome": "explicit", "source.fixAll.biome": "explicit",
"source.fixAll.biome": "explicit" "source.organizeImports.biome": "explicit"
}, },
"editor.defaultFormatter": "biomejs.biome", "editor.defaultFormatter": "biomejs.biome",
"editor.formatOnSave": true, "editor.formatOnSave": true,
+1
View File
@@ -2,6 +2,7 @@ import path from "node:path";
import { loadConfig as defineConfig } from "@devscast/config"; import { loadConfig as defineConfig } from "@devscast/config";
import { z } from "zod"; import { z } from "zod";
import { import {
DateRangeSchema, DateRangeSchema,
HtmlSourceConfigSchema, HtmlSourceConfigSchema,
+23
View File
@@ -1,6 +1,29 @@
/**
* Default date format used for parsing and formatting dates.
* Follows the "yyyy-LL-dd" pattern (e.g., "2024-06-15").
*/
export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd"; export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd";
/**
* Default User-Agent string for HTTP requests made by the crawler.
* Some websites may block requests with missing or generic User-Agent headers.
*/
export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)"; export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)";
/**
* User-Agent string used for Open Graph requests.
* Some services require a specific User-Agent to return Open Graph data.
*/
export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1"; export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1";
/**
* HTTP status codes considered transient errors.
* Used for retry logic in HTTP clients.
*/
export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504]; export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504];
/**
* Default header name for Retry-After responses.
* Used when handling rate limiting.
*/
export const DEFAULT_RETRY_AFTER_HEADER = "retry-after"; export const DEFAULT_RETRY_AFTER_HEADER = "retry-after";
+39
View File
@@ -0,0 +1,39 @@
/**
* Error thrown when an article is invalid or cannot be processed.
*/
export class InvalidArticleError extends Error {
constructor(message: string) {
super(message);
this.name = "InvalidArticleError";
}
}
/**
* Error thrown when a source kind is not supported by the crawler.
*/
export class UnsupportedSourceKindError extends Error {
constructor(message: string) {
super(message);
this.name = "UnsupportedSourceKindError";
}
}
/**
* Error thrown when a source's selectors are invalid or missing.
*/
export class InvalidSourceSelectorsError extends Error {
constructor(message: string) {
super(message);
this.name = "InvalidSourceSelectorsError";
}
}
/**
* Error thrown when an article's publication date is outside the specified date range.
*/
export class ArticleOutOfDateRangeError extends Error {
constructor(message: string, _meta: Record<string, unknown>) {
super(message);
this.name = "ArticleOutOfDateRangeError";
}
}
+1
View File
@@ -1,4 +1,5 @@
import { setTimeout as delay } from "node:timers/promises"; import { setTimeout as delay } from "node:timers/promises";
import { FetchClientConfig } from "@/config"; import { FetchClientConfig } from "@/config";
import { import {
DEFAULT_RETRY_AFTER_HEADER, DEFAULT_RETRY_AFTER_HEADER,
+1
View File
@@ -1,4 +1,5 @@
import { parse } from "node-html-parser"; import { parse } from "node-html-parser";
import { config } from "@/config"; import { config } from "@/config";
import { OPEN_GRAPH_USER_AGENT } from "@/constants"; import { OPEN_GRAPH_USER_AGENT } from "@/constants";
import { SyncHttpClient } from "@/http/http-client"; import { SyncHttpClient } from "@/http/http-client";
+23 -16
View File
@@ -1,17 +1,17 @@
import { logger } from "@basango/logger"; import { logger } from "@basango/logger";
import { config, env } from "@/config"; import { config, env } from "@/config";
import { UnsupportedSourceKindError } from "@/errors";
import { SyncHttpClient } from "@/http/http-client"; import { SyncHttpClient } from "@/http/http-client";
import { createQueueManager, QueueManager } from "@/process/async/queue"; import { QueueManager, createQueueManager } from "@/process/async/queue";
import { import {
DetailsTaskPayload, DetailsTaskPayload,
ListingTaskPayload, ListingTaskPayload,
ProcessingTaskPayload, ProcessingTaskPayload,
} from "@/process/async/schemas"; } from "@/process/async/schemas";
import { resolveCrawlerConfig } from "@/process/crawler"; import { createPersistors, resolveCrawlerConfig } from "@/process/crawler";
import { HtmlCrawler } from "@/process/parsers/html"; import { HtmlCrawler } from "@/process/parsers/html";
import { WordPressCrawler } from "@/process/parsers/wordpress"; import { WordPressCrawler } from "@/process/parsers/wordpress";
import { JsonlPersistor } from "@/process/persistence";
import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema"; import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils"; import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
@@ -30,7 +30,7 @@ export const collectHtmlListing = async (
let queued = 0; let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) { for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`; const target = crawler.buildEndpointUrl(page) ?? `${source.sourceUrl}`;
try { try {
const items = await crawler.fetchLinks(target, source.sourceSelectors.articles); const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
@@ -69,7 +69,7 @@ export const collectWordPressListing = async (
let queued = 0; let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) { for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const url = crawler.postsEndpoint(page); const url = crawler.buildEndpointUrl(page);
try { try {
const entries = await crawler.fetchLinks(url); const entries = await crawler.fetchLinks(url);
@@ -94,7 +94,10 @@ export const collectWordPressListing = async (
return queued; return queued;
}; };
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => { export const collectArticle = async (
payload: DetailsTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<unknown> => {
const source = resolveSourceConfig(payload.sourceId); const source = resolveSourceConfig(payload.sourceId);
const settings = resolveCrawlerConfig(source, { const settings = resolveCrawlerConfig(source, {
category: payload.category, category: payload.category,
@@ -102,26 +105,30 @@ export const collectArticle = async (payload: DetailsTaskPayload): Promise<unkno
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined, pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
sourceId: payload.sourceId, sourceId: payload.sourceId,
}); });
const persistors = [ const persistors = createPersistors(source);
new JsonlPersistor({
directory: config.paths.data,
sourceId: String(source.sourceId),
}),
];
if (source.sourceKind === SourceKindSchema.enum.html) { if (source.sourceKind === SourceKindSchema.enum.html) {
if (!payload.url) throw new Error("Missing article url");
const crawler = new HtmlCrawler(settings, { persistors }); const crawler = new HtmlCrawler(settings, { persistors });
const html = await crawler.crawl(payload.url); const html = await crawler.crawl(payload.url);
return await crawler.fetchOne(html, settings.dateRange);
const article = await crawler.fetchOne(html, settings.dateRange);
await manager.enqueueProcessed({
article,
sourceId: payload.sourceId,
} as ProcessingTaskPayload);
} }
if (source.sourceKind === SourceKindSchema.enum.wordpress) { if (source.sourceKind === SourceKindSchema.enum.wordpress) {
const crawler = new WordPressCrawler(settings, { persistors }); const crawler = new WordPressCrawler(settings, { persistors });
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
const article = await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
await manager.enqueueProcessed({
article,
sourceId: payload.sourceId,
} as ProcessingTaskPayload);
} }
throw new Error(`Unsupported source kind`); throw new UnsupportedSourceKindError(`Unsupported source kind`);
}; };
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => { export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
+6 -4
View File
@@ -1,7 +1,9 @@
import { randomUUID } from "node:crypto"; import { randomUUID } from "node:crypto";
import { JobsOptions, Queue, QueueOptions } from "bullmq"; import { JobsOptions, Queue, QueueOptions } from "bullmq";
import IORedis from "ioredis"; import IORedis from "ioredis";
import { config, FetchAsyncConfig } from "@/config";
import { FetchAsyncConfig, config } from "@/config";
import { import {
DetailsTaskPayload, DetailsTaskPayload,
DetailsTaskPayloadSchema, DetailsTaskPayloadSchema,
@@ -97,9 +99,9 @@ export const createQueueManager = (options: CreateQueueManagerOptions = {}): Que
return queue.add("forward_for_processing", data); return queue.add("forward_for_processing", data);
}, },
iterQueueNames: () => [ iterQueueNames: () => [
`${settings.prefix}:${settings.queues.listing}`, settings.queues.listing,
`${settings.prefix}:${settings.queues.details}`, settings.queues.details,
`${settings.prefix}:${settings.queues.processing}`, settings.queues.processing,
], ],
queueName: (suffix: string) => `${settings.prefix}:${suffix}`, queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
settings, settings,
@@ -1,4 +1,5 @@
import { z } from "zod"; import { z } from "zod";
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema"; import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema";
export const ListingTaskPayloadSchema = z.object({ export const ListingTaskPayloadSchema = z.object({
+1
View File
@@ -1,4 +1,5 @@
import { logger } from "@basango/logger"; import { logger } from "@basango/logger";
import * as handlers from "@/process/async/handlers"; import * as handlers from "@/process/async/handlers";
import { createQueueManager } from "@/process/async/queue"; import { createQueueManager } from "@/process/async/queue";
import { import {
+5 -1
View File
@@ -45,6 +45,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
{ {
concurrency: options.concurrency ?? 5, concurrency: options.concurrency ?? 5,
connection, connection,
prefix: manager.settings.prefix,
}, },
); );
@@ -53,7 +54,10 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
worker.on("error", (err) => options.onError?.(err as Error)); worker.on("error", (err) => options.onError?.(err as Error));
} }
const queueEvents = new QueueEvents(queueName, { connection }); const queueEvents = new QueueEvents(queueName, {
connection,
prefix: manager.settings.prefix,
});
workers.push(worker); workers.push(worker);
events.push(queueEvents); events.push(queueEvents);
+2 -1
View File
@@ -1,5 +1,6 @@
import logger from "@basango/logger"; import logger from "@basango/logger";
import { config, FetchCrawlerConfig } from "@/config";
import { FetchCrawlerConfig, config } from "@/config";
import { JsonlPersistor, Persistor } from "@/process/persistence"; import { JsonlPersistor, Persistor } from "@/process/persistence";
import { AnySourceConfig } from "@/schema"; import { AnySourceConfig } from "@/schema";
import { createDateRange, createPageRange } from "@/utils"; import { createDateRange, createPageRange } from "@/utils";
+2 -1
View File
@@ -1,5 +1,6 @@
import { HTMLElement, parse as parseHtml } from "node-html-parser"; import { HTMLElement, parse as parseHtml } from "node-html-parser";
import { config, FetchCrawlerConfig } from "@/config";
import { FetchCrawlerConfig, config } from "@/config";
import { SyncHttpClient } from "@/http/http-client"; import { SyncHttpClient } from "@/http/http-client";
import { OpenGraph } from "@/http/open-graph"; import { OpenGraph } from "@/http/open-graph";
import type { Persistor } from "@/process/persistence"; import type { Persistor } from "@/process/persistence";
+56 -54
View File
@@ -2,10 +2,17 @@ import { logger } from "@basango/logger";
import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns"; import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
import { HTMLElement } from "node-html-parser"; import { HTMLElement } from "node-html-parser";
import TurndownService from "turndown"; import TurndownService from "turndown";
import { FetchCrawlerConfig } from "@/config"; import { FetchCrawlerConfig } from "@/config";
import {
ArticleOutOfDateRangeError,
InvalidArticleError,
InvalidSourceSelectorsError,
UnsupportedSourceKindError,
} from "@/errors";
import { BaseCrawler } from "@/process/parsers/base"; import { BaseCrawler } from "@/process/parsers/base";
import { Persistor, persist } from "@/process/persistence"; import { Persistor, persist } from "@/process/persistence";
import { DateRange, HtmlSourceConfig } from "@/schema"; import { Article, DateRange, HtmlSourceConfig } from "@/schema";
import { createAbsoluteUrl, isTimestampInRange } from "@/utils"; import { createAbsoluteUrl, isTimestampInRange } from "@/utils";
const md = new TurndownService({ const md = new TurndownService({
@@ -32,13 +39,13 @@ const safeRegExp = (pattern?: string | null): RegExp | null => {
*/ */
export class HtmlCrawler extends BaseCrawler { export class HtmlCrawler extends BaseCrawler {
readonly source: HtmlSourceConfig; readonly source: HtmlSourceConfig;
private currentArticleUrl: string | null = null; private currentNode: string | null = null;
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) { constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
super(settings, options); super(settings, options);
if (!settings.source || settings.source.sourceKind !== "html") { if (!settings.source || settings.source.sourceKind !== "html") {
throw new Error("HtmlCrawler requires a source of kind 'html'"); throw new UnsupportedSourceKindError("HtmlCrawler requires a source of kind 'html'");
} }
this.source = this.settings.source as HtmlSourceConfig; this.source = this.settings.source as HtmlSourceConfig;
} }
@@ -46,69 +53,64 @@ export class HtmlCrawler extends BaseCrawler {
async fetch(): Promise<void> { async fetch(): Promise<void> {
const pageRange = this.settings.pageRange ?? (await this.getPagination()); const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange; const dateRange = this.settings.dateRange;
const selectors = this.source.sourceSelectors;
const articleSelector = this.source.sourceSelectors.articles; if (!selectors.articles) {
if (!articleSelector) { throw new InvalidSourceSelectorsError("No article selector configured for HTML source");
logger.error(
{ source: this.source.sourceId },
"No article selector configured for HTML source",
);
return;
} }
let stop = false;
for (let page = pageRange.start; page <= pageRange.end; page += 1) { for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const pageUrl = this.buildPageUrl(page); const endpoint = this.buildEndpointUrl(page);
let html: string; let html: string;
try { try {
html = await this.crawl(pageUrl); html = await this.crawl(endpoint);
} catch (error) { } catch (error) {
logger.error({ error, page, pageUrl }, "> page %s => [failed]", page); logger.error({ endpoint, error, page }, `Failed to crawl page ${page}`);
continue; continue;
} }
const root = this.parseHtml(html); const root = this.parseHtml(html);
const articles = this.extractAll(root, articleSelector); const articles = this.extractAll(root, selectors.articles);
if (!articles.length) { if (!articles.length) {
logger.info({ page }, "No articles found on page"); logger.error({ page }, "No articles found on page");
continue; continue;
} }
for (const node of articles) { for (const node of articles) {
try { try {
this.currentArticleUrl = this.extractLink(node); this.currentNode = this.extractLink(node);
let targetHtml = node.toString(); let nodeHtml = node.toString();
if (this.source.requiresDetails) { if (this.source.requiresDetails) {
if (!this.currentArticleUrl) { if (!this.currentNode) {
logger.debug({ page }, "Skipping article without link for details"); logger.error({ page }, "Skipping article without link for details");
continue; continue;
} }
try { try {
targetHtml = await this.crawl(this.currentArticleUrl); nodeHtml = await this.crawl(this.currentNode);
} catch (err) { } catch (error) {
logger.error( logger.error({ error, url: this.currentNode }, "Failed to fetch detail page");
{ error: err, url: this.currentArticleUrl },
"Failed to fetch detail page",
);
continue; continue;
} }
} }
const saved = await this.fetchOne(targetHtml, dateRange); await this.fetchOne(nodeHtml, dateRange);
// stop early on first out-of-range if pages are sorted by date desc } catch (error: unknown) {
if (saved === null) { if (error instanceof ArticleOutOfDateRangeError) {
stop = true; logger.info(
{ url: this.currentNode },
"Article out of date range, stopping further processing",
);
break; break;
} }
} catch (error) {
logger.error({ error, pageUrl }, "Failed to process article on page"); logger.error({ error, url: this.currentNode }, "Failed to process HTML article");
} finally { } finally {
this.currentArticleUrl = null; this.currentNode = null;
} }
} }
if (stop) break;
} }
} }
@@ -117,43 +119,43 @@ export class HtmlCrawler extends BaseCrawler {
* @param html - The HTML content of the article * @param html - The HTML content of the article
* @param dateRange - Optional date range for filtering * @param dateRange - Optional date range for filtering
*/ */
async fetchOne(html: string, dateRange?: DateRange | null) { async fetchOne(html: string, dateRange?: DateRange | null): Promise<Article> {
const root = this.parseHtml(html); const root = this.parseHtml(html);
const sel = this.source.sourceSelectors; const selectors = this.source.sourceSelectors;
const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled"; const title = this.extractText(root, selectors.articleTitle) ?? "Untitled";
const link = this.currentArticleUrl ?? this.extractLink(root); const link = this.currentNode ?? this.extractLink(root);
if (!link) { if (!link) {
logger.warn({ title: titleText }, "Skipping article without link"); throw new InvalidArticleError("Missing article link");
return null;
} }
const body = this.extractBody(root, sel.articleBody); const body = this.extractBody(root, selectors.articleBody);
const categories = this.extractCategories(root, sel.articleCategories); const categories = this.extractCategories(root, selectors.articleCategories);
const rawDate = this.extractText(root, sel.articleDate); const date = this.extractText(root, selectors.articleDate);
const timestamp = this.computeTimestamp(rawDate); const timestamp = this.computeTimestamp(date);
if (dateRange && !isTimestampInRange(dateRange, timestamp)) { if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
logger.info( throw new ArticleOutOfDateRangeError("Article outside date range", {
{ date: rawDate, link, timestamp, title: titleText }, date,
"Skipping article outside date range", link,
); timestamp,
return null; title,
});
} }
const enriched = await this.enrichWithOpenGraph( const data = await this.enrichWithOpenGraph(
{ {
body, body,
categories, categories,
link, link,
source: this.source.sourceId, source: this.source.sourceId,
timestamp, timestamp,
title: titleText, title,
}, },
link, link,
); );
return await persist(enriched, this.persistors); return await persist(data, this.persistors);
} }
/** /**
@@ -211,7 +213,7 @@ export class HtmlCrawler extends BaseCrawler {
* Build the URL for a given page number. * Build the URL for a given page number.
* @param page - The page number * @param page - The page number
*/ */
buildPageUrl(page: number): string { buildEndpointUrl(page: number): string {
let template = this.applyCategory(this.source.paginationTemplate); let template = this.applyCategory(this.source.paginationTemplate);
if (template.includes("{page}")) { if (template.includes("{page}")) {
template = template.replace("{page}", String(page)); template = template.replace("{page}", String(page));
+43 -38
View File
@@ -1,9 +1,16 @@
import { logger } from "@basango/logger"; import { logger } from "@basango/logger";
import TurndownService from "turndown"; import TurndownService from "turndown";
import { FetchCrawlerConfig } from "@/config"; import { FetchCrawlerConfig } from "@/config";
import {
ArticleOutOfDateRangeError,
InvalidArticleError,
UnsupportedSourceKindError,
} from "@/errors";
import { BaseCrawler } from "@/process/parsers/base"; import { BaseCrawler } from "@/process/parsers/base";
import { Persistor, persist } from "@/process/persistence"; import { Persistor, persist } from "@/process/persistence";
import { DateRange, PageRange, WordPressSourceConfig } from "@/schema"; import { Article, DateRange, PageRange, WordPressSourceConfig } from "@/schema";
import { isTimestampInRange } from "@/utils";
const md = new TurndownService({ const md = new TurndownService({
bulletListMarker: "-", bulletListMarker: "-",
@@ -38,7 +45,9 @@ export class WordPressCrawler extends BaseCrawler {
super(settings, options); super(settings, options);
if (!settings.source || settings.source.sourceKind !== "wordpress") { if (!settings.source || settings.source.sourceKind !== "wordpress") {
throw new Error("HtmlCrawler requires a source of kind 'wordpress'"); throw new UnsupportedSourceKindError(
"WordPressCrawler requires a source of kind 'wordpress'",
);
} }
this.source = this.settings.source as WordPressSourceConfig; this.source = this.settings.source as WordPressSourceConfig;
} }
@@ -50,29 +59,31 @@ export class WordPressCrawler extends BaseCrawler {
const pageRange = this.settings.pageRange ?? (await this.getPagination()); const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange; const dateRange = this.settings.dateRange;
let stop = false;
for (let page = pageRange.start; page <= pageRange.end; page += 1) { for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const endpoint = this.postsEndpoint(page); const endpoint = this.buildEndpointUrl(page);
try { try {
const response = await this.http.get(endpoint); const response = await this.http.get(endpoint);
const data = (await response.json()) as unknown; const articles = (await response.json()) as WordPressPost[];
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
if (!Array.isArray(data)) {
logger.warn({ page, type: typeof data }, "Unexpected WordPress payload type");
}
for (const entry of articles) { for (const node of articles) {
const saved = await this.fetchOne(entry, dateRange); try {
if (saved === null) { await this.fetchOne(node, dateRange);
stop = true; } catch (error: unknown) {
break; if (error instanceof ArticleOutOfDateRangeError) {
logger.info(
{ url: node.link },
"Article out of date range, stopping further processing",
);
break;
}
logger.error({ error, url: node.link }, "Failed to process WordPress article");
} }
} }
} catch (error) { } catch (error) {
logger.error({ error, page }, "> page %s => [failed]", page); logger.error({ error, page }, `Failed to fetch WordPress page ${page}`);
continue;
} }
if (stop) break;
} }
} }
@@ -95,7 +106,7 @@ export class WordPressCrawler extends BaseCrawler {
* @param input - Decoded JSON object or raw JSON string * @param input - Decoded JSON object or raw JSON string
* @param dateRange - Optional date range for filtering * @param dateRange - Optional date range for filtering
*/ */
async fetchOne(input: unknown, dateRange?: DateRange | null) { async fetchOne(input: unknown, dateRange?: DateRange | null): Promise<Article> {
// input can be the decoded JSON object or a raw JSON string // input can be the decoded JSON object or a raw JSON string
let data: WordPressPost | null = null; let data: WordPressPost | null = null;
try { try {
@@ -110,35 +121,29 @@ export class WordPressCrawler extends BaseCrawler {
} }
if (!data || typeof data !== "object") { if (!data || typeof data !== "object") {
throw new Error("Unexpected WordPress payload type"); throw new InvalidArticleError("Unexpected WordPress payload type");
} }
const link = data.link; const link = data.link;
if (!link) { if (!link) {
logger.error("Skipping WordPress article without link"); throw new InvalidArticleError("Missing article link");
return null;
} }
const titleHtml = data.title?.rendered ?? ""; const title =
const bodyHtml = data.content?.rendered ?? ""; this.textContent(this.parseHtml(data.title?.rendered ?? "")) ?? data.slug ?? "Untitled";
const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled"; const body = md.turndown(data.content?.rendered ?? "");
const body = md.turndown(bodyHtml);
const timestamp = this.computeTimestamp(data.date); const timestamp = this.computeTimestamp(data.date);
const categories = await this.mapCategories(data.categories ?? []); const categories = await this.mapCategories(data.categories ?? []);
// date range skip as in HTML crawler if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
if (dateRange) { throw new ArticleOutOfDateRangeError("Article outside date range", {
const { isTimestampInRange } = await import("@/utils"); link,
if (!isTimestampInRange(dateRange, timestamp)) { timestamp,
logger.info( title,
{ date: data.date, link, timestamp, title }, });
"Skipping article outside date range",
);
return null;
}
} }
const enriched = await this.enrichWithOpenGraph( const article = await this.enrichWithOpenGraph(
{ {
body, body,
categories, categories,
@@ -150,7 +155,7 @@ export class WordPressCrawler extends BaseCrawler {
link, link,
); );
return await persist(enriched, this.persistors); return await persist(article, this.persistors);
} }
/** /**
@@ -188,7 +193,7 @@ export class WordPressCrawler extends BaseCrawler {
* Construct posts endpoint URL for a given page. * Construct posts endpoint URL for a given page.
* @param page - Page number * @param page - Page number
*/ */
postsEndpoint(page: number): string { buildEndpointUrl(page: number): string {
return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`; return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`;
} }
+2
View File
@@ -1,6 +1,8 @@
import fs from "node:fs"; import fs from "node:fs";
import path from "node:path"; import path from "node:path";
import logger from "@basango/logger"; import logger from "@basango/logger";
import { Article } from "@/schema"; import { Article } from "@/schema";
import { countTokens } from "@/utils"; import { countTokens } from "@/utils";
+1
View File
@@ -1,4 +1,5 @@
import logger from "@basango/logger"; import logger from "@basango/logger";
import { import {
CrawlingOptions, CrawlingOptions,
closePersistors, closePersistors,
+1
View File
@@ -1,4 +1,5 @@
import { logger } from "@basango/logger"; import { logger } from "@basango/logger";
import { runSyncCrawl } from "@/process/sync/tasks"; import { runSyncCrawl } from "@/process/sync/tasks";
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils"; import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
+1
View File
@@ -1,4 +1,5 @@
import { logger } from "@basango/logger"; import { logger } from "@basango/logger";
import { scheduleAsyncCrawl } from "@/process/async/tasks"; import { scheduleAsyncCrawl } from "@/process/async/tasks";
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils"; import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
+2 -2
View File
@@ -1,4 +1,5 @@
import { parseArgs } from "node:util"; import { parseArgs } from "node:util";
import { CrawlingOptions } from "@/process/crawler"; import { CrawlingOptions } from "@/process/crawler";
interface WorkerCliOptions { interface WorkerCliOptions {
@@ -6,13 +7,12 @@ interface WorkerCliOptions {
} }
export const CRAWLING_USAGE = ` export const CRAWLING_USAGE = `
Usage: bun run crawl:[async|sync] -- --sourceId <id> [options] Usage: bun run crawler:[async|sync] -- --sourceId <id> [options]
Options: Options:
--pageRange <range> Optional page range filter (e.g. 1:5) --pageRange <range> Optional page range filter (e.g. 1:5)
--dateRange <range> Optional date range filter (e.g. 2024-01-01:2024-01-31) --dateRange <range> Optional date range filter (e.g. 2024-01-01:2024-01-31)
--category <slug> Optional category to crawl --category <slug> Optional category to crawl
-h, --help Show this message
`; `;
export const parseWorkerCliArgs = (): WorkerCliOptions => { export const parseWorkerCliArgs = (): WorkerCliOptions => {
+1 -3
View File
@@ -8,9 +8,7 @@ const main = async (): Promise<void> => {
const options = parseWorkerCliArgs(); const options = parseWorkerCliArgs();
const manager = createQueueManager(); const manager = createQueueManager();
const queues = options.queue?.length const queues = options.queue?.length ? options.queue : undefined;
? options.queue.map((name) => manager.queueName(name))
: undefined;
const handle = startWorker({ const handle = startWorker({
queueManager: manager, queueManager: manager,
+2 -1
View File
@@ -1,6 +1,7 @@
import { format, getUnixTime, isMatch, parse } from "date-fns"; import { format, getUnixTime, isMatch, parse } from "date-fns";
import type { RedisOptions } from "ioredis"; import type { RedisOptions } from "ioredis";
import { get_encoding, TiktokenEncoding } from "tiktoken"; import { TiktokenEncoding, get_encoding } from "tiktoken";
import { config } from "@/config"; import { config } from "@/config";
import { DEFAULT_DATE_FORMAT } from "@/constants"; import { DEFAULT_DATE_FORMAT } from "@/constants";
import { import {

Before

Width:  |  Height:  |  Size: 25 KiB

After

Width:  |  Height:  |  Size: 25 KiB

+3 -11
View File
@@ -1,8 +1,6 @@
{ {
"commitlint": { "commitlint": {
"extends": [ "extends": ["@commitlint/config-conventional"]
"@commitlint/config-conventional"
]
}, },
"config": { "config": {
"commitizen": { "commitizen": {
@@ -85,14 +83,8 @@
"preset": "jest-expo" "preset": "jest-expo"
}, },
"lint-staged": { "lint-staged": {
"*.ts": [ "*.ts": ["prettier --write", "eslint --fix"],
"prettier --write", "*.tsx": ["prettier --write", "eslint --fix"]
"eslint --fix"
],
"*.tsx": [
"prettier --write",
"eslint --fix"
]
}, },
"main": "expo-router/entry", "main": "expo-router/entry",
"name": "drc-news", "name": "drc-news",
@@ -1,8 +1,8 @@
import { endpoint } from "@/api/endpoint"; import { endpoint } from "@/api/endpoint";
import { import {
Bookmark, Bookmark,
BookmarkedArticle,
BookmarkPayload, BookmarkPayload,
BookmarkedArticle,
} from "@/api/schema/feed-management/bookmark"; } from "@/api/schema/feed-management/bookmark";
import { import {
ArticleFilters, ArticleFilters,
@@ -1,4 +1,4 @@
import { formatDistanceToNowStrict, Locale } from "date-fns"; import { Locale, formatDistanceToNowStrict } from "date-fns";
import { fr } from "date-fns/locale"; import { fr } from "date-fns/locale";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
@@ -1,5 +1,4 @@
import type React from "react"; import type React from "react";
import { GestureHandlerRootView } from "react-native-gesture-handler"; import { GestureHandlerRootView } from "react-native-gesture-handler";
import { SafeAreaProvider } from "react-native-safe-area-context"; import { SafeAreaProvider } from "react-native-safe-area-context";
@@ -1,5 +1,4 @@
import type React from "react"; import type React from "react";
import { TamaguiProvider } from "tamagui"; import { TamaguiProvider } from "tamagui";
import { config } from "~/tamagui.config"; import { config } from "~/tamagui.config";
@@ -1,5 +1,4 @@
import React, { useCallback } from "react"; import React, { useCallback } from "react";
import { ActivityIndicator, Dimensions, FlatList, FlatListProps } from "react-native"; import { ActivityIndicator, Dimensions, FlatList, FlatListProps } from "react-native";
import { View, XStack, YStack } from "tamagui"; import { View, XStack, YStack } from "tamagui";
@@ -1,5 +1,4 @@
import { useCallback } from "react"; import { useCallback } from "react";
import ContentLoader, { Circle, Rect } from "react-content-loader/native"; import ContentLoader, { Circle, Rect } from "react-content-loader/native";
import { Dimensions, FlatList } from "react-native"; import { Dimensions, FlatList } from "react-native";
import { View } from "tamagui"; import { View } from "tamagui";
@@ -1,5 +1,4 @@
import React, { useCallback } from "react"; import React, { useCallback } from "react";
import { ActivityIndicator, FlatList, FlatListProps } from "react-native"; import { ActivityIndicator, FlatList, FlatListProps } from "react-native";
import { YStack } from "tamagui"; import { YStack } from "tamagui";
@@ -1,5 +1,4 @@
import { useCallback, useState } from "react"; import { useCallback, useState } from "react";
import { ActivityIndicator, Alert } from "react-native"; import { ActivityIndicator, Alert } from "react-native";
import { Button, GetProps } from "tamagui"; import { Button, GetProps } from "tamagui";
@@ -1,5 +1,4 @@
import React, { useCallback } from "react"; import React, { useCallback } from "react";
import { FlatList, FlatListProps } from "react-native"; import { FlatList, FlatListProps } from "react-native";
import { Paragraph, XStack, YStack } from "tamagui"; import { Paragraph, XStack, YStack } from "tamagui";
@@ -1,5 +1,5 @@
import { Link } from "expo-router"; import { Link } from "expo-router";
import { GetProps, styled, XStack, YStack } from "tamagui"; import { GetProps, XStack, YStack, styled } from "tamagui";
import { SourceOverview } from "@/api/schema/feed-management/source"; import { SourceOverview } from "@/api/schema/feed-management/source";
import { SourceFollowButton } from "@/ui/components/content/source/SourceFollowButton"; import { SourceFollowButton } from "@/ui/components/content/source/SourceFollowButton";
@@ -1,5 +1,4 @@
import { useCallback } from "react"; import { useCallback } from "react";
import ContentLoader, { Circle, Rect } from "react-content-loader/native"; import ContentLoader, { Circle, Rect } from "react-content-loader/native";
import { FlatList } from "react-native"; import { FlatList } from "react-native";
import { YStack } from "tamagui"; import { YStack } from "tamagui";
@@ -5,10 +5,10 @@ import {
GetProps, GetProps,
Label, Label,
SizeTokens, SizeTokens,
styled,
Input as TamaguiInput, Input as TamaguiInput,
XStack, XStack,
YStack, YStack,
styled,
} from "tamagui"; } from "tamagui";
import { Caption } from "@/ui/components/typography"; import { Caption } from "@/ui/components/typography";
@@ -1,4 +1,4 @@
import { GetProps, Label, styled, TextArea as TamaguiTextArea, XStack, YStack } from "tamagui"; import { GetProps, Label, TextArea as TamaguiTextArea, XStack, YStack, styled } from "tamagui";
import { withController } from "@/ui/components/controls/forms/withController"; import { withController } from "@/ui/components/controls/forms/withController";
import { Caption } from "@/ui/components/typography"; import { Caption } from "@/ui/components/typography";
@@ -1,5 +1,4 @@
import React from "react"; import React from "react";
import { Controller, ControllerProps } from "react-hook-form"; import { Controller, ControllerProps } from "react-hook-form";
type WithControllerProps = { type WithControllerProps = {
@@ -1,6 +1,5 @@
import React from "react"; import React from "react";
import { View, XStack, styled } from "tamagui";
import { styled, View, XStack } from "tamagui";
import { Text } from "@/ui/components/typography"; import { Text } from "@/ui/components/typography";
@@ -1,6 +1,6 @@
import { ArrowRight } from "@tamagui/lucide-icons"; import { ArrowRight } from "@tamagui/lucide-icons";
import { Href, Link } from "expo-router"; import { Href, Link } from "expo-router";
import { GetProps, Paragraph, styled, XStack } from "tamagui"; import { GetProps, Paragraph, XStack, styled } from "tamagui";
import { Text } from "@/ui/components/typography"; import { Text } from "@/ui/components/typography";
@@ -1,7 +1,7 @@
import { StatusBar } from "expo-status-bar"; import { StatusBar } from "expo-status-bar";
import React from "react"; import React from "react";
import { useSafeAreaInsets } from "react-native-safe-area-context"; import { useSafeAreaInsets } from "react-native-safe-area-context";
import { styled, YStack } from "tamagui"; import { YStack, styled } from "tamagui";
import { ScreenHeading } from "@/ui/components/layout/ScreenHeading"; import { ScreenHeading } from "@/ui/components/layout/ScreenHeading";
import { ScreenSection } from "@/ui/components/layout/ScreenSection"; import { ScreenSection } from "@/ui/components/layout/ScreenSection";
@@ -1,5 +1,4 @@
import type React from "react"; import type React from "react";
import { Paragraph, ParagraphProps } from "tamagui"; import { Paragraph, ParagraphProps } from "tamagui";
export const Caption = (props: React.PropsWithChildren<ParagraphProps>) => { export const Caption = (props: React.PropsWithChildren<ParagraphProps>) => {
@@ -1,5 +1,4 @@
import type React from "react"; import type React from "react";
import { H2, ParagraphProps } from "tamagui"; import { H2, ParagraphProps } from "tamagui";
export const Display = (props: React.PropsWithChildren<ParagraphProps>) => { export const Display = (props: React.PropsWithChildren<ParagraphProps>) => {
@@ -1,5 +1,4 @@
import type React from "react"; import type React from "react";
import { H4, ParagraphProps } from "tamagui"; import { H4, ParagraphProps } from "tamagui";
export const Heading = (props: React.PropsWithChildren<ParagraphProps>) => { export const Heading = (props: React.PropsWithChildren<ParagraphProps>) => {
@@ -1,5 +1,4 @@
import type React from "react"; import type React from "react";
import { Paragraph, ParagraphProps } from "tamagui"; import { Paragraph, ParagraphProps } from "tamagui";
export const Text = (props: React.PropsWithChildren<ParagraphProps>) => { export const Text = (props: React.PropsWithChildren<ParagraphProps>) => {
+1 -1
View File
@@ -39,7 +39,7 @@
}, },
"files": { "files": {
"ignoreUnknown": true, "ignoreUnknown": true,
"includes": ["**/apps", "**/packages", "!/apps/api-legacy", "!/apps/mobile-legacy"] "includes": ["**", "!apps/mobile-legacy", "!apps/api-legacy"]
}, },
"formatter": { "formatter": {
"enabled": true, "enabled": true,
+2 -1
View File
@@ -1,5 +1,6 @@
import { drizzle } from "drizzle-orm/node-postgres"; import { drizzle } from "drizzle-orm/node-postgres";
import { Pool } from "pg"; import { Pool } from "pg";
import * as schema from "@/schema"; import * as schema from "@/schema";
const isDevelopment = process.env.NODE_ENV === "development"; const isDevelopment = process.env.NODE_ENV === "development";
@@ -21,7 +22,7 @@ const pool = new Pool({
export const getConnectionPoolStats = () => { export const getConnectionPoolStats = () => {
const stats = { const stats = {
active: Math.max(0, (pool.totalCount ?? 0) - (pool.idleCount ?? 0)), active: Math.max(0, (pool.totalCount ?? 0) - (pool.idleCount ?? 0)),
ended: (pool as any).ended ?? false, ended: pool.ended ?? false,
idle: pool.idleCount ?? 0, idle: pool.idleCount ?? 0,
name: "primary", name: "primary",
total: pool.options.max ?? 0, total: pool.options.max ?? 0,
+3 -3
View File
@@ -4,13 +4,13 @@ import { and, asc, desc, eq, gt, lt, or, sql } from "drizzle-orm";
import type { Database } from "@/client"; import type { Database } from "@/client";
import { articles, bookmarkArticles, bookmarks, comments, sources, users } from "@/schema"; import { articles, bookmarkArticles, bookmarks, comments, sources, users } from "@/schema";
import { import {
buildPaginationResult,
createPageState,
decodeCursor,
type PageRequest, type PageRequest,
type PageState, type PageState,
type PaginationMeta, type PaginationMeta,
type SortDirection, type SortDirection,
buildPaginationResult,
createPageState,
decodeCursor,
} from "@/utils/pagination"; } from "@/utils/pagination";
export interface ArticleFilters { export interface ArticleFilters {
+2 -2
View File
@@ -4,11 +4,11 @@ import { and, desc, eq, lt, sql } from "drizzle-orm";
import type { Database } from "@/client"; import type { Database } from "@/client";
import { bookmarkArticles, bookmarks } from "@/schema"; import { bookmarkArticles, bookmarks } from "@/schema";
import { import {
type PageRequest,
type PaginationMeta,
buildPaginationResult, buildPaginationResult,
createPageState, createPageState,
decodeCursor, decodeCursor,
type PageRequest,
type PaginationMeta,
} from "@/utils/pagination"; } from "@/utils/pagination";
export interface BookmarkRow { export interface BookmarkRow {
+2 -2
View File
@@ -5,11 +5,11 @@ import type { Database } from "@/client";
import { PUBLICATION_GRAPH_DAYS, SOURCE_IMAGE_BASE } from "@/constant"; import { PUBLICATION_GRAPH_DAYS, SOURCE_IMAGE_BASE } from "@/constant";
import { articles, followedSources, sources } from "@/schema"; import { articles, followedSources, sources } from "@/schema";
import { import {
type PageRequest,
type PaginationMeta,
buildPaginationResult, buildPaginationResult,
createPageState, createPageState,
decodeCursor, decodeCursor,
type PageRequest,
type PaginationMeta,
} from "@/utils/pagination"; } from "@/utils/pagination";
export interface SourceOverviewRow { export interface SourceOverviewRow {
+1
View File
@@ -1,4 +1,5 @@
import { sql } from "drizzle-orm"; import { sql } from "drizzle-orm";
import { db } from "@/client"; import { db } from "@/client";
export async function checkHealth() { export async function checkHealth() {
+4 -3
View File
@@ -7,14 +7,15 @@
"incremental": false, "incremental": false,
"isolatedModules": true, "isolatedModules": true,
"lib": ["es2022", "DOM", "DOM.Iterable"], "lib": ["es2022", "DOM", "DOM.Iterable"],
"module": "NodeNext", "module": "ESNext",
"moduleDetection": "force", "moduleDetection": "force",
"moduleResolution": "NodeNext", "moduleResolution": "Bundler",
"noUncheckedIndexedAccess": true, "noUncheckedIndexedAccess": true,
"resolveJsonModule": true, "resolveJsonModule": true,
"skipLibCheck": true, "skipLibCheck": true,
"strict": true, "strict": true,
"target": "ES2022" "target": "ES2022",
"verbatimModuleSyntax": false
}, },
"display": "Default" "display": "Default"
} }
+1 -1
View File
@@ -1,6 +1,6 @@
import { cn } from "@basango/ui/lib/utils"; import { cn } from "@basango/ui/lib/utils";
import { Slot } from "@radix-ui/react-slot"; import { Slot } from "@radix-ui/react-slot";
import { cva, type VariantProps } from "class-variance-authority"; import { type VariantProps, cva } from "class-variance-authority";
import * as React from "react"; import * as React from "react";
const buttonVariants = cva( const buttonVariants = cva(