feat(crawler): fix async crawling
This commit is contained in:
Vendored
+2
-2
@@ -9,8 +9,8 @@
|
||||
"editor.defaultFormatter": "biomejs.biome"
|
||||
},
|
||||
"editor.codeActionsOnSave": {
|
||||
"source.organizeImports.biome": "explicit",
|
||||
"source.fixAll.biome": "explicit"
|
||||
"source.fixAll.biome": "explicit",
|
||||
"source.organizeImports.biome": "explicit"
|
||||
},
|
||||
"editor.defaultFormatter": "biomejs.biome",
|
||||
"editor.formatOnSave": true,
|
||||
|
||||
@@ -2,6 +2,7 @@ import path from "node:path";
|
||||
|
||||
import { loadConfig as defineConfig } from "@devscast/config";
|
||||
import { z } from "zod";
|
||||
|
||||
import {
|
||||
DateRangeSchema,
|
||||
HtmlSourceConfigSchema,
|
||||
|
||||
@@ -1,6 +1,29 @@
|
||||
/**
|
||||
* Default date format used for parsing and formatting dates.
|
||||
* Follows the "yyyy-LL-dd" pattern (e.g., "2024-06-15").
|
||||
*/
|
||||
export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd";
|
||||
|
||||
/**
|
||||
* Default User-Agent string for HTTP requests made by the crawler.
|
||||
* Some websites may block requests with missing or generic User-Agent headers.
|
||||
*/
|
||||
export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)";
|
||||
|
||||
/**
|
||||
* User-Agent string used for Open Graph requests.
|
||||
* Some services require a specific User-Agent to return Open Graph data.
|
||||
*/
|
||||
export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1";
|
||||
|
||||
/**
|
||||
* HTTP status codes considered transient errors.
|
||||
* Used for retry logic in HTTP clients.
|
||||
*/
|
||||
export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504];
|
||||
|
||||
/**
|
||||
* Default header name for Retry-After responses.
|
||||
* Used when handling rate limiting.
|
||||
*/
|
||||
export const DEFAULT_RETRY_AFTER_HEADER = "retry-after";
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
/**
|
||||
* Error thrown when an article is invalid or cannot be processed.
|
||||
*/
|
||||
export class InvalidArticleError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = "InvalidArticleError";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Error thrown when a source kind is not supported by the crawler.
|
||||
*/
|
||||
export class UnsupportedSourceKindError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = "UnsupportedSourceKindError";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Error thrown when a source's selectors are invalid or missing.
|
||||
*/
|
||||
export class InvalidSourceSelectorsError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = "InvalidSourceSelectorsError";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Error thrown when an article's publication date is outside the specified date range.
|
||||
*/
|
||||
export class ArticleOutOfDateRangeError extends Error {
|
||||
constructor(message: string, _meta: Record<string, unknown>) {
|
||||
super(message);
|
||||
this.name = "ArticleOutOfDateRangeError";
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
import { setTimeout as delay } from "node:timers/promises";
|
||||
|
||||
import { FetchClientConfig } from "@/config";
|
||||
import {
|
||||
DEFAULT_RETRY_AFTER_HEADER,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { parse } from "node-html-parser";
|
||||
|
||||
import { config } from "@/config";
|
||||
import { OPEN_GRAPH_USER_AGENT } from "@/constants";
|
||||
import { SyncHttpClient } from "@/http/http-client";
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { config, env } from "@/config";
|
||||
import { UnsupportedSourceKindError } from "@/errors";
|
||||
import { SyncHttpClient } from "@/http/http-client";
|
||||
import { createQueueManager, QueueManager } from "@/process/async/queue";
|
||||
import { QueueManager, createQueueManager } from "@/process/async/queue";
|
||||
import {
|
||||
DetailsTaskPayload,
|
||||
ListingTaskPayload,
|
||||
ProcessingTaskPayload,
|
||||
} from "@/process/async/schemas";
|
||||
import { resolveCrawlerConfig } from "@/process/crawler";
|
||||
import { createPersistors, resolveCrawlerConfig } from "@/process/crawler";
|
||||
import { HtmlCrawler } from "@/process/parsers/html";
|
||||
import { WordPressCrawler } from "@/process/parsers/wordpress";
|
||||
import { JsonlPersistor } from "@/process/persistence";
|
||||
import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
|
||||
import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
|
||||
|
||||
@@ -30,7 +30,7 @@ export const collectHtmlListing = async (
|
||||
|
||||
let queued = 0;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`;
|
||||
const target = crawler.buildEndpointUrl(page) ?? `${source.sourceUrl}`;
|
||||
|
||||
try {
|
||||
const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
|
||||
@@ -69,7 +69,7 @@ export const collectWordPressListing = async (
|
||||
|
||||
let queued = 0;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const url = crawler.postsEndpoint(page);
|
||||
const url = crawler.buildEndpointUrl(page);
|
||||
|
||||
try {
|
||||
const entries = await crawler.fetchLinks(url);
|
||||
@@ -94,7 +94,10 @@ export const collectWordPressListing = async (
|
||||
return queued;
|
||||
};
|
||||
|
||||
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
|
||||
export const collectArticle = async (
|
||||
payload: DetailsTaskPayload,
|
||||
manager: QueueManager = createQueueManager(),
|
||||
): Promise<unknown> => {
|
||||
const source = resolveSourceConfig(payload.sourceId);
|
||||
const settings = resolveCrawlerConfig(source, {
|
||||
category: payload.category,
|
||||
@@ -102,26 +105,30 @@ export const collectArticle = async (payload: DetailsTaskPayload): Promise<unkno
|
||||
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
|
||||
sourceId: payload.sourceId,
|
||||
});
|
||||
const persistors = [
|
||||
new JsonlPersistor({
|
||||
directory: config.paths.data,
|
||||
sourceId: String(source.sourceId),
|
||||
}),
|
||||
];
|
||||
const persistors = createPersistors(source);
|
||||
|
||||
if (source.sourceKind === SourceKindSchema.enum.html) {
|
||||
if (!payload.url) throw new Error("Missing article url");
|
||||
const crawler = new HtmlCrawler(settings, { persistors });
|
||||
const html = await crawler.crawl(payload.url);
|
||||
return await crawler.fetchOne(html, settings.dateRange);
|
||||
|
||||
const article = await crawler.fetchOne(html, settings.dateRange);
|
||||
await manager.enqueueProcessed({
|
||||
article,
|
||||
sourceId: payload.sourceId,
|
||||
} as ProcessingTaskPayload);
|
||||
}
|
||||
|
||||
if (source.sourceKind === SourceKindSchema.enum.wordpress) {
|
||||
const crawler = new WordPressCrawler(settings, { persistors });
|
||||
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
|
||||
|
||||
const article = await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
|
||||
await manager.enqueueProcessed({
|
||||
article,
|
||||
sourceId: payload.sourceId,
|
||||
} as ProcessingTaskPayload);
|
||||
}
|
||||
|
||||
throw new Error(`Unsupported source kind`);
|
||||
throw new UnsupportedSourceKindError(`Unsupported source kind`);
|
||||
};
|
||||
|
||||
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
|
||||
import { JobsOptions, Queue, QueueOptions } from "bullmq";
|
||||
import IORedis from "ioredis";
|
||||
import { config, FetchAsyncConfig } from "@/config";
|
||||
|
||||
import { FetchAsyncConfig, config } from "@/config";
|
||||
import {
|
||||
DetailsTaskPayload,
|
||||
DetailsTaskPayloadSchema,
|
||||
@@ -97,9 +99,9 @@ export const createQueueManager = (options: CreateQueueManagerOptions = {}): Que
|
||||
return queue.add("forward_for_processing", data);
|
||||
},
|
||||
iterQueueNames: () => [
|
||||
`${settings.prefix}:${settings.queues.listing}`,
|
||||
`${settings.prefix}:${settings.queues.details}`,
|
||||
`${settings.prefix}:${settings.queues.processing}`,
|
||||
settings.queues.listing,
|
||||
settings.queues.details,
|
||||
settings.queues.processing,
|
||||
],
|
||||
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
|
||||
settings,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { z } from "zod";
|
||||
|
||||
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema";
|
||||
|
||||
export const ListingTaskPayloadSchema = z.object({
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import * as handlers from "@/process/async/handlers";
|
||||
import { createQueueManager } from "@/process/async/queue";
|
||||
import {
|
||||
|
||||
@@ -45,6 +45,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||
{
|
||||
concurrency: options.concurrency ?? 5,
|
||||
connection,
|
||||
prefix: manager.settings.prefix,
|
||||
},
|
||||
);
|
||||
|
||||
@@ -53,7 +54,10 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||
worker.on("error", (err) => options.onError?.(err as Error));
|
||||
}
|
||||
|
||||
const queueEvents = new QueueEvents(queueName, { connection });
|
||||
const queueEvents = new QueueEvents(queueName, {
|
||||
connection,
|
||||
prefix: manager.settings.prefix,
|
||||
});
|
||||
|
||||
workers.push(worker);
|
||||
events.push(queueEvents);
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logger from "@basango/logger";
|
||||
import { config, FetchCrawlerConfig } from "@/config";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "@/config";
|
||||
import { JsonlPersistor, Persistor } from "@/process/persistence";
|
||||
import { AnySourceConfig } from "@/schema";
|
||||
import { createDateRange, createPageRange } from "@/utils";
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { HTMLElement, parse as parseHtml } from "node-html-parser";
|
||||
import { config, FetchCrawlerConfig } from "@/config";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "@/config";
|
||||
import { SyncHttpClient } from "@/http/http-client";
|
||||
import { OpenGraph } from "@/http/open-graph";
|
||||
import type { Persistor } from "@/process/persistence";
|
||||
|
||||
@@ -2,10 +2,17 @@ import { logger } from "@basango/logger";
|
||||
import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
|
||||
import { HTMLElement } from "node-html-parser";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
import { FetchCrawlerConfig } from "@/config";
|
||||
import {
|
||||
ArticleOutOfDateRangeError,
|
||||
InvalidArticleError,
|
||||
InvalidSourceSelectorsError,
|
||||
UnsupportedSourceKindError,
|
||||
} from "@/errors";
|
||||
import { BaseCrawler } from "@/process/parsers/base";
|
||||
import { Persistor, persist } from "@/process/persistence";
|
||||
import { DateRange, HtmlSourceConfig } from "@/schema";
|
||||
import { Article, DateRange, HtmlSourceConfig } from "@/schema";
|
||||
import { createAbsoluteUrl, isTimestampInRange } from "@/utils";
|
||||
|
||||
const md = new TurndownService({
|
||||
@@ -32,13 +39,13 @@ const safeRegExp = (pattern?: string | null): RegExp | null => {
|
||||
*/
|
||||
export class HtmlCrawler extends BaseCrawler {
|
||||
readonly source: HtmlSourceConfig;
|
||||
private currentArticleUrl: string | null = null;
|
||||
private currentNode: string | null = null;
|
||||
|
||||
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "html") {
|
||||
throw new Error("HtmlCrawler requires a source of kind 'html'");
|
||||
throw new UnsupportedSourceKindError("HtmlCrawler requires a source of kind 'html'");
|
||||
}
|
||||
this.source = this.settings.source as HtmlSourceConfig;
|
||||
}
|
||||
@@ -46,69 +53,64 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
async fetch(): Promise<void> {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
const selectors = this.source.sourceSelectors;
|
||||
|
||||
const articleSelector = this.source.sourceSelectors.articles;
|
||||
if (!articleSelector) {
|
||||
logger.error(
|
||||
{ source: this.source.sourceId },
|
||||
"No article selector configured for HTML source",
|
||||
);
|
||||
return;
|
||||
if (!selectors.articles) {
|
||||
throw new InvalidSourceSelectorsError("No article selector configured for HTML source");
|
||||
}
|
||||
|
||||
let stop = false;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const pageUrl = this.buildPageUrl(page);
|
||||
const endpoint = this.buildEndpointUrl(page);
|
||||
let html: string;
|
||||
|
||||
try {
|
||||
html = await this.crawl(pageUrl);
|
||||
html = await this.crawl(endpoint);
|
||||
} catch (error) {
|
||||
logger.error({ error, page, pageUrl }, "> page %s => [failed]", page);
|
||||
logger.error({ endpoint, error, page }, `Failed to crawl page ${page}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const root = this.parseHtml(html);
|
||||
const articles = this.extractAll(root, articleSelector);
|
||||
const articles = this.extractAll(root, selectors.articles);
|
||||
if (!articles.length) {
|
||||
logger.info({ page }, "No articles found on page");
|
||||
logger.error({ page }, "No articles found on page");
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const node of articles) {
|
||||
try {
|
||||
this.currentArticleUrl = this.extractLink(node);
|
||||
let targetHtml = node.toString();
|
||||
this.currentNode = this.extractLink(node);
|
||||
let nodeHtml = node.toString();
|
||||
|
||||
if (this.source.requiresDetails) {
|
||||
if (!this.currentArticleUrl) {
|
||||
logger.debug({ page }, "Skipping article without link for details");
|
||||
if (!this.currentNode) {
|
||||
logger.error({ page }, "Skipping article without link for details");
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
targetHtml = await this.crawl(this.currentArticleUrl);
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
{ error: err, url: this.currentArticleUrl },
|
||||
"Failed to fetch detail page",
|
||||
);
|
||||
nodeHtml = await this.crawl(this.currentNode);
|
||||
} catch (error) {
|
||||
logger.error({ error, url: this.currentNode }, "Failed to fetch detail page");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const saved = await this.fetchOne(targetHtml, dateRange);
|
||||
// stop early on first out-of-range if pages are sorted by date desc
|
||||
if (saved === null) {
|
||||
stop = true;
|
||||
await this.fetchOne(nodeHtml, dateRange);
|
||||
} catch (error: unknown) {
|
||||
if (error instanceof ArticleOutOfDateRangeError) {
|
||||
logger.info(
|
||||
{ url: this.currentNode },
|
||||
"Article out of date range, stopping further processing",
|
||||
);
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error, pageUrl }, "Failed to process article on page");
|
||||
|
||||
logger.error({ error, url: this.currentNode }, "Failed to process HTML article");
|
||||
} finally {
|
||||
this.currentArticleUrl = null;
|
||||
this.currentNode = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (stop) break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,43 +119,43 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
* @param html - The HTML content of the article
|
||||
* @param dateRange - Optional date range for filtering
|
||||
*/
|
||||
async fetchOne(html: string, dateRange?: DateRange | null) {
|
||||
async fetchOne(html: string, dateRange?: DateRange | null): Promise<Article> {
|
||||
const root = this.parseHtml(html);
|
||||
const sel = this.source.sourceSelectors;
|
||||
const selectors = this.source.sourceSelectors;
|
||||
|
||||
const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled";
|
||||
const link = this.currentArticleUrl ?? this.extractLink(root);
|
||||
const title = this.extractText(root, selectors.articleTitle) ?? "Untitled";
|
||||
const link = this.currentNode ?? this.extractLink(root);
|
||||
if (!link) {
|
||||
logger.warn({ title: titleText }, "Skipping article without link");
|
||||
return null;
|
||||
throw new InvalidArticleError("Missing article link");
|
||||
}
|
||||
|
||||
const body = this.extractBody(root, sel.articleBody);
|
||||
const categories = this.extractCategories(root, sel.articleCategories);
|
||||
const rawDate = this.extractText(root, sel.articleDate);
|
||||
const timestamp = this.computeTimestamp(rawDate);
|
||||
const body = this.extractBody(root, selectors.articleBody);
|
||||
const categories = this.extractCategories(root, selectors.articleCategories);
|
||||
const date = this.extractText(root, selectors.articleDate);
|
||||
const timestamp = this.computeTimestamp(date);
|
||||
|
||||
if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
|
||||
logger.info(
|
||||
{ date: rawDate, link, timestamp, title: titleText },
|
||||
"Skipping article outside date range",
|
||||
);
|
||||
return null;
|
||||
throw new ArticleOutOfDateRangeError("Article outside date range", {
|
||||
date,
|
||||
link,
|
||||
timestamp,
|
||||
title,
|
||||
});
|
||||
}
|
||||
|
||||
const enriched = await this.enrichWithOpenGraph(
|
||||
const data = await this.enrichWithOpenGraph(
|
||||
{
|
||||
body,
|
||||
categories,
|
||||
link,
|
||||
source: this.source.sourceId,
|
||||
timestamp,
|
||||
title: titleText,
|
||||
title,
|
||||
},
|
||||
link,
|
||||
);
|
||||
|
||||
return await persist(enriched, this.persistors);
|
||||
return await persist(data, this.persistors);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -211,7 +213,7 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
* Build the URL for a given page number.
|
||||
* @param page - The page number
|
||||
*/
|
||||
buildPageUrl(page: number): string {
|
||||
buildEndpointUrl(page: number): string {
|
||||
let template = this.applyCategory(this.source.paginationTemplate);
|
||||
if (template.includes("{page}")) {
|
||||
template = template.replace("{page}", String(page));
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
import { FetchCrawlerConfig } from "@/config";
|
||||
import {
|
||||
ArticleOutOfDateRangeError,
|
||||
InvalidArticleError,
|
||||
UnsupportedSourceKindError,
|
||||
} from "@/errors";
|
||||
import { BaseCrawler } from "@/process/parsers/base";
|
||||
import { Persistor, persist } from "@/process/persistence";
|
||||
import { DateRange, PageRange, WordPressSourceConfig } from "@/schema";
|
||||
import { Article, DateRange, PageRange, WordPressSourceConfig } from "@/schema";
|
||||
import { isTimestampInRange } from "@/utils";
|
||||
|
||||
const md = new TurndownService({
|
||||
bulletListMarker: "-",
|
||||
@@ -38,7 +45,9 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "wordpress") {
|
||||
throw new Error("HtmlCrawler requires a source of kind 'wordpress'");
|
||||
throw new UnsupportedSourceKindError(
|
||||
"WordPressCrawler requires a source of kind 'wordpress'",
|
||||
);
|
||||
}
|
||||
this.source = this.settings.source as WordPressSourceConfig;
|
||||
}
|
||||
@@ -50,29 +59,31 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
|
||||
let stop = false;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const endpoint = this.postsEndpoint(page);
|
||||
const endpoint = this.buildEndpointUrl(page);
|
||||
|
||||
try {
|
||||
const response = await this.http.get(endpoint);
|
||||
const data = (await response.json()) as unknown;
|
||||
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
|
||||
if (!Array.isArray(data)) {
|
||||
logger.warn({ page, type: typeof data }, "Unexpected WordPress payload type");
|
||||
}
|
||||
const articles = (await response.json()) as WordPressPost[];
|
||||
|
||||
for (const entry of articles) {
|
||||
const saved = await this.fetchOne(entry, dateRange);
|
||||
if (saved === null) {
|
||||
stop = true;
|
||||
break;
|
||||
for (const node of articles) {
|
||||
try {
|
||||
await this.fetchOne(node, dateRange);
|
||||
} catch (error: unknown) {
|
||||
if (error instanceof ArticleOutOfDateRangeError) {
|
||||
logger.info(
|
||||
{ url: node.link },
|
||||
"Article out of date range, stopping further processing",
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
logger.error({ error, url: node.link }, "Failed to process WordPress article");
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error, page }, "> page %s => [failed]", page);
|
||||
continue;
|
||||
logger.error({ error, page }, `Failed to fetch WordPress page ${page}`);
|
||||
}
|
||||
if (stop) break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +106,7 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
* @param input - Decoded JSON object or raw JSON string
|
||||
* @param dateRange - Optional date range for filtering
|
||||
*/
|
||||
async fetchOne(input: unknown, dateRange?: DateRange | null) {
|
||||
async fetchOne(input: unknown, dateRange?: DateRange | null): Promise<Article> {
|
||||
// input can be the decoded JSON object or a raw JSON string
|
||||
let data: WordPressPost | null = null;
|
||||
try {
|
||||
@@ -110,35 +121,29 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
}
|
||||
|
||||
if (!data || typeof data !== "object") {
|
||||
throw new Error("Unexpected WordPress payload type");
|
||||
throw new InvalidArticleError("Unexpected WordPress payload type");
|
||||
}
|
||||
|
||||
const link = data.link;
|
||||
if (!link) {
|
||||
logger.error("Skipping WordPress article without link");
|
||||
return null;
|
||||
throw new InvalidArticleError("Missing article link");
|
||||
}
|
||||
|
||||
const titleHtml = data.title?.rendered ?? "";
|
||||
const bodyHtml = data.content?.rendered ?? "";
|
||||
const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled";
|
||||
const body = md.turndown(bodyHtml);
|
||||
const title =
|
||||
this.textContent(this.parseHtml(data.title?.rendered ?? "")) ?? data.slug ?? "Untitled";
|
||||
const body = md.turndown(data.content?.rendered ?? "");
|
||||
const timestamp = this.computeTimestamp(data.date);
|
||||
const categories = await this.mapCategories(data.categories ?? []);
|
||||
|
||||
// date range skip as in HTML crawler
|
||||
if (dateRange) {
|
||||
const { isTimestampInRange } = await import("@/utils");
|
||||
if (!isTimestampInRange(dateRange, timestamp)) {
|
||||
logger.info(
|
||||
{ date: data.date, link, timestamp, title },
|
||||
"Skipping article outside date range",
|
||||
);
|
||||
return null;
|
||||
}
|
||||
if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
|
||||
throw new ArticleOutOfDateRangeError("Article outside date range", {
|
||||
link,
|
||||
timestamp,
|
||||
title,
|
||||
});
|
||||
}
|
||||
|
||||
const enriched = await this.enrichWithOpenGraph(
|
||||
const article = await this.enrichWithOpenGraph(
|
||||
{
|
||||
body,
|
||||
categories,
|
||||
@@ -150,7 +155,7 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
link,
|
||||
);
|
||||
|
||||
return await persist(enriched, this.persistors);
|
||||
return await persist(article, this.persistors);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -188,7 +193,7 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
* Construct posts endpoint URL for a given page.
|
||||
* @param page - Page number
|
||||
*/
|
||||
postsEndpoint(page: number): string {
|
||||
buildEndpointUrl(page: number): string {
|
||||
return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import { Article } from "@/schema";
|
||||
import { countTokens } from "@/utils";
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import {
|
||||
CrawlingOptions,
|
||||
closePersistors,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { runSyncCrawl } from "@/process/sync/tasks";
|
||||
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { scheduleAsyncCrawl } from "@/process/async/tasks";
|
||||
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { parseArgs } from "node:util";
|
||||
|
||||
import { CrawlingOptions } from "@/process/crawler";
|
||||
|
||||
interface WorkerCliOptions {
|
||||
@@ -6,13 +7,12 @@ interface WorkerCliOptions {
|
||||
}
|
||||
|
||||
export const CRAWLING_USAGE = `
|
||||
Usage: bun run crawl:[async|sync] -- --sourceId <id> [options]
|
||||
Usage: bun run crawler:[async|sync] -- --sourceId <id> [options]
|
||||
|
||||
Options:
|
||||
--pageRange <range> Optional page range filter (e.g. 1:5)
|
||||
--dateRange <range> Optional date range filter (e.g. 2024-01-01:2024-01-31)
|
||||
--category <slug> Optional category to crawl
|
||||
-h, --help Show this message
|
||||
`;
|
||||
|
||||
export const parseWorkerCliArgs = (): WorkerCliOptions => {
|
||||
|
||||
@@ -8,9 +8,7 @@ const main = async (): Promise<void> => {
|
||||
const options = parseWorkerCliArgs();
|
||||
|
||||
const manager = createQueueManager();
|
||||
const queues = options.queue?.length
|
||||
? options.queue.map((name) => manager.queueName(name))
|
||||
: undefined;
|
||||
const queues = options.queue?.length ? options.queue : undefined;
|
||||
|
||||
const handle = startWorker({
|
||||
queueManager: manager,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { format, getUnixTime, isMatch, parse } from "date-fns";
|
||||
import type { RedisOptions } from "ioredis";
|
||||
import { get_encoding, TiktokenEncoding } from "tiktoken";
|
||||
import { TiktokenEncoding, get_encoding } from "tiktoken";
|
||||
|
||||
import { config } from "@/config";
|
||||
import { DEFAULT_DATE_FORMAT } from "@/constants";
|
||||
import {
|
||||
|
||||
|
Before Width: | Height: | Size: 25 KiB After Width: | Height: | Size: 25 KiB |
@@ -1,8 +1,6 @@
|
||||
{
|
||||
"commitlint": {
|
||||
"extends": [
|
||||
"@commitlint/config-conventional"
|
||||
]
|
||||
"extends": ["@commitlint/config-conventional"]
|
||||
},
|
||||
"config": {
|
||||
"commitizen": {
|
||||
@@ -85,14 +83,8 @@
|
||||
"preset": "jest-expo"
|
||||
},
|
||||
"lint-staged": {
|
||||
"*.ts": [
|
||||
"prettier --write",
|
||||
"eslint --fix"
|
||||
],
|
||||
"*.tsx": [
|
||||
"prettier --write",
|
||||
"eslint --fix"
|
||||
]
|
||||
"*.ts": ["prettier --write", "eslint --fix"],
|
||||
"*.tsx": ["prettier --write", "eslint --fix"]
|
||||
},
|
||||
"main": "expo-router/entry",
|
||||
"name": "drc-news",
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { endpoint } from "@/api/endpoint";
|
||||
import {
|
||||
Bookmark,
|
||||
BookmarkedArticle,
|
||||
BookmarkPayload,
|
||||
BookmarkedArticle,
|
||||
} from "@/api/schema/feed-management/bookmark";
|
||||
import {
|
||||
ArticleFilters,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { formatDistanceToNowStrict, Locale } from "date-fns";
|
||||
import { Locale, formatDistanceToNowStrict } from "date-fns";
|
||||
import { fr } from "date-fns/locale";
|
||||
import { useEffect, useState } from "react";
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type React from "react";
|
||||
|
||||
import { GestureHandlerRootView } from "react-native-gesture-handler";
|
||||
import { SafeAreaProvider } from "react-native-safe-area-context";
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type React from "react";
|
||||
|
||||
import { TamaguiProvider } from "tamagui";
|
||||
|
||||
import { config } from "~/tamagui.config";
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import React, { useCallback } from "react";
|
||||
|
||||
import { ActivityIndicator, Dimensions, FlatList, FlatListProps } from "react-native";
|
||||
import { View, XStack, YStack } from "tamagui";
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import { useCallback } from "react";
|
||||
|
||||
import ContentLoader, { Circle, Rect } from "react-content-loader/native";
|
||||
import { Dimensions, FlatList } from "react-native";
|
||||
import { View } from "tamagui";
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import React, { useCallback } from "react";
|
||||
|
||||
import { ActivityIndicator, FlatList, FlatListProps } from "react-native";
|
||||
import { YStack } from "tamagui";
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import { useCallback, useState } from "react";
|
||||
|
||||
import { ActivityIndicator, Alert } from "react-native";
|
||||
import { Button, GetProps } from "tamagui";
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import React, { useCallback } from "react";
|
||||
|
||||
import { FlatList, FlatListProps } from "react-native";
|
||||
import { Paragraph, XStack, YStack } from "tamagui";
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { Link } from "expo-router";
|
||||
import { GetProps, styled, XStack, YStack } from "tamagui";
|
||||
import { GetProps, XStack, YStack, styled } from "tamagui";
|
||||
|
||||
import { SourceOverview } from "@/api/schema/feed-management/source";
|
||||
import { SourceFollowButton } from "@/ui/components/content/source/SourceFollowButton";
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import { useCallback } from "react";
|
||||
|
||||
import ContentLoader, { Circle, Rect } from "react-content-loader/native";
|
||||
import { FlatList } from "react-native";
|
||||
import { YStack } from "tamagui";
|
||||
|
||||
@@ -5,10 +5,10 @@ import {
|
||||
GetProps,
|
||||
Label,
|
||||
SizeTokens,
|
||||
styled,
|
||||
Input as TamaguiInput,
|
||||
XStack,
|
||||
YStack,
|
||||
styled,
|
||||
} from "tamagui";
|
||||
|
||||
import { Caption } from "@/ui/components/typography";
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { GetProps, Label, styled, TextArea as TamaguiTextArea, XStack, YStack } from "tamagui";
|
||||
import { GetProps, Label, TextArea as TamaguiTextArea, XStack, YStack, styled } from "tamagui";
|
||||
|
||||
import { withController } from "@/ui/components/controls/forms/withController";
|
||||
import { Caption } from "@/ui/components/typography";
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import React from "react";
|
||||
|
||||
import { Controller, ControllerProps } from "react-hook-form";
|
||||
|
||||
type WithControllerProps = {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import React from "react";
|
||||
|
||||
import { styled, View, XStack } from "tamagui";
|
||||
import { View, XStack, styled } from "tamagui";
|
||||
|
||||
import { Text } from "@/ui/components/typography";
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { ArrowRight } from "@tamagui/lucide-icons";
|
||||
import { Href, Link } from "expo-router";
|
||||
import { GetProps, Paragraph, styled, XStack } from "tamagui";
|
||||
import { GetProps, Paragraph, XStack, styled } from "tamagui";
|
||||
|
||||
import { Text } from "@/ui/components/typography";
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import { StatusBar } from "expo-status-bar";
|
||||
import React from "react";
|
||||
import { useSafeAreaInsets } from "react-native-safe-area-context";
|
||||
import { styled, YStack } from "tamagui";
|
||||
import { YStack, styled } from "tamagui";
|
||||
|
||||
import { ScreenHeading } from "@/ui/components/layout/ScreenHeading";
|
||||
import { ScreenSection } from "@/ui/components/layout/ScreenSection";
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type React from "react";
|
||||
|
||||
import { Paragraph, ParagraphProps } from "tamagui";
|
||||
|
||||
export const Caption = (props: React.PropsWithChildren<ParagraphProps>) => {
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type React from "react";
|
||||
|
||||
import { H2, ParagraphProps } from "tamagui";
|
||||
|
||||
export const Display = (props: React.PropsWithChildren<ParagraphProps>) => {
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type React from "react";
|
||||
|
||||
import { H4, ParagraphProps } from "tamagui";
|
||||
|
||||
export const Heading = (props: React.PropsWithChildren<ParagraphProps>) => {
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type React from "react";
|
||||
|
||||
import { Paragraph, ParagraphProps } from "tamagui";
|
||||
|
||||
export const Text = (props: React.PropsWithChildren<ParagraphProps>) => {
|
||||
|
||||
+1
-1
@@ -39,7 +39,7 @@
|
||||
},
|
||||
"files": {
|
||||
"ignoreUnknown": true,
|
||||
"includes": ["**/apps", "**/packages", "!/apps/api-legacy", "!/apps/mobile-legacy"]
|
||||
"includes": ["**", "!apps/mobile-legacy", "!apps/api-legacy"]
|
||||
},
|
||||
"formatter": {
|
||||
"enabled": true,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { drizzle } from "drizzle-orm/node-postgres";
|
||||
import { Pool } from "pg";
|
||||
|
||||
import * as schema from "@/schema";
|
||||
|
||||
const isDevelopment = process.env.NODE_ENV === "development";
|
||||
@@ -21,7 +22,7 @@ const pool = new Pool({
|
||||
export const getConnectionPoolStats = () => {
|
||||
const stats = {
|
||||
active: Math.max(0, (pool.totalCount ?? 0) - (pool.idleCount ?? 0)),
|
||||
ended: (pool as any).ended ?? false,
|
||||
ended: pool.ended ?? false,
|
||||
idle: pool.idleCount ?? 0,
|
||||
name: "primary",
|
||||
total: pool.options.max ?? 0,
|
||||
|
||||
@@ -4,13 +4,13 @@ import { and, asc, desc, eq, gt, lt, or, sql } from "drizzle-orm";
|
||||
import type { Database } from "@/client";
|
||||
import { articles, bookmarkArticles, bookmarks, comments, sources, users } from "@/schema";
|
||||
import {
|
||||
buildPaginationResult,
|
||||
createPageState,
|
||||
decodeCursor,
|
||||
type PageRequest,
|
||||
type PageState,
|
||||
type PaginationMeta,
|
||||
type SortDirection,
|
||||
buildPaginationResult,
|
||||
createPageState,
|
||||
decodeCursor,
|
||||
} from "@/utils/pagination";
|
||||
|
||||
export interface ArticleFilters {
|
||||
|
||||
@@ -4,11 +4,11 @@ import { and, desc, eq, lt, sql } from "drizzle-orm";
|
||||
import type { Database } from "@/client";
|
||||
import { bookmarkArticles, bookmarks } from "@/schema";
|
||||
import {
|
||||
type PageRequest,
|
||||
type PaginationMeta,
|
||||
buildPaginationResult,
|
||||
createPageState,
|
||||
decodeCursor,
|
||||
type PageRequest,
|
||||
type PaginationMeta,
|
||||
} from "@/utils/pagination";
|
||||
|
||||
export interface BookmarkRow {
|
||||
|
||||
@@ -5,11 +5,11 @@ import type { Database } from "@/client";
|
||||
import { PUBLICATION_GRAPH_DAYS, SOURCE_IMAGE_BASE } from "@/constant";
|
||||
import { articles, followedSources, sources } from "@/schema";
|
||||
import {
|
||||
type PageRequest,
|
||||
type PaginationMeta,
|
||||
buildPaginationResult,
|
||||
createPageState,
|
||||
decodeCursor,
|
||||
type PageRequest,
|
||||
type PaginationMeta,
|
||||
} from "@/utils/pagination";
|
||||
|
||||
export interface SourceOverviewRow {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
import { db } from "@/client";
|
||||
|
||||
export async function checkHealth() {
|
||||
|
||||
@@ -7,14 +7,15 @@
|
||||
"incremental": false,
|
||||
"isolatedModules": true,
|
||||
"lib": ["es2022", "DOM", "DOM.Iterable"],
|
||||
"module": "NodeNext",
|
||||
"module": "ESNext",
|
||||
"moduleDetection": "force",
|
||||
"moduleResolution": "NodeNext",
|
||||
"moduleResolution": "Bundler",
|
||||
"noUncheckedIndexedAccess": true,
|
||||
"resolveJsonModule": true,
|
||||
"skipLibCheck": true,
|
||||
"strict": true,
|
||||
"target": "ES2022"
|
||||
"target": "ES2022",
|
||||
"verbatimModuleSyntax": false
|
||||
},
|
||||
"display": "Default"
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { cn } from "@basango/ui/lib/utils";
|
||||
import { Slot } from "@radix-ui/react-slot";
|
||||
import { cva, type VariantProps } from "class-variance-authority";
|
||||
import { type VariantProps, cva } from "class-variance-authority";
|
||||
import * as React from "react";
|
||||
|
||||
const buttonVariants = cva(
|
||||
|
||||
Reference in New Issue
Block a user