feat(monorepo): migrate to typescript monorepo
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
import path from "node:path";
|
||||
|
||||
import { loadConfig as defineConfig } from "@devscast/config";
|
||||
import { z } from "zod";
|
||||
import {
|
||||
DateRangeSchema,
|
||||
HtmlSourceConfigSchema,
|
||||
PageRangeSchema,
|
||||
UpdateDirectionSchema,
|
||||
WordPressSourceConfigSchema,
|
||||
} from "@/schema";
|
||||
|
||||
export const PROJECT_DIR = path.resolve(__dirname, "../");
|
||||
|
||||
export const PipelineConfigSchema = z.object({
|
||||
fetch: z.object({
|
||||
async: z.object({
|
||||
prefix: z.string().default("basango:crawler:queue"),
|
||||
queues: z.object({
|
||||
details: z.string().default("details"),
|
||||
listing: z.string().default("listing"),
|
||||
processing: z.string().default("processing"),
|
||||
}),
|
||||
redisUrl: z.string().default("redis://localhost:6379/0"),
|
||||
ttl: z.object({
|
||||
default: z.number().int().positive().default(600),
|
||||
failure: z.number().int().nonnegative().default(3600),
|
||||
result: z.number().int().nonnegative().default(3600),
|
||||
}),
|
||||
}),
|
||||
client: z.object({
|
||||
backoffInitial: z.number().nonnegative().default(1),
|
||||
backoffMax: z.number().nonnegative().default(30),
|
||||
backoffMultiplier: z.number().positive().default(2),
|
||||
followRedirects: z.boolean().default(true),
|
||||
maxRetries: z.number().int().nonnegative().default(3),
|
||||
respectRetryAfter: z.boolean().default(true),
|
||||
rotate: z.boolean().default(true),
|
||||
timeout: z.number().positive().default(20),
|
||||
userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
|
||||
verifySsl: z.boolean().default(true),
|
||||
}),
|
||||
crawler: z.object({
|
||||
category: z.string().optional(),
|
||||
dateRange: DateRangeSchema.optional(),
|
||||
direction: UpdateDirectionSchema.default("forward"),
|
||||
isUpdate: z.boolean().default(false),
|
||||
maxWorkers: z.number().int().positive().default(5),
|
||||
notify: z.boolean().default(false),
|
||||
pageRange: PageRangeSchema.optional(),
|
||||
source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
|
||||
useMultiThreading: z.boolean().default(false),
|
||||
}),
|
||||
}),
|
||||
paths: z.object({
|
||||
config: z.string().default(path.join(PROJECT_DIR, "config")),
|
||||
data: z.string().default(path.join(PROJECT_DIR, "data", "datasets")),
|
||||
root: z.string().default(PROJECT_DIR),
|
||||
}),
|
||||
sources: z.object({
|
||||
html: z.array(HtmlSourceConfigSchema).default([]),
|
||||
wordpress: z.array(WordPressSourceConfigSchema).default([]),
|
||||
}),
|
||||
});
|
||||
|
||||
export const { config, env } = defineConfig({
|
||||
cwd: process.cwd(),
|
||||
env: {
|
||||
path: path.join(PROJECT_DIR, ".env"),
|
||||
},
|
||||
schema: PipelineConfigSchema,
|
||||
sources: [
|
||||
path.join(PROJECT_DIR, "config", "pipeline.json"),
|
||||
path.join(PROJECT_DIR, "config", "sources.json"),
|
||||
],
|
||||
});
|
||||
|
||||
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
|
||||
export type FetchClientConfig = PipelineConfig["fetch"]["client"];
|
||||
export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
|
||||
export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];
|
||||
@@ -0,0 +1,6 @@
|
||||
export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd";
|
||||
export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)";
|
||||
export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1";
|
||||
|
||||
export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504];
|
||||
export const DEFAULT_RETRY_AFTER_HEADER = "retry-after";
|
||||
@@ -0,0 +1,241 @@
|
||||
import { setTimeout as delay } from "node:timers/promises";
|
||||
import { FetchClientConfig } from "@/config";
|
||||
import {
|
||||
DEFAULT_RETRY_AFTER_HEADER,
|
||||
DEFAULT_USER_AGENT,
|
||||
TRANSIENT_HTTP_STATUSES,
|
||||
} from "@/constants";
|
||||
import { UserAgents } from "@/http/user-agent";
|
||||
|
||||
export type HttpHeaders = Record<string, string>;
|
||||
export type HttpParams = Record<string, string | number | boolean | null | undefined>;
|
||||
export type HttpData = unknown;
|
||||
|
||||
export interface HttpClientOptions {
|
||||
userAgentProvider?: UserAgents;
|
||||
defaultHeaders?: HttpHeaders;
|
||||
fetchImpl?: typeof fetch;
|
||||
sleep?: (ms: number) => Promise<void>;
|
||||
}
|
||||
|
||||
export interface HttpRequestOptions {
|
||||
headers?: HttpHeaders;
|
||||
params?: HttpParams;
|
||||
data?: HttpData;
|
||||
json?: HttpData;
|
||||
retryAfterHeader?: string;
|
||||
}
|
||||
|
||||
export class HttpError extends Error {
|
||||
readonly status: number;
|
||||
readonly response: Response;
|
||||
|
||||
constructor(message: string, response: Response) {
|
||||
super(message);
|
||||
this.status = response.status;
|
||||
this.response = response;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Default sleep function using setTimeout.
|
||||
* @param ms - Milliseconds to sleep
|
||||
*/
|
||||
const defaultSleep = (ms: number): Promise<void> => {
|
||||
return delay(ms).then(() => undefined);
|
||||
};
|
||||
|
||||
/**
|
||||
* Builds a URL with query parameters.
|
||||
* @param url - The base URL
|
||||
* @param params - The query parameters to append
|
||||
*/
|
||||
const buildUrl = (url: string, params?: HttpParams): string => {
|
||||
if (!params || Object.keys(params).length === 0) {
|
||||
return url;
|
||||
}
|
||||
|
||||
const target = new URL(url);
|
||||
for (const [key, value] of Object.entries(params)) {
|
||||
if (value === undefined || value === null) continue;
|
||||
target.searchParams.set(key, String(value));
|
||||
}
|
||||
|
||||
return target.toString();
|
||||
};
|
||||
|
||||
/**
|
||||
* Computes the backoff time in milliseconds based on the configuration and attempt number.
|
||||
* @param config - Fetch client configuration
|
||||
* @param attempt - Current attempt number
|
||||
*/
|
||||
const computeBackoff = (config: FetchClientConfig, attempt: number): number => {
|
||||
const base = Math.min(
|
||||
config.backoffInitial * config.backoffMultiplier ** attempt,
|
||||
config.backoffMax,
|
||||
);
|
||||
const jitter = Math.random() * base * 0.25;
|
||||
return (base + jitter) * 1000;
|
||||
};
|
||||
|
||||
const parseRetryAfter = (header: string): number => {
|
||||
const numeric = Number.parseInt(header, 10);
|
||||
if (!Number.isNaN(numeric)) {
|
||||
return Math.max(0, numeric * 1000);
|
||||
}
|
||||
|
||||
const parsed = Date.parse(header);
|
||||
if (Number.isNaN(parsed)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const delta = parsed - Date.now();
|
||||
return delta > 0 ? delta : 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* Base HTTP client providing common functionality.
|
||||
*
|
||||
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||
*/
|
||||
export class BaseHttpClient {
|
||||
protected readonly config: FetchClientConfig;
|
||||
protected readonly fetchImpl: typeof fetch;
|
||||
protected readonly sleep: (ms: number) => Promise<void>;
|
||||
protected readonly headers: HttpHeaders;
|
||||
|
||||
constructor(config: FetchClientConfig, options: HttpClientOptions = {}) {
|
||||
this.config = config;
|
||||
const provider =
|
||||
options.userAgentProvider ??
|
||||
new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT);
|
||||
const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT;
|
||||
|
||||
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
|
||||
if (options.defaultHeaders) {
|
||||
Object.assign(baseHeaders, options.defaultHeaders);
|
||||
}
|
||||
|
||||
this.headers = baseHeaders;
|
||||
this.fetchImpl = options.fetchImpl ?? fetch;
|
||||
this.sleep = options.sleep ?? defaultSleep;
|
||||
}
|
||||
|
||||
protected buildHeaders(headers?: HttpHeaders): HeadersInit {
|
||||
return { ...this.headers, ...(headers ?? {}) };
|
||||
}
|
||||
|
||||
protected async maybeDelay(
|
||||
attempt: number,
|
||||
response?: Response,
|
||||
retryAfterHeader: string = DEFAULT_RETRY_AFTER_HEADER,
|
||||
): Promise<void> {
|
||||
let waitMs = 0;
|
||||
|
||||
if (response) {
|
||||
const retryAfter = response.headers.get(retryAfterHeader);
|
||||
if (retryAfter && this.config.respectRetryAfter) {
|
||||
waitMs = parseRetryAfter(retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
if (waitMs === 0) {
|
||||
waitMs = computeBackoff(this.config, attempt);
|
||||
}
|
||||
|
||||
if (waitMs > 0) {
|
||||
await this.sleep(waitMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronous HTTP client with retry and timeout capabilities.
|
||||
*
|
||||
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||
*/
|
||||
export class SyncHttpClient extends BaseHttpClient {
|
||||
async request(method: string, url: string, options: HttpRequestOptions = {}): Promise<Response> {
|
||||
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
|
||||
const target = buildUrl(url, options.params);
|
||||
|
||||
const maxAttempts = this.config.maxRetries + 1;
|
||||
let attempt = 0;
|
||||
let lastError: unknown;
|
||||
|
||||
while (attempt < maxAttempts) {
|
||||
const controller = new AbortController();
|
||||
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
|
||||
try {
|
||||
timeoutHandle = setTimeout(() => controller.abort(), this.config.timeout * 1000);
|
||||
|
||||
const headers = this.buildHeaders(options.headers);
|
||||
const init: RequestInit = {
|
||||
body: options.data as BodyInit | undefined,
|
||||
headers,
|
||||
method,
|
||||
redirect: this.config.followRedirects ? "follow" : "manual",
|
||||
signal: controller.signal,
|
||||
};
|
||||
|
||||
if (options.json !== undefined) {
|
||||
init.body = JSON.stringify(options.json);
|
||||
(init.headers as Record<string, string>)["Content-Type"] ??= "application/json";
|
||||
}
|
||||
|
||||
const response = await this.fetchImpl(target, init);
|
||||
|
||||
if (
|
||||
TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
|
||||
attempt < this.config.maxRetries
|
||||
) {
|
||||
await this.maybeDelay(attempt, response, retryAfterHeader);
|
||||
attempt += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
throw new HttpError(`HTTP ${response.status} ${response.statusText}`, response);
|
||||
}
|
||||
|
||||
return response;
|
||||
} catch (error) {
|
||||
if (error instanceof HttpError) {
|
||||
lastError = error;
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (error instanceof DOMException && error.name === "AbortError") {
|
||||
lastError = error;
|
||||
if (attempt >= this.config.maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
} else {
|
||||
lastError = error;
|
||||
if (attempt >= this.config.maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
await this.maybeDelay(attempt);
|
||||
attempt += 1;
|
||||
} finally {
|
||||
if (timeoutHandle) {
|
||||
clearTimeout(timeoutHandle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError instanceof Error ? lastError : new Error("HTTP request failed after retries");
|
||||
}
|
||||
|
||||
get(url: string, options?: Omit<HttpRequestOptions, "data" | "json">): Promise<Response> {
|
||||
return this.request("GET", url, options);
|
||||
}
|
||||
|
||||
post(url: string, options: HttpRequestOptions = {}): Promise<Response> {
|
||||
return this.request("POST", url, options);
|
||||
}
|
||||
}
|
||||
|
||||
export type HttpClient = SyncHttpClient;
|
||||
@@ -0,0 +1,102 @@
|
||||
import { parse } from "node-html-parser";
|
||||
import { config } from "@/config";
|
||||
import { OPEN_GRAPH_USER_AGENT } from "@/constants";
|
||||
import { SyncHttpClient } from "@/http/http-client";
|
||||
import { UserAgents } from "@/http/user-agent";
|
||||
import { ArticleMetadata } from "@/schema";
|
||||
|
||||
/**
|
||||
* Picks the first non-empty value from the provided array.
|
||||
* @param values - An array of string values
|
||||
*/
|
||||
const pick = (values: Array<string | null | undefined>): string | undefined => {
|
||||
for (const value of values) {
|
||||
if (value && value.trim().length > 0) {
|
||||
return value.trim();
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
/**
|
||||
* Extracts the content of a meta tag given its property or name.
|
||||
* @param root - The root HTML element
|
||||
* @param property - The property or name of the meta tag to extract
|
||||
*/
|
||||
const extract = (root: ReturnType<typeof parse>, property: string): string | null => {
|
||||
const selector = `meta[property='${property}'], meta[name='${property}']`;
|
||||
const node = root.querySelector(selector);
|
||||
if (!node) {
|
||||
return null;
|
||||
}
|
||||
return node.getAttribute("content") ?? null;
|
||||
};
|
||||
|
||||
/**
|
||||
* OpenGraph consumer for extracting Open Graph metadata from HTML pages.
|
||||
* Uses a synchronous HTTP client to fetch the HTML content.
|
||||
*
|
||||
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||
*/
|
||||
export class OpenGraph {
|
||||
private readonly client: Pick<SyncHttpClient, "get">;
|
||||
|
||||
constructor() {
|
||||
const settings = config.fetch.client;
|
||||
const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT);
|
||||
|
||||
this.client = new SyncHttpClient(settings, {
|
||||
defaultHeaders: { "User-Agent": provider.og() },
|
||||
userAgentProvider: provider,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Consume a URL and extract Open Graph metadata.
|
||||
* @param url - The URL to fetch and parse
|
||||
*/
|
||||
async consumeUrl(url: string): Promise<ArticleMetadata | undefined> {
|
||||
try {
|
||||
const response = await this.client.get(url);
|
||||
const html = await response.text();
|
||||
return OpenGraph.consumeHtml(html, url);
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Consume HTML content and extract Open Graph metadata.
|
||||
* @param html - HTML content as a string
|
||||
* @param url - Optional URL of the page
|
||||
*/
|
||||
static consumeHtml(html: string, url?: string): ArticleMetadata | undefined {
|
||||
if (!html) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const root = parse(html);
|
||||
const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]);
|
||||
const description = pick([extract(root, "og:description"), extract(root, "description")]);
|
||||
const image = pick([
|
||||
extract(root, "og:image"),
|
||||
root.querySelector("img")?.getAttribute("src") ?? null,
|
||||
]);
|
||||
const canonical = pick([
|
||||
extract(root, "og:url"),
|
||||
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
|
||||
url ?? null,
|
||||
]);
|
||||
|
||||
if (!title && !description && !image && !canonical) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return {
|
||||
description,
|
||||
image,
|
||||
title,
|
||||
url: canonical,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
import { DEFAULT_USER_AGENT, OPEN_GRAPH_USER_AGENT } from "@/constants";
|
||||
|
||||
/**
|
||||
* User agent provider with optional rotation.
|
||||
* Allows fetching a random user agent from a predefined list
|
||||
* or using a fallback user agent.
|
||||
*
|
||||
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||
*/
|
||||
export class UserAgents {
|
||||
private static readonly USER_AGENTS: string[] = [
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
|
||||
"Mozilla/50.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7",
|
||||
"Mozilla/5.0 (Linux; U; Android 5.0; SM-P815 Build/LRX22G) AppleWebKit/600.4 (KHTML, like Gecko) Chrome/48.0.1562.260 Mobile Safari/600.0",
|
||||
"Mozilla/5.0 (Windows; U; Windows NT 6.3;) AppleWebKit/533.34 (KHTML, like Gecko) Chrome/51.0.1883.215 Safari/533",
|
||||
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.3; x64; en-US Trident/4.0)",
|
||||
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_10_3) Gecko/20100101 Firefox/63.4",
|
||||
"Mozilla/5.0 (Linux; Linux x86_64; en-US) AppleWebKit/603.50 (KHTML, like Gecko) Chrome/55.0.2226.116 Safari/601",
|
||||
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 7_8_3; en-US) Gecko/20100101 Firefox/68.9",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 8_9_8; like Mac OS X) AppleWebKit/603.34 (KHTML, like Gecko) Chrome/47.0.1126.107 Mobile Safari/602.7",
|
||||
"Mozilla/5.0 (iPod; CPU iPod OS 8_2_0; like Mac OS X) AppleWebKit/601.40 (KHTML, like Gecko) Chrome/47.0.1590.178 Mobile Safari/535.2",
|
||||
];
|
||||
|
||||
private readonly rotate: boolean;
|
||||
private readonly fallback: string;
|
||||
|
||||
constructor(rotate: boolean = true, fallback: string = DEFAULT_USER_AGENT) {
|
||||
this.rotate = rotate;
|
||||
this.fallback = fallback;
|
||||
}
|
||||
|
||||
og(): string {
|
||||
return OPEN_GRAPH_USER_AGENT;
|
||||
}
|
||||
|
||||
get(): string {
|
||||
if (!this.rotate) return this.fallback;
|
||||
const idx = Math.floor(Math.random() * UserAgents.USER_AGENTS.length);
|
||||
return UserAgents.USER_AGENTS[idx]!;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,137 @@
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { config, env } from "@/config";
|
||||
import { SyncHttpClient } from "@/http/http-client";
|
||||
import { createQueueManager, QueueManager } from "@/process/async/queue";
|
||||
import {
|
||||
DetailsTaskPayload,
|
||||
ListingTaskPayload,
|
||||
ProcessingTaskPayload,
|
||||
} from "@/process/async/schemas";
|
||||
import { resolveCrawlerConfig } from "@/process/crawler";
|
||||
import { HtmlCrawler } from "@/process/parsers/html";
|
||||
import { WordPressCrawler } from "@/process/parsers/wordpress";
|
||||
import { JsonlPersistor } from "@/process/persistence";
|
||||
import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
|
||||
import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
|
||||
|
||||
export const collectHtmlListing = async (
|
||||
payload: ListingTaskPayload,
|
||||
manager: QueueManager = createQueueManager(),
|
||||
): Promise<number> => {
|
||||
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
|
||||
if (source.sourceKind !== "html") {
|
||||
return await collectWordPressListing(payload, manager);
|
||||
}
|
||||
|
||||
const settings = resolveCrawlerConfig(source, payload);
|
||||
const crawler = new HtmlCrawler(settings);
|
||||
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
||||
|
||||
let queued = 0;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`;
|
||||
|
||||
try {
|
||||
const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
|
||||
for (const node of items) {
|
||||
const url = crawler.extractLink(node);
|
||||
if (!url) continue;
|
||||
|
||||
await manager.enqueueArticle({
|
||||
category: payload.category,
|
||||
dateRange: createDateRange(payload.dateRange),
|
||||
sourceId: payload.sourceId,
|
||||
url,
|
||||
} as DetailsTaskPayload);
|
||||
queued += 1;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error, target }, "Failed to crawl page");
|
||||
}
|
||||
}
|
||||
|
||||
return queued;
|
||||
};
|
||||
|
||||
export const collectWordPressListing = async (
|
||||
payload: ListingTaskPayload,
|
||||
manager: QueueManager = createQueueManager(),
|
||||
): Promise<number> => {
|
||||
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
|
||||
if (source.sourceKind !== "wordpress") {
|
||||
return await collectHtmlListing(payload, manager);
|
||||
}
|
||||
|
||||
const settings = resolveCrawlerConfig(source, payload);
|
||||
const crawler = new WordPressCrawler(settings);
|
||||
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
||||
|
||||
let queued = 0;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const url = crawler.postsEndpoint(page);
|
||||
|
||||
try {
|
||||
const entries = await crawler.fetchLinks(url);
|
||||
for (const data of entries) {
|
||||
const url = data.link;
|
||||
if (!url) continue;
|
||||
|
||||
await manager.enqueueArticle({
|
||||
category: payload.category,
|
||||
data,
|
||||
dateRange: createDateRange(payload.dateRange),
|
||||
sourceId: payload.sourceId,
|
||||
url,
|
||||
} as DetailsTaskPayload);
|
||||
queued += 1;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error, page }, "Failed to fetch WordPress page");
|
||||
}
|
||||
}
|
||||
|
||||
return queued;
|
||||
};
|
||||
|
||||
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
|
||||
const source = resolveSourceConfig(payload.sourceId);
|
||||
const settings = resolveCrawlerConfig(source, {
|
||||
category: payload.category,
|
||||
dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
|
||||
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
|
||||
sourceId: payload.sourceId,
|
||||
});
|
||||
const persistors = [
|
||||
new JsonlPersistor({
|
||||
directory: config.paths.data,
|
||||
sourceId: String(source.sourceId),
|
||||
}),
|
||||
];
|
||||
|
||||
if (source.sourceKind === SourceKindSchema.enum.html) {
|
||||
if (!payload.url) throw new Error("Missing article url");
|
||||
const crawler = new HtmlCrawler(settings, { persistors });
|
||||
const html = await crawler.crawl(payload.url);
|
||||
return await crawler.fetchOne(html, settings.dateRange);
|
||||
}
|
||||
|
||||
if (source.sourceKind === SourceKindSchema.enum.wordpress) {
|
||||
const crawler = new WordPressCrawler(settings, { persistors });
|
||||
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
|
||||
}
|
||||
|
||||
throw new Error(`Unsupported source kind`);
|
||||
};
|
||||
|
||||
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
|
||||
logger.info({ article: payload.article.title }, "Ready for downstream processing");
|
||||
|
||||
const client = new SyncHttpClient(config.fetch.client);
|
||||
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
|
||||
|
||||
await client.post(endpoint, { json: payload.article });
|
||||
logger.info({ article: payload.article.title }, "Forwarded article to API");
|
||||
|
||||
return payload.article;
|
||||
};
|
||||
@@ -0,0 +1,107 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { JobsOptions, Queue, QueueOptions } from "bullmq";
|
||||
import IORedis from "ioredis";
|
||||
import { config, FetchAsyncConfig } from "@/config";
|
||||
import {
|
||||
DetailsTaskPayload,
|
||||
DetailsTaskPayloadSchema,
|
||||
ListingTaskPayload,
|
||||
ListingTaskPayloadSchema,
|
||||
ProcessingTaskPayload,
|
||||
ProcessingTaskPayloadSchema,
|
||||
} from "@/process/async/schemas";
|
||||
import { parseRedisUrl } from "@/utils";
|
||||
|
||||
export interface QueueBackend<T = unknown> {
|
||||
add: (name: string, data: T, opts?: JobsOptions) => Promise<{ id: string }>;
|
||||
}
|
||||
|
||||
export type QueueFactory = (
|
||||
queueName: string,
|
||||
settings: FetchAsyncConfig,
|
||||
connection?: IORedis,
|
||||
) => QueueBackend;
|
||||
|
||||
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
|
||||
const redisConnection =
|
||||
connection ??
|
||||
new IORedis(settings.redisUrl, {
|
||||
...parseRedisUrl(settings.redisUrl),
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
const options: QueueOptions = {
|
||||
connection: redisConnection,
|
||||
prefix: settings.prefix,
|
||||
};
|
||||
|
||||
const queue = new Queue(queueName, options);
|
||||
return {
|
||||
add: async (name, data, opts) => {
|
||||
const job = await queue.add(name, data, {
|
||||
removeOnComplete: settings.ttl.result === 0 ? true : undefined,
|
||||
removeOnFail: settings.ttl.failure === 0 ? true : undefined,
|
||||
...opts,
|
||||
});
|
||||
return { id: job.id ?? randomUUID() };
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
export interface CreateQueueManagerOptions {
|
||||
queueFactory?: QueueFactory;
|
||||
connection?: IORedis;
|
||||
}
|
||||
|
||||
export interface QueueManager {
|
||||
readonly settings: FetchAsyncConfig;
|
||||
readonly connection: IORedis;
|
||||
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
|
||||
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
|
||||
enqueueProcessed: (payload: ProcessingTaskPayload) => Promise<{ id: string }>;
|
||||
iterQueueNames: () => string[];
|
||||
queueName: (suffix: string) => string;
|
||||
close: () => Promise<void>;
|
||||
}
|
||||
|
||||
export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
|
||||
const settings = config.fetch.async;
|
||||
|
||||
const connection =
|
||||
options.connection ??
|
||||
new IORedis(settings.redisUrl, {
|
||||
...parseRedisUrl(settings.redisUrl),
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
const factory = options.queueFactory ?? defaultQueueFactory;
|
||||
|
||||
const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
|
||||
|
||||
return {
|
||||
close: async () => {
|
||||
await connection.quit();
|
||||
},
|
||||
connection,
|
||||
enqueueArticle: (payload) => {
|
||||
const data = DetailsTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.details);
|
||||
return queue.add("collect_article", data);
|
||||
},
|
||||
enqueueListing: (payload) => {
|
||||
const data = ListingTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.listing);
|
||||
return queue.add("collect_listing", data);
|
||||
},
|
||||
enqueueProcessed: (payload) => {
|
||||
const data = ProcessingTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.processing);
|
||||
return queue.add("forward_for_processing", data);
|
||||
},
|
||||
iterQueueNames: () => [
|
||||
`${settings.prefix}:${settings.queues.listing}`,
|
||||
`${settings.prefix}:${settings.queues.details}`,
|
||||
`${settings.prefix}:${settings.queues.processing}`,
|
||||
],
|
||||
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
|
||||
settings,
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,28 @@
|
||||
import { z } from "zod";
|
||||
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema";
|
||||
|
||||
export const ListingTaskPayloadSchema = z.object({
|
||||
category: z.string().optional(),
|
||||
dateRange: z.string().optional(),
|
||||
pageRange: z.string().optional(),
|
||||
sourceId: z.string(),
|
||||
});
|
||||
|
||||
export const DetailsTaskPayloadSchema = z.object({
|
||||
category: z.string().optional(),
|
||||
data: z.any().optional(),
|
||||
dateRange: DateRangeSchema.optional(),
|
||||
page: z.number().int().nonnegative().optional(),
|
||||
pageRange: PageRangeSchema.optional(),
|
||||
sourceId: z.string(),
|
||||
url: z.url(),
|
||||
});
|
||||
|
||||
export const ProcessingTaskPayloadSchema = z.object({
|
||||
article: ArticleSchema,
|
||||
sourceId: z.string(),
|
||||
});
|
||||
|
||||
export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
|
||||
export type DetailsTaskPayload = z.infer<typeof DetailsTaskPayloadSchema>;
|
||||
export type ProcessingTaskPayload = z.infer<typeof ProcessingTaskPayloadSchema>;
|
||||
@@ -0,0 +1,60 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import * as handlers from "@/process/async/handlers";
|
||||
import { createQueueManager } from "@/process/async/queue";
|
||||
import {
|
||||
DetailsTaskPayloadSchema,
|
||||
ListingTaskPayloadSchema,
|
||||
ProcessingTaskPayloadSchema,
|
||||
} from "@/process/async/schemas";
|
||||
import { CrawlingOptions } from "@/process/crawler";
|
||||
|
||||
export const collectListing = async (payload: unknown): Promise<number> => {
|
||||
const data = ListingTaskPayloadSchema.parse(payload);
|
||||
logger.debug({ data }, "Collecting listing");
|
||||
|
||||
const count = await handlers.collectHtmlListing(data);
|
||||
logger.info({ count }, "Listing collection completed");
|
||||
|
||||
return count;
|
||||
};
|
||||
|
||||
export const collectArticle = async (payload: unknown): Promise<unknown> => {
|
||||
const data = DetailsTaskPayloadSchema.parse(payload);
|
||||
logger.info({ data }, "Collecting article");
|
||||
|
||||
const result = await handlers.collectArticle(data);
|
||||
logger.info({ url: data.url }, "Article collection completed");
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
export const forwardForProcessing = async (payload: unknown): Promise<unknown> => {
|
||||
const data = ProcessingTaskPayloadSchema.parse(payload);
|
||||
logger.debug({ sourceId: data.sourceId }, "Forwarding article for processing");
|
||||
|
||||
const result = await handlers.forwardForProcessing(data);
|
||||
logger.info({ result }, "Article forwarded for processing");
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise<string> => {
|
||||
const payload = ListingTaskPayloadSchema.parse({
|
||||
category: options.category,
|
||||
dateRange: options.dateRange,
|
||||
pageRange: options.pageRange,
|
||||
sourceId: options.sourceId,
|
||||
});
|
||||
|
||||
const manager = createQueueManager();
|
||||
logger.info({ payload }, "Scheduling listing collection job");
|
||||
|
||||
try {
|
||||
const job = await manager.enqueueListing(payload);
|
||||
logger.info({ job }, "Scheduled listing collection job");
|
||||
|
||||
return job.id;
|
||||
} finally {
|
||||
await manager.close();
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,74 @@
|
||||
import { QueueEvents, Worker } from "bullmq";
|
||||
import IORedis from "ioredis";
|
||||
|
||||
import { QueueFactory, QueueManager } from "@/process/async/queue";
|
||||
import { collectArticle, collectListing, forwardForProcessing } from "@/process/async/tasks";
|
||||
|
||||
export interface WorkerOptions {
|
||||
queueNames?: string[];
|
||||
connection?: IORedis;
|
||||
queueFactory?: QueueFactory;
|
||||
concurrency?: number;
|
||||
onError?: (error: Error) => void;
|
||||
queueManager: QueueManager;
|
||||
}
|
||||
|
||||
export interface WorkerHandle {
|
||||
readonly workers: Worker[];
|
||||
readonly events: QueueEvents[];
|
||||
close: () => Promise<void>;
|
||||
}
|
||||
|
||||
export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||
const manager = options.queueManager;
|
||||
const queueNames = options.queueNames ?? manager.iterQueueNames();
|
||||
const workers: Worker[] = [];
|
||||
const events: QueueEvents[] = [];
|
||||
|
||||
const connection = manager.connection;
|
||||
|
||||
for (const queueName of queueNames) {
|
||||
const worker = new Worker(
|
||||
queueName,
|
||||
async (job) => {
|
||||
switch (job.name) {
|
||||
case "collect_listing":
|
||||
return collectListing(job.data);
|
||||
case "collect_article":
|
||||
return collectArticle(job.data);
|
||||
case "forward_for_processing":
|
||||
return forwardForProcessing(job.data);
|
||||
default:
|
||||
throw new Error(`Unknown job name: ${job.name}`);
|
||||
}
|
||||
},
|
||||
{
|
||||
concurrency: options.concurrency ?? 5,
|
||||
connection,
|
||||
},
|
||||
);
|
||||
|
||||
if (options.onError) {
|
||||
worker.on("failed", (_, err) => options.onError?.(err as Error));
|
||||
worker.on("error", (err) => options.onError?.(err as Error));
|
||||
}
|
||||
|
||||
const queueEvents = new QueueEvents(queueName, { connection });
|
||||
|
||||
workers.push(worker);
|
||||
events.push(queueEvents);
|
||||
}
|
||||
|
||||
return {
|
||||
close: async () => {
|
||||
await Promise.all(workers.map((worker) => worker.close()));
|
||||
await Promise.all(events.map((event) => event.close()));
|
||||
|
||||
if (!options.queueManager) {
|
||||
await manager.close();
|
||||
}
|
||||
},
|
||||
events,
|
||||
workers,
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,44 @@
|
||||
import logger from "@basango/logger";
|
||||
import { config, FetchCrawlerConfig } from "@/config";
|
||||
import { JsonlPersistor, Persistor } from "@/process/persistence";
|
||||
import { AnySourceConfig } from "@/schema";
|
||||
import { createDateRange, createPageRange } from "@/utils";
|
||||
|
||||
export interface CrawlingOptions {
|
||||
sourceId: string;
|
||||
pageRange?: string | undefined;
|
||||
dateRange?: string | undefined;
|
||||
category?: string | undefined;
|
||||
}
|
||||
|
||||
export const resolveCrawlerConfig = (
|
||||
source: AnySourceConfig,
|
||||
options: CrawlingOptions,
|
||||
): FetchCrawlerConfig => {
|
||||
return {
|
||||
...config.fetch.crawler,
|
||||
category: options.category,
|
||||
dateRange: createDateRange(options.dateRange),
|
||||
pageRange: createPageRange(options.pageRange),
|
||||
source,
|
||||
};
|
||||
};
|
||||
|
||||
export const createPersistors = (source: AnySourceConfig): Persistor[] => {
|
||||
return [
|
||||
new JsonlPersistor({
|
||||
directory: config.paths.data,
|
||||
sourceId: source.sourceId,
|
||||
}),
|
||||
];
|
||||
};
|
||||
|
||||
export const closePersistors = async (persistors: Persistor[]): Promise<void> => {
|
||||
for (const persistor of persistors) {
|
||||
try {
|
||||
await persistor.close();
|
||||
} catch (error) {
|
||||
logger.warn({ error }, "Failed to close persistor");
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,107 @@
|
||||
import { HTMLElement, parse as parseHtml } from "node-html-parser";
|
||||
import { config, FetchCrawlerConfig } from "@/config";
|
||||
import { SyncHttpClient } from "@/http/http-client";
|
||||
import { OpenGraph } from "@/http/open-graph";
|
||||
import type { Persistor } from "@/process/persistence";
|
||||
import { AnySourceConfig, Article } from "@/schema";
|
||||
|
||||
export interface CrawlerOptions {
|
||||
persistors?: Persistor[];
|
||||
}
|
||||
|
||||
export abstract class BaseCrawler {
|
||||
protected readonly settings: FetchCrawlerConfig;
|
||||
protected readonly source: AnySourceConfig;
|
||||
protected readonly http: SyncHttpClient;
|
||||
protected readonly persistors: Persistor[];
|
||||
protected readonly openGraph: OpenGraph;
|
||||
|
||||
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
|
||||
if (!settings.source) {
|
||||
throw new Error("Crawler requires a bound source");
|
||||
}
|
||||
|
||||
this.http = new SyncHttpClient(config.fetch.client);
|
||||
this.persistors = options.persistors ?? [];
|
||||
this.openGraph = new OpenGraph();
|
||||
|
||||
this.settings = settings;
|
||||
this.source = settings.source as AnySourceConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and process articles from the source.
|
||||
*/
|
||||
abstract fetch(): Promise<void> | void;
|
||||
|
||||
/**
|
||||
* Crawl the given URL and return the HTML content as a string.
|
||||
* @param url - The URL to crawl
|
||||
*/
|
||||
async crawl(url: string): Promise<string> {
|
||||
const response = await this.http.get(url);
|
||||
return await response.text();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from an HTML node.
|
||||
* @param node - The HTML node
|
||||
*/
|
||||
protected textContent(node: HTMLElement | null | undefined): string | null {
|
||||
if (!node) return null;
|
||||
// innerText keeps spacing similar to browser rendering
|
||||
const value = node.innerText ?? node.text;
|
||||
const text = value.trim();
|
||||
return text.length ? text : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the first matching element from the root using the selector.
|
||||
* @param root - The root HTML element
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null {
|
||||
if (!selector) return null;
|
||||
try {
|
||||
return root.querySelector(selector) ?? null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all matching elements from the root using the selector.
|
||||
* @param root - The root HTML element
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] {
|
||||
if (!selector) return [];
|
||||
try {
|
||||
return root.querySelectorAll(selector);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse HTML string into an HTMLElement.
|
||||
* @param html - The HTML string
|
||||
*/
|
||||
protected parseHtml(html: string): HTMLElement {
|
||||
return parseHtml(html) as unknown as HTMLElement;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enrich the record with Open Graph metadata from the given URL.
|
||||
* @param record - The article record
|
||||
* @param url - The URL to fetch Open Graph data from
|
||||
*/
|
||||
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
|
||||
try {
|
||||
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
|
||||
return { ...record, metadata };
|
||||
} catch {
|
||||
return { ...record, metadata: undefined };
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,335 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
|
||||
import { HTMLElement } from "node-html-parser";
|
||||
import TurndownService from "turndown";
|
||||
import { FetchCrawlerConfig } from "@/config";
|
||||
import { BaseCrawler } from "@/process/parsers/base";
|
||||
import { Persistor, persist } from "@/process/persistence";
|
||||
import { DateRange, HtmlSourceConfig } from "@/schema";
|
||||
import { createAbsoluteUrl, isTimestampInRange } from "@/utils";
|
||||
|
||||
const md = new TurndownService({
|
||||
bulletListMarker: "-",
|
||||
headingStyle: "atx",
|
||||
hr: "---",
|
||||
});
|
||||
|
||||
/**
|
||||
* Create a safe RegExp from the given pattern.
|
||||
* @param pattern
|
||||
*/
|
||||
const safeRegExp = (pattern?: string | null): RegExp | null => {
|
||||
if (!pattern) return null;
|
||||
try {
|
||||
return new RegExp(pattern, "g");
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Crawler for generic HTML pages.
|
||||
*/
|
||||
export class HtmlCrawler extends BaseCrawler {
|
||||
readonly source: HtmlSourceConfig;
|
||||
private currentArticleUrl: string | null = null;
|
||||
|
||||
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "html") {
|
||||
throw new Error("HtmlCrawler requires a source of kind 'html'");
|
||||
}
|
||||
this.source = this.settings.source as HtmlSourceConfig;
|
||||
}
|
||||
|
||||
async fetch(): Promise<void> {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
|
||||
const articleSelector = this.source.sourceSelectors.articles;
|
||||
if (!articleSelector) {
|
||||
logger.error(
|
||||
{ source: this.source.sourceId },
|
||||
"No article selector configured for HTML source",
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
let stop = false;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const pageUrl = this.buildPageUrl(page);
|
||||
let html: string;
|
||||
try {
|
||||
html = await this.crawl(pageUrl);
|
||||
} catch (error) {
|
||||
logger.error({ error, page, pageUrl }, "> page %s => [failed]", page);
|
||||
continue;
|
||||
}
|
||||
|
||||
const root = this.parseHtml(html);
|
||||
const articles = this.extractAll(root, articleSelector);
|
||||
if (!articles.length) {
|
||||
logger.info({ page }, "No articles found on page");
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const node of articles) {
|
||||
try {
|
||||
this.currentArticleUrl = this.extractLink(node);
|
||||
let targetHtml = node.toString();
|
||||
|
||||
if (this.source.requiresDetails) {
|
||||
if (!this.currentArticleUrl) {
|
||||
logger.debug({ page }, "Skipping article without link for details");
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
targetHtml = await this.crawl(this.currentArticleUrl);
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
{ error: err, url: this.currentArticleUrl },
|
||||
"Failed to fetch detail page",
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const saved = await this.fetchOne(targetHtml, dateRange);
|
||||
// stop early on first out-of-range if pages are sorted by date desc
|
||||
if (saved === null) {
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error, pageUrl }, "Failed to process article on page");
|
||||
} finally {
|
||||
this.currentArticleUrl = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (stop) break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and process a single HTML article.
|
||||
* @param html - The HTML content of the article
|
||||
* @param dateRange - Optional date range for filtering
|
||||
*/
|
||||
async fetchOne(html: string, dateRange?: DateRange | null) {
|
||||
const root = this.parseHtml(html);
|
||||
const sel = this.source.sourceSelectors;
|
||||
|
||||
const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled";
|
||||
const link = this.currentArticleUrl ?? this.extractLink(root);
|
||||
if (!link) {
|
||||
logger.warn({ title: titleText }, "Skipping article without link");
|
||||
return null;
|
||||
}
|
||||
|
||||
const body = this.extractBody(root, sel.articleBody);
|
||||
const categories = this.extractCategories(root, sel.articleCategories);
|
||||
const rawDate = this.extractText(root, sel.articleDate);
|
||||
const timestamp = this.computeTimestamp(rawDate);
|
||||
|
||||
if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
|
||||
logger.info(
|
||||
{ date: rawDate, link, timestamp, title: titleText },
|
||||
"Skipping article outside date range",
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
const enriched = await this.enrichWithOpenGraph(
|
||||
{
|
||||
body,
|
||||
categories,
|
||||
link,
|
||||
source: this.source.sourceId,
|
||||
timestamp,
|
||||
title: titleText,
|
||||
},
|
||||
link,
|
||||
);
|
||||
|
||||
return await persist(enriched, this.persistors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch links from the target URL using the given selector.
|
||||
* @param target - The target URL to crawl
|
||||
* @param selector - The CSS selector to extract links
|
||||
*/
|
||||
async fetchLinks(target: string, selector: string) {
|
||||
const html = await this.crawl(target);
|
||||
const root = this.parseHtml(html);
|
||||
return this.extractAll(root, selector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the pagination range (start and end page numbers).
|
||||
*/
|
||||
async getPagination(): Promise<{ start: number; end: number }> {
|
||||
return { end: await this.getLastPage(), start: 0 };
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the last page number from pagination links.
|
||||
*/
|
||||
private async getLastPage(): Promise<number> {
|
||||
const template = this.applyCategory(this.source.paginationTemplate);
|
||||
const url = `${this.source.sourceUrl}${template}`;
|
||||
try {
|
||||
const html = await this.crawl(url);
|
||||
const root = this.parseHtml(html);
|
||||
const links = this.extractAll(root, this.source.sourceSelectors.pagination);
|
||||
if (!links.length) return 1;
|
||||
const last = links[links.length - 1]!;
|
||||
const href = last.getAttribute("href") as string | null;
|
||||
if (!href) return 1;
|
||||
|
||||
// Heuristic: prefer a number in the href, else "page" query param
|
||||
const numberMatch = href.match(/(\d+)/);
|
||||
if (numberMatch) {
|
||||
const page = Number.parseInt(numberMatch[1]!, 10);
|
||||
return Number.isFinite(page) && page > 0 ? page : 1;
|
||||
}
|
||||
const urlObj = new URL(createAbsoluteUrl(this.source.sourceUrl, href));
|
||||
const pageParam = urlObj.searchParams.get("page");
|
||||
if (pageParam) {
|
||||
const page = Number.parseInt(pageParam, 10);
|
||||
return Number.isFinite(page) && page > 0 ? page : 1;
|
||||
}
|
||||
return 1;
|
||||
} catch {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the URL for a given page number.
|
||||
* @param page - The page number
|
||||
*/
|
||||
buildPageUrl(page: number): string {
|
||||
let template = this.applyCategory(this.source.paginationTemplate);
|
||||
if (template.includes("{page}")) {
|
||||
template = template.replace("{page}", String(page));
|
||||
} else if (page > 0) {
|
||||
const sep = template.includes("?") ? "&" : "?";
|
||||
template = `${template}${sep}page=${page}`;
|
||||
}
|
||||
return createAbsoluteUrl(this.source.sourceUrl, template);
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply category replacement in the template if needed.
|
||||
* @param template - The URL template
|
||||
*/
|
||||
private applyCategory(template: string): string {
|
||||
if (template.includes("{category}")) {
|
||||
const replacement = this.settings.category ?? "";
|
||||
return template.replace("{category}", replacement);
|
||||
}
|
||||
return template;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract link URL from the given node using the selector.
|
||||
* @param node - The HTML element
|
||||
*/
|
||||
extractLink(node: HTMLElement): string | null {
|
||||
const selector = this.source.sourceSelectors.articleLink;
|
||||
if (!selector) return null;
|
||||
const target = this.extractFirst(node, selector);
|
||||
if (!target) return null;
|
||||
|
||||
const href =
|
||||
target.getAttribute("href") ?? target.getAttribute("data-href") ?? target.getAttribute("src");
|
||||
|
||||
if (!href) return null;
|
||||
return createAbsoluteUrl(this.source.sourceUrl, href);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from the root using the selector.
|
||||
* @param root - The root HTML element
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
private extractText(root: HTMLElement, selector?: string | null): string | null {
|
||||
if (!selector) return null;
|
||||
const target = this.extractFirst(root, selector);
|
||||
if (!target) return null;
|
||||
|
||||
// If it's an image, prefer alt/title
|
||||
const tag = target.tagName.toLowerCase();
|
||||
if (tag === "img") {
|
||||
const alt = target.getAttribute("alt");
|
||||
const title = target.getAttribute("title");
|
||||
const pick = (alt ?? title ?? "").trim();
|
||||
if (pick.length > 0) return pick;
|
||||
}
|
||||
return this.textContent(target);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract body content from the root using the selector.
|
||||
* @param root - The root HTML element
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
private extractBody(root: HTMLElement, selector?: string | null): string {
|
||||
if (selector) {
|
||||
const nodes = this.extractAll(root, selector);
|
||||
if (nodes.length) {
|
||||
const parts = nodes.map((n) => md.turndown(n.toString())).filter(Boolean);
|
||||
if (parts.length) return parts.join("\n");
|
||||
}
|
||||
}
|
||||
return md.turndown(root.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract categories from the root using the selector.
|
||||
* @param root - The root HTML element
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
|
||||
if (!selector) return [];
|
||||
const values: string[] = [];
|
||||
for (const node of this.extractAll(root, selector)) {
|
||||
const text = this.textContent(node);
|
||||
if (!text) continue;
|
||||
const lower = text.toLowerCase();
|
||||
if (!values.includes(lower)) values.push(lower);
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute Unix timestamp from raw date string.
|
||||
* @param raw - Raw date string
|
||||
* @private
|
||||
*/
|
||||
private computeTimestamp(raw?: string | null): number {
|
||||
if (!raw) return Math.floor(Date.now() / 1000);
|
||||
let value = raw.trim();
|
||||
const pattern = safeRegExp(this.source.sourceDate?.pattern);
|
||||
const replacement = this.source.sourceDate?.replacement ?? "";
|
||||
if (pattern) {
|
||||
try {
|
||||
value = value.replace(pattern, replacement);
|
||||
} catch {
|
||||
// ignore pattern failures
|
||||
}
|
||||
}
|
||||
const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm";
|
||||
if (!isDateMatch(value, format)) {
|
||||
// fallback: try native Date.parse as last resort
|
||||
const parsed = Date.parse(value);
|
||||
return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000);
|
||||
}
|
||||
const date = parseDateFns(value, format, new Date());
|
||||
const ts = getUnixTime(date);
|
||||
return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,239 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import TurndownService from "turndown";
|
||||
import { FetchCrawlerConfig } from "@/config";
|
||||
import { BaseCrawler } from "@/process/parsers/base";
|
||||
import { Persistor, persist } from "@/process/persistence";
|
||||
import { DateRange, PageRange, WordPressSourceConfig } from "@/schema";
|
||||
|
||||
const md = new TurndownService({
|
||||
bulletListMarker: "-",
|
||||
headingStyle: "atx",
|
||||
hr: "---",
|
||||
});
|
||||
|
||||
interface WordPressPost {
|
||||
link?: string;
|
||||
slug?: string;
|
||||
title?: { rendered?: string };
|
||||
content?: { rendered?: string };
|
||||
date?: string;
|
||||
categories?: number[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Crawler for WordPress sites using the REST API.
|
||||
*/
|
||||
export class WordPressCrawler extends BaseCrawler {
|
||||
readonly source: WordPressSourceConfig;
|
||||
private categoryMap: Map<number, string> = new Map();
|
||||
|
||||
private static readonly POST_QUERY =
|
||||
"_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc";
|
||||
private static readonly CATEGORY_QUERY =
|
||||
"_fields=id,slug,count&orderby=count&order=desc&per_page=100";
|
||||
private static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
|
||||
private static readonly TOTAL_POSTS_HEADER = "x-wp-total";
|
||||
|
||||
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "wordpress") {
|
||||
throw new Error("HtmlCrawler requires a source of kind 'wordpress'");
|
||||
}
|
||||
this.source = this.settings.source as WordPressSourceConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and process WordPress posts.
|
||||
*/
|
||||
async fetch(): Promise<void> {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
|
||||
let stop = false;
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const endpoint = this.postsEndpoint(page);
|
||||
try {
|
||||
const response = await this.http.get(endpoint);
|
||||
const data = (await response.json()) as unknown;
|
||||
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
|
||||
if (!Array.isArray(data)) {
|
||||
logger.warn({ page, type: typeof data }, "Unexpected WordPress payload type");
|
||||
}
|
||||
|
||||
for (const entry of articles) {
|
||||
const saved = await this.fetchOne(entry, dateRange);
|
||||
if (saved === null) {
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error, page }, "> page %s => [failed]", page);
|
||||
continue;
|
||||
}
|
||||
if (stop) break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch links from a WordPress posts endpoint.
|
||||
* @param url - The posts endpoint URL
|
||||
*/
|
||||
async fetchLinks(url: string) {
|
||||
const response = await this.http.get(url);
|
||||
const data = (await response.json()) as unknown;
|
||||
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
|
||||
if (!Array.isArray(data)) {
|
||||
logger.warn({ type: typeof data }, "Unexpected WordPress payload type");
|
||||
}
|
||||
return articles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and process a single WordPress post.
|
||||
* @param input - Decoded JSON object or raw JSON string
|
||||
* @param dateRange - Optional date range for filtering
|
||||
*/
|
||||
async fetchOne(input: unknown, dateRange?: DateRange | null) {
|
||||
// input can be the decoded JSON object or a raw JSON string
|
||||
let data: WordPressPost | null = null;
|
||||
try {
|
||||
if (typeof input === "string") {
|
||||
data = JSON.parse(input) as WordPressPost;
|
||||
} else if (input && typeof input === "object") {
|
||||
data = input as WordPressPost;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error({ error }, "Failed to decode WordPress payload");
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (!data || typeof data !== "object") {
|
||||
throw new Error("Unexpected WordPress payload type");
|
||||
}
|
||||
|
||||
const link = data.link;
|
||||
if (!link) {
|
||||
logger.error("Skipping WordPress article without link");
|
||||
return null;
|
||||
}
|
||||
|
||||
const titleHtml = data.title?.rendered ?? "";
|
||||
const bodyHtml = data.content?.rendered ?? "";
|
||||
const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled";
|
||||
const body = md.turndown(bodyHtml);
|
||||
const timestamp = this.computeTimestamp(data.date);
|
||||
const categories = await this.mapCategories(data.categories ?? []);
|
||||
|
||||
// date range skip as in HTML crawler
|
||||
if (dateRange) {
|
||||
const { isTimestampInRange } = await import("@/utils");
|
||||
if (!isTimestampInRange(dateRange, timestamp)) {
|
||||
logger.info(
|
||||
{ date: data.date, link, timestamp, title },
|
||||
"Skipping article outside date range",
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const enriched = await this.enrichWithOpenGraph(
|
||||
{
|
||||
body,
|
||||
categories,
|
||||
link,
|
||||
source: this.source.sourceId,
|
||||
timestamp,
|
||||
title,
|
||||
},
|
||||
link,
|
||||
);
|
||||
|
||||
return await persist(enriched, this.persistors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get pagination info from WordPress API.
|
||||
*/
|
||||
async getPagination(): Promise<PageRange> {
|
||||
try {
|
||||
const url = `${this.baseUrl()}wp-json/wp/v2/posts?_fields=id&per_page=100`;
|
||||
const response = await this.http.get(url);
|
||||
const pages = Number.parseInt(
|
||||
response.headers.get(WordPressCrawler.TOTAL_PAGES_HEADER) ?? "1",
|
||||
10,
|
||||
);
|
||||
const posts = Number.parseInt(
|
||||
response.headers.get(WordPressCrawler.TOTAL_POSTS_HEADER) ?? "0",
|
||||
10,
|
||||
);
|
||||
logger.info({ pages, posts }, "WordPress pagination");
|
||||
const end = Number.isFinite(pages) && pages > 0 ? pages : 1;
|
||||
return { end, start: 1 };
|
||||
} catch {
|
||||
return { end: 1, start: 1 };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get base URL for WordPress REST API.
|
||||
*/
|
||||
private baseUrl(): string {
|
||||
const base = String(this.source.sourceUrl);
|
||||
return base.endsWith("/") ? base : `${base}/`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct posts endpoint URL for a given page.
|
||||
* @param page - Page number
|
||||
*/
|
||||
postsEndpoint(page: number): string {
|
||||
return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and cache WordPress categories.
|
||||
*/
|
||||
private async fetchCategories(): Promise<void> {
|
||||
const url = `${this.baseUrl()}wp-json/wp/v2/categories?${WordPressCrawler.CATEGORY_QUERY}`;
|
||||
const response = await this.http.get(url);
|
||||
const list = (await response.json()) as Array<{ id: number; slug: string }>;
|
||||
for (const c of list) {
|
||||
this.categoryMap.set(c.id, c.slug);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Map category IDs to slugs.
|
||||
* @param ids - Category IDs
|
||||
*/
|
||||
private async mapCategories(ids: number[]): Promise<string[]> {
|
||||
if (this.categoryMap.size === 0) {
|
||||
try {
|
||||
await this.fetchCategories();
|
||||
} catch (error) {
|
||||
logger.warn({ error }, "Failed to fetch WordPress categories");
|
||||
}
|
||||
}
|
||||
const values: string[] = [];
|
||||
for (const id of [...ids].sort((a, b) => a - b)) {
|
||||
const slug = this.categoryMap.get(id);
|
||||
if (slug && !values.includes(slug)) values.push(slug);
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute UNIX timestamp from WordPress date string.
|
||||
* @param raw - Raw date string
|
||||
*/
|
||||
private computeTimestamp(raw?: string | null): number {
|
||||
if (!raw) return Math.floor(Date.now() / 1000);
|
||||
// Normalize WordPress Z into +00:00 for Date parsing robustness
|
||||
const cleaned = raw.replace("Z", "+00:00");
|
||||
const parsed = Date.parse(cleaned);
|
||||
if (!Number.isNaN(parsed)) return Math.floor(parsed / 1000);
|
||||
return Math.floor(Date.now() / 1000);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import logger from "@basango/logger";
|
||||
import { Article } from "@/schema";
|
||||
import { countTokens } from "@/utils";
|
||||
|
||||
export interface Persistor {
|
||||
persist(record: Article): Promise<void> | void;
|
||||
close: () => Promise<void> | void;
|
||||
}
|
||||
|
||||
export interface PersistorOptions {
|
||||
directory: string;
|
||||
sourceId: string;
|
||||
suffix?: string;
|
||||
encoding?: BufferEncoding;
|
||||
}
|
||||
|
||||
const sanitize = (text: string): string => {
|
||||
if (!text) return text;
|
||||
|
||||
let s = text.replace(/\u00A0/g, " "); // remove NBSP
|
||||
s = s.replace(" ", " "); // remove other NBSP
|
||||
s = s.replace(" ", " "); // remove NARROW NO-BREAK SPACE
|
||||
s = s.replace(/\u200B/g, ""); // remove ZERO WIDTH SPACE
|
||||
s = s.replace(/\u200C/g, ""); // remove ZERO WIDTH NON-JOINER
|
||||
s = s.replace(/\u200D/g, ""); // remove ZERO WIDTH JOINER
|
||||
s = s.replace(/\uFEFF/g, ""); // remove ZERO WIDTH NO-BREAK SPACE
|
||||
s = s.replace(/\r\n/g, "\n"); // normalize CRLF to LF
|
||||
s = s.replace(/\n{2,}/g, "\n"); // collapse multiple newlines to one
|
||||
// s = s.replace(/[ \t]{2,}/g, " "); // collapse multiple spaces/tabs
|
||||
|
||||
return s.trim();
|
||||
};
|
||||
|
||||
export const persist = async (payload: Article, persistors: Persistor[]): Promise<Article> => {
|
||||
const data = {
|
||||
...payload,
|
||||
body: sanitize(payload.body),
|
||||
categories: payload.categories.map(sanitize),
|
||||
title: sanitize(payload.title),
|
||||
};
|
||||
|
||||
const article = {
|
||||
...data,
|
||||
tokenStatistics: {
|
||||
body: countTokens(payload.body),
|
||||
categories: countTokens(payload.categories.join(",")),
|
||||
excerpt: countTokens(payload.body.substring(0, 200)),
|
||||
title: countTokens(payload.title),
|
||||
},
|
||||
} as Article;
|
||||
|
||||
for (const persistor of persistors) {
|
||||
try {
|
||||
await persistor.persist(article);
|
||||
} catch (error) {
|
||||
logger.error({ error }, "Failed to persist article record");
|
||||
}
|
||||
}
|
||||
|
||||
logger.info({ url: article.link }, "article successfully persisted");
|
||||
return article;
|
||||
};
|
||||
|
||||
export class JsonlPersistor implements Persistor {
|
||||
private readonly filePath: string;
|
||||
private readonly encoding: BufferEncoding;
|
||||
private pending: Promise<void> = Promise.resolve();
|
||||
private closed = false;
|
||||
|
||||
constructor(options: PersistorOptions) {
|
||||
const suffix = options.suffix ?? ".jsonl";
|
||||
this.encoding = options.encoding ?? "utf-8";
|
||||
|
||||
fs.mkdirSync(options.directory, { recursive: true });
|
||||
this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`);
|
||||
|
||||
if (!fs.existsSync(this.filePath)) {
|
||||
fs.writeFileSync(this.filePath, "", { encoding: this.encoding });
|
||||
}
|
||||
}
|
||||
|
||||
persist(record: Article): Promise<void> {
|
||||
if (this.closed) {
|
||||
return Promise.reject(new Error("Persistor has been closed"));
|
||||
}
|
||||
|
||||
const payload = `${JSON.stringify(record)}\n`;
|
||||
|
||||
this.pending = this.pending.then(async () => {
|
||||
fs.appendFileSync(this.filePath, payload, { encoding: this.encoding });
|
||||
});
|
||||
|
||||
return this.pending;
|
||||
}
|
||||
|
||||
async close(): Promise<void> {
|
||||
this.closed = true;
|
||||
await this.pending;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
import logger from "@basango/logger";
|
||||
import {
|
||||
CrawlingOptions,
|
||||
closePersistors,
|
||||
createPersistors,
|
||||
resolveCrawlerConfig,
|
||||
} from "@/process/crawler";
|
||||
import { HtmlCrawler } from "@/process/parsers/html";
|
||||
import { WordPressCrawler } from "@/process/parsers/wordpress";
|
||||
import { resolveSourceConfig } from "@/utils";
|
||||
|
||||
export const runSyncCrawl = async (options: CrawlingOptions): Promise<void> => {
|
||||
const source = resolveSourceConfig(options.sourceId);
|
||||
const settings = resolveCrawlerConfig(source, options);
|
||||
const persistors = createPersistors(source);
|
||||
|
||||
const crawler =
|
||||
source.sourceKind === "wordpress"
|
||||
? new WordPressCrawler(settings, { persistors })
|
||||
: new HtmlCrawler(settings, { persistors });
|
||||
|
||||
try {
|
||||
await crawler.fetch();
|
||||
} finally {
|
||||
await closePersistors(persistors);
|
||||
}
|
||||
|
||||
logger.info({ ...options }, "Synchronous crawl completed");
|
||||
};
|
||||
@@ -0,0 +1,131 @@
|
||||
import { z } from "zod";
|
||||
|
||||
export const UpdateDirectionSchema = z.enum(["forward", "backward"]);
|
||||
export const SourceKindSchema = z.enum(["wordpress", "html"]);
|
||||
|
||||
export const DateRangeSchema = z
|
||||
.object({
|
||||
end: z.number().int(),
|
||||
start: z.number().int(),
|
||||
})
|
||||
.superRefine((value, ctx) => {
|
||||
if (value.start === 0 || value.end === 0) {
|
||||
ctx.addIssue({
|
||||
code: "custom",
|
||||
message: "Timestamp cannot be zero",
|
||||
});
|
||||
}
|
||||
if (value.end < value.start) {
|
||||
ctx.addIssue({
|
||||
code: "custom",
|
||||
message: "End timestamp must be greater than or equal to start",
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
export const PageRangeSchema = z
|
||||
.object({
|
||||
end: z.number().int().min(0),
|
||||
start: z.number().int().min(0),
|
||||
})
|
||||
.superRefine((value, ctx) => {
|
||||
if (value.end < value.start) {
|
||||
ctx.addIssue({
|
||||
code: "custom",
|
||||
message: "End page must be greater than or equal to start page",
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
export const PageRangeSpecSchema = z
|
||||
.string()
|
||||
.regex(/^[0-9]+:[0-9]+$/, "Invalid page range format. Use start:end")
|
||||
.transform((spec) => {
|
||||
const [startText, endText] = spec.split(":");
|
||||
return {
|
||||
end: Number.parseInt(String(endText), 10),
|
||||
start: Number.parseInt(String(startText), 10),
|
||||
};
|
||||
});
|
||||
|
||||
export const DateRangeSpecSchema = z
|
||||
.string()
|
||||
.regex(/.+:.+/, "Expected start:end format")
|
||||
.transform((spec) => {
|
||||
const [startRaw, endRaw] = spec.split(":");
|
||||
return { endRaw: String(endRaw), startRaw: String(startRaw) };
|
||||
});
|
||||
|
||||
export const SourceDateSchema = z.object({
|
||||
format: z.string().default("yyyy-LL-dd HH:mm"),
|
||||
pattern: z.string().nullable().optional(),
|
||||
replacement: z.string().nullable().optional(),
|
||||
});
|
||||
|
||||
const BaseSourceSchema = z.object({
|
||||
categories: z.array(z.string()).default([]),
|
||||
requiresDetails: z.boolean().default(false),
|
||||
requiresRateLimit: z.boolean().default(false),
|
||||
sourceDate: SourceDateSchema,
|
||||
sourceId: z.string(),
|
||||
sourceKind: SourceKindSchema,
|
||||
sourceUrl: z.url(),
|
||||
supportsCategories: z.boolean().default(false),
|
||||
});
|
||||
|
||||
export const HtmlSourceConfigSchema = BaseSourceSchema.extend({
|
||||
paginationTemplate: z.string(),
|
||||
sourceKind: z.literal("html"),
|
||||
sourceSelectors: z.object({
|
||||
articleBody: z.string(),
|
||||
articleCategories: z.string().optional(),
|
||||
articleDate: z.string(),
|
||||
articleLink: z.string(),
|
||||
articles: z.string(),
|
||||
articleTitle: z.string(),
|
||||
pagination: z.string().default("ul.pagination > li a"),
|
||||
}),
|
||||
});
|
||||
|
||||
export const WordPressSourceConfigSchema = BaseSourceSchema.extend({
|
||||
sourceDate: SourceDateSchema.default(SourceDateSchema.parse({ format: "yyyy-LL-dd'T'HH:mm:ss" })),
|
||||
sourceKind: z.literal("wordpress"),
|
||||
});
|
||||
|
||||
export const ArticleMetadataSchema = z.object({
|
||||
description: z.string().optional(),
|
||||
image: z.string().optional(),
|
||||
title: z.string().optional(),
|
||||
url: z.url().optional(),
|
||||
});
|
||||
|
||||
export const ArticleTokenStatisticsSchema = z.object({
|
||||
body: z.number().int().nonnegative().default(0),
|
||||
categories: z.number().int().nonnegative().default(0),
|
||||
excerpt: z.number().int().nonnegative().default(0),
|
||||
title: z.number().int().nonnegative().default(0),
|
||||
});
|
||||
|
||||
export const ArticleSchema = z.object({
|
||||
body: z.string(),
|
||||
categories: z.array(z.string()).default([]),
|
||||
link: z.url(),
|
||||
metadata: ArticleMetadataSchema.optional(),
|
||||
source: z.string(),
|
||||
timestamp: z.number().int(),
|
||||
title: z.string(),
|
||||
tokenStatistics: ArticleTokenStatisticsSchema.optional(),
|
||||
});
|
||||
|
||||
export type ArticleMetadata = z.infer<typeof ArticleMetadataSchema>;
|
||||
export type Article = z.infer<typeof ArticleSchema>;
|
||||
export type DateRange = z.infer<typeof DateRangeSchema>;
|
||||
export type PageRange = z.infer<typeof PageRangeSchema>;
|
||||
export type HtmlSourceConfig = z.infer<typeof HtmlSourceConfigSchema>;
|
||||
export type WordPressSourceConfig = z.infer<typeof WordPressSourceConfigSchema>;
|
||||
export type AnySourceConfig = HtmlSourceConfig | WordPressSourceConfig;
|
||||
|
||||
export interface CreateDateRangeOptions {
|
||||
format?: string;
|
||||
separator?: string;
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import { runSyncCrawl } from "@/process/sync/tasks";
|
||||
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
|
||||
|
||||
const main = async (): Promise<void> => {
|
||||
const options = parseCrawlingCliArgs();
|
||||
|
||||
if (options.sourceId === undefined) {
|
||||
console.log(CRAWLING_USAGE);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await runSyncCrawl({ ...options });
|
||||
} catch (error) {
|
||||
logger.error({ error }, "Synchronous crawl failed");
|
||||
process.exitCode = 1;
|
||||
}
|
||||
};
|
||||
|
||||
void main();
|
||||
@@ -0,0 +1,24 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import { scheduleAsyncCrawl } from "@/process/async/tasks";
|
||||
import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
|
||||
|
||||
const main = async (): Promise<void> => {
|
||||
const options = parseCrawlingCliArgs();
|
||||
|
||||
if (options.sourceId === undefined) {
|
||||
console.log(CRAWLING_USAGE);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const id = await scheduleAsyncCrawl({ ...options });
|
||||
|
||||
logger.info({ id, options }, "Scheduled asynchronous crawl job");
|
||||
} catch (error) {
|
||||
logger.error({ error }, "Failed to schedule crawl job");
|
||||
process.exitCode = 1;
|
||||
}
|
||||
};
|
||||
|
||||
void main();
|
||||
@@ -0,0 +1,39 @@
|
||||
import { parseArgs } from "node:util";
|
||||
import { CrawlingOptions } from "@/process/crawler";
|
||||
|
||||
interface WorkerCliOptions {
|
||||
queue?: string[];
|
||||
}
|
||||
|
||||
export const CRAWLING_USAGE = `
|
||||
Usage: bun run crawl:[async|sync] -- --sourceId <id> [options]
|
||||
|
||||
Options:
|
||||
--pageRange <range> Optional page range filter (e.g. 1:5)
|
||||
--dateRange <range> Optional date range filter (e.g. 2024-01-01:2024-01-31)
|
||||
--category <slug> Optional category to crawl
|
||||
-h, --help Show this message
|
||||
`;
|
||||
|
||||
export const parseWorkerCliArgs = (): WorkerCliOptions => {
|
||||
const { values } = parseArgs({
|
||||
options: {
|
||||
queue: { multiple: true, short: "q", type: "string" },
|
||||
},
|
||||
});
|
||||
|
||||
return values as WorkerCliOptions;
|
||||
};
|
||||
|
||||
export const parseCrawlingCliArgs = (): CrawlingOptions => {
|
||||
const { values } = parseArgs({
|
||||
options: {
|
||||
category: { type: "string" },
|
||||
dateRange: { type: "string" },
|
||||
pageRange: { type: "string" },
|
||||
sourceId: { type: "string" },
|
||||
},
|
||||
});
|
||||
|
||||
return values as CrawlingOptions;
|
||||
};
|
||||
@@ -0,0 +1,35 @@
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { createQueueManager } from "@/process/async/queue";
|
||||
import { startWorker } from "@/process/async/worker";
|
||||
import { parseWorkerCliArgs } from "@/scripts/utils";
|
||||
|
||||
const main = async (): Promise<void> => {
|
||||
const options = parseWorkerCliArgs();
|
||||
|
||||
const manager = createQueueManager();
|
||||
const queues = options.queue?.length
|
||||
? options.queue.map((name) => manager.queueName(name))
|
||||
: undefined;
|
||||
|
||||
const handle = startWorker({
|
||||
queueManager: manager,
|
||||
queueNames: queues,
|
||||
});
|
||||
|
||||
const shutdown = async (signal: NodeJS.Signals) => {
|
||||
logger.info({ signal }, "Received shutdown signal, draining workers");
|
||||
try {
|
||||
await handle.close();
|
||||
} finally {
|
||||
await manager.close();
|
||||
process.exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
process.once("SIGINT", (signal) => void shutdown(signal));
|
||||
process.once("SIGTERM", (signal) => void shutdown(signal));
|
||||
logger.info({ queueNames: queues }, "Crawler workers started");
|
||||
};
|
||||
|
||||
void main();
|
||||
@@ -0,0 +1,163 @@
|
||||
import { format, getUnixTime, isMatch, parse } from "date-fns";
|
||||
import type { RedisOptions } from "ioredis";
|
||||
import { get_encoding, TiktokenEncoding } from "tiktoken";
|
||||
import { config } from "@/config";
|
||||
import { DEFAULT_DATE_FORMAT } from "@/constants";
|
||||
import {
|
||||
AnySourceConfig,
|
||||
CreateDateRangeOptions,
|
||||
DateRange,
|
||||
DateRangeSchema,
|
||||
DateRangeSpecSchema,
|
||||
HtmlSourceConfig,
|
||||
PageRange,
|
||||
PageRangeSchema,
|
||||
PageRangeSpecSchema,
|
||||
WordPressSourceConfig,
|
||||
} from "@/schema";
|
||||
|
||||
/**
|
||||
* Resolve a source configuration by its ID.
|
||||
* @param id - The source ID
|
||||
*/
|
||||
export const resolveSourceConfig = (id: string): AnySourceConfig => {
|
||||
const source =
|
||||
config.sources.html.find((s: HtmlSourceConfig) => s.sourceId === id) ||
|
||||
config.sources.wordpress.find((s: WordPressSourceConfig) => s.sourceId === id);
|
||||
|
||||
if (source === undefined) {
|
||||
throw new Error(`Source '${id}' not found in configuration`);
|
||||
}
|
||||
|
||||
return source;
|
||||
};
|
||||
|
||||
/**
|
||||
* Parse a Redis URL into RedisOptions.
|
||||
* @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0")
|
||||
*/
|
||||
export const parseRedisUrl = (url: string): RedisOptions => {
|
||||
if (!url.startsWith("redis://")) {
|
||||
return {};
|
||||
}
|
||||
const parsed = new URL(url);
|
||||
return {
|
||||
db: Number(parsed.pathname?.replace("/", "") || 0),
|
||||
host: parsed.hostname,
|
||||
password: parsed.password || undefined,
|
||||
port: Number(parsed.port || 6379),
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Parse a date string using the specified format.
|
||||
* @param value - The date string to parse
|
||||
* @param format - The date format
|
||||
*/
|
||||
const parseDate = (value: string, format: string): Date => {
|
||||
if (!isMatch(value, format)) {
|
||||
throw new Error(`Invalid date '${value}' for format '${format}'`);
|
||||
}
|
||||
const parsed = parse(value, format, new Date());
|
||||
if (Number.isNaN(parsed.getTime())) {
|
||||
throw new Error(`Invalid date '${value}' for format '${format}'`);
|
||||
}
|
||||
return parsed;
|
||||
};
|
||||
|
||||
/**
|
||||
* Count the number of tokens in the given text using the specified encoding.
|
||||
* @param text - The input text
|
||||
* @param encoding - The token encoding (default: "cl100k_base")
|
||||
*/
|
||||
export const countTokens = (text: string, encoding: TiktokenEncoding = "cl100k_base"): number => {
|
||||
try {
|
||||
const encoder = get_encoding(encoding);
|
||||
const tokens = encoder.encode(text);
|
||||
encoder.free();
|
||||
return tokens.length;
|
||||
} catch {
|
||||
return text.length;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Create a page range from a string specification.
|
||||
* @param spec - The page range specification (e.g., "1:10")
|
||||
*/
|
||||
export const createPageRange = (spec: string | undefined): PageRange | undefined => {
|
||||
if (!spec) return undefined;
|
||||
const parsed = PageRangeSpecSchema.parse(spec);
|
||||
return PageRangeSchema.parse(parsed);
|
||||
};
|
||||
|
||||
/**
|
||||
* Create a date range from a string specification.
|
||||
* @param spec - The date range specification (e.g., "2023-01-01:2023-12-31")
|
||||
* @param options - Options for date range creation
|
||||
*/
|
||||
export const createDateRange = (
|
||||
spec: string | undefined,
|
||||
options: CreateDateRangeOptions = {},
|
||||
): DateRange | undefined => {
|
||||
if (!spec) return undefined;
|
||||
const { format = DEFAULT_DATE_FORMAT, separator = ":" } = options;
|
||||
if (!separator) {
|
||||
throw new Error("Separator cannot be empty");
|
||||
}
|
||||
|
||||
const normalized = spec.replace(separator, ":");
|
||||
const parsedSpec = DateRangeSpecSchema.parse(normalized);
|
||||
|
||||
const startDate = parseDate(parsedSpec.startRaw, format);
|
||||
const endDate = parseDate(parsedSpec.endRaw, format);
|
||||
|
||||
const range = {
|
||||
end: getUnixTime(endDate),
|
||||
start: getUnixTime(startDate),
|
||||
};
|
||||
|
||||
return DateRangeSchema.parse(range);
|
||||
};
|
||||
|
||||
/**
|
||||
* Format a date range into a string representation.
|
||||
* @param range - The date range
|
||||
* @param fmt - The date format (default: DEFAULT_DATE_FORMAT)
|
||||
*/
|
||||
export const formatDateRange = (range: DateRange, fmt = DEFAULT_DATE_FORMAT): string => {
|
||||
const start = format(new Date(range.start * 1000), fmt);
|
||||
const end = format(new Date(range.end * 1000), fmt);
|
||||
return `${start}:${end}`;
|
||||
};
|
||||
|
||||
/**
|
||||
* Format a page range into a string representation.
|
||||
* @param range - The page range
|
||||
*/
|
||||
export const formatPageRange = (range: PageRange): string => {
|
||||
return `${range.start}:${range.end}`;
|
||||
};
|
||||
|
||||
/**
|
||||
* Check if a timestamp is within a given date range.
|
||||
* @param range - The date range
|
||||
* @param timestamp - The timestamp to check
|
||||
*/
|
||||
export const isTimestampInRange = (range: DateRange, timestamp: number): boolean => {
|
||||
return range.start <= timestamp && timestamp <= range.end;
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert a relative URL to an absolute URL based on the base URL.
|
||||
* @param base - The base URL
|
||||
* @param href - The relative or absolute URL
|
||||
*/
|
||||
export const createAbsoluteUrl = (base: string, href: string): string => {
|
||||
try {
|
||||
// new URL handles relative paths with base
|
||||
return new URL(href, base.endsWith("/") ? base : `${base}/`).toString();
|
||||
} catch {
|
||||
return href;
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user