feat(monorepo): migrate to typescript monorepo

This commit is contained in:
2025-11-07 17:09:29 +02:00
committed by BernardNganduDev
parent 3e09956f05
commit 075a388ccb
745 changed files with 2341 additions and 5082 deletions
+137
View File
@@ -0,0 +1,137 @@
import { logger } from "@basango/logger";
import { config, env } from "@/config";
import { SyncHttpClient } from "@/http/http-client";
import { createQueueManager, QueueManager } from "@/process/async/queue";
import {
DetailsTaskPayload,
ListingTaskPayload,
ProcessingTaskPayload,
} from "@/process/async/schemas";
import { resolveCrawlerConfig } from "@/process/crawler";
import { HtmlCrawler } from "@/process/parsers/html";
import { WordPressCrawler } from "@/process/parsers/wordpress";
import { JsonlPersistor } from "@/process/persistence";
import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
export const collectHtmlListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
if (source.sourceKind !== "html") {
return await collectWordPressListing(payload, manager);
}
const settings = resolveCrawlerConfig(source, payload);
const crawler = new HtmlCrawler(settings);
const pageRange = settings.pageRange ?? (await crawler.getPagination());
let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`;
try {
const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
for (const node of items) {
const url = crawler.extractLink(node);
if (!url) continue;
await manager.enqueueArticle({
category: payload.category,
dateRange: createDateRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
queued += 1;
}
} catch (error) {
logger.error({ error, target }, "Failed to crawl page");
}
}
return queued;
};
export const collectWordPressListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
if (source.sourceKind !== "wordpress") {
return await collectHtmlListing(payload, manager);
}
const settings = resolveCrawlerConfig(source, payload);
const crawler = new WordPressCrawler(settings);
const pageRange = settings.pageRange ?? (await crawler.getPagination());
let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const url = crawler.postsEndpoint(page);
try {
const entries = await crawler.fetchLinks(url);
for (const data of entries) {
const url = data.link;
if (!url) continue;
await manager.enqueueArticle({
category: payload.category,
data,
dateRange: createDateRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
queued += 1;
}
} catch (error) {
logger.error({ error, page }, "Failed to fetch WordPress page");
}
}
return queued;
};
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
const source = resolveSourceConfig(payload.sourceId);
const settings = resolveCrawlerConfig(source, {
category: payload.category,
dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
sourceId: payload.sourceId,
});
const persistors = [
new JsonlPersistor({
directory: config.paths.data,
sourceId: String(source.sourceId),
}),
];
if (source.sourceKind === SourceKindSchema.enum.html) {
if (!payload.url) throw new Error("Missing article url");
const crawler = new HtmlCrawler(settings, { persistors });
const html = await crawler.crawl(payload.url);
return await crawler.fetchOne(html, settings.dateRange);
}
if (source.sourceKind === SourceKindSchema.enum.wordpress) {
const crawler = new WordPressCrawler(settings, { persistors });
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
}
throw new Error(`Unsupported source kind`);
};
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
logger.info({ article: payload.article.title }, "Ready for downstream processing");
const client = new SyncHttpClient(config.fetch.client);
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
await client.post(endpoint, { json: payload.article });
logger.info({ article: payload.article.title }, "Forwarded article to API");
return payload.article;
};
+107
View File
@@ -0,0 +1,107 @@
import { randomUUID } from "node:crypto";
import { JobsOptions, Queue, QueueOptions } from "bullmq";
import IORedis from "ioredis";
import { config, FetchAsyncConfig } from "@/config";
import {
DetailsTaskPayload,
DetailsTaskPayloadSchema,
ListingTaskPayload,
ListingTaskPayloadSchema,
ProcessingTaskPayload,
ProcessingTaskPayloadSchema,
} from "@/process/async/schemas";
import { parseRedisUrl } from "@/utils";
export interface QueueBackend<T = unknown> {
add: (name: string, data: T, opts?: JobsOptions) => Promise<{ id: string }>;
}
export type QueueFactory = (
queueName: string,
settings: FetchAsyncConfig,
connection?: IORedis,
) => QueueBackend;
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
const redisConnection =
connection ??
new IORedis(settings.redisUrl, {
...parseRedisUrl(settings.redisUrl),
maxRetriesPerRequest: null,
});
const options: QueueOptions = {
connection: redisConnection,
prefix: settings.prefix,
};
const queue = new Queue(queueName, options);
return {
add: async (name, data, opts) => {
const job = await queue.add(name, data, {
removeOnComplete: settings.ttl.result === 0 ? true : undefined,
removeOnFail: settings.ttl.failure === 0 ? true : undefined,
...opts,
});
return { id: job.id ?? randomUUID() };
},
};
};
export interface CreateQueueManagerOptions {
queueFactory?: QueueFactory;
connection?: IORedis;
}
export interface QueueManager {
readonly settings: FetchAsyncConfig;
readonly connection: IORedis;
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
enqueueProcessed: (payload: ProcessingTaskPayload) => Promise<{ id: string }>;
iterQueueNames: () => string[];
queueName: (suffix: string) => string;
close: () => Promise<void>;
}
export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
const settings = config.fetch.async;
const connection =
options.connection ??
new IORedis(settings.redisUrl, {
...parseRedisUrl(settings.redisUrl),
maxRetriesPerRequest: null,
});
const factory = options.queueFactory ?? defaultQueueFactory;
const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
return {
close: async () => {
await connection.quit();
},
connection,
enqueueArticle: (payload) => {
const data = DetailsTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.details);
return queue.add("collect_article", data);
},
enqueueListing: (payload) => {
const data = ListingTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.listing);
return queue.add("collect_listing", data);
},
enqueueProcessed: (payload) => {
const data = ProcessingTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.processing);
return queue.add("forward_for_processing", data);
},
iterQueueNames: () => [
`${settings.prefix}:${settings.queues.listing}`,
`${settings.prefix}:${settings.queues.details}`,
`${settings.prefix}:${settings.queues.processing}`,
],
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
settings,
};
};
+28
View File
@@ -0,0 +1,28 @@
import { z } from "zod";
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema";
export const ListingTaskPayloadSchema = z.object({
category: z.string().optional(),
dateRange: z.string().optional(),
pageRange: z.string().optional(),
sourceId: z.string(),
});
export const DetailsTaskPayloadSchema = z.object({
category: z.string().optional(),
data: z.any().optional(),
dateRange: DateRangeSchema.optional(),
page: z.number().int().nonnegative().optional(),
pageRange: PageRangeSchema.optional(),
sourceId: z.string(),
url: z.url(),
});
export const ProcessingTaskPayloadSchema = z.object({
article: ArticleSchema,
sourceId: z.string(),
});
export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
export type DetailsTaskPayload = z.infer<typeof DetailsTaskPayloadSchema>;
export type ProcessingTaskPayload = z.infer<typeof ProcessingTaskPayloadSchema>;
+60
View File
@@ -0,0 +1,60 @@
import { logger } from "@basango/logger";
import * as handlers from "@/process/async/handlers";
import { createQueueManager } from "@/process/async/queue";
import {
DetailsTaskPayloadSchema,
ListingTaskPayloadSchema,
ProcessingTaskPayloadSchema,
} from "@/process/async/schemas";
import { CrawlingOptions } from "@/process/crawler";
export const collectListing = async (payload: unknown): Promise<number> => {
const data = ListingTaskPayloadSchema.parse(payload);
logger.debug({ data }, "Collecting listing");
const count = await handlers.collectHtmlListing(data);
logger.info({ count }, "Listing collection completed");
return count;
};
export const collectArticle = async (payload: unknown): Promise<unknown> => {
const data = DetailsTaskPayloadSchema.parse(payload);
logger.info({ data }, "Collecting article");
const result = await handlers.collectArticle(data);
logger.info({ url: data.url }, "Article collection completed");
return result;
};
export const forwardForProcessing = async (payload: unknown): Promise<unknown> => {
const data = ProcessingTaskPayloadSchema.parse(payload);
logger.debug({ sourceId: data.sourceId }, "Forwarding article for processing");
const result = await handlers.forwardForProcessing(data);
logger.info({ result }, "Article forwarded for processing");
return result;
};
export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise<string> => {
const payload = ListingTaskPayloadSchema.parse({
category: options.category,
dateRange: options.dateRange,
pageRange: options.pageRange,
sourceId: options.sourceId,
});
const manager = createQueueManager();
logger.info({ payload }, "Scheduling listing collection job");
try {
const job = await manager.enqueueListing(payload);
logger.info({ job }, "Scheduled listing collection job");
return job.id;
} finally {
await manager.close();
}
};
+74
View File
@@ -0,0 +1,74 @@
import { QueueEvents, Worker } from "bullmq";
import IORedis from "ioredis";
import { QueueFactory, QueueManager } from "@/process/async/queue";
import { collectArticle, collectListing, forwardForProcessing } from "@/process/async/tasks";
export interface WorkerOptions {
queueNames?: string[];
connection?: IORedis;
queueFactory?: QueueFactory;
concurrency?: number;
onError?: (error: Error) => void;
queueManager: QueueManager;
}
export interface WorkerHandle {
readonly workers: Worker[];
readonly events: QueueEvents[];
close: () => Promise<void>;
}
export const startWorker = (options: WorkerOptions): WorkerHandle => {
const manager = options.queueManager;
const queueNames = options.queueNames ?? manager.iterQueueNames();
const workers: Worker[] = [];
const events: QueueEvents[] = [];
const connection = manager.connection;
for (const queueName of queueNames) {
const worker = new Worker(
queueName,
async (job) => {
switch (job.name) {
case "collect_listing":
return collectListing(job.data);
case "collect_article":
return collectArticle(job.data);
case "forward_for_processing":
return forwardForProcessing(job.data);
default:
throw new Error(`Unknown job name: ${job.name}`);
}
},
{
concurrency: options.concurrency ?? 5,
connection,
},
);
if (options.onError) {
worker.on("failed", (_, err) => options.onError?.(err as Error));
worker.on("error", (err) => options.onError?.(err as Error));
}
const queueEvents = new QueueEvents(queueName, { connection });
workers.push(worker);
events.push(queueEvents);
}
return {
close: async () => {
await Promise.all(workers.map((worker) => worker.close()));
await Promise.all(events.map((event) => event.close()));
if (!options.queueManager) {
await manager.close();
}
},
events,
workers,
};
};
+44
View File
@@ -0,0 +1,44 @@
import logger from "@basango/logger";
import { config, FetchCrawlerConfig } from "@/config";
import { JsonlPersistor, Persistor } from "@/process/persistence";
import { AnySourceConfig } from "@/schema";
import { createDateRange, createPageRange } from "@/utils";
export interface CrawlingOptions {
sourceId: string;
pageRange?: string | undefined;
dateRange?: string | undefined;
category?: string | undefined;
}
export const resolveCrawlerConfig = (
source: AnySourceConfig,
options: CrawlingOptions,
): FetchCrawlerConfig => {
return {
...config.fetch.crawler,
category: options.category,
dateRange: createDateRange(options.dateRange),
pageRange: createPageRange(options.pageRange),
source,
};
};
export const createPersistors = (source: AnySourceConfig): Persistor[] => {
return [
new JsonlPersistor({
directory: config.paths.data,
sourceId: source.sourceId,
}),
];
};
export const closePersistors = async (persistors: Persistor[]): Promise<void> => {
for (const persistor of persistors) {
try {
await persistor.close();
} catch (error) {
logger.warn({ error }, "Failed to close persistor");
}
}
};
+107
View File
@@ -0,0 +1,107 @@
import { HTMLElement, parse as parseHtml } from "node-html-parser";
import { config, FetchCrawlerConfig } from "@/config";
import { SyncHttpClient } from "@/http/http-client";
import { OpenGraph } from "@/http/open-graph";
import type { Persistor } from "@/process/persistence";
import { AnySourceConfig, Article } from "@/schema";
export interface CrawlerOptions {
persistors?: Persistor[];
}
export abstract class BaseCrawler {
protected readonly settings: FetchCrawlerConfig;
protected readonly source: AnySourceConfig;
protected readonly http: SyncHttpClient;
protected readonly persistors: Persistor[];
protected readonly openGraph: OpenGraph;
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
if (!settings.source) {
throw new Error("Crawler requires a bound source");
}
this.http = new SyncHttpClient(config.fetch.client);
this.persistors = options.persistors ?? [];
this.openGraph = new OpenGraph();
this.settings = settings;
this.source = settings.source as AnySourceConfig;
}
/**
* Fetch and process articles from the source.
*/
abstract fetch(): Promise<void> | void;
/**
* Crawl the given URL and return the HTML content as a string.
* @param url - The URL to crawl
*/
async crawl(url: string): Promise<string> {
const response = await this.http.get(url);
return await response.text();
}
/**
* Extract text content from an HTML node.
* @param node - The HTML node
*/
protected textContent(node: HTMLElement | null | undefined): string | null {
if (!node) return null;
// innerText keeps spacing similar to browser rendering
const value = node.innerText ?? node.text;
const text = value.trim();
return text.length ? text : null;
}
/**
* Extract the first matching element from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null {
if (!selector) return null;
try {
return root.querySelector(selector) ?? null;
} catch {
return null;
}
}
/**
* Extract all matching elements from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] {
if (!selector) return [];
try {
return root.querySelectorAll(selector);
} catch {
return [];
}
}
/**
* Parse HTML string into an HTMLElement.
* @param html - The HTML string
*/
protected parseHtml(html: string): HTMLElement {
return parseHtml(html) as unknown as HTMLElement;
}
/**
* Enrich the record with Open Graph metadata from the given URL.
* @param record - The article record
* @param url - The URL to fetch Open Graph data from
*/
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
try {
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
return { ...record, metadata };
} catch {
return { ...record, metadata: undefined };
}
}
}
+335
View File
@@ -0,0 +1,335 @@
import { logger } from "@basango/logger";
import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
import { HTMLElement } from "node-html-parser";
import TurndownService from "turndown";
import { FetchCrawlerConfig } from "@/config";
import { BaseCrawler } from "@/process/parsers/base";
import { Persistor, persist } from "@/process/persistence";
import { DateRange, HtmlSourceConfig } from "@/schema";
import { createAbsoluteUrl, isTimestampInRange } from "@/utils";
const md = new TurndownService({
bulletListMarker: "-",
headingStyle: "atx",
hr: "---",
});
/**
* Create a safe RegExp from the given pattern.
* @param pattern
*/
const safeRegExp = (pattern?: string | null): RegExp | null => {
if (!pattern) return null;
try {
return new RegExp(pattern, "g");
} catch {
return null;
}
};
/**
* Crawler for generic HTML pages.
*/
export class HtmlCrawler extends BaseCrawler {
readonly source: HtmlSourceConfig;
private currentArticleUrl: string | null = null;
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
super(settings, options);
if (!settings.source || settings.source.sourceKind !== "html") {
throw new Error("HtmlCrawler requires a source of kind 'html'");
}
this.source = this.settings.source as HtmlSourceConfig;
}
async fetch(): Promise<void> {
const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange;
const articleSelector = this.source.sourceSelectors.articles;
if (!articleSelector) {
logger.error(
{ source: this.source.sourceId },
"No article selector configured for HTML source",
);
return;
}
let stop = false;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const pageUrl = this.buildPageUrl(page);
let html: string;
try {
html = await this.crawl(pageUrl);
} catch (error) {
logger.error({ error, page, pageUrl }, "> page %s => [failed]", page);
continue;
}
const root = this.parseHtml(html);
const articles = this.extractAll(root, articleSelector);
if (!articles.length) {
logger.info({ page }, "No articles found on page");
continue;
}
for (const node of articles) {
try {
this.currentArticleUrl = this.extractLink(node);
let targetHtml = node.toString();
if (this.source.requiresDetails) {
if (!this.currentArticleUrl) {
logger.debug({ page }, "Skipping article without link for details");
continue;
}
try {
targetHtml = await this.crawl(this.currentArticleUrl);
} catch (err) {
logger.error(
{ error: err, url: this.currentArticleUrl },
"Failed to fetch detail page",
);
continue;
}
}
const saved = await this.fetchOne(targetHtml, dateRange);
// stop early on first out-of-range if pages are sorted by date desc
if (saved === null) {
stop = true;
break;
}
} catch (error) {
logger.error({ error, pageUrl }, "Failed to process article on page");
} finally {
this.currentArticleUrl = null;
}
}
if (stop) break;
}
}
/**
* Fetch and process a single HTML article.
* @param html - The HTML content of the article
* @param dateRange - Optional date range for filtering
*/
async fetchOne(html: string, dateRange?: DateRange | null) {
const root = this.parseHtml(html);
const sel = this.source.sourceSelectors;
const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled";
const link = this.currentArticleUrl ?? this.extractLink(root);
if (!link) {
logger.warn({ title: titleText }, "Skipping article without link");
return null;
}
const body = this.extractBody(root, sel.articleBody);
const categories = this.extractCategories(root, sel.articleCategories);
const rawDate = this.extractText(root, sel.articleDate);
const timestamp = this.computeTimestamp(rawDate);
if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
logger.info(
{ date: rawDate, link, timestamp, title: titleText },
"Skipping article outside date range",
);
return null;
}
const enriched = await this.enrichWithOpenGraph(
{
body,
categories,
link,
source: this.source.sourceId,
timestamp,
title: titleText,
},
link,
);
return await persist(enriched, this.persistors);
}
/**
* Fetch links from the target URL using the given selector.
* @param target - The target URL to crawl
* @param selector - The CSS selector to extract links
*/
async fetchLinks(target: string, selector: string) {
const html = await this.crawl(target);
const root = this.parseHtml(html);
return this.extractAll(root, selector);
}
/**
* Get the pagination range (start and end page numbers).
*/
async getPagination(): Promise<{ start: number; end: number }> {
return { end: await this.getLastPage(), start: 0 };
}
/**
* Determine the last page number from pagination links.
*/
private async getLastPage(): Promise<number> {
const template = this.applyCategory(this.source.paginationTemplate);
const url = `${this.source.sourceUrl}${template}`;
try {
const html = await this.crawl(url);
const root = this.parseHtml(html);
const links = this.extractAll(root, this.source.sourceSelectors.pagination);
if (!links.length) return 1;
const last = links[links.length - 1]!;
const href = last.getAttribute("href") as string | null;
if (!href) return 1;
// Heuristic: prefer a number in the href, else "page" query param
const numberMatch = href.match(/(\d+)/);
if (numberMatch) {
const page = Number.parseInt(numberMatch[1]!, 10);
return Number.isFinite(page) && page > 0 ? page : 1;
}
const urlObj = new URL(createAbsoluteUrl(this.source.sourceUrl, href));
const pageParam = urlObj.searchParams.get("page");
if (pageParam) {
const page = Number.parseInt(pageParam, 10);
return Number.isFinite(page) && page > 0 ? page : 1;
}
return 1;
} catch {
return 1;
}
}
/**
* Build the URL for a given page number.
* @param page - The page number
*/
buildPageUrl(page: number): string {
let template = this.applyCategory(this.source.paginationTemplate);
if (template.includes("{page}")) {
template = template.replace("{page}", String(page));
} else if (page > 0) {
const sep = template.includes("?") ? "&" : "?";
template = `${template}${sep}page=${page}`;
}
return createAbsoluteUrl(this.source.sourceUrl, template);
}
/**
* Apply category replacement in the template if needed.
* @param template - The URL template
*/
private applyCategory(template: string): string {
if (template.includes("{category}")) {
const replacement = this.settings.category ?? "";
return template.replace("{category}", replacement);
}
return template;
}
/**
* Extract link URL from the given node using the selector.
* @param node - The HTML element
*/
extractLink(node: HTMLElement): string | null {
const selector = this.source.sourceSelectors.articleLink;
if (!selector) return null;
const target = this.extractFirst(node, selector);
if (!target) return null;
const href =
target.getAttribute("href") ?? target.getAttribute("data-href") ?? target.getAttribute("src");
if (!href) return null;
return createAbsoluteUrl(this.source.sourceUrl, href);
}
/**
* Extract text content from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
private extractText(root: HTMLElement, selector?: string | null): string | null {
if (!selector) return null;
const target = this.extractFirst(root, selector);
if (!target) return null;
// If it's an image, prefer alt/title
const tag = target.tagName.toLowerCase();
if (tag === "img") {
const alt = target.getAttribute("alt");
const title = target.getAttribute("title");
const pick = (alt ?? title ?? "").trim();
if (pick.length > 0) return pick;
}
return this.textContent(target);
}
/**
* Extract body content from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
private extractBody(root: HTMLElement, selector?: string | null): string {
if (selector) {
const nodes = this.extractAll(root, selector);
if (nodes.length) {
const parts = nodes.map((n) => md.turndown(n.toString())).filter(Boolean);
if (parts.length) return parts.join("\n");
}
}
return md.turndown(root.toString());
}
/**
* Extract categories from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
if (!selector) return [];
const values: string[] = [];
for (const node of this.extractAll(root, selector)) {
const text = this.textContent(node);
if (!text) continue;
const lower = text.toLowerCase();
if (!values.includes(lower)) values.push(lower);
}
return values;
}
/**
* Compute Unix timestamp from raw date string.
* @param raw - Raw date string
* @private
*/
private computeTimestamp(raw?: string | null): number {
if (!raw) return Math.floor(Date.now() / 1000);
let value = raw.trim();
const pattern = safeRegExp(this.source.sourceDate?.pattern);
const replacement = this.source.sourceDate?.replacement ?? "";
if (pattern) {
try {
value = value.replace(pattern, replacement);
} catch {
// ignore pattern failures
}
}
const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm";
if (!isDateMatch(value, format)) {
// fallback: try native Date.parse as last resort
const parsed = Date.parse(value);
return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000);
}
const date = parseDateFns(value, format, new Date());
const ts = getUnixTime(date);
return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000);
}
}
@@ -0,0 +1,239 @@
import { logger } from "@basango/logger";
import TurndownService from "turndown";
import { FetchCrawlerConfig } from "@/config";
import { BaseCrawler } from "@/process/parsers/base";
import { Persistor, persist } from "@/process/persistence";
import { DateRange, PageRange, WordPressSourceConfig } from "@/schema";
const md = new TurndownService({
bulletListMarker: "-",
headingStyle: "atx",
hr: "---",
});
interface WordPressPost {
link?: string;
slug?: string;
title?: { rendered?: string };
content?: { rendered?: string };
date?: string;
categories?: number[];
}
/**
* Crawler for WordPress sites using the REST API.
*/
export class WordPressCrawler extends BaseCrawler {
readonly source: WordPressSourceConfig;
private categoryMap: Map<number, string> = new Map();
private static readonly POST_QUERY =
"_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc";
private static readonly CATEGORY_QUERY =
"_fields=id,slug,count&orderby=count&order=desc&per_page=100";
private static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
private static readonly TOTAL_POSTS_HEADER = "x-wp-total";
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
super(settings, options);
if (!settings.source || settings.source.sourceKind !== "wordpress") {
throw new Error("HtmlCrawler requires a source of kind 'wordpress'");
}
this.source = this.settings.source as WordPressSourceConfig;
}
/**
* Fetch and process WordPress posts.
*/
async fetch(): Promise<void> {
const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange;
let stop = false;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const endpoint = this.postsEndpoint(page);
try {
const response = await this.http.get(endpoint);
const data = (await response.json()) as unknown;
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
if (!Array.isArray(data)) {
logger.warn({ page, type: typeof data }, "Unexpected WordPress payload type");
}
for (const entry of articles) {
const saved = await this.fetchOne(entry, dateRange);
if (saved === null) {
stop = true;
break;
}
}
} catch (error) {
logger.error({ error, page }, "> page %s => [failed]", page);
continue;
}
if (stop) break;
}
}
/**
* Fetch links from a WordPress posts endpoint.
* @param url - The posts endpoint URL
*/
async fetchLinks(url: string) {
const response = await this.http.get(url);
const data = (await response.json()) as unknown;
const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
if (!Array.isArray(data)) {
logger.warn({ type: typeof data }, "Unexpected WordPress payload type");
}
return articles;
}
/**
* Fetch and process a single WordPress post.
* @param input - Decoded JSON object or raw JSON string
* @param dateRange - Optional date range for filtering
*/
async fetchOne(input: unknown, dateRange?: DateRange | null) {
// input can be the decoded JSON object or a raw JSON string
let data: WordPressPost | null = null;
try {
if (typeof input === "string") {
data = JSON.parse(input) as WordPressPost;
} else if (input && typeof input === "object") {
data = input as WordPressPost;
}
} catch (error) {
logger.error({ error }, "Failed to decode WordPress payload");
throw error;
}
if (!data || typeof data !== "object") {
throw new Error("Unexpected WordPress payload type");
}
const link = data.link;
if (!link) {
logger.error("Skipping WordPress article without link");
return null;
}
const titleHtml = data.title?.rendered ?? "";
const bodyHtml = data.content?.rendered ?? "";
const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled";
const body = md.turndown(bodyHtml);
const timestamp = this.computeTimestamp(data.date);
const categories = await this.mapCategories(data.categories ?? []);
// date range skip as in HTML crawler
if (dateRange) {
const { isTimestampInRange } = await import("@/utils");
if (!isTimestampInRange(dateRange, timestamp)) {
logger.info(
{ date: data.date, link, timestamp, title },
"Skipping article outside date range",
);
return null;
}
}
const enriched = await this.enrichWithOpenGraph(
{
body,
categories,
link,
source: this.source.sourceId,
timestamp,
title,
},
link,
);
return await persist(enriched, this.persistors);
}
/**
* Get pagination info from WordPress API.
*/
async getPagination(): Promise<PageRange> {
try {
const url = `${this.baseUrl()}wp-json/wp/v2/posts?_fields=id&per_page=100`;
const response = await this.http.get(url);
const pages = Number.parseInt(
response.headers.get(WordPressCrawler.TOTAL_PAGES_HEADER) ?? "1",
10,
);
const posts = Number.parseInt(
response.headers.get(WordPressCrawler.TOTAL_POSTS_HEADER) ?? "0",
10,
);
logger.info({ pages, posts }, "WordPress pagination");
const end = Number.isFinite(pages) && pages > 0 ? pages : 1;
return { end, start: 1 };
} catch {
return { end: 1, start: 1 };
}
}
/**
* Get base URL for WordPress REST API.
*/
private baseUrl(): string {
const base = String(this.source.sourceUrl);
return base.endsWith("/") ? base : `${base}/`;
}
/**
* Construct posts endpoint URL for a given page.
* @param page - Page number
*/
postsEndpoint(page: number): string {
return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`;
}
/**
* Fetch and cache WordPress categories.
*/
private async fetchCategories(): Promise<void> {
const url = `${this.baseUrl()}wp-json/wp/v2/categories?${WordPressCrawler.CATEGORY_QUERY}`;
const response = await this.http.get(url);
const list = (await response.json()) as Array<{ id: number; slug: string }>;
for (const c of list) {
this.categoryMap.set(c.id, c.slug);
}
}
/**
* Map category IDs to slugs.
* @param ids - Category IDs
*/
private async mapCategories(ids: number[]): Promise<string[]> {
if (this.categoryMap.size === 0) {
try {
await this.fetchCategories();
} catch (error) {
logger.warn({ error }, "Failed to fetch WordPress categories");
}
}
const values: string[] = [];
for (const id of [...ids].sort((a, b) => a - b)) {
const slug = this.categoryMap.get(id);
if (slug && !values.includes(slug)) values.push(slug);
}
return values;
}
/**
* Compute UNIX timestamp from WordPress date string.
* @param raw - Raw date string
*/
private computeTimestamp(raw?: string | null): number {
if (!raw) return Math.floor(Date.now() / 1000);
// Normalize WordPress Z into +00:00 for Date parsing robustness
const cleaned = raw.replace("Z", "+00:00");
const parsed = Date.parse(cleaned);
if (!Number.isNaN(parsed)) return Math.floor(parsed / 1000);
return Math.floor(Date.now() / 1000);
}
}
+102
View File
@@ -0,0 +1,102 @@
import fs from "node:fs";
import path from "node:path";
import logger from "@basango/logger";
import { Article } from "@/schema";
import { countTokens } from "@/utils";
export interface Persistor {
persist(record: Article): Promise<void> | void;
close: () => Promise<void> | void;
}
export interface PersistorOptions {
directory: string;
sourceId: string;
suffix?: string;
encoding?: BufferEncoding;
}
const sanitize = (text: string): string => {
if (!text) return text;
let s = text.replace(/\u00A0/g, " "); // remove NBSP
s = s.replace(" ", " "); // remove other NBSP
s = s.replace("", " "); // remove NARROW NO-BREAK SPACE
s = s.replace(/\u200B/g, ""); // remove ZERO WIDTH SPACE
s = s.replace(/\u200C/g, ""); // remove ZERO WIDTH NON-JOINER
s = s.replace(/\u200D/g, ""); // remove ZERO WIDTH JOINER
s = s.replace(/\uFEFF/g, ""); // remove ZERO WIDTH NO-BREAK SPACE
s = s.replace(/\r\n/g, "\n"); // normalize CRLF to LF
s = s.replace(/\n{2,}/g, "\n"); // collapse multiple newlines to one
// s = s.replace(/[ \t]{2,}/g, " "); // collapse multiple spaces/tabs
return s.trim();
};
export const persist = async (payload: Article, persistors: Persistor[]): Promise<Article> => {
const data = {
...payload,
body: sanitize(payload.body),
categories: payload.categories.map(sanitize),
title: sanitize(payload.title),
};
const article = {
...data,
tokenStatistics: {
body: countTokens(payload.body),
categories: countTokens(payload.categories.join(",")),
excerpt: countTokens(payload.body.substring(0, 200)),
title: countTokens(payload.title),
},
} as Article;
for (const persistor of persistors) {
try {
await persistor.persist(article);
} catch (error) {
logger.error({ error }, "Failed to persist article record");
}
}
logger.info({ url: article.link }, "article successfully persisted");
return article;
};
export class JsonlPersistor implements Persistor {
private readonly filePath: string;
private readonly encoding: BufferEncoding;
private pending: Promise<void> = Promise.resolve();
private closed = false;
constructor(options: PersistorOptions) {
const suffix = options.suffix ?? ".jsonl";
this.encoding = options.encoding ?? "utf-8";
fs.mkdirSync(options.directory, { recursive: true });
this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`);
if (!fs.existsSync(this.filePath)) {
fs.writeFileSync(this.filePath, "", { encoding: this.encoding });
}
}
persist(record: Article): Promise<void> {
if (this.closed) {
return Promise.reject(new Error("Persistor has been closed"));
}
const payload = `${JSON.stringify(record)}\n`;
this.pending = this.pending.then(async () => {
fs.appendFileSync(this.filePath, payload, { encoding: this.encoding });
});
return this.pending;
}
async close(): Promise<void> {
this.closed = true;
await this.pending;
}
}
+29
View File
@@ -0,0 +1,29 @@
import logger from "@basango/logger";
import {
CrawlingOptions,
closePersistors,
createPersistors,
resolveCrawlerConfig,
} from "@/process/crawler";
import { HtmlCrawler } from "@/process/parsers/html";
import { WordPressCrawler } from "@/process/parsers/wordpress";
import { resolveSourceConfig } from "@/utils";
export const runSyncCrawl = async (options: CrawlingOptions): Promise<void> => {
const source = resolveSourceConfig(options.sourceId);
const settings = resolveCrawlerConfig(source, options);
const persistors = createPersistors(source);
const crawler =
source.sourceKind === "wordpress"
? new WordPressCrawler(settings, { persistors })
: new HtmlCrawler(settings, { persistors });
try {
await crawler.fetch();
} finally {
await closePersistors(persistors);
}
logger.info({ ...options }, "Synchronous crawl completed");
};