feat(monorepo): migrate to typescript monorepo

2025-11-07 17:09:29 +02:00
parent 3e09956f05
commit 075a388ccb
745 changed files with 2341 additions and 5082 deletions
@@ -0,0 +1,81 @@
+import path from "node:path";
+
+import { loadConfig as defineConfig } from "@devscast/config";
+import { z } from "zod";
+import {
+  DateRangeSchema,
+  HtmlSourceConfigSchema,
+  PageRangeSchema,
+  UpdateDirectionSchema,
+  WordPressSourceConfigSchema,
+} from "@/schema";
+
+export const PROJECT_DIR = path.resolve(__dirname, "../");
+
+export const PipelineConfigSchema = z.object({
+  fetch: z.object({
+    async: z.object({
+      prefix: z.string().default("basango:crawler:queue"),
+      queues: z.object({
+        details: z.string().default("details"),
+        listing: z.string().default("listing"),
+        processing: z.string().default("processing"),
+      }),
+      redisUrl: z.string().default("redis://localhost:6379/0"),
+      ttl: z.object({
+        default: z.number().int().positive().default(600),
+        failure: z.number().int().nonnegative().default(3600),
+        result: z.number().int().nonnegative().default(3600),
+      }),
+    }),
+    client: z.object({
+      backoffInitial: z.number().nonnegative().default(1),
+      backoffMax: z.number().nonnegative().default(30),
+      backoffMultiplier: z.number().positive().default(2),
+      followRedirects: z.boolean().default(true),
+      maxRetries: z.number().int().nonnegative().default(3),
+      respectRetryAfter: z.boolean().default(true),
+      rotate: z.boolean().default(true),
+      timeout: z.number().positive().default(20),
+      userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
+      verifySsl: z.boolean().default(true),
+    }),
+    crawler: z.object({
+      category: z.string().optional(),
+      dateRange: DateRangeSchema.optional(),
+      direction: UpdateDirectionSchema.default("forward"),
+      isUpdate: z.boolean().default(false),
+      maxWorkers: z.number().int().positive().default(5),
+      notify: z.boolean().default(false),
+      pageRange: PageRangeSchema.optional(),
+      source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
+      useMultiThreading: z.boolean().default(false),
+    }),
+  }),
+  paths: z.object({
+    config: z.string().default(path.join(PROJECT_DIR, "config")),
+    data: z.string().default(path.join(PROJECT_DIR, "data", "datasets")),
+    root: z.string().default(PROJECT_DIR),
+  }),
+  sources: z.object({
+    html: z.array(HtmlSourceConfigSchema).default([]),
+    wordpress: z.array(WordPressSourceConfigSchema).default([]),
+  }),
+});
+
+export const { config, env } = defineConfig({
+  cwd: process.cwd(),
+  env: {
+    path: path.join(PROJECT_DIR, ".env"),
+  },
+  schema: PipelineConfigSchema,
+  sources: [
+    path.join(PROJECT_DIR, "config", "pipeline.json"),
+    path.join(PROJECT_DIR, "config", "sources.json"),
+  ],
+});
+
+export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
+export type FetchClientConfig = PipelineConfig["fetch"]["client"];
+export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
+export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];
@@ -0,0 +1,6 @@
+export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd";
+export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)";
+export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1";
+
+export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504];
+export const DEFAULT_RETRY_AFTER_HEADER = "retry-after";
@@ -0,0 +1,241 @@
+import { setTimeout as delay } from "node:timers/promises";
+import { FetchClientConfig } from "@/config";
+import {
+  DEFAULT_RETRY_AFTER_HEADER,
+  DEFAULT_USER_AGENT,
+  TRANSIENT_HTTP_STATUSES,
+} from "@/constants";
+import { UserAgents } from "@/http/user-agent";
+
+export type HttpHeaders = Record<string, string>;
+export type HttpParams = Record<string, string | number | boolean | null | undefined>;
+export type HttpData = unknown;
+
+export interface HttpClientOptions {
+  userAgentProvider?: UserAgents;
+  defaultHeaders?: HttpHeaders;
+  fetchImpl?: typeof fetch;
+  sleep?: (ms: number) => Promise<void>;
+}
+
+export interface HttpRequestOptions {
+  headers?: HttpHeaders;
+  params?: HttpParams;
+  data?: HttpData;
+  json?: HttpData;
+  retryAfterHeader?: string;
+}
+
+export class HttpError extends Error {
+  readonly status: number;
+  readonly response: Response;
+
+  constructor(message: string, response: Response) {
+    super(message);
+    this.status = response.status;
+    this.response = response;
+  }
+}
+
+/**
+ * Default sleep function using setTimeout.
+ * @param ms - Milliseconds to sleep
+ */
+const defaultSleep = (ms: number): Promise<void> => {
+  return delay(ms).then(() => undefined);
+};
+
+/**
+ * Builds a URL with query parameters.
+ * @param url - The base URL
+ * @param params - The query parameters to append
+ */
+const buildUrl = (url: string, params?: HttpParams): string => {
+  if (!params || Object.keys(params).length === 0) {
+    return url;
+  }
+
+  const target = new URL(url);
+  for (const [key, value] of Object.entries(params)) {
+    if (value === undefined || value === null) continue;
+    target.searchParams.set(key, String(value));
+  }
+
+  return target.toString();
+};
+
+/**
+ * Computes the backoff time in milliseconds based on the configuration and attempt number.
+ * @param config - Fetch client configuration
+ * @param attempt - Current attempt number
+ */
+const computeBackoff = (config: FetchClientConfig, attempt: number): number => {
+  const base = Math.min(
+    config.backoffInitial * config.backoffMultiplier ** attempt,
+    config.backoffMax,
+  );
+  const jitter = Math.random() * base * 0.25;
+  return (base + jitter) * 1000;
+};
+
+const parseRetryAfter = (header: string): number => {
+  const numeric = Number.parseInt(header, 10);
+  if (!Number.isNaN(numeric)) {
+    return Math.max(0, numeric * 1000);
+  }
+
+  const parsed = Date.parse(header);
+  if (Number.isNaN(parsed)) {
+    return 0;
+  }
+
+  const delta = parsed - Date.now();
+  return delta > 0 ? delta : 0;
+};
+
+/**
+ * Base HTTP client providing common functionality.
+ *
+ * @author Bernard Ngandu <bernard@devscast.tech>
+ */
+export class BaseHttpClient {
+  protected readonly config: FetchClientConfig;
+  protected readonly fetchImpl: typeof fetch;
+  protected readonly sleep: (ms: number) => Promise<void>;
+  protected readonly headers: HttpHeaders;
+
+  constructor(config: FetchClientConfig, options: HttpClientOptions = {}) {
+    this.config = config;
+    const provider =
+      options.userAgentProvider ??
+      new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT);
+    const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT;
+
+    const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
+    if (options.defaultHeaders) {
+      Object.assign(baseHeaders, options.defaultHeaders);
+    }
+
+    this.headers = baseHeaders;
+    this.fetchImpl = options.fetchImpl ?? fetch;
+    this.sleep = options.sleep ?? defaultSleep;
+  }
+
+  protected buildHeaders(headers?: HttpHeaders): HeadersInit {
+    return { ...this.headers, ...(headers ?? {}) };
+  }
+
+  protected async maybeDelay(
+    attempt: number,
+    response?: Response,
+    retryAfterHeader: string = DEFAULT_RETRY_AFTER_HEADER,
+  ): Promise<void> {
+    let waitMs = 0;
+
+    if (response) {
+      const retryAfter = response.headers.get(retryAfterHeader);
+      if (retryAfter && this.config.respectRetryAfter) {
+        waitMs = parseRetryAfter(retryAfter);
+      }
+    }
+
+    if (waitMs === 0) {
+      waitMs = computeBackoff(this.config, attempt);
+    }
+
+    if (waitMs > 0) {
+      await this.sleep(waitMs);
+    }
+  }
+}
+
+/**
+ * Synchronous HTTP client with retry and timeout capabilities.
+ *
+ * @author Bernard Ngandu <bernard@devscast.tech>
+ */
+export class SyncHttpClient extends BaseHttpClient {
+  async request(method: string, url: string, options: HttpRequestOptions = {}): Promise<Response> {
+    const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
+    const target = buildUrl(url, options.params);
+
+    const maxAttempts = this.config.maxRetries + 1;
+    let attempt = 0;
+    let lastError: unknown;
+
+    while (attempt < maxAttempts) {
+      const controller = new AbortController();
+      let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
+      try {
+        timeoutHandle = setTimeout(() => controller.abort(), this.config.timeout * 1000);
+
+        const headers = this.buildHeaders(options.headers);
+        const init: RequestInit = {
+          body: options.data as BodyInit | undefined,
+          headers,
+          method,
+          redirect: this.config.followRedirects ? "follow" : "manual",
+          signal: controller.signal,
+        };
+
+        if (options.json !== undefined) {
+          init.body = JSON.stringify(options.json);
+          (init.headers as Record<string, string>)["Content-Type"] ??= "application/json";
+        }
+
+        const response = await this.fetchImpl(target, init);
+
+        if (
+          TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
+          attempt < this.config.maxRetries
+        ) {
+          await this.maybeDelay(attempt, response, retryAfterHeader);
+          attempt += 1;
+          continue;
+        }
+
+        if (!response.ok) {
+          throw new HttpError(`HTTP ${response.status} ${response.statusText}`, response);
+        }
+
+        return response;
+      } catch (error) {
+        if (error instanceof HttpError) {
+          lastError = error;
+          throw error;
+        }
+
+        if (error instanceof DOMException && error.name === "AbortError") {
+          lastError = error;
+          if (attempt >= this.config.maxRetries) {
+            throw error;
+          }
+        } else {
+          lastError = error;
+          if (attempt >= this.config.maxRetries) {
+            throw error;
+          }
+        }
+
+        await this.maybeDelay(attempt);
+        attempt += 1;
+      } finally {
+        if (timeoutHandle) {
+          clearTimeout(timeoutHandle);
+        }
+      }
+    }
+
+    throw lastError instanceof Error ? lastError : new Error("HTTP request failed after retries");
+  }
+
+  get(url: string, options?: Omit<HttpRequestOptions, "data" | "json">): Promise<Response> {
+    return this.request("GET", url, options);
+  }
+
+  post(url: string, options: HttpRequestOptions = {}): Promise<Response> {
+    return this.request("POST", url, options);
+  }
+}
+
+export type HttpClient = SyncHttpClient;
@@ -0,0 +1,102 @@
+import { parse } from "node-html-parser";
+import { config } from "@/config";
+import { OPEN_GRAPH_USER_AGENT } from "@/constants";
+import { SyncHttpClient } from "@/http/http-client";
+import { UserAgents } from "@/http/user-agent";
+import { ArticleMetadata } from "@/schema";
+
+/**
+ * Picks the first non-empty value from the provided array.
+ * @param values - An array of string values
+ */
+const pick = (values: Array<string | null | undefined>): string | undefined => {
+  for (const value of values) {
+    if (value && value.trim().length > 0) {
+      return value.trim();
+    }
+  }
+  return undefined;
+};
+
+/**
+ * Extracts the content of a meta tag given its property or name.
+ * @param root - The root HTML element
+ * @param property - The property or name of the meta tag to extract
+ */
+const extract = (root: ReturnType<typeof parse>, property: string): string | null => {
+  const selector = `meta[property='${property}'], meta[name='${property}']`;
+  const node = root.querySelector(selector);
+  if (!node) {
+    return null;
+  }
+  return node.getAttribute("content") ?? null;
+};
+
+/**
+ * OpenGraph consumer for extracting Open Graph metadata from HTML pages.
+ * Uses a synchronous HTTP client to fetch the HTML content.
+ *
+ * @author Bernard Ngandu <bernard@devscast.tech>
+ */
+export class OpenGraph {
+  private readonly client: Pick<SyncHttpClient, "get">;
+
+  constructor() {
+    const settings = config.fetch.client;
+    const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT);
+
+    this.client = new SyncHttpClient(settings, {
+      defaultHeaders: { "User-Agent": provider.og() },
+      userAgentProvider: provider,
+    });
+  }
+
+  /**
+   * Consume a URL and extract Open Graph metadata.
+   * @param url - The URL to fetch and parse
+   */
+  async consumeUrl(url: string): Promise<ArticleMetadata | undefined> {
+    try {
+      const response = await this.client.get(url);
+      const html = await response.text();
+      return OpenGraph.consumeHtml(html, url);
+    } catch {
+      return undefined;
+    }
+  }
+
+  /**
+   * Consume HTML content and extract Open Graph metadata.
+   * @param html - HTML content as a string
+   * @param url - Optional URL of the page
+   */
+  static consumeHtml(html: string, url?: string): ArticleMetadata | undefined {
+    if (!html) {
+      return undefined;
+    }
+
+    const root = parse(html);
+    const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]);
+    const description = pick([extract(root, "og:description"), extract(root, "description")]);
+    const image = pick([
+      extract(root, "og:image"),
+      root.querySelector("img")?.getAttribute("src") ?? null,
+    ]);
+    const canonical = pick([
+      extract(root, "og:url"),
+      root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
+      url ?? null,
+    ]);
+
+    if (!title && !description && !image && !canonical) {
+      return undefined;
+    }
+
+    return {
+      description,
+      image,
+      title,
+      url: canonical,
+    };
+  }
+}
@@ -0,0 +1,41 @@
+import { DEFAULT_USER_AGENT, OPEN_GRAPH_USER_AGENT } from "@/constants";
+
+/**
+ * User agent provider with optional rotation.
+ * Allows fetching a random user agent from a predefined list
+ * or using a fallback user agent.
+ *
+ * @author Bernard Ngandu <bernard@devscast.tech>
+ */
+export class UserAgents {
+  private static readonly USER_AGENTS: string[] = [
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5",
+    "Mozilla/50.0 (Linux; U; Linux x86_64; en-US) Gecko/20130401 Firefox/52.7",
+    "Mozilla/5.0 (Linux; U; Android 5.0; SM-P815 Build/LRX22G) AppleWebKit/600.4 (KHTML, like Gecko) Chrome/48.0.1562.260 Mobile Safari/600.0",
+    "Mozilla/5.0 (Windows; U; Windows NT 6.3;) AppleWebKit/533.34 (KHTML, like Gecko) Chrome/51.0.1883.215 Safari/533",
+    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.3; x64; en-US Trident/4.0)",
+    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_10_3) Gecko/20100101 Firefox/63.4",
+    "Mozilla/5.0 (Linux; Linux x86_64; en-US) AppleWebKit/603.50 (KHTML, like Gecko) Chrome/55.0.2226.116 Safari/601",
+    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 7_8_3; en-US) Gecko/20100101 Firefox/68.9",
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 8_9_8; like Mac OS X) AppleWebKit/603.34 (KHTML, like Gecko) Chrome/47.0.1126.107 Mobile Safari/602.7",
+    "Mozilla/5.0 (iPod; CPU iPod OS 8_2_0; like Mac OS X) AppleWebKit/601.40 (KHTML, like Gecko) Chrome/47.0.1590.178 Mobile Safari/535.2",
+  ];
+
+  private readonly rotate: boolean;
+  private readonly fallback: string;
+
+  constructor(rotate: boolean = true, fallback: string = DEFAULT_USER_AGENT) {
+    this.rotate = rotate;
+    this.fallback = fallback;
+  }
+
+  og(): string {
+    return OPEN_GRAPH_USER_AGENT;
+  }
+
+  get(): string {
+    if (!this.rotate) return this.fallback;
+    const idx = Math.floor(Math.random() * UserAgents.USER_AGENTS.length);
+    return UserAgents.USER_AGENTS[idx]!;
+  }
+}
@@ -0,0 +1,137 @@
+import { logger } from "@basango/logger";
+
+import { config, env } from "@/config";
+import { SyncHttpClient } from "@/http/http-client";
+import { createQueueManager, QueueManager } from "@/process/async/queue";
+import {
+  DetailsTaskPayload,
+  ListingTaskPayload,
+  ProcessingTaskPayload,
+} from "@/process/async/schemas";
+import { resolveCrawlerConfig } from "@/process/crawler";
+import { HtmlCrawler } from "@/process/parsers/html";
+import { WordPressCrawler } from "@/process/parsers/wordpress";
+import { JsonlPersistor } from "@/process/persistence";
+import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
+import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
+
+export const collectHtmlListing = async (
+  payload: ListingTaskPayload,
+  manager: QueueManager = createQueueManager(),
+): Promise<number> => {
+  const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
+  if (source.sourceKind !== "html") {
+    return await collectWordPressListing(payload, manager);
+  }
+
+  const settings = resolveCrawlerConfig(source, payload);
+  const crawler = new HtmlCrawler(settings);
+  const pageRange = settings.pageRange ?? (await crawler.getPagination());
+
+  let queued = 0;
+  for (let page = pageRange.start; page <= pageRange.end; page += 1) {
+    const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`;
+
+    try {
+      const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
+      for (const node of items) {
+        const url = crawler.extractLink(node);
+        if (!url) continue;
+
+        await manager.enqueueArticle({
+          category: payload.category,
+          dateRange: createDateRange(payload.dateRange),
+          sourceId: payload.sourceId,
+          url,
+        } as DetailsTaskPayload);
+        queued += 1;
+      }
+    } catch (error) {
+      logger.error({ error, target }, "Failed to crawl page");
+    }
+  }
+
+  return queued;
+};
+
+export const collectWordPressListing = async (
+  payload: ListingTaskPayload,
+  manager: QueueManager = createQueueManager(),
+): Promise<number> => {
+  const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
+  if (source.sourceKind !== "wordpress") {
+    return await collectHtmlListing(payload, manager);
+  }
+
+  const settings = resolveCrawlerConfig(source, payload);
+  const crawler = new WordPressCrawler(settings);
+  const pageRange = settings.pageRange ?? (await crawler.getPagination());
+
+  let queued = 0;
+  for (let page = pageRange.start; page <= pageRange.end; page += 1) {
+    const url = crawler.postsEndpoint(page);
+
+    try {
+      const entries = await crawler.fetchLinks(url);
+      for (const data of entries) {
+        const url = data.link;
+        if (!url) continue;
+
+        await manager.enqueueArticle({
+          category: payload.category,
+          data,
+          dateRange: createDateRange(payload.dateRange),
+          sourceId: payload.sourceId,
+          url,
+        } as DetailsTaskPayload);
+        queued += 1;
+      }
+    } catch (error) {
+      logger.error({ error, page }, "Failed to fetch WordPress page");
+    }
+  }
+
+  return queued;
+};
+
+export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
+  const source = resolveSourceConfig(payload.sourceId);
+  const settings = resolveCrawlerConfig(source, {
+    category: payload.category,
+    dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
+    pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
+    sourceId: payload.sourceId,
+  });
+  const persistors = [
+    new JsonlPersistor({
+      directory: config.paths.data,
+      sourceId: String(source.sourceId),
+    }),
+  ];
+
+  if (source.sourceKind === SourceKindSchema.enum.html) {
+    if (!payload.url) throw new Error("Missing article url");
+    const crawler = new HtmlCrawler(settings, { persistors });
+    const html = await crawler.crawl(payload.url);
+    return await crawler.fetchOne(html, settings.dateRange);
+  }
+
+  if (source.sourceKind === SourceKindSchema.enum.wordpress) {
+    const crawler = new WordPressCrawler(settings, { persistors });
+    return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
+  }
+
+  throw new Error(`Unsupported source kind`);
+};
+
+export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
+  logger.info({ article: payload.article.title }, "Ready for downstream processing");
+
+  const client = new SyncHttpClient(config.fetch.client);
+  const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
+
+  await client.post(endpoint, { json: payload.article });
+  logger.info({ article: payload.article.title }, "Forwarded article to API");
+
+  return payload.article;
+};
@@ -0,0 +1,107 @@
+import { randomUUID } from "node:crypto";
+import { JobsOptions, Queue, QueueOptions } from "bullmq";
+import IORedis from "ioredis";
+import { config, FetchAsyncConfig } from "@/config";
+import {
+  DetailsTaskPayload,
+  DetailsTaskPayloadSchema,
+  ListingTaskPayload,
+  ListingTaskPayloadSchema,
+  ProcessingTaskPayload,
+  ProcessingTaskPayloadSchema,
+} from "@/process/async/schemas";
+import { parseRedisUrl } from "@/utils";
+
+export interface QueueBackend<T = unknown> {
+  add: (name: string, data: T, opts?: JobsOptions) => Promise<{ id: string }>;
+}
+
+export type QueueFactory = (
+  queueName: string,
+  settings: FetchAsyncConfig,
+  connection?: IORedis,
+) => QueueBackend;
+
+const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
+  const redisConnection =
+    connection ??
+    new IORedis(settings.redisUrl, {
+      ...parseRedisUrl(settings.redisUrl),
+      maxRetriesPerRequest: null,
+    });
+  const options: QueueOptions = {
+    connection: redisConnection,
+    prefix: settings.prefix,
+  };
+
+  const queue = new Queue(queueName, options);
+  return {
+    add: async (name, data, opts) => {
+      const job = await queue.add(name, data, {
+        removeOnComplete: settings.ttl.result === 0 ? true : undefined,
+        removeOnFail: settings.ttl.failure === 0 ? true : undefined,
+        ...opts,
+      });
+      return { id: job.id ?? randomUUID() };
+    },
+  };
+};
+
+export interface CreateQueueManagerOptions {
+  queueFactory?: QueueFactory;
+  connection?: IORedis;
+}
+
+export interface QueueManager {
+  readonly settings: FetchAsyncConfig;
+  readonly connection: IORedis;
+  enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
+  enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
+  enqueueProcessed: (payload: ProcessingTaskPayload) => Promise<{ id: string }>;
+  iterQueueNames: () => string[];
+  queueName: (suffix: string) => string;
+  close: () => Promise<void>;
+}
+
+export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
+  const settings = config.fetch.async;
+
+  const connection =
+    options.connection ??
+    new IORedis(settings.redisUrl, {
+      ...parseRedisUrl(settings.redisUrl),
+      maxRetriesPerRequest: null,
+    });
+  const factory = options.queueFactory ?? defaultQueueFactory;
+
+  const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
+
+  return {
+    close: async () => {
+      await connection.quit();
+    },
+    connection,
+    enqueueArticle: (payload) => {
+      const data = DetailsTaskPayloadSchema.parse(payload);
+      const queue = ensureQueue(settings.queues.details);
+      return queue.add("collect_article", data);
+    },
+    enqueueListing: (payload) => {
+      const data = ListingTaskPayloadSchema.parse(payload);
+      const queue = ensureQueue(settings.queues.listing);
+      return queue.add("collect_listing", data);
+    },
+    enqueueProcessed: (payload) => {
+      const data = ProcessingTaskPayloadSchema.parse(payload);
+      const queue = ensureQueue(settings.queues.processing);
+      return queue.add("forward_for_processing", data);
+    },
+    iterQueueNames: () => [
+      `${settings.prefix}:${settings.queues.listing}`,
+      `${settings.prefix}:${settings.queues.details}`,
+      `${settings.prefix}:${settings.queues.processing}`,
+    ],
+    queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
+    settings,
+  };
+};
@@ -0,0 +1,28 @@
+import { z } from "zod";
+import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema";
+
+export const ListingTaskPayloadSchema = z.object({
+  category: z.string().optional(),
+  dateRange: z.string().optional(),
+  pageRange: z.string().optional(),
+  sourceId: z.string(),
+});
+
+export const DetailsTaskPayloadSchema = z.object({
+  category: z.string().optional(),
+  data: z.any().optional(),
+  dateRange: DateRangeSchema.optional(),
+  page: z.number().int().nonnegative().optional(),
+  pageRange: PageRangeSchema.optional(),
+  sourceId: z.string(),
+  url: z.url(),
+});
+
+export const ProcessingTaskPayloadSchema = z.object({
+  article: ArticleSchema,
+  sourceId: z.string(),
+});
+
+export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
+export type DetailsTaskPayload = z.infer<typeof DetailsTaskPayloadSchema>;
+export type ProcessingTaskPayload = z.infer<typeof ProcessingTaskPayloadSchema>;
@@ -0,0 +1,60 @@
+import { logger } from "@basango/logger";
+import * as handlers from "@/process/async/handlers";
+import { createQueueManager } from "@/process/async/queue";
+import {
+  DetailsTaskPayloadSchema,
+  ListingTaskPayloadSchema,
+  ProcessingTaskPayloadSchema,
+} from "@/process/async/schemas";
+import { CrawlingOptions } from "@/process/crawler";
+
+export const collectListing = async (payload: unknown): Promise<number> => {
+  const data = ListingTaskPayloadSchema.parse(payload);
+  logger.debug({ data }, "Collecting listing");
+
+  const count = await handlers.collectHtmlListing(data);
+  logger.info({ count }, "Listing collection completed");
+
+  return count;
+};
+
+export const collectArticle = async (payload: unknown): Promise<unknown> => {
+  const data = DetailsTaskPayloadSchema.parse(payload);
+  logger.info({ data }, "Collecting article");
+
+  const result = await handlers.collectArticle(data);
+  logger.info({ url: data.url }, "Article collection completed");
+
+  return result;
+};
+
+export const forwardForProcessing = async (payload: unknown): Promise<unknown> => {
+  const data = ProcessingTaskPayloadSchema.parse(payload);
+  logger.debug({ sourceId: data.sourceId }, "Forwarding article for processing");
+
+  const result = await handlers.forwardForProcessing(data);
+  logger.info({ result }, "Article forwarded for processing");
+
+  return result;
+};
+
+export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise<string> => {
+  const payload = ListingTaskPayloadSchema.parse({
+    category: options.category,
+    dateRange: options.dateRange,
+    pageRange: options.pageRange,
+    sourceId: options.sourceId,
+  });
+
+  const manager = createQueueManager();
+  logger.info({ payload }, "Scheduling listing collection job");
+
+  try {
+    const job = await manager.enqueueListing(payload);
+    logger.info({ job }, "Scheduled listing collection job");
+
+    return job.id;
+  } finally {
+    await manager.close();
+  }
+};
@@ -0,0 +1,74 @@
+import { QueueEvents, Worker } from "bullmq";
+import IORedis from "ioredis";
+
+import { QueueFactory, QueueManager } from "@/process/async/queue";
+import { collectArticle, collectListing, forwardForProcessing } from "@/process/async/tasks";
+
+export interface WorkerOptions {
+  queueNames?: string[];
+  connection?: IORedis;
+  queueFactory?: QueueFactory;
+  concurrency?: number;
+  onError?: (error: Error) => void;
+  queueManager: QueueManager;
+}
+
+export interface WorkerHandle {
+  readonly workers: Worker[];
+  readonly events: QueueEvents[];
+  close: () => Promise<void>;
+}
+
+export const startWorker = (options: WorkerOptions): WorkerHandle => {
+  const manager = options.queueManager;
+  const queueNames = options.queueNames ?? manager.iterQueueNames();
+  const workers: Worker[] = [];
+  const events: QueueEvents[] = [];
+
+  const connection = manager.connection;
+
+  for (const queueName of queueNames) {
+    const worker = new Worker(
+      queueName,
+      async (job) => {
+        switch (job.name) {
+          case "collect_listing":
+            return collectListing(job.data);
+          case "collect_article":
+            return collectArticle(job.data);
+          case "forward_for_processing":
+            return forwardForProcessing(job.data);
+          default:
+            throw new Error(`Unknown job name: ${job.name}`);
+        }
+      },
+      {
+        concurrency: options.concurrency ?? 5,
+        connection,
+      },
+    );
+
+    if (options.onError) {
+      worker.on("failed", (_, err) => options.onError?.(err as Error));
+      worker.on("error", (err) => options.onError?.(err as Error));
+    }
+
+    const queueEvents = new QueueEvents(queueName, { connection });
+
+    workers.push(worker);
+    events.push(queueEvents);
+  }
+
+  return {
+    close: async () => {
+      await Promise.all(workers.map((worker) => worker.close()));
+      await Promise.all(events.map((event) => event.close()));
+
+      if (!options.queueManager) {
+        await manager.close();
+      }
+    },
+    events,
+    workers,
+  };
+};
@@ -0,0 +1,44 @@
+import logger from "@basango/logger";
+import { config, FetchCrawlerConfig } from "@/config";
+import { JsonlPersistor, Persistor } from "@/process/persistence";
+import { AnySourceConfig } from "@/schema";
+import { createDateRange, createPageRange } from "@/utils";
+
+export interface CrawlingOptions {
+  sourceId: string;
+  pageRange?: string | undefined;
+  dateRange?: string | undefined;
+  category?: string | undefined;
+}
+
+export const resolveCrawlerConfig = (
+  source: AnySourceConfig,
+  options: CrawlingOptions,
+): FetchCrawlerConfig => {
+  return {
+    ...config.fetch.crawler,
+    category: options.category,
+    dateRange: createDateRange(options.dateRange),
+    pageRange: createPageRange(options.pageRange),
+    source,
+  };
+};
+
+export const createPersistors = (source: AnySourceConfig): Persistor[] => {
+  return [
+    new JsonlPersistor({
+      directory: config.paths.data,
+      sourceId: source.sourceId,
+    }),
+  ];
+};
+
+export const closePersistors = async (persistors: Persistor[]): Promise<void> => {
+  for (const persistor of persistors) {
+    try {
+      await persistor.close();
+    } catch (error) {
+      logger.warn({ error }, "Failed to close persistor");
+    }
+  }
+};
@@ -0,0 +1,107 @@
+import { HTMLElement, parse as parseHtml } from "node-html-parser";
+import { config, FetchCrawlerConfig } from "@/config";
+import { SyncHttpClient } from "@/http/http-client";
+import { OpenGraph } from "@/http/open-graph";
+import type { Persistor } from "@/process/persistence";
+import { AnySourceConfig, Article } from "@/schema";
+
+export interface CrawlerOptions {
+  persistors?: Persistor[];
+}
+
+export abstract class BaseCrawler {
+  protected readonly settings: FetchCrawlerConfig;
+  protected readonly source: AnySourceConfig;
+  protected readonly http: SyncHttpClient;
+  protected readonly persistors: Persistor[];
+  protected readonly openGraph: OpenGraph;
+
+  protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
+    if (!settings.source) {
+      throw new Error("Crawler requires a bound source");
+    }
+
+    this.http = new SyncHttpClient(config.fetch.client);
+    this.persistors = options.persistors ?? [];
+    this.openGraph = new OpenGraph();
+
+    this.settings = settings;
+    this.source = settings.source as AnySourceConfig;
+  }
+
+  /**
+   * Fetch and process articles from the source.
+   */
+  abstract fetch(): Promise<void> | void;
+
+  /**
+   * Crawl the given URL and return the HTML content as a string.
+   * @param url - The URL to crawl
+   */
+  async crawl(url: string): Promise<string> {
+    const response = await this.http.get(url);
+    return await response.text();
+  }
+
+  /**
+   * Extract text content from an HTML node.
+   * @param node - The HTML node
+   */
+  protected textContent(node: HTMLElement | null | undefined): string | null {
+    if (!node) return null;
+    // innerText keeps spacing similar to browser rendering
+    const value = node.innerText ?? node.text;
+    const text = value.trim();
+    return text.length ? text : null;
+  }
+
+  /**
+   * Extract the first matching element from the root using the selector.
+   * @param root - The root HTML element
+   * @param selector - The CSS selector
+   */
+  protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null {
+    if (!selector) return null;
+    try {
+      return root.querySelector(selector) ?? null;
+    } catch {
+      return null;
+    }
+  }
+
+  /**
+   * Extract all matching elements from the root using the selector.
+   * @param root - The root HTML element
+   * @param selector - The CSS selector
+   */
+  protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] {
+    if (!selector) return [];
+    try {
+      return root.querySelectorAll(selector);
+    } catch {
+      return [];
+    }
+  }
+
+  /**
+   * Parse HTML string into an HTMLElement.
+   * @param html - The HTML string
+   */
+  protected parseHtml(html: string): HTMLElement {
+    return parseHtml(html) as unknown as HTMLElement;
+  }
+
+  /**
+   * Enrich the record with Open Graph metadata from the given URL.
+   * @param record - The article record
+   * @param url - The URL to fetch Open Graph data from
+   */
+  protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
+    try {
+      const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
+      return { ...record, metadata };
+    } catch {
+      return { ...record, metadata: undefined };
+    }
+  }
+}
@@ -0,0 +1,335 @@
+import { logger } from "@basango/logger";
+import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
+import { HTMLElement } from "node-html-parser";
+import TurndownService from "turndown";
+import { FetchCrawlerConfig } from "@/config";
+import { BaseCrawler } from "@/process/parsers/base";
+import { Persistor, persist } from "@/process/persistence";
+import { DateRange, HtmlSourceConfig } from "@/schema";
+import { createAbsoluteUrl, isTimestampInRange } from "@/utils";
+
+const md = new TurndownService({
+  bulletListMarker: "-",
+  headingStyle: "atx",
+  hr: "---",
+});
+
+/**
+ * Create a safe RegExp from the given pattern.
+ * @param pattern
+ */
+const safeRegExp = (pattern?: string | null): RegExp | null => {
+  if (!pattern) return null;
+  try {
+    return new RegExp(pattern, "g");
+  } catch {
+    return null;
+  }
+};
+
+/**
+ * Crawler for generic HTML pages.
+ */
+export class HtmlCrawler extends BaseCrawler {
+  readonly source: HtmlSourceConfig;
+  private currentArticleUrl: string | null = null;
+
+  constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
+    super(settings, options);
+
+    if (!settings.source || settings.source.sourceKind !== "html") {
+      throw new Error("HtmlCrawler requires a source of kind 'html'");
+    }
+    this.source = this.settings.source as HtmlSourceConfig;
+  }
+
+  async fetch(): Promise<void> {
+    const pageRange = this.settings.pageRange ?? (await this.getPagination());
+    const dateRange = this.settings.dateRange;
+
+    const articleSelector = this.source.sourceSelectors.articles;
+    if (!articleSelector) {
+      logger.error(
+        { source: this.source.sourceId },
+        "No article selector configured for HTML source",
+      );
+      return;
+    }
+
+    let stop = false;
+    for (let page = pageRange.start; page <= pageRange.end; page += 1) {
+      const pageUrl = this.buildPageUrl(page);
+      let html: string;
+      try {
+        html = await this.crawl(pageUrl);
+      } catch (error) {
+        logger.error({ error, page, pageUrl }, "> page %s => [failed]", page);
+        continue;
+      }
+
+      const root = this.parseHtml(html);
+      const articles = this.extractAll(root, articleSelector);
+      if (!articles.length) {
+        logger.info({ page }, "No articles found on page");
+        continue;
+      }
+
+      for (const node of articles) {
+        try {
+          this.currentArticleUrl = this.extractLink(node);
+          let targetHtml = node.toString();
+
+          if (this.source.requiresDetails) {
+            if (!this.currentArticleUrl) {
+              logger.debug({ page }, "Skipping article without link for details");
+              continue;
+            }
+            try {
+              targetHtml = await this.crawl(this.currentArticleUrl);
+            } catch (err) {
+              logger.error(
+                { error: err, url: this.currentArticleUrl },
+                "Failed to fetch detail page",
+              );
+              continue;
+            }
+          }
+
+          const saved = await this.fetchOne(targetHtml, dateRange);
+          // stop early on first out-of-range if pages are sorted by date desc
+          if (saved === null) {
+            stop = true;
+            break;
+          }
+        } catch (error) {
+          logger.error({ error, pageUrl }, "Failed to process article on page");
+        } finally {
+          this.currentArticleUrl = null;
+        }
+      }
+
+      if (stop) break;
+    }
+  }
+
+  /**
+   * Fetch and process a single HTML article.
+   * @param html - The HTML content of the article
+   * @param dateRange - Optional date range for filtering
+   */
+  async fetchOne(html: string, dateRange?: DateRange | null) {
+    const root = this.parseHtml(html);
+    const sel = this.source.sourceSelectors;
+
+    const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled";
+    const link = this.currentArticleUrl ?? this.extractLink(root);
+    if (!link) {
+      logger.warn({ title: titleText }, "Skipping article without link");
+      return null;
+    }
+
+    const body = this.extractBody(root, sel.articleBody);
+    const categories = this.extractCategories(root, sel.articleCategories);
+    const rawDate = this.extractText(root, sel.articleDate);
+    const timestamp = this.computeTimestamp(rawDate);
+
+    if (dateRange && !isTimestampInRange(dateRange, timestamp)) {
+      logger.info(
+        { date: rawDate, link, timestamp, title: titleText },
+        "Skipping article outside date range",
+      );
+      return null;
+    }
+
+    const enriched = await this.enrichWithOpenGraph(
+      {
+        body,
+        categories,
+        link,
+        source: this.source.sourceId,
+        timestamp,
+        title: titleText,
+      },
+      link,
+    );
+
+    return await persist(enriched, this.persistors);
+  }
+
+  /**
+   * Fetch links from the target URL using the given selector.
+   * @param target - The target URL to crawl
+   * @param selector - The CSS selector to extract links
+   */
+  async fetchLinks(target: string, selector: string) {
+    const html = await this.crawl(target);
+    const root = this.parseHtml(html);
+    return this.extractAll(root, selector);
+  }
+
+  /**
+   * Get the pagination range (start and end page numbers).
+   */
+  async getPagination(): Promise<{ start: number; end: number }> {
+    return { end: await this.getLastPage(), start: 0 };
+  }
+
+  /**
+   * Determine the last page number from pagination links.
+   */
+  private async getLastPage(): Promise<number> {
+    const template = this.applyCategory(this.source.paginationTemplate);
+    const url = `${this.source.sourceUrl}${template}`;
+    try {
+      const html = await this.crawl(url);
+      const root = this.parseHtml(html);
+      const links = this.extractAll(root, this.source.sourceSelectors.pagination);
+      if (!links.length) return 1;
+      const last = links[links.length - 1]!;
+      const href = last.getAttribute("href") as string | null;
+      if (!href) return 1;
+
+      // Heuristic: prefer a number in the href, else "page" query param
+      const numberMatch = href.match(/(\d+)/);
+      if (numberMatch) {
+        const page = Number.parseInt(numberMatch[1]!, 10);
+        return Number.isFinite(page) && page > 0 ? page : 1;
+      }
+      const urlObj = new URL(createAbsoluteUrl(this.source.sourceUrl, href));
+      const pageParam = urlObj.searchParams.get("page");
+      if (pageParam) {
+        const page = Number.parseInt(pageParam, 10);
+        return Number.isFinite(page) && page > 0 ? page : 1;
+      }
+      return 1;
+    } catch {
+      return 1;
+    }
+  }
+
+  /**
+   * Build the URL for a given page number.
+   * @param page - The page number
+   */
+  buildPageUrl(page: number): string {
+    let template = this.applyCategory(this.source.paginationTemplate);
+    if (template.includes("{page}")) {
+      template = template.replace("{page}", String(page));
+    } else if (page > 0) {
+      const sep = template.includes("?") ? "&" : "?";
+      template = `${template}${sep}page=${page}`;
+    }
+    return createAbsoluteUrl(this.source.sourceUrl, template);
+  }
+
+  /**
+   * Apply category replacement in the template if needed.
+   * @param template - The URL template
+   */
+  private applyCategory(template: string): string {
+    if (template.includes("{category}")) {
+      const replacement = this.settings.category ?? "";
+      return template.replace("{category}", replacement);
+    }
+    return template;
+  }
+
+  /**
+   * Extract link URL from the given node using the selector.
+   * @param node - The HTML element
+   */
+  extractLink(node: HTMLElement): string | null {
+    const selector = this.source.sourceSelectors.articleLink;
+    if (!selector) return null;
+    const target = this.extractFirst(node, selector);
+    if (!target) return null;
+
+    const href =
+      target.getAttribute("href") ?? target.getAttribute("data-href") ?? target.getAttribute("src");
+
+    if (!href) return null;
+    return createAbsoluteUrl(this.source.sourceUrl, href);
+  }
+
+  /**
+   * Extract text content from the root using the selector.
+   * @param root - The root HTML element
+   * @param selector - The CSS selector
+   */
+  private extractText(root: HTMLElement, selector?: string | null): string | null {
+    if (!selector) return null;
+    const target = this.extractFirst(root, selector);
+    if (!target) return null;
+
+    // If it's an image, prefer alt/title
+    const tag = target.tagName.toLowerCase();
+    if (tag === "img") {
+      const alt = target.getAttribute("alt");
+      const title = target.getAttribute("title");
+      const pick = (alt ?? title ?? "").trim();
+      if (pick.length > 0) return pick;
+    }
+    return this.textContent(target);
+  }
+
+  /**
+   * Extract body content from the root using the selector.
+   * @param root - The root HTML element
+   * @param selector - The CSS selector
+   */
+  private extractBody(root: HTMLElement, selector?: string | null): string {
+    if (selector) {
+      const nodes = this.extractAll(root, selector);
+      if (nodes.length) {
+        const parts = nodes.map((n) => md.turndown(n.toString())).filter(Boolean);
+        if (parts.length) return parts.join("\n");
+      }
+    }
+    return md.turndown(root.toString());
+  }
+
+  /**
+   * Extract categories from the root using the selector.
+   * @param root - The root HTML element
+   * @param selector - The CSS selector
+   */
+  private extractCategories(root: HTMLElement, selector?: string | null): string[] {
+    if (!selector) return [];
+    const values: string[] = [];
+    for (const node of this.extractAll(root, selector)) {
+      const text = this.textContent(node);
+      if (!text) continue;
+      const lower = text.toLowerCase();
+      if (!values.includes(lower)) values.push(lower);
+    }
+    return values;
+  }
+
+  /**
+   * Compute Unix timestamp from raw date string.
+   * @param raw - Raw date string
+   * @private
+   */
+  private computeTimestamp(raw?: string | null): number {
+    if (!raw) return Math.floor(Date.now() / 1000);
+    let value = raw.trim();
+    const pattern = safeRegExp(this.source.sourceDate?.pattern);
+    const replacement = this.source.sourceDate?.replacement ?? "";
+    if (pattern) {
+      try {
+        value = value.replace(pattern, replacement);
+      } catch {
+        // ignore pattern failures
+      }
+    }
+    const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm";
+    if (!isDateMatch(value, format)) {
+      // fallback: try native Date.parse as last resort
+      const parsed = Date.parse(value);
+      return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000);
+    }
+    const date = parseDateFns(value, format, new Date());
+    const ts = getUnixTime(date);
+    return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000);
+  }
+}
@@ -0,0 +1,239 @@
+import { logger } from "@basango/logger";
+import TurndownService from "turndown";
+import { FetchCrawlerConfig } from "@/config";
+import { BaseCrawler } from "@/process/parsers/base";
+import { Persistor, persist } from "@/process/persistence";
+import { DateRange, PageRange, WordPressSourceConfig } from "@/schema";
+
+const md = new TurndownService({
+  bulletListMarker: "-",
+  headingStyle: "atx",
+  hr: "---",
+});
+
+interface WordPressPost {
+  link?: string;
+  slug?: string;
+  title?: { rendered?: string };
+  content?: { rendered?: string };
+  date?: string;
+  categories?: number[];
+}
+
+/**
+ * Crawler for WordPress sites using the REST API.
+ */
+export class WordPressCrawler extends BaseCrawler {
+  readonly source: WordPressSourceConfig;
+  private categoryMap: Map<number, string> = new Map();
+
+  private static readonly POST_QUERY =
+    "_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc";
+  private static readonly CATEGORY_QUERY =
+    "_fields=id,slug,count&orderby=count&order=desc&per_page=100";
+  private static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
+  private static readonly TOTAL_POSTS_HEADER = "x-wp-total";
+
+  constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
+    super(settings, options);
+
+    if (!settings.source || settings.source.sourceKind !== "wordpress") {
+      throw new Error("HtmlCrawler requires a source of kind 'wordpress'");
+    }
+    this.source = this.settings.source as WordPressSourceConfig;
+  }
+
+  /**
+   * Fetch and process WordPress posts.
+   */
+  async fetch(): Promise<void> {
+    const pageRange = this.settings.pageRange ?? (await this.getPagination());
+    const dateRange = this.settings.dateRange;
+
+    let stop = false;
+    for (let page = pageRange.start; page <= pageRange.end; page += 1) {
+      const endpoint = this.postsEndpoint(page);
+      try {
+        const response = await this.http.get(endpoint);
+        const data = (await response.json()) as unknown;
+        const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
+        if (!Array.isArray(data)) {
+          logger.warn({ page, type: typeof data }, "Unexpected WordPress payload type");
+        }
+
+        for (const entry of articles) {
+          const saved = await this.fetchOne(entry, dateRange);
+          if (saved === null) {
+            stop = true;
+            break;
+          }
+        }
+      } catch (error) {
+        logger.error({ error, page }, "> page %s => [failed]", page);
+        continue;
+      }
+      if (stop) break;
+    }
+  }
+
+  /**
+   * Fetch links from a WordPress posts endpoint.
+   * @param url - The posts endpoint URL
+   */
+  async fetchLinks(url: string) {
+    const response = await this.http.get(url);
+    const data = (await response.json()) as unknown;
+    const articles = Array.isArray(data) ? (data as WordPressPost[]) : [];
+    if (!Array.isArray(data)) {
+      logger.warn({ type: typeof data }, "Unexpected WordPress payload type");
+    }
+    return articles;
+  }
+
+  /**
+   * Fetch and process a single WordPress post.
+   * @param input - Decoded JSON object or raw JSON string
+   * @param dateRange - Optional date range for filtering
+   */
+  async fetchOne(input: unknown, dateRange?: DateRange | null) {
+    // input can be the decoded JSON object or a raw JSON string
+    let data: WordPressPost | null = null;
+    try {
+      if (typeof input === "string") {
+        data = JSON.parse(input) as WordPressPost;
+      } else if (input && typeof input === "object") {
+        data = input as WordPressPost;
+      }
+    } catch (error) {
+      logger.error({ error }, "Failed to decode WordPress payload");
+      throw error;
+    }
+
+    if (!data || typeof data !== "object") {
+      throw new Error("Unexpected WordPress payload type");
+    }
+
+    const link = data.link;
+    if (!link) {
+      logger.error("Skipping WordPress article without link");
+      return null;
+    }
+
+    const titleHtml = data.title?.rendered ?? "";
+    const bodyHtml = data.content?.rendered ?? "";
+    const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled";
+    const body = md.turndown(bodyHtml);
+    const timestamp = this.computeTimestamp(data.date);
+    const categories = await this.mapCategories(data.categories ?? []);
+
+    // date range skip as in HTML crawler
+    if (dateRange) {
+      const { isTimestampInRange } = await import("@/utils");
+      if (!isTimestampInRange(dateRange, timestamp)) {
+        logger.info(
+          { date: data.date, link, timestamp, title },
+          "Skipping article outside date range",
+        );
+        return null;
+      }
+    }
+
+    const enriched = await this.enrichWithOpenGraph(
+      {
+        body,
+        categories,
+        link,
+        source: this.source.sourceId,
+        timestamp,
+        title,
+      },
+      link,
+    );
+
+    return await persist(enriched, this.persistors);
+  }
+
+  /**
+   * Get pagination info from WordPress API.
+   */
+  async getPagination(): Promise<PageRange> {
+    try {
+      const url = `${this.baseUrl()}wp-json/wp/v2/posts?_fields=id&per_page=100`;
+      const response = await this.http.get(url);
+      const pages = Number.parseInt(
+        response.headers.get(WordPressCrawler.TOTAL_PAGES_HEADER) ?? "1",
+        10,
+      );
+      const posts = Number.parseInt(
+        response.headers.get(WordPressCrawler.TOTAL_POSTS_HEADER) ?? "0",
+        10,
+      );
+      logger.info({ pages, posts }, "WordPress pagination");
+      const end = Number.isFinite(pages) && pages > 0 ? pages : 1;
+      return { end, start: 1 };
+    } catch {
+      return { end: 1, start: 1 };
+    }
+  }
+
+  /**
+   * Get base URL for WordPress REST API.
+   */
+  private baseUrl(): string {
+    const base = String(this.source.sourceUrl);
+    return base.endsWith("/") ? base : `${base}/`;
+  }
+
+  /**
+   * Construct posts endpoint URL for a given page.
+   * @param page - Page number
+   */
+  postsEndpoint(page: number): string {
+    return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`;
+  }
+
+  /**
+   * Fetch and cache WordPress categories.
+   */
+  private async fetchCategories(): Promise<void> {
+    const url = `${this.baseUrl()}wp-json/wp/v2/categories?${WordPressCrawler.CATEGORY_QUERY}`;
+    const response = await this.http.get(url);
+    const list = (await response.json()) as Array<{ id: number; slug: string }>;
+    for (const c of list) {
+      this.categoryMap.set(c.id, c.slug);
+    }
+  }
+
+  /**
+   * Map category IDs to slugs.
+   * @param ids - Category IDs
+   */
+  private async mapCategories(ids: number[]): Promise<string[]> {
+    if (this.categoryMap.size === 0) {
+      try {
+        await this.fetchCategories();
+      } catch (error) {
+        logger.warn({ error }, "Failed to fetch WordPress categories");
+      }
+    }
+    const values: string[] = [];
+    for (const id of [...ids].sort((a, b) => a - b)) {
+      const slug = this.categoryMap.get(id);
+      if (slug && !values.includes(slug)) values.push(slug);
+    }
+    return values;
+  }
+
+  /**
+   * Compute UNIX timestamp from WordPress date string.
+   * @param raw - Raw date string
+   */
+  private computeTimestamp(raw?: string | null): number {
+    if (!raw) return Math.floor(Date.now() / 1000);
+    // Normalize WordPress Z into +00:00 for Date parsing robustness
+    const cleaned = raw.replace("Z", "+00:00");
+    const parsed = Date.parse(cleaned);
+    if (!Number.isNaN(parsed)) return Math.floor(parsed / 1000);
+    return Math.floor(Date.now() / 1000);
+  }
+}
@@ -0,0 +1,102 @@
+import fs from "node:fs";
+import path from "node:path";
+import logger from "@basango/logger";
+import { Article } from "@/schema";
+import { countTokens } from "@/utils";
+
+export interface Persistor {
+  persist(record: Article): Promise<void> | void;
+  close: () => Promise<void> | void;
+}
+
+export interface PersistorOptions {
+  directory: string;
+  sourceId: string;
+  suffix?: string;
+  encoding?: BufferEncoding;
+}
+
+const sanitize = (text: string): string => {
+  if (!text) return text;
+
+  let s = text.replace(/\u00A0/g, " "); // remove NBSP
+  s = s.replace(" ", " "); // remove other NBSP
+  s = s.replace(" ", " "); // remove NARROW NO-BREAK SPACE
+  s = s.replace(/\u200B/g, ""); // remove ZERO WIDTH SPACE
+  s = s.replace(/\u200C/g, ""); // remove ZERO WIDTH NON-JOINER
+  s = s.replace(/\u200D/g, ""); // remove ZERO WIDTH JOINER
+  s = s.replace(/\uFEFF/g, ""); // remove ZERO WIDTH NO-BREAK SPACE
+  s = s.replace(/\r\n/g, "\n"); // normalize CRLF to LF
+  s = s.replace(/\n{2,}/g, "\n"); // collapse multiple newlines to one
+  // s = s.replace(/[ \t]{2,}/g, " "); // collapse multiple spaces/tabs
+
+  return s.trim();
+};
+
+export const persist = async (payload: Article, persistors: Persistor[]): Promise<Article> => {
+  const data = {
+    ...payload,
+    body: sanitize(payload.body),
+    categories: payload.categories.map(sanitize),
+    title: sanitize(payload.title),
+  };
+
+  const article = {
+    ...data,
+    tokenStatistics: {
+      body: countTokens(payload.body),
+      categories: countTokens(payload.categories.join(",")),
+      excerpt: countTokens(payload.body.substring(0, 200)),
+      title: countTokens(payload.title),
+    },
+  } as Article;
+
+  for (const persistor of persistors) {
+    try {
+      await persistor.persist(article);
+    } catch (error) {
+      logger.error({ error }, "Failed to persist article record");
+    }
+  }
+
+  logger.info({ url: article.link }, "article successfully persisted");
+  return article;
+};
+
+export class JsonlPersistor implements Persistor {
+  private readonly filePath: string;
+  private readonly encoding: BufferEncoding;
+  private pending: Promise<void> = Promise.resolve();
+  private closed = false;
+
+  constructor(options: PersistorOptions) {
+    const suffix = options.suffix ?? ".jsonl";
+    this.encoding = options.encoding ?? "utf-8";
+
+    fs.mkdirSync(options.directory, { recursive: true });
+    this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`);
+
+    if (!fs.existsSync(this.filePath)) {
+      fs.writeFileSync(this.filePath, "", { encoding: this.encoding });
+    }
+  }
+
+  persist(record: Article): Promise<void> {
+    if (this.closed) {
+      return Promise.reject(new Error("Persistor has been closed"));
+    }
+
+    const payload = `${JSON.stringify(record)}\n`;
+
+    this.pending = this.pending.then(async () => {
+      fs.appendFileSync(this.filePath, payload, { encoding: this.encoding });
+    });
+
+    return this.pending;
+  }
+
+  async close(): Promise<void> {
+    this.closed = true;
+    await this.pending;
+  }
+}
@@ -0,0 +1,29 @@
+import logger from "@basango/logger";
+import {
+  CrawlingOptions,
+  closePersistors,
+  createPersistors,
+  resolveCrawlerConfig,
+} from "@/process/crawler";
+import { HtmlCrawler } from "@/process/parsers/html";
+import { WordPressCrawler } from "@/process/parsers/wordpress";
+import { resolveSourceConfig } from "@/utils";
+
+export const runSyncCrawl = async (options: CrawlingOptions): Promise<void> => {
+  const source = resolveSourceConfig(options.sourceId);
+  const settings = resolveCrawlerConfig(source, options);
+  const persistors = createPersistors(source);
+
+  const crawler =
+    source.sourceKind === "wordpress"
+      ? new WordPressCrawler(settings, { persistors })
+      : new HtmlCrawler(settings, { persistors });
+
+  try {
+    await crawler.fetch();
+  } finally {
+    await closePersistors(persistors);
+  }
+
+  logger.info({ ...options }, "Synchronous crawl completed");
+};
@@ -0,0 +1,131 @@
+import { z } from "zod";
+
+export const UpdateDirectionSchema = z.enum(["forward", "backward"]);
+export const SourceKindSchema = z.enum(["wordpress", "html"]);
+
+export const DateRangeSchema = z
+  .object({
+    end: z.number().int(),
+    start: z.number().int(),
+  })
+  .superRefine((value, ctx) => {
+    if (value.start === 0 || value.end === 0) {
+      ctx.addIssue({
+        code: "custom",
+        message: "Timestamp cannot be zero",
+      });
+    }
+    if (value.end < value.start) {
+      ctx.addIssue({
+        code: "custom",
+        message: "End timestamp must be greater than or equal to start",
+      });
+    }
+  });
+
+export const PageRangeSchema = z
+  .object({
+    end: z.number().int().min(0),
+    start: z.number().int().min(0),
+  })
+  .superRefine((value, ctx) => {
+    if (value.end < value.start) {
+      ctx.addIssue({
+        code: "custom",
+        message: "End page must be greater than or equal to start page",
+      });
+    }
+  });
+
+export const PageRangeSpecSchema = z
+  .string()
+  .regex(/^[0-9]+:[0-9]+$/, "Invalid page range format. Use start:end")
+  .transform((spec) => {
+    const [startText, endText] = spec.split(":");
+    return {
+      end: Number.parseInt(String(endText), 10),
+      start: Number.parseInt(String(startText), 10),
+    };
+  });
+
+export const DateRangeSpecSchema = z
+  .string()
+  .regex(/.+:.+/, "Expected start:end format")
+  .transform((spec) => {
+    const [startRaw, endRaw] = spec.split(":");
+    return { endRaw: String(endRaw), startRaw: String(startRaw) };
+  });
+
+export const SourceDateSchema = z.object({
+  format: z.string().default("yyyy-LL-dd HH:mm"),
+  pattern: z.string().nullable().optional(),
+  replacement: z.string().nullable().optional(),
+});
+
+const BaseSourceSchema = z.object({
+  categories: z.array(z.string()).default([]),
+  requiresDetails: z.boolean().default(false),
+  requiresRateLimit: z.boolean().default(false),
+  sourceDate: SourceDateSchema,
+  sourceId: z.string(),
+  sourceKind: SourceKindSchema,
+  sourceUrl: z.url(),
+  supportsCategories: z.boolean().default(false),
+});
+
+export const HtmlSourceConfigSchema = BaseSourceSchema.extend({
+  paginationTemplate: z.string(),
+  sourceKind: z.literal("html"),
+  sourceSelectors: z.object({
+    articleBody: z.string(),
+    articleCategories: z.string().optional(),
+    articleDate: z.string(),
+    articleLink: z.string(),
+    articles: z.string(),
+    articleTitle: z.string(),
+    pagination: z.string().default("ul.pagination > li a"),
+  }),
+});
+
+export const WordPressSourceConfigSchema = BaseSourceSchema.extend({
+  sourceDate: SourceDateSchema.default(SourceDateSchema.parse({ format: "yyyy-LL-dd'T'HH:mm:ss" })),
+  sourceKind: z.literal("wordpress"),
+});
+
+export const ArticleMetadataSchema = z.object({
+  description: z.string().optional(),
+  image: z.string().optional(),
+  title: z.string().optional(),
+  url: z.url().optional(),
+});
+
+export const ArticleTokenStatisticsSchema = z.object({
+  body: z.number().int().nonnegative().default(0),
+  categories: z.number().int().nonnegative().default(0),
+  excerpt: z.number().int().nonnegative().default(0),
+  title: z.number().int().nonnegative().default(0),
+});
+
+export const ArticleSchema = z.object({
+  body: z.string(),
+  categories: z.array(z.string()).default([]),
+  link: z.url(),
+  metadata: ArticleMetadataSchema.optional(),
+  source: z.string(),
+  timestamp: z.number().int(),
+  title: z.string(),
+  tokenStatistics: ArticleTokenStatisticsSchema.optional(),
+});
+
+export type ArticleMetadata = z.infer<typeof ArticleMetadataSchema>;
+export type Article = z.infer<typeof ArticleSchema>;
+export type DateRange = z.infer<typeof DateRangeSchema>;
+export type PageRange = z.infer<typeof PageRangeSchema>;
+export type HtmlSourceConfig = z.infer<typeof HtmlSourceConfigSchema>;
+export type WordPressSourceConfig = z.infer<typeof WordPressSourceConfigSchema>;
+export type AnySourceConfig = HtmlSourceConfig | WordPressSourceConfig;
+
+export interface CreateDateRangeOptions {
+  format?: string;
+  separator?: string;
+}
@@ -0,0 +1,22 @@
+import { logger } from "@basango/logger";
+import { runSyncCrawl } from "@/process/sync/tasks";
+import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
+
+const main = async (): Promise<void> => {
+  const options = parseCrawlingCliArgs();
+
+  if (options.sourceId === undefined) {
+    console.log(CRAWLING_USAGE);
+    process.exitCode = 1;
+    return;
+  }
+
+  try {
+    await runSyncCrawl({ ...options });
+  } catch (error) {
+    logger.error({ error }, "Synchronous crawl failed");
+    process.exitCode = 1;
+  }
+};
+
+void main();
@@ -0,0 +1,24 @@
+import { logger } from "@basango/logger";
+import { scheduleAsyncCrawl } from "@/process/async/tasks";
+import { CRAWLING_USAGE, parseCrawlingCliArgs } from "@/scripts/utils";
+
+const main = async (): Promise<void> => {
+  const options = parseCrawlingCliArgs();
+
+  if (options.sourceId === undefined) {
+    console.log(CRAWLING_USAGE);
+    process.exitCode = 1;
+    return;
+  }
+
+  try {
+    const id = await scheduleAsyncCrawl({ ...options });
+
+    logger.info({ id, options }, "Scheduled asynchronous crawl job");
+  } catch (error) {
+    logger.error({ error }, "Failed to schedule crawl job");
+    process.exitCode = 1;
+  }
+};
+
+void main();
@@ -0,0 +1,39 @@
+import { parseArgs } from "node:util";
+import { CrawlingOptions } from "@/process/crawler";
+
+interface WorkerCliOptions {
+  queue?: string[];
+}
+
+export const CRAWLING_USAGE = `
+    Usage: bun run crawl:[async|sync] -- --sourceId <id> [options]
+    
+    Options:
+      --pageRange <range>   Optional page range filter (e.g. 1:5)
+      --dateRange <range>   Optional date range filter (e.g. 2024-01-01:2024-01-31)
+      --category <slug>     Optional category to crawl
+      -h, --help            Show this message
+`;
+
+export const parseWorkerCliArgs = (): WorkerCliOptions => {
+  const { values } = parseArgs({
+    options: {
+      queue: { multiple: true, short: "q", type: "string" },
+    },
+  });
+
+  return values as WorkerCliOptions;
+};
+
+export const parseCrawlingCliArgs = (): CrawlingOptions => {
+  const { values } = parseArgs({
+    options: {
+      category: { type: "string" },
+      dateRange: { type: "string" },
+      pageRange: { type: "string" },
+      sourceId: { type: "string" },
+    },
+  });
+
+  return values as CrawlingOptions;
+};
@@ -0,0 +1,35 @@
+import { logger } from "@basango/logger";
+
+import { createQueueManager } from "@/process/async/queue";
+import { startWorker } from "@/process/async/worker";
+import { parseWorkerCliArgs } from "@/scripts/utils";
+
+const main = async (): Promise<void> => {
+  const options = parseWorkerCliArgs();
+
+  const manager = createQueueManager();
+  const queues = options.queue?.length
+    ? options.queue.map((name) => manager.queueName(name))
+    : undefined;
+
+  const handle = startWorker({
+    queueManager: manager,
+    queueNames: queues,
+  });
+
+  const shutdown = async (signal: NodeJS.Signals) => {
+    logger.info({ signal }, "Received shutdown signal, draining workers");
+    try {
+      await handle.close();
+    } finally {
+      await manager.close();
+      process.exit(0);
+    }
+  };
+
+  process.once("SIGINT", (signal) => void shutdown(signal));
+  process.once("SIGTERM", (signal) => void shutdown(signal));
+  logger.info({ queueNames: queues }, "Crawler workers started");
+};
+
+void main();
@@ -0,0 +1,163 @@
+import { format, getUnixTime, isMatch, parse } from "date-fns";
+import type { RedisOptions } from "ioredis";
+import { get_encoding, TiktokenEncoding } from "tiktoken";
+import { config } from "@/config";
+import { DEFAULT_DATE_FORMAT } from "@/constants";
+import {
+  AnySourceConfig,
+  CreateDateRangeOptions,
+  DateRange,
+  DateRangeSchema,
+  DateRangeSpecSchema,
+  HtmlSourceConfig,
+  PageRange,
+  PageRangeSchema,
+  PageRangeSpecSchema,
+  WordPressSourceConfig,
+} from "@/schema";
+
+/**
+ * Resolve a source configuration by its ID.
+ * @param id - The source ID
+ */
+export const resolveSourceConfig = (id: string): AnySourceConfig => {
+  const source =
+    config.sources.html.find((s: HtmlSourceConfig) => s.sourceId === id) ||
+    config.sources.wordpress.find((s: WordPressSourceConfig) => s.sourceId === id);
+
+  if (source === undefined) {
+    throw new Error(`Source '${id}' not found in configuration`);
+  }
+
+  return source;
+};
+
+/**
+ * Parse a Redis URL into RedisOptions.
+ * @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0")
+ */
+export const parseRedisUrl = (url: string): RedisOptions => {
+  if (!url.startsWith("redis://")) {
+    return {};
+  }
+  const parsed = new URL(url);
+  return {
+    db: Number(parsed.pathname?.replace("/", "") || 0),
+    host: parsed.hostname,
+    password: parsed.password || undefined,
+    port: Number(parsed.port || 6379),
+  };
+};
+
+/**
+ * Parse a date string using the specified format.
+ * @param value - The date string to parse
+ * @param format - The date format
+ */
+const parseDate = (value: string, format: string): Date => {
+  if (!isMatch(value, format)) {
+    throw new Error(`Invalid date '${value}' for format '${format}'`);
+  }
+  const parsed = parse(value, format, new Date());
+  if (Number.isNaN(parsed.getTime())) {
+    throw new Error(`Invalid date '${value}' for format '${format}'`);
+  }
+  return parsed;
+};
+
+/**
+ * Count the number of tokens in the given text using the specified encoding.
+ * @param text - The input text
+ * @param encoding - The token encoding (default: "cl100k_base")
+ */
+export const countTokens = (text: string, encoding: TiktokenEncoding = "cl100k_base"): number => {
+  try {
+    const encoder = get_encoding(encoding);
+    const tokens = encoder.encode(text);
+    encoder.free();
+    return tokens.length;
+  } catch {
+    return text.length;
+  }
+};
+
+/**
+ * Create a page range from a string specification.
+ * @param spec - The page range specification (e.g., "1:10")
+ */
+export const createPageRange = (spec: string | undefined): PageRange | undefined => {
+  if (!spec) return undefined;
+  const parsed = PageRangeSpecSchema.parse(spec);
+  return PageRangeSchema.parse(parsed);
+};
+
+/**
+ * Create a date range from a string specification.
+ * @param spec - The date range specification (e.g., "2023-01-01:2023-12-31")
+ * @param options - Options for date range creation
+ */
+export const createDateRange = (
+  spec: string | undefined,
+  options: CreateDateRangeOptions = {},
+): DateRange | undefined => {
+  if (!spec) return undefined;
+  const { format = DEFAULT_DATE_FORMAT, separator = ":" } = options;
+  if (!separator) {
+    throw new Error("Separator cannot be empty");
+  }
+
+  const normalized = spec.replace(separator, ":");
+  const parsedSpec = DateRangeSpecSchema.parse(normalized);
+
+  const startDate = parseDate(parsedSpec.startRaw, format);
+  const endDate = parseDate(parsedSpec.endRaw, format);
+
+  const range = {
+    end: getUnixTime(endDate),
+    start: getUnixTime(startDate),
+  };
+
+  return DateRangeSchema.parse(range);
+};
+
+/**
+ * Format a date range into a string representation.
+ * @param range - The date range
+ * @param fmt - The date format (default: DEFAULT_DATE_FORMAT)
+ */
+export const formatDateRange = (range: DateRange, fmt = DEFAULT_DATE_FORMAT): string => {
+  const start = format(new Date(range.start * 1000), fmt);
+  const end = format(new Date(range.end * 1000), fmt);
+  return `${start}:${end}`;
+};
+
+/**
+ * Format a page range into a string representation.
+ * @param range - The page range
+ */
+export const formatPageRange = (range: PageRange): string => {
+  return `${range.start}:${range.end}`;
+};
+
+/**
+ * Check if a timestamp is within a given date range.
+ * @param range - The date range
+ * @param timestamp - The timestamp to check
+ */
+export const isTimestampInRange = (range: DateRange, timestamp: number): boolean => {
+  return range.start <= timestamp && timestamp <= range.end;
+};
+
+/**
+ * Convert a relative URL to an absolute URL based on the base URL.
+ * @param base - The base URL
+ * @param href - The relative or absolute URL
+ */
+export const createAbsoluteUrl = (base: string, href: string): string => {
+  try {
+    // new URL handles relative paths with base
+    return new URL(href, base.endsWith("/") ? base : `${base}/`).toString();
+  } catch {
+    return href;
+  }
+};