Files
basango/apps/crawler/src/process/parsers/base.ts
T

109 lines
3.2 KiB
TypeScript

import { HTMLElement, parse as parseHtml } from "node-html-parser";
import { FetchCrawlerConfig, config } from "#crawler/config";
import { SyncHttpClient } from "#crawler/http/http-client";
import { OpenGraph } from "#crawler/http/open-graph";
import type { Persistor } from "#crawler/process/persistence";
import { AnySourceConfig, Article } from "#crawler/schema";
export interface CrawlerOptions {
persistors?: Persistor[];
}
export abstract class BaseCrawler {
protected readonly settings: FetchCrawlerConfig;
protected readonly source: AnySourceConfig;
protected readonly http: SyncHttpClient;
protected readonly persistors: Persistor[];
protected readonly openGraph: OpenGraph;
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
if (!settings.source) {
throw new Error("Crawler requires a bound source");
}
this.http = new SyncHttpClient(config.fetch.client);
this.persistors = options.persistors ?? [];
this.openGraph = new OpenGraph();
this.settings = settings;
this.source = settings.source as AnySourceConfig;
}
/**
* Fetch and process articles from the source.
*/
abstract fetch(): Promise<void> | void;
/**
* Crawl the given URL and return the HTML content as a string.
* @param url - The URL to crawl
*/
async crawl(url: string): Promise<string> {
const response = await this.http.get(url);
return await response.text();
}
/**
* Extract text content from an HTML node.
* @param node - The HTML node
*/
protected textContent(node: HTMLElement | null | undefined): string | null {
if (!node) return null;
// innerText keeps spacing similar to browser rendering
const value = node.innerText ?? node.text;
const text = value.trim();
return text.length ? text : null;
}
/**
* Extract the first matching element from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null {
if (!selector) return null;
try {
return root.querySelector(selector) ?? null;
} catch {
return null;
}
}
/**
* Extract all matching elements from the root using the selector.
* @param root - The root HTML element
* @param selector - The CSS selector
*/
protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] {
if (!selector) return [];
try {
return root.querySelectorAll(selector);
} catch {
return [];
}
}
/**
* Parse HTML string into an HTMLElement.
* @param html - The HTML string
*/
protected parseHtml(html: string): HTMLElement {
return parseHtml(html) as unknown as HTMLElement;
}
/**
* Enrich the record with Open Graph metadata from the given URL.
* @param record - The article record
* @param url - The URL to fetch Open Graph data from
*/
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
try {
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
return { ...record, metadata };
} catch {
return { ...record, metadata: undefined };
}
}
}