109 lines
3.2 KiB
TypeScript
109 lines
3.2 KiB
TypeScript
import { HTMLElement, parse as parseHtml } from "node-html-parser";
|
|
|
|
import { FetchCrawlerConfig, config } from "#crawler/config";
|
|
import { SyncHttpClient } from "#crawler/http/http-client";
|
|
import { OpenGraph } from "#crawler/http/open-graph";
|
|
import type { Persistor } from "#crawler/process/persistence";
|
|
import { AnySourceConfig, Article } from "#crawler/schema";
|
|
|
|
export interface CrawlerOptions {
|
|
persistors?: Persistor[];
|
|
}
|
|
|
|
export abstract class BaseCrawler {
|
|
protected readonly settings: FetchCrawlerConfig;
|
|
protected readonly source: AnySourceConfig;
|
|
protected readonly http: SyncHttpClient;
|
|
protected readonly persistors: Persistor[];
|
|
protected readonly openGraph: OpenGraph;
|
|
|
|
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
|
|
if (!settings.source) {
|
|
throw new Error("Crawler requires a bound source");
|
|
}
|
|
|
|
this.http = new SyncHttpClient(config.fetch.client);
|
|
this.persistors = options.persistors ?? [];
|
|
this.openGraph = new OpenGraph();
|
|
|
|
this.settings = settings;
|
|
this.source = settings.source as AnySourceConfig;
|
|
}
|
|
|
|
/**
|
|
* Fetch and process articles from the source.
|
|
*/
|
|
abstract fetch(): Promise<void> | void;
|
|
|
|
/**
|
|
* Crawl the given URL and return the HTML content as a string.
|
|
* @param url - The URL to crawl
|
|
*/
|
|
async crawl(url: string): Promise<string> {
|
|
const response = await this.http.get(url);
|
|
return await response.text();
|
|
}
|
|
|
|
/**
|
|
* Extract text content from an HTML node.
|
|
* @param node - The HTML node
|
|
*/
|
|
protected textContent(node: HTMLElement | null | undefined): string | null {
|
|
if (!node) return null;
|
|
// innerText keeps spacing similar to browser rendering
|
|
const value = node.innerText ?? node.text;
|
|
const text = value.trim();
|
|
return text.length ? text : null;
|
|
}
|
|
|
|
/**
|
|
* Extract the first matching element from the root using the selector.
|
|
* @param root - The root HTML element
|
|
* @param selector - The CSS selector
|
|
*/
|
|
protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null {
|
|
if (!selector) return null;
|
|
try {
|
|
return root.querySelector(selector) ?? null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract all matching elements from the root using the selector.
|
|
* @param root - The root HTML element
|
|
* @param selector - The CSS selector
|
|
*/
|
|
protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] {
|
|
if (!selector) return [];
|
|
try {
|
|
return root.querySelectorAll(selector);
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse HTML string into an HTMLElement.
|
|
* @param html - The HTML string
|
|
*/
|
|
protected parseHtml(html: string): HTMLElement {
|
|
return parseHtml(html) as unknown as HTMLElement;
|
|
}
|
|
|
|
/**
|
|
* Enrich the record with Open Graph metadata from the given URL.
|
|
* @param record - The article record
|
|
* @param url - The URL to fetch Open Graph data from
|
|
*/
|
|
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
|
|
try {
|
|
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
|
|
return { ...record, metadata };
|
|
} catch {
|
|
return { ...record, metadata: undefined };
|
|
}
|
|
}
|
|
}
|