import { parse } from "node-html-parser"; import { config } from "#crawler/config"; import { OPEN_GRAPH_USER_AGENT } from "#crawler/constants"; import { SyncHttpClient } from "#crawler/http/http-client"; import { UserAgents } from "#crawler/http/user-agent"; import { ArticleMetadata } from "#crawler/schema"; import { createAbsoluteUrl } from "#crawler/utils"; /** * Picks the first non-empty value from the provided array. * @param values - An array of string values */ const pick = (values: Array): string | undefined => { for (const value of values) { if (value && value.trim().length > 0) { return value.trim(); } } return undefined; }; /** * Extracts the content of a meta tag given its property or name. * @param root - The root HTML element * @param property - The property or name of the meta tag to extract */ const extract = (root: ReturnType, property: string): string | null => { const selector = `meta[property='${property}'], meta[name='${property}']`; const node = root.querySelector(selector); if (!node) { return null; } return node.getAttribute("content") ?? null; }; /** * OpenGraph consumer for extracting Open Graph metadata from HTML pages. * Uses a synchronous HTTP client to fetch the HTML content. * * @author Bernard Ngandu */ export class OpenGraph { private readonly client: Pick; constructor() { const settings = config.fetch.client; const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT); this.client = new SyncHttpClient(settings, { defaultHeaders: { "User-Agent": provider.og() }, userAgentProvider: provider, }); } /** * Consume a URL and extract Open Graph metadata. * @param url - The URL to fetch and parse */ async consumeUrl(url: string): Promise { try { const response = await this.client.get(url); const html = await response.text(); return OpenGraph.consumeHtml(html, url); } catch { return undefined; } } /** * Consume HTML content and extract Open Graph metadata. * @param html - HTML content as a string * @param url - Optional URL of the page */ static consumeHtml(html: string, url: string): ArticleMetadata | undefined { if (!html) { return undefined; } const root = parse(html); const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]); const description = pick([extract(root, "og:description"), extract(root, "description")]); const image = pick([ extract(root, "og:image"), root.querySelector("img")?.getAttribute("src") ?? null, ]); const canonical = pick([ extract(root, "og:url"), root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null, url ?? null, ]); if (!title && !description && !image && !canonical) { return undefined; } return { description, image: createAbsoluteUrl(url, image ?? "") || undefined, title, url: createAbsoluteUrl(url, canonical ?? "") || undefined, }; } }