basango/apps/crawler/src/http/open-graph.ts

import { parse } from "node-html-parser";

import { config } from "#crawler/config";
import { OPEN_GRAPH_USER_AGENT } from "#crawler/constants";
import { SyncHttpClient } from "#crawler/http/http-client";
import { UserAgents } from "#crawler/http/user-agent";
import { ArticleMetadata } from "#crawler/schema";
import { createAbsoluteUrl } from "#crawler/utils";

/**
 * Picks the first non-empty value from the provided array.
 * @param values - An array of string values
 */
const pick = (values: Array<string | null | undefined>): string | undefined => {
  for (const value of values) {
    if (value && value.trim().length > 0) {
      return value.trim();
    }
  }
  return undefined;
};

/**
 * Extracts the content of a meta tag given its property or name.
 * @param root - The root HTML element
 * @param property - The property or name of the meta tag to extract
 */
const extract = (root: ReturnType<typeof parse>, property: string): string | null => {
  const selector = `meta[property='${property}'], meta[name='${property}']`;
  const node = root.querySelector(selector);
  if (!node) {
    return null;
  }
  return node.getAttribute("content") ?? null;
};

/**
 * OpenGraph consumer for extracting Open Graph metadata from HTML pages.
 * Uses a synchronous HTTP client to fetch the HTML content.
 *
 * @author Bernard Ngandu <bernard@devscast.tech>
 */
export class OpenGraph {
  private readonly client: Pick<SyncHttpClient, "get">;

  constructor() {
    const settings = config.fetch.client;
    const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT);

    this.client = new SyncHttpClient(settings, {
      defaultHeaders: { "User-Agent": provider.og() },
      userAgentProvider: provider,
    });
  }

  /**
   * Consume a URL and extract Open Graph metadata.
   * @param url - The URL to fetch and parse
   */
  async consumeUrl(url: string): Promise<ArticleMetadata | undefined> {
    try {
      const response = await this.client.get(url);
      const html = await response.text();
      return OpenGraph.consumeHtml(html, url);
    } catch {
      return undefined;
    }
  }

  /**
   * Consume HTML content and extract Open Graph metadata.
   * @param html - HTML content as a string
   * @param url - Optional URL of the page
   */
  static consumeHtml(html: string, url: string): ArticleMetadata | undefined {
    if (!html) {
      return undefined;
    }

    const root = parse(html);
    const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]);
    const description = pick([extract(root, "og:description"), extract(root, "description")]);
    const image = pick([
      extract(root, "og:image"),
      root.querySelector("img")?.getAttribute("src") ?? null,
    ]);
    const canonical = pick([
      extract(root, "og:url"),
      root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
      url ?? null,
    ]);

    if (!title && !description && !image && !canonical) {
      return undefined;
    }

    return {
      description,
      image: createAbsoluteUrl(url, image ?? "") || undefined,
      title,
      url: createAbsoluteUrl(url, canonical ?? "") || undefined,
    };
  }
}