105 lines
3.1 KiB
TypeScript
105 lines
3.1 KiB
TypeScript
import { parse } from "node-html-parser";
|
|
|
|
import { config } from "#crawler/config";
|
|
import { OPEN_GRAPH_USER_AGENT } from "#crawler/constants";
|
|
import { SyncHttpClient } from "#crawler/http/http-client";
|
|
import { UserAgents } from "#crawler/http/user-agent";
|
|
import { ArticleMetadata } from "#crawler/schema";
|
|
import { createAbsoluteUrl } from "#crawler/utils";
|
|
|
|
/**
|
|
* Picks the first non-empty value from the provided array.
|
|
* @param values - An array of string values
|
|
*/
|
|
const pick = (values: Array<string | null | undefined>): string | undefined => {
|
|
for (const value of values) {
|
|
if (value && value.trim().length > 0) {
|
|
return value.trim();
|
|
}
|
|
}
|
|
return undefined;
|
|
};
|
|
|
|
/**
|
|
* Extracts the content of a meta tag given its property or name.
|
|
* @param root - The root HTML element
|
|
* @param property - The property or name of the meta tag to extract
|
|
*/
|
|
const extract = (root: ReturnType<typeof parse>, property: string): string | null => {
|
|
const selector = `meta[property='${property}'], meta[name='${property}']`;
|
|
const node = root.querySelector(selector);
|
|
if (!node) {
|
|
return null;
|
|
}
|
|
return node.getAttribute("content") ?? null;
|
|
};
|
|
|
|
/**
|
|
* OpenGraph consumer for extracting Open Graph metadata from HTML pages.
|
|
* Uses a synchronous HTTP client to fetch the HTML content.
|
|
*
|
|
* @author Bernard Ngandu <bernard@devscast.tech>
|
|
*/
|
|
export class OpenGraph {
|
|
private readonly client: Pick<SyncHttpClient, "get">;
|
|
|
|
constructor() {
|
|
const settings = config.fetch.client;
|
|
const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT);
|
|
|
|
this.client = new SyncHttpClient(settings, {
|
|
defaultHeaders: { "User-Agent": provider.og() },
|
|
userAgentProvider: provider,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Consume a URL and extract Open Graph metadata.
|
|
* @param url - The URL to fetch and parse
|
|
*/
|
|
async consumeUrl(url: string): Promise<ArticleMetadata | undefined> {
|
|
try {
|
|
const response = await this.client.get(url);
|
|
const html = await response.text();
|
|
return OpenGraph.consumeHtml(html, url);
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Consume HTML content and extract Open Graph metadata.
|
|
* @param html - HTML content as a string
|
|
* @param url - Optional URL of the page
|
|
*/
|
|
static consumeHtml(html: string, url: string): ArticleMetadata | undefined {
|
|
if (!html) {
|
|
return undefined;
|
|
}
|
|
|
|
const root = parse(html);
|
|
const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]);
|
|
const description = pick([extract(root, "og:description"), extract(root, "description")]);
|
|
const image = pick([
|
|
extract(root, "og:image"),
|
|
root.querySelector("img")?.getAttribute("src") ?? null,
|
|
]);
|
|
const canonical = pick([
|
|
extract(root, "og:url"),
|
|
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
|
|
url ?? null,
|
|
]);
|
|
|
|
if (!title && !description && !image && !canonical) {
|
|
return undefined;
|
|
}
|
|
|
|
return {
|
|
description,
|
|
image: createAbsoluteUrl(url, image ?? "") || undefined,
|
|
title,
|
|
url: createAbsoluteUrl(url, canonical ?? "") || undefined,
|
|
};
|
|
}
|
|
}
|