244 lines
6.9 KiB
TypeScript
244 lines
6.9 KiB
TypeScript
import { setTimeout as delay } from "node:timers/promises";
|
|
|
|
import type { CrawlerHttpOptions } from "@basango/domain/config";
|
|
import {
|
|
DEFAULT_RETRY_AFTER_HEADER,
|
|
DEFAULT_TRANSIENT_HTTP_STATUSES,
|
|
DEFAULT_USER_AGENT,
|
|
} from "@basango/domain/constants";
|
|
|
|
import { UserAgents } from "#crawler/http/user-agent";
|
|
|
|
export type HttpHeaders = Record<string, string>;
|
|
export type HttpParams = Record<string, string | number | boolean | null | undefined>;
|
|
export type HttpData = unknown;
|
|
|
|
export interface HttpClientOptions {
|
|
userAgentProvider?: UserAgents;
|
|
defaultHeaders?: HttpHeaders;
|
|
fetchImpl?: typeof fetch;
|
|
sleep?: (ms: number) => Promise<void>;
|
|
}
|
|
|
|
export interface HttpRequestOptions {
|
|
headers?: HttpHeaders;
|
|
params?: HttpParams;
|
|
data?: HttpData;
|
|
json?: HttpData;
|
|
retryAfterHeader?: string;
|
|
}
|
|
|
|
export class HttpError extends Error {
|
|
readonly status: number;
|
|
readonly response: Response;
|
|
|
|
constructor(message: string, response: Response) {
|
|
super(message);
|
|
this.status = response.status;
|
|
this.response = response;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Default sleep function using setTimeout.
|
|
* @param ms - Milliseconds to sleep
|
|
*/
|
|
const defaultSleep = (ms: number): Promise<void> => {
|
|
return delay(ms).then(() => undefined);
|
|
};
|
|
|
|
/**
|
|
* Builds a URL with query parameters.
|
|
* @param url - The base URL
|
|
* @param params - The query parameters to append
|
|
*/
|
|
const buildUrl = (url: string, params?: HttpParams): string => {
|
|
if (!params || Object.keys(params).length === 0) {
|
|
return url;
|
|
}
|
|
|
|
const target = new URL(url);
|
|
for (const [key, value] of Object.entries(params)) {
|
|
if (value === undefined || value === null) continue;
|
|
target.searchParams.set(key, String(value));
|
|
}
|
|
|
|
return target.toString();
|
|
};
|
|
|
|
/**
|
|
* Computes the backoff time in milliseconds based on the configuration and attempt number.
|
|
* @param config - Fetch client configuration
|
|
* @param attempt - Current attempt number
|
|
*/
|
|
const computeBackoff = (config: CrawlerHttpOptions, attempt: number): number => {
|
|
const base = Math.min(
|
|
config.backoffInitial * config.backoffMultiplier ** attempt,
|
|
config.backoffMax,
|
|
);
|
|
const jitter = Math.random() * base * 0.25;
|
|
return (base + jitter) * 1000;
|
|
};
|
|
|
|
const parseRetryAfter = (header: string): number => {
|
|
const numeric = Number.parseInt(header, 10);
|
|
if (!Number.isNaN(numeric)) {
|
|
return Math.max(0, numeric * 1000);
|
|
}
|
|
|
|
const parsed = Date.parse(header);
|
|
if (Number.isNaN(parsed)) {
|
|
return 0;
|
|
}
|
|
|
|
const delta = parsed - Date.now();
|
|
return delta > 0 ? delta : 0;
|
|
};
|
|
|
|
/**
|
|
* Base HTTP client providing common functionality.
|
|
*
|
|
* @author Bernard Ngandu <bernard@devscast.tech>
|
|
*/
|
|
export class BaseHttpClient {
|
|
protected readonly options: CrawlerHttpOptions;
|
|
protected readonly fetchImpl: typeof fetch;
|
|
protected readonly sleep: (ms: number) => Promise<void>;
|
|
protected readonly headers: HttpHeaders;
|
|
|
|
constructor(options: CrawlerHttpOptions, clientOptions: HttpClientOptions = {}) {
|
|
this.options = options;
|
|
const provider =
|
|
clientOptions.userAgentProvider ??
|
|
new UserAgents(options.rotate, options.userAgent ?? DEFAULT_USER_AGENT);
|
|
const userAgent = provider.get() ?? options.userAgent ?? DEFAULT_USER_AGENT;
|
|
|
|
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
|
|
if (clientOptions.defaultHeaders) {
|
|
Object.assign(baseHeaders, clientOptions.defaultHeaders);
|
|
}
|
|
|
|
this.headers = baseHeaders;
|
|
this.fetchImpl = clientOptions.fetchImpl ?? fetch;
|
|
this.sleep = clientOptions.sleep ?? defaultSleep;
|
|
}
|
|
|
|
protected buildHeaders(headers?: HttpHeaders): HeadersInit {
|
|
return { ...this.headers, ...(headers ?? {}) };
|
|
}
|
|
|
|
protected async maybeDelay(
|
|
attempt: number,
|
|
response?: Response,
|
|
retryAfterHeader: string = DEFAULT_RETRY_AFTER_HEADER,
|
|
): Promise<void> {
|
|
let waitMs = 0;
|
|
|
|
if (response) {
|
|
const retryAfter = response.headers.get(retryAfterHeader);
|
|
if (retryAfter && this.options.respectRetryAfter) {
|
|
waitMs = parseRetryAfter(retryAfter);
|
|
}
|
|
}
|
|
|
|
if (waitMs === 0) {
|
|
waitMs = computeBackoff(this.options, attempt);
|
|
}
|
|
|
|
if (waitMs > 0) {
|
|
await this.sleep(waitMs);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Synchronous HTTP client with retry and timeout capabilities.
|
|
*
|
|
* @author Bernard Ngandu <bernard@devscast.tech>
|
|
*/
|
|
export class SyncHttpClient extends BaseHttpClient {
|
|
async request(method: string, url: string, options: HttpRequestOptions = {}): Promise<Response> {
|
|
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
|
|
const target = buildUrl(url, options.params);
|
|
|
|
const maxAttempts = this.options.maxRetries + 1;
|
|
let attempt = 0;
|
|
let lastError: unknown;
|
|
|
|
while (attempt < maxAttempts) {
|
|
const controller = new AbortController();
|
|
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
|
|
try {
|
|
timeoutHandle = setTimeout(() => controller.abort(), this.options.timeout * 1000);
|
|
|
|
const headers = this.buildHeaders(options.headers);
|
|
const init: RequestInit = {
|
|
body: options.data as BodyInit | undefined,
|
|
headers,
|
|
method,
|
|
redirect: this.options.followRedirects ? "follow" : "manual",
|
|
signal: controller.signal,
|
|
};
|
|
|
|
if (options.json !== undefined) {
|
|
init.body = JSON.stringify(options.json);
|
|
(init.headers as Record<string, string>)["Content-Type"] ??= "application/json";
|
|
}
|
|
|
|
const response = await this.fetchImpl(target, init);
|
|
|
|
if (
|
|
DEFAULT_TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
|
|
attempt < this.options.maxRetries
|
|
) {
|
|
await this.maybeDelay(attempt, response, retryAfterHeader);
|
|
attempt += 1;
|
|
continue;
|
|
}
|
|
|
|
if (!response.ok) {
|
|
throw new HttpError(`HTTP ${response.status} ${response.statusText}`, response);
|
|
}
|
|
|
|
return response;
|
|
} catch (error) {
|
|
if (error instanceof HttpError) {
|
|
lastError = error;
|
|
throw error;
|
|
}
|
|
|
|
if (error instanceof DOMException && error.name === "AbortError") {
|
|
lastError = error;
|
|
if (attempt >= this.options.maxRetries) {
|
|
throw error;
|
|
}
|
|
} else {
|
|
lastError = error;
|
|
if (attempt >= this.options.maxRetries) {
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
await this.maybeDelay(attempt);
|
|
attempt += 1;
|
|
} finally {
|
|
if (timeoutHandle) {
|
|
clearTimeout(timeoutHandle);
|
|
}
|
|
}
|
|
}
|
|
|
|
throw lastError instanceof Error ? lastError : new Error("HTTP request failed after retries");
|
|
}
|
|
|
|
get(url: string, options?: Omit<HttpRequestOptions, "data" | "json">): Promise<Response> {
|
|
return this.request("GET", url, options);
|
|
}
|
|
|
|
post(url: string, options: HttpRequestOptions = {}): Promise<Response> {
|
|
return this.request("POST", url, options);
|
|
}
|
|
}
|
|
|
|
export type HttpClient = SyncHttpClient;
|