refactor: centralize configuration
This commit is contained in:
@@ -1,21 +0,0 @@
|
||||
# paths
|
||||
BASANGO_CRAWLER_ROOT_PATH=
|
||||
BASANGO_CRAWLER_DATA_PATH=
|
||||
BASANGO_CRAWLER_LOGS_PATH=
|
||||
BASANGO_CRAWLER_CONFIG_PATH=
|
||||
|
||||
# crawler settings
|
||||
BASANGO_CRAWLER_UPDATE_DIRECTION=forward
|
||||
BASANGO_CRAWLER_FETCH_USER_AGENT="Basango/0.1 (+https://github.com/bernard-ng/basango)"
|
||||
BASANGO_CRAWLER_FETCH_MAX_RETRIES=3
|
||||
BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER=true
|
||||
|
||||
BASANGO_CRAWLER_ASYNC_REDIS_URL="redis://localhost:6379/0"
|
||||
BASANGO_CRAWLER_ASYNC_TTL_RESULT=3600
|
||||
BASANGO_CRAWLER_ASYNC_TTL_FAILURE=3600
|
||||
BASANGO_CRAWLER_ASYNC_QUEUE_LISTING="listing"
|
||||
BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS="details"
|
||||
BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING="processing"
|
||||
|
||||
BASANGO_CRAWLER_TOKEN="dev"
|
||||
BASANGO_CRAWLER_BACKEND_API_ENDPOINT="http://localhost:3080/articles"
|
||||
@@ -1,41 +0,0 @@
|
||||
{
|
||||
"fetch": {
|
||||
"async": {
|
||||
"prefix": "basango:crawler",
|
||||
"queues": {
|
||||
"details": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS)%",
|
||||
"listing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_LISTING)%",
|
||||
"processing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING)%"
|
||||
},
|
||||
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
|
||||
"ttl": {
|
||||
"default": 600,
|
||||
"failure": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_FAILURE)%",
|
||||
"result": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_RESULT)%"
|
||||
}
|
||||
},
|
||||
"client": {
|
||||
"backoffInitial": 1,
|
||||
"backoffMax": 30,
|
||||
"backoffMultiplier": 2,
|
||||
"followRedirects": true,
|
||||
"maxRetries": "%env(number:BASANGO_CRAWLER_FETCH_MAX_RETRIES)%",
|
||||
"respectRetryAfter": "%env(boolean:BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER)%",
|
||||
"rotate": true,
|
||||
"timeout": 20,
|
||||
"userAgent": "%env(BASANGO_CRAWLER_FETCH_USER_AGENT)%",
|
||||
"verifySsl": true
|
||||
},
|
||||
"crawler": {
|
||||
"direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%",
|
||||
"maxWorkers": 5,
|
||||
"notify": false,
|
||||
"useMultiThreading": false
|
||||
}
|
||||
},
|
||||
"paths": {
|
||||
"config": "%env(BASANGO_CRAWLER_CONFIG_PATH)%",
|
||||
"data": "%env(BASANGO_CRAWLER_DATA_PATH)%",
|
||||
"root": "%env(BASANGO_CRAWLER_ROOT_PATH)%"
|
||||
}
|
||||
}
|
||||
@@ -1,210 +0,0 @@
|
||||
{
|
||||
"sources": {
|
||||
"html": [
|
||||
{
|
||||
"paginationTemplate": "actualite",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {},
|
||||
"sourceId": "radiookapi.net",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".field-name-body",
|
||||
"articleCategories": ".views-field-field-cat-gorie a",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": ".views-field-title a",
|
||||
"articles": ".view-content > .views-row.content-row",
|
||||
"articleTitle": "h1.page-header",
|
||||
"pagination": "ul.pagination > li.pager-last > a"
|
||||
},
|
||||
"sourceUrl": "https://www.radiookapi.net",
|
||||
"supportsCategories": false
|
||||
},
|
||||
{
|
||||
"categories": ["politique", "economie", "culture", "sport", "societe"],
|
||||
"paginationTemplate": "index.php/category/{category}",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {},
|
||||
"sourceId": "7sur7.cd",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": "div[property=\"schema:text\"].field.field--name-body",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": ".views-field-title a",
|
||||
"articles": ".view-content > .row.views-row",
|
||||
"articleTitle": ".views-field-title a",
|
||||
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
|
||||
},
|
||||
"sourceUrl": "https://7sur7.cd",
|
||||
"supportsCategories": true
|
||||
},
|
||||
{
|
||||
"paginationTemplate": "articles.html",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {
|
||||
"format": "dd.MM.yyyy"
|
||||
},
|
||||
"sourceId": "mediacongo.net",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".article_ttext",
|
||||
"articleCategories": "a.color_link",
|
||||
"articleDate": ".article_other_about",
|
||||
"articleLink": "a:first-child",
|
||||
"articles": ".for_aitems > .article_other_item",
|
||||
"articleTitle": "h1",
|
||||
"pagination": "div.pagination > div > a:last-child"
|
||||
},
|
||||
"sourceUrl": "https://www.mediacongo.net",
|
||||
"supportsCategories": false
|
||||
},
|
||||
{
|
||||
"paginationTemplate": "actualite",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {},
|
||||
"sourceId": "actualite.cd",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".views-field.views-field-body .field-content",
|
||||
"articleCategories": "#actu-cat",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": "#actu-titre a",
|
||||
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
|
||||
"articleTitle": "h1.page-title"
|
||||
},
|
||||
"sourceUrl": "https://actualite.cd",
|
||||
"supportsCategories": false
|
||||
}
|
||||
],
|
||||
"wordpress": [
|
||||
{
|
||||
"requiresRateLimit": true,
|
||||
"sourceId": "beto.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://beto.cd"
|
||||
},
|
||||
{ "sourceId": "newscd.net", "sourceKind": "wordpress", "sourceUrl": "https://newscd.net" },
|
||||
{
|
||||
"sourceId": "africanewsrdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.africanewsrdc.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "angazainstitute.ac.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://angazainstitute.ac.cd"
|
||||
},
|
||||
{ "sourceId": "b-onetv.cd", "sourceKind": "wordpress", "sourceUrl": "https://b-onetv.cd" },
|
||||
{
|
||||
"sourceId": "bukavufm.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://bukavufm.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "changement7.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://changement7.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "congoactu.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://congoactu.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "congoindependant.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.congoindependant.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "congoquotidien.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.congoquotidien.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "cumulard.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.cumulard.cd"
|
||||
},
|
||||
{
|
||||
"sourceId": "environews-rdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://environews-rdc.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "freemediardc.info",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.freemediardc.info"
|
||||
},
|
||||
{
|
||||
"sourceId": "geopolismagazine.org",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://geopolismagazine.org"
|
||||
},
|
||||
{
|
||||
"sourceId": "habarirdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://habarirdc.net"
|
||||
},
|
||||
{ "sourceId": "infordc.com", "sourceKind": "wordpress", "sourceUrl": "https://infordc.com" },
|
||||
{
|
||||
"sourceId": "kilalopress.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://kilalopress.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "laprosperiteonline.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://laprosperiteonline.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "laprunellerdc.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://laprunellerdc.cd"
|
||||
},
|
||||
{
|
||||
"sourceId": "lesmedias.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://lesmedias.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "lesvolcansnews.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://lesvolcansnews.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "netic-news.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.netic-news.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "objectif-infos.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://objectif-infos.cd"
|
||||
},
|
||||
{
|
||||
"sourceId": "scooprdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://scooprdc.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "journaldekinshasa.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.journaldekinshasa.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "lepotentiel.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://lepotentiel.cd"
|
||||
},
|
||||
{ "sourceId": "acturdc.com", "sourceKind": "wordpress", "sourceUrl": "https://acturdc.com" },
|
||||
{
|
||||
"sourceId": "matininfos.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://matininfos.net"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,81 +0,0 @@
|
||||
import path from "node:path";
|
||||
|
||||
import {
|
||||
HtmlSourceConfigSchema,
|
||||
PageRangeSchema,
|
||||
TimestampRangeSchema,
|
||||
UpdateDirectionSchema,
|
||||
WordPressSourceConfigSchema,
|
||||
} from "@basango/domain/crawler";
|
||||
import { loadConfig as defineConfig } from "@devscast/config";
|
||||
import { z } from "zod";
|
||||
|
||||
export const PROJECT_DIR = path.resolve(__dirname, "../");
|
||||
|
||||
export const PipelineConfigSchema = z.object({
|
||||
fetch: z.object({
|
||||
async: z.object({
|
||||
prefix: z.string().default("basango:crawler:queue"),
|
||||
queues: z.object({
|
||||
details: z.string().default("details"),
|
||||
listing: z.string().default("listing"),
|
||||
processing: z.string().default("processing"),
|
||||
}),
|
||||
redisUrl: z.string().default("redis://localhost:6379/0"),
|
||||
ttl: z.object({
|
||||
default: z.number().int().positive().default(600),
|
||||
failure: z.number().int().nonnegative().default(3600),
|
||||
result: z.number().int().nonnegative().default(3600),
|
||||
}),
|
||||
}),
|
||||
client: z.object({
|
||||
backoffInitial: z.number().nonnegative().default(1),
|
||||
backoffMax: z.number().nonnegative().default(30),
|
||||
backoffMultiplier: z.number().positive().default(2),
|
||||
followRedirects: z.boolean().default(true),
|
||||
maxRetries: z.number().int().nonnegative().default(3),
|
||||
respectRetryAfter: z.boolean().default(true),
|
||||
rotate: z.boolean().default(true),
|
||||
timeout: z.number().positive().default(20),
|
||||
userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
|
||||
verifySsl: z.boolean().default(true),
|
||||
}),
|
||||
crawler: z.object({
|
||||
category: z.string().optional(),
|
||||
dateRange: TimestampRangeSchema.optional(),
|
||||
direction: UpdateDirectionSchema.default("forward"),
|
||||
isUpdate: z.boolean().default(false),
|
||||
maxWorkers: z.number().int().positive().default(5),
|
||||
notify: z.boolean().default(false),
|
||||
pageRange: PageRangeSchema.optional(),
|
||||
source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
|
||||
useMultiThreading: z.boolean().default(false),
|
||||
}),
|
||||
}),
|
||||
paths: z.object({
|
||||
config: z.string().default(path.join(PROJECT_DIR, "config")),
|
||||
data: z.string().default(path.join(PROJECT_DIR, "data", "datasets")),
|
||||
root: z.string().default(PROJECT_DIR),
|
||||
}),
|
||||
sources: z.object({
|
||||
html: z.array(HtmlSourceConfigSchema).default([]),
|
||||
wordpress: z.array(WordPressSourceConfigSchema).default([]),
|
||||
}),
|
||||
});
|
||||
|
||||
export const { config, env } = defineConfig({
|
||||
cwd: process.cwd(),
|
||||
env: {
|
||||
path: path.join(PROJECT_DIR, ".env"),
|
||||
},
|
||||
schema: PipelineConfigSchema,
|
||||
sources: [
|
||||
path.join(PROJECT_DIR, "config", "pipeline.json"),
|
||||
path.join(PROJECT_DIR, "config", "sources.json"),
|
||||
],
|
||||
});
|
||||
|
||||
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
|
||||
export type FetchClientConfig = PipelineConfig["fetch"]["client"];
|
||||
export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
|
||||
export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];
|
||||
@@ -1,12 +1,12 @@
|
||||
import { setTimeout as delay } from "node:timers/promises";
|
||||
|
||||
import type { CrawlerHttpOptions } from "@basango/domain/config";
|
||||
import {
|
||||
DEFAULT_RETRY_AFTER_HEADER,
|
||||
DEFAULT_TRANSIENT_HTTP_STATUSES,
|
||||
DEFAULT_USER_AGENT,
|
||||
} from "@basango/domain/constants";
|
||||
|
||||
import { FetchClientConfig } from "#crawler/config";
|
||||
import { UserAgents } from "#crawler/http/user-agent";
|
||||
|
||||
export type HttpHeaders = Record<string, string>;
|
||||
@@ -71,7 +71,7 @@ const buildUrl = (url: string, params?: HttpParams): string => {
|
||||
* @param config - Fetch client configuration
|
||||
* @param attempt - Current attempt number
|
||||
*/
|
||||
const computeBackoff = (config: FetchClientConfig, attempt: number): number => {
|
||||
const computeBackoff = (config: CrawlerHttpOptions, attempt: number): number => {
|
||||
const base = Math.min(
|
||||
config.backoffInitial * config.backoffMultiplier ** attempt,
|
||||
config.backoffMax,
|
||||
@@ -101,26 +101,26 @@ const parseRetryAfter = (header: string): number => {
|
||||
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||
*/
|
||||
export class BaseHttpClient {
|
||||
protected readonly config: FetchClientConfig;
|
||||
protected readonly options: CrawlerHttpOptions;
|
||||
protected readonly fetchImpl: typeof fetch;
|
||||
protected readonly sleep: (ms: number) => Promise<void>;
|
||||
protected readonly headers: HttpHeaders;
|
||||
|
||||
constructor(config: FetchClientConfig, options: HttpClientOptions = {}) {
|
||||
this.config = config;
|
||||
constructor(options: CrawlerHttpOptions, clientOptions: HttpClientOptions = {}) {
|
||||
this.options = options;
|
||||
const provider =
|
||||
options.userAgentProvider ??
|
||||
new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT);
|
||||
const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT;
|
||||
clientOptions.userAgentProvider ??
|
||||
new UserAgents(options.rotate, options.userAgent ?? DEFAULT_USER_AGENT);
|
||||
const userAgent = provider.get() ?? options.userAgent ?? DEFAULT_USER_AGENT;
|
||||
|
||||
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
|
||||
if (options.defaultHeaders) {
|
||||
Object.assign(baseHeaders, options.defaultHeaders);
|
||||
if (clientOptions.defaultHeaders) {
|
||||
Object.assign(baseHeaders, clientOptions.defaultHeaders);
|
||||
}
|
||||
|
||||
this.headers = baseHeaders;
|
||||
this.fetchImpl = options.fetchImpl ?? fetch;
|
||||
this.sleep = options.sleep ?? defaultSleep;
|
||||
this.fetchImpl = clientOptions.fetchImpl ?? fetch;
|
||||
this.sleep = clientOptions.sleep ?? defaultSleep;
|
||||
}
|
||||
|
||||
protected buildHeaders(headers?: HttpHeaders): HeadersInit {
|
||||
@@ -136,13 +136,13 @@ export class BaseHttpClient {
|
||||
|
||||
if (response) {
|
||||
const retryAfter = response.headers.get(retryAfterHeader);
|
||||
if (retryAfter && this.config.respectRetryAfter) {
|
||||
if (retryAfter && this.options.respectRetryAfter) {
|
||||
waitMs = parseRetryAfter(retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
if (waitMs === 0) {
|
||||
waitMs = computeBackoff(this.config, attempt);
|
||||
waitMs = computeBackoff(this.options, attempt);
|
||||
}
|
||||
|
||||
if (waitMs > 0) {
|
||||
@@ -161,7 +161,7 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
|
||||
const target = buildUrl(url, options.params);
|
||||
|
||||
const maxAttempts = this.config.maxRetries + 1;
|
||||
const maxAttempts = this.options.maxRetries + 1;
|
||||
let attempt = 0;
|
||||
let lastError: unknown;
|
||||
|
||||
@@ -169,14 +169,14 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
const controller = new AbortController();
|
||||
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
|
||||
try {
|
||||
timeoutHandle = setTimeout(() => controller.abort(), this.config.timeout * 1000);
|
||||
timeoutHandle = setTimeout(() => controller.abort(), this.options.timeout * 1000);
|
||||
|
||||
const headers = this.buildHeaders(options.headers);
|
||||
const init: RequestInit = {
|
||||
body: options.data as BodyInit | undefined,
|
||||
headers,
|
||||
method,
|
||||
redirect: this.config.followRedirects ? "follow" : "manual",
|
||||
redirect: this.options.followRedirects ? "follow" : "manual",
|
||||
signal: controller.signal,
|
||||
};
|
||||
|
||||
@@ -189,7 +189,7 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
|
||||
if (
|
||||
DEFAULT_TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
|
||||
attempt < this.config.maxRetries
|
||||
attempt < this.options.maxRetries
|
||||
) {
|
||||
await this.maybeDelay(attempt, response, retryAfterHeader);
|
||||
attempt += 1;
|
||||
@@ -209,12 +209,12 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
|
||||
if (error instanceof DOMException && error.name === "AbortError") {
|
||||
lastError = error;
|
||||
if (attempt >= this.config.maxRetries) {
|
||||
if (attempt >= this.options.maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
} else {
|
||||
lastError = error;
|
||||
if (attempt >= this.config.maxRetries) {
|
||||
if (attempt >= this.options.maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { config } from "@basango/domain/config";
|
||||
import { DEFAULT_OPEN_GRAPH_USER_AGENT } from "@basango/domain/constants";
|
||||
import { ArticleMetadata } from "@basango/domain/models";
|
||||
import { parse } from "node-html-parser";
|
||||
|
||||
import { config } from "#crawler/config";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { UserAgents } from "#crawler/http/user-agent";
|
||||
import { createAbsoluteUrl } from "#crawler/utils";
|
||||
@@ -44,7 +44,7 @@ export class OpenGraph {
|
||||
private readonly client: Pick<SyncHttpClient, "get">;
|
||||
|
||||
constructor() {
|
||||
const settings = config.fetch.client;
|
||||
const settings = config.crawler.fetch.client;
|
||||
const provider = new UserAgents(true, DEFAULT_OPEN_GRAPH_USER_AGENT);
|
||||
|
||||
this.client = new SyncHttpClient(settings, {
|
||||
@@ -89,16 +89,28 @@ export class OpenGraph {
|
||||
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
|
||||
url ?? null,
|
||||
]);
|
||||
const author = pick([extract(root, "article:author"), extract(root, "og:article:author")]);
|
||||
const publishedAt = pick([
|
||||
extract(root, "article:published_time"),
|
||||
extract(root, "og:article:published_time"),
|
||||
]);
|
||||
const updatedAt = pick([
|
||||
extract(root, "article:modified_time"),
|
||||
extract(root, "og:article:modified_time"),
|
||||
]);
|
||||
|
||||
if (!title && !description && !image && !canonical) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return {
|
||||
author,
|
||||
description,
|
||||
image: createAbsoluteUrl(url, image ?? "") || undefined,
|
||||
publishedAt,
|
||||
title,
|
||||
updatedAt,
|
||||
url: createAbsoluteUrl(url, canonical ?? "") || undefined,
|
||||
};
|
||||
} as ArticleMetadata;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import type { HtmlSourceConfig, WordPressSourceConfig } from "@basango/domain/crawler";
|
||||
import type { HtmlSourceOptions, WordPressSourceOptions } from "@basango/domain/config";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
@@ -24,7 +24,7 @@ export const collectHtmlListing = async (
|
||||
payload: ListingTaskPayload,
|
||||
manager: QueueManager = createQueueManager(),
|
||||
): Promise<number> => {
|
||||
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
|
||||
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceOptions;
|
||||
if (source.sourceKind !== "html") {
|
||||
return await collectWordPressListing(payload, manager);
|
||||
}
|
||||
@@ -63,7 +63,7 @@ export const collectWordPressListing = async (
|
||||
payload: ListingTaskPayload,
|
||||
manager: QueueManager = createQueueManager(),
|
||||
): Promise<number> => {
|
||||
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
|
||||
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceOptions;
|
||||
if (source.sourceKind !== "wordpress") {
|
||||
return await collectHtmlListing(payload, manager);
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
|
||||
import { JobsOptions, Queue, QueueOptions } from "bullmq";
|
||||
import { type CrawlerAsyncOptions, config } from "@basango/domain/config";
|
||||
import { JobsOptions, Queue } from "bullmq";
|
||||
import IORedis from "ioredis";
|
||||
|
||||
import { FetchAsyncConfig, config } from "#crawler/config";
|
||||
import {
|
||||
DetailsTaskPayload,
|
||||
DetailsTaskPayloadSchema,
|
||||
@@ -20,28 +20,27 @@ export interface QueueBackend<T = unknown> {
|
||||
|
||||
export type QueueFactory = (
|
||||
queueName: string,
|
||||
settings: FetchAsyncConfig,
|
||||
options: CrawlerAsyncOptions,
|
||||
connection?: IORedis,
|
||||
) => QueueBackend;
|
||||
|
||||
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
|
||||
const defaultQueueFactory: QueueFactory = (queueName, options, connection) => {
|
||||
const redisConnection =
|
||||
connection ??
|
||||
new IORedis(settings.redisUrl, {
|
||||
...parseRedisUrl(settings.redisUrl),
|
||||
new IORedis(options.redisUrl, {
|
||||
...parseRedisUrl(options.redisUrl),
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
const options: QueueOptions = {
|
||||
connection: redisConnection,
|
||||
prefix: settings.prefix,
|
||||
};
|
||||
|
||||
const queue = new Queue(queueName, options);
|
||||
const queue = new Queue(queueName, {
|
||||
connection: redisConnection,
|
||||
prefix: options.prefix,
|
||||
});
|
||||
return {
|
||||
add: async (name, data, opts) => {
|
||||
const job = await queue.add(name, data, {
|
||||
removeOnComplete: settings.ttl.result === 0 ? true : undefined,
|
||||
removeOnFail: settings.ttl.failure === 0 ? true : undefined,
|
||||
removeOnComplete: options.ttl.result === 0 ? true : undefined,
|
||||
removeOnFail: options.ttl.failure === 0 ? true : undefined,
|
||||
...opts,
|
||||
});
|
||||
return { id: job.id ?? randomUUID() };
|
||||
@@ -55,7 +54,7 @@ export interface CreateQueueManagerOptions {
|
||||
}
|
||||
|
||||
export interface QueueManager {
|
||||
readonly settings: FetchAsyncConfig;
|
||||
readonly options: CrawlerAsyncOptions;
|
||||
readonly connection: IORedis;
|
||||
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
|
||||
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
|
||||
@@ -66,17 +65,17 @@ export interface QueueManager {
|
||||
}
|
||||
|
||||
export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
|
||||
const settings = config.fetch.async;
|
||||
const asyncOptions = config.crawler.fetch.async;
|
||||
|
||||
const connection =
|
||||
options.connection ??
|
||||
new IORedis(settings.redisUrl, {
|
||||
...parseRedisUrl(settings.redisUrl),
|
||||
new IORedis(asyncOptions.redisUrl, {
|
||||
...parseRedisUrl(asyncOptions.redisUrl),
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
const factory = options.queueFactory ?? defaultQueueFactory;
|
||||
|
||||
const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
|
||||
const ensureQueue = (queueName: string) => factory(queueName, asyncOptions, connection);
|
||||
|
||||
return {
|
||||
close: async () => {
|
||||
@@ -85,25 +84,25 @@ export const createQueueManager = (options: CreateQueueManagerOptions = {}): Que
|
||||
connection,
|
||||
enqueueArticle: (payload) => {
|
||||
const data = DetailsTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.details);
|
||||
const queue = ensureQueue(asyncOptions.queues.details);
|
||||
return queue.add("collect_article", data);
|
||||
},
|
||||
enqueueListing: (payload) => {
|
||||
const data = ListingTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.listing);
|
||||
const queue = ensureQueue(asyncOptions.queues.listing);
|
||||
return queue.add("collect_listing", data);
|
||||
},
|
||||
enqueueProcessed: (payload) => {
|
||||
const data = ProcessingTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.processing);
|
||||
const queue = ensureQueue(asyncOptions.queues.processing);
|
||||
return queue.add("forward_for_processing", data);
|
||||
},
|
||||
iterQueueNames: () => [
|
||||
settings.queues.listing,
|
||||
settings.queues.details,
|
||||
settings.queues.processing,
|
||||
asyncOptions.queues.listing,
|
||||
asyncOptions.queues.details,
|
||||
asyncOptions.queues.processing,
|
||||
],
|
||||
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
|
||||
settings,
|
||||
options: asyncOptions,
|
||||
queueName: (suffix: string) => `${asyncOptions.prefix}:${suffix}`,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import { PageRangeSchema, TimestampRangeSchema } from "@basango/domain/crawler";
|
||||
import { articleSchema } from "@basango/domain/models";
|
||||
import { PageRangeSchema, TimestampRangeSchema, articleSchema } from "@basango/domain/models";
|
||||
import { z } from "zod";
|
||||
|
||||
export const ListingTaskPayloadSchema = z.object({
|
||||
|
||||
@@ -45,7 +45,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||
{
|
||||
concurrency: options.concurrency ?? 5,
|
||||
connection,
|
||||
prefix: manager.settings.prefix,
|
||||
prefix: manager.options.prefix,
|
||||
},
|
||||
);
|
||||
|
||||
@@ -56,7 +56,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||
|
||||
const queueEvents = new QueueEvents(queueName, {
|
||||
connection,
|
||||
prefix: manager.settings.prefix,
|
||||
prefix: manager.options.prefix,
|
||||
});
|
||||
|
||||
workers.push(worker);
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import type { AnySourceConfig } from "@basango/domain/crawler";
|
||||
import { AnySourceOptions, CrawlerFetchingOptions, config } from "@basango/domain/config";
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "#crawler/config";
|
||||
import { JsonlPersistor, Persistor } from "#crawler/process/persistence";
|
||||
import { createPageRange, createTimestampRange } from "#crawler/utils";
|
||||
|
||||
@@ -13,11 +12,11 @@ export interface CrawlingOptions {
|
||||
}
|
||||
|
||||
export const resolveCrawlerConfig = (
|
||||
source: AnySourceConfig,
|
||||
source: AnySourceOptions,
|
||||
options: CrawlingOptions,
|
||||
): FetchCrawlerConfig => {
|
||||
): CrawlerFetchingOptions => {
|
||||
return {
|
||||
...config.fetch.crawler,
|
||||
...config.crawler.fetch.crawler,
|
||||
category: options.category,
|
||||
dateRange: createTimestampRange(options.dateRange),
|
||||
pageRange: createPageRange(options.pageRange),
|
||||
@@ -25,10 +24,10 @@ export const resolveCrawlerConfig = (
|
||||
};
|
||||
};
|
||||
|
||||
export const createPersistors = (source: AnySourceConfig): Persistor[] => {
|
||||
export const createPersistors = (source: AnySourceOptions): Persistor[] => {
|
||||
return [
|
||||
new JsonlPersistor({
|
||||
directory: config.paths.data,
|
||||
directory: config.crawler.paths.data,
|
||||
sourceId: source.sourceId,
|
||||
}),
|
||||
];
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
import type { AnySourceConfig } from "@basango/domain/crawler";
|
||||
import { AnySourceOptions, CrawlerFetchingOptions, config } from "@basango/domain/config";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { HTMLElement, parse as parseHtml } from "node-html-parser";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "#crawler/config";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { OpenGraph } from "#crawler/http/open-graph";
|
||||
import type { Persistor } from "#crawler/process/persistence";
|
||||
@@ -12,23 +11,23 @@ export interface CrawlerOptions {
|
||||
}
|
||||
|
||||
export abstract class BaseCrawler {
|
||||
protected readonly settings: FetchCrawlerConfig;
|
||||
protected readonly source: AnySourceConfig;
|
||||
protected readonly options: CrawlerFetchingOptions;
|
||||
protected readonly source: AnySourceOptions;
|
||||
protected readonly http: SyncHttpClient;
|
||||
protected readonly persistors: Persistor[];
|
||||
protected readonly openGraph: OpenGraph;
|
||||
|
||||
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
|
||||
if (!settings.source) {
|
||||
protected constructor(options: CrawlerFetchingOptions, crawlerOptions: CrawlerOptions = {}) {
|
||||
if (!options.source) {
|
||||
throw new Error("Crawler requires a bound source");
|
||||
}
|
||||
|
||||
this.http = new SyncHttpClient(config.fetch.client);
|
||||
this.persistors = options.persistors ?? [];
|
||||
this.http = new SyncHttpClient(config.crawler.fetch.client);
|
||||
this.persistors = crawlerOptions.persistors ?? [];
|
||||
this.openGraph = new OpenGraph();
|
||||
|
||||
this.settings = settings;
|
||||
this.source = settings.source as AnySourceConfig;
|
||||
this.options = options;
|
||||
this.source = options.source as AnySourceOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
import type { HtmlSourceConfig, TimestampRange } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { CrawlerFetchingOptions, HtmlSourceOptions } from "@basango/domain/config";
|
||||
import { Article, TimestampRange } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns";
|
||||
import { HTMLElement } from "node-html-parser";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
import { FetchCrawlerConfig } from "#crawler/config";
|
||||
import {
|
||||
ArticleOutOfDateRangeError,
|
||||
InvalidArticleError,
|
||||
@@ -26,21 +25,21 @@ const md = new TurndownService({
|
||||
* Crawler for generic HTML pages.
|
||||
*/
|
||||
export class HtmlCrawler extends BaseCrawler {
|
||||
readonly source: HtmlSourceConfig;
|
||||
readonly source: HtmlSourceOptions;
|
||||
private currentNode: string | null = null;
|
||||
|
||||
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||
constructor(settings: CrawlerFetchingOptions, options: { persistors?: Persistor[] } = {}) {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "html") {
|
||||
throw new UnsupportedSourceKindError("HtmlCrawler requires a source of kind 'html'");
|
||||
}
|
||||
this.source = this.settings.source as HtmlSourceConfig;
|
||||
this.source = this.options.source as HtmlSourceOptions;
|
||||
}
|
||||
|
||||
async fetch(): Promise<void> {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
const pageRange = this.options.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.options.dateRange;
|
||||
const selectors = this.source.sourceSelectors;
|
||||
|
||||
if (!selectors.articles) {
|
||||
@@ -218,7 +217,7 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
*/
|
||||
private applyCategory(template: string): string {
|
||||
if (template.includes("{category}")) {
|
||||
const replacement = this.settings.category ?? "";
|
||||
const replacement = this.options.category ?? "";
|
||||
return template.replace("{category}", replacement);
|
||||
}
|
||||
return template;
|
||||
@@ -297,7 +296,7 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
|
||||
if (!selector && this.settings.category) return [this.settings.category.toLowerCase()];
|
||||
if (!selector && this.options.category) return [this.options.category.toLowerCase()];
|
||||
if (!selector) return [];
|
||||
|
||||
const values: string[] = [];
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
import type { PageRange, TimestampRange, WordPressSourceConfig } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { CrawlerFetchingOptions, WordPressSourceOptions } from "@basango/domain/config";
|
||||
import { Article, PageRange, TimestampRange } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
import { fromUnixTime } from "date-fns";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
import { FetchCrawlerConfig } from "#crawler/config";
|
||||
import {
|
||||
ArticleOutOfDateRangeError,
|
||||
InvalidArticleError,
|
||||
@@ -33,7 +32,7 @@ interface WordPressPost {
|
||||
* Crawler for WordPress sites using the REST API.
|
||||
*/
|
||||
export class WordPressCrawler extends BaseCrawler {
|
||||
readonly source: WordPressSourceConfig;
|
||||
readonly source: WordPressSourceOptions;
|
||||
private categoryMap: Map<number, string> = new Map();
|
||||
|
||||
public static readonly POST_QUERY =
|
||||
@@ -43,7 +42,7 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
public static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
|
||||
public static readonly TOTAL_POSTS_HEADER = "x-wp-total";
|
||||
|
||||
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||
constructor(settings: CrawlerFetchingOptions, options: { persistors?: Persistor[] } = {}) {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "wordpress") {
|
||||
@@ -51,15 +50,15 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
"WordPressCrawler requires a source of kind 'wordpress'",
|
||||
);
|
||||
}
|
||||
this.source = this.settings.source as WordPressSourceConfig;
|
||||
this.source = this.options.source as WordPressSourceOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and process WordPress posts.
|
||||
*/
|
||||
async fetch(): Promise<void> {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
const pageRange = this.options.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.options.dateRange;
|
||||
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const endpoint = this.buildEndpointUrl(page);
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
import { config } from "@basango/domain/config";
|
||||
import type { Article } from "@basango/domain/models";
|
||||
import { md5 } from "@basango/encryption";
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import { config, env } from "#crawler/config";
|
||||
import { HttpError, SyncHttpClient } from "#crawler/http/http-client";
|
||||
|
||||
export interface Persistor {
|
||||
@@ -66,9 +66,9 @@ export const persist = async (
|
||||
};
|
||||
|
||||
export const forward = async (payload: Partial<Article>): Promise<void> => {
|
||||
const client = new SyncHttpClient(config.fetch.client);
|
||||
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
|
||||
const token = env("BASANGO_CRAWLER_TOKEN");
|
||||
const client = new SyncHttpClient(config.crawler.fetch.client);
|
||||
const endpoint = config.crawler.backend.endpoint;
|
||||
const token = config.crawler.backend.token;
|
||||
|
||||
try {
|
||||
const response = await client.post(endpoint, {
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#! /usr/bin/env bun
|
||||
#!/usr/bin/env bun
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import { createInterface } from "node:readline";
|
||||
import { parseArgs } from "node:util";
|
||||
|
||||
import { config } from "@basango/domain/config";
|
||||
import type { Article } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { config } from "#crawler/config";
|
||||
import { forward } from "#crawler/process/persistence";
|
||||
|
||||
const USAGE = `
|
||||
@@ -31,7 +31,7 @@ const main = async (): Promise<void> => {
|
||||
return;
|
||||
}
|
||||
|
||||
const filePath = path.join(config.paths.data, `${sourceId}.jsonl`);
|
||||
const filePath = path.join(config.crawler.paths.data, `${sourceId}.jsonl`);
|
||||
|
||||
if (!fs.existsSync(filePath)) {
|
||||
logger.error({ filePath, sourceId }, "Source must be crawled first; JSONL not found");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#! /usr/bin/env bun
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#! /usr/bin/env bun
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
|
||||
@@ -1,28 +1,29 @@
|
||||
import {
|
||||
AnySourceOptions,
|
||||
HtmlSourceOptions,
|
||||
WordPressSourceOptions,
|
||||
config,
|
||||
} from "@basango/domain/config";
|
||||
import { DEFAULT_DATE_FORMAT } from "@basango/domain/constants";
|
||||
import {
|
||||
AnySourceConfig,
|
||||
DateSpecSchema,
|
||||
HtmlSourceConfig,
|
||||
PageRange,
|
||||
PageRangeSchema,
|
||||
PageSpecSchema,
|
||||
TimestampRange,
|
||||
TimestampRangeSchema,
|
||||
WordPressSourceConfig,
|
||||
} from "@basango/domain/crawler";
|
||||
} from "@basango/domain/models";
|
||||
import { format, fromUnixTime, getUnixTime, isMatch, parse } from "date-fns";
|
||||
import type { RedisOptions } from "ioredis";
|
||||
|
||||
import { config } from "#crawler/config";
|
||||
|
||||
/**
|
||||
* Resolve a source configuration by its ID.
|
||||
* @param id - The source ID
|
||||
*/
|
||||
export const resolveSourceConfig = (id: string): AnySourceConfig => {
|
||||
export const resolveSourceConfig = (id: string): AnySourceOptions => {
|
||||
const source =
|
||||
config.sources.html.find((s: HtmlSourceConfig) => s.sourceId === id) ||
|
||||
config.sources.wordpress.find((s: WordPressSourceConfig) => s.sourceId === id);
|
||||
config.crawler.sources.html.find((s: HtmlSourceOptions) => s.sourceId === id) ||
|
||||
config.crawler.sources.wordpress.find((s: WordPressSourceOptions) => s.sourceId === id);
|
||||
|
||||
if (source === undefined) {
|
||||
throw new Error(`Source '${id}' not found in configuration`);
|
||||
|
||||
Reference in New Issue
Block a user