refactor: centralize configuration

This commit is contained in:
2025-11-23 19:54:32 +02:00
parent 57a8501c88
commit 72dfa53f80
78 changed files with 2252 additions and 1385 deletions
-7
View File
@@ -1,7 +0,0 @@
NODE_ENV=development
BASANGO_API_HOST=localhost
BASANGO_API_PORT=3080
BASANGO_API_ALLOWED_ORIGINS=http://localhost:3000,http://127.0.0.1:3000
BASANGO_API_KEY=your_api_key_here
BASANGO_CRAWLER_TOKEN=dev
BASANGO_JWT_SECRET=your_jwt_secret_here
-16
View File
@@ -1,16 +0,0 @@
{
"cors": {
"allowedHeaders": [
"Authorization",
"Content-Type",
"accept-language",
"x-trpc-source",
"x-user-locale",
"x-user-timezone",
"x-user-country"
],
"allowMethods": ["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"],
"exposeHeaders": ["Content-Length"],
"maxAge": 86400
}
}
-7
View File
@@ -1,7 +0,0 @@
{
"server": {
"host": "%env(BASANGO_API_HOST)%",
"port": "%env(number:BASANGO_API_PORT)%",
"version": "1.0.0"
}
}
-3
View File
@@ -4,13 +4,10 @@
"@basango/domain": "workspace:*",
"@basango/encryption": "workspace:*",
"@basango/logger": "workspace:*",
"@devscast/config": "catalog:",
"@hono/node-server": "^1.19.6",
"@hono/trpc-server": "^0.4.0",
"@hono/zod-openapi": "^1.1.4",
"@scalar/hono-api-reference": "^0.9.24",
"@trpc/server": "^11.7.1",
"ai": "^5.0.89",
"camelcase-keys": "^10.0.1",
"date-fns": "catalog:",
"hono-rate-limiter": "^0.4.2",
-45
View File
@@ -1,45 +0,0 @@
import path from "node:path";
import { loadConfig as defineConfig } from "@devscast/config";
import { z } from "zod";
export const PROJECT_DIR = path.resolve(__dirname, "../");
const ServerConfigurationSchema = z.object({
cors: z.object({
allowedHeaders: z.array(z.string()).optional(),
allowMethods: z.array(z.string()).optional(),
exposeHeaders: z.array(z.string()).optional(),
maxAge: z.number().int().min(0).optional(),
origin: z
.array(z.string())
.optional()
.default(["http://localhost:3000", "http://127.0.0.1:3000", "https://dashboard.basango.io"]),
}),
server: z.object({
host: z.string().default("localhost"),
port: z.number().int().min(1).max(65535).default(4000),
version: z.string().default("1.0.0"),
}),
});
export const { env, config } = defineConfig({
env: {
knownKeys: [
"BASANGO_API_HOST",
"BASANGO_API_PORT",
"BASANGO_API_ALLOWED_ORIGINS",
"BASANGO_API_KEY",
"BASANGO_CRAWLER_TOKEN",
"BASANGO_JWT_SECRET",
],
path: path.join(PROJECT_DIR, ".env"),
},
schema: ServerConfigurationSchema,
sources: [
path.join(PROJECT_DIR, "config", "server.json"),
path.join(PROJECT_DIR, "config", "cors.json"),
],
});
export type ServerConfiguration = z.infer<typeof ServerConfigurationSchema>;
+8 -48
View File
@@ -1,11 +1,10 @@
import { config } from "@basango/domain/config";
import { trpcServer } from "@hono/trpc-server";
import { OpenAPIHono } from "@hono/zod-openapi";
import { Scalar } from "@scalar/hono-api-reference";
import { cors } from "hono/cors";
import { logger } from "hono/logger";
import { secureHeaders } from "hono/secure-headers";
import { config, env } from "#api/config";
import { routers } from "#api/rest/routers";
import { createTRPCContext } from "#api/trpc/init";
import { appRouter } from "#api/trpc/routers/_app";
@@ -18,11 +17,11 @@ app.use(secureHeaders());
app.use(
"*",
cors({
allowHeaders: config.cors.allowedHeaders,
allowMethods: config.cors.allowMethods,
exposeHeaders: config.cors.exposeHeaders,
maxAge: config.cors.maxAge,
origin: ["http://localhost:3000", "http://127.0.0.1:3000", "https://dashboard.basango.io"],
allowHeaders: config.api.cors.allowedHeaders,
allowMethods: config.api.cors.allowMethods,
exposeHeaders: config.api.cors.exposeHeaders,
maxAge: config.api.cors.maxAge,
origin: config.api.cors.origin,
}),
);
@@ -34,49 +33,10 @@ app.use(
}),
);
app.doc("/openapi", {
info: {
contact: {
email: "engineering@basango.io",
name: "Basango",
url: "https://basango.io",
},
description: "Basango is a platform that leverages AI to revolutionize news curation.",
license: {
name: "AGPL-3.0 license",
url: "https://github.com/bernard-ng/basango/blob/main/LICENSE",
},
title: "Basango API",
version: "0.0.1",
},
openapi: "3.1.0",
security: [
{
oauth2: [],
},
{ token: [] },
],
servers: [
{
description: "Production API",
url: "https://api.basango.io",
},
],
});
// Register security scheme
app.openAPIRegistry.registerComponent("securitySchemes", "token", {
description: "Default authentication mechanism",
scheme: "bearer",
type: "http",
"x-speakeasy-example": env("BASANGO_API_KEY"),
});
app.get("/", Scalar({ pageTitle: "Basango API", theme: "saturn", url: "/openapi" }));
app.route("/", routers);
export default {
fetch: app.fetch,
hostname: config.server.host,
port: config.server.port,
hostname: config.api.server.host,
port: config.api.server.port,
};
+2 -3
View File
@@ -1,8 +1,7 @@
import { config } from "@basango/domain/config";
import type { MiddlewareHandler } from "hono";
import { HTTPException } from "hono/http-exception";
import { env } from "#api/config";
export const withCrawlerAuth: MiddlewareHandler = async (c, next) => {
const token = c.req.header("Authorization");
@@ -10,7 +9,7 @@ export const withCrawlerAuth: MiddlewareHandler = async (c, next) => {
throw new HTTPException(401, { message: "Authorization header required" });
}
if (token !== env("BASANGO_CRAWLER_TOKEN")) {
if (token !== config.api.security.crawlerToken) {
throw new HTTPException(403, { message: "Invalid token" });
}
+1 -1
View File
@@ -13,7 +13,7 @@ export const authRouter = createTRPCRouter({
if (!user || user.isLocked) {
throw new TRPCError({
code: "UNAUTHORIZED",
message: "Invalid credentials.",
message: "Account is locked",
});
}
+10 -17
View File
@@ -1,15 +1,8 @@
import { Database } from "@basango/db/client";
import { getUserById } from "@basango/db/queries";
import {
DEFAULT_ACCESS_TOKEN_TTL,
DEFAULT_REFRESH_TOKEN_TTL,
DEFAULT_TOKEN_AUDIENCE,
DEFAULT_TOKEN_ISSUER,
} from "@basango/domain/constants";
import { config } from "@basango/domain/config";
import { type JWTPayload, SignJWT, jwtVerify } from "jose";
import { env } from "#api/config";
export type Session = {
user: {
id: string;
@@ -39,7 +32,7 @@ export type SessionTokens = {
const encoder = new TextEncoder();
function getSecretKey() {
return encoder.encode(env("BASANGO_JWT_SECRET"));
return encoder.encode(config.api.security.jwtSecret);
}
export async function getSession(db: Database, accessToken?: string): Promise<Session | null> {
@@ -74,24 +67,24 @@ async function createToken(session: Session, tokenType: TokenType, expiresIn: st
})
.setProtectedHeader({ alg: "HS256" })
.setIssuedAt()
.setAudience(DEFAULT_TOKEN_AUDIENCE)
.setIssuer(DEFAULT_TOKEN_ISSUER)
.setAudience(config.api.security.audience)
.setIssuer(config.api.security.issuer)
.setExpirationTime(expiresIn)
.sign(getSecretKey());
}
export async function createSessionTokens(session: Session): Promise<SessionTokens> {
const [accessToken, refreshToken] = await Promise.all([
createToken(session, "access", DEFAULT_ACCESS_TOKEN_TTL),
createToken(session, "refresh", DEFAULT_REFRESH_TOKEN_TTL),
createToken(session, "access", config.api.security.accessTokenTtl),
createToken(session, "refresh", config.api.security.refreshTokenTtl),
]);
const issuedAt = Date.now();
const accessTokenExpiresAt = new Date(
issuedAt + formatTTL(DEFAULT_ACCESS_TOKEN_TTL),
issuedAt + formatTTL(config.api.security.accessTokenTtl),
).toISOString();
const refreshTokenExpiresAt = new Date(
issuedAt + formatTTL(DEFAULT_REFRESH_TOKEN_TTL),
issuedAt + formatTTL(config.api.security.refreshTokenTtl),
).toISOString();
return {
@@ -118,8 +111,8 @@ async function verifyToken(
try {
const { payload } = await jwtVerify<VerifiedJWTPayload>(token, getSecretKey(), {
audience: DEFAULT_TOKEN_AUDIENCE,
issuer: DEFAULT_TOKEN_ISSUER,
audience: config.api.security.audience,
issuer: config.api.security.issuer,
});
if (payload.tokenType !== expectedType) {
-21
View File
@@ -1,21 +0,0 @@
# paths
BASANGO_CRAWLER_ROOT_PATH=
BASANGO_CRAWLER_DATA_PATH=
BASANGO_CRAWLER_LOGS_PATH=
BASANGO_CRAWLER_CONFIG_PATH=
# crawler settings
BASANGO_CRAWLER_UPDATE_DIRECTION=forward
BASANGO_CRAWLER_FETCH_USER_AGENT="Basango/0.1 (+https://github.com/bernard-ng/basango)"
BASANGO_CRAWLER_FETCH_MAX_RETRIES=3
BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER=true
BASANGO_CRAWLER_ASYNC_REDIS_URL="redis://localhost:6379/0"
BASANGO_CRAWLER_ASYNC_TTL_RESULT=3600
BASANGO_CRAWLER_ASYNC_TTL_FAILURE=3600
BASANGO_CRAWLER_ASYNC_QUEUE_LISTING="listing"
BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS="details"
BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING="processing"
BASANGO_CRAWLER_TOKEN="dev"
BASANGO_CRAWLER_BACKEND_API_ENDPOINT="http://localhost:3080/articles"
-41
View File
@@ -1,41 +0,0 @@
{
"fetch": {
"async": {
"prefix": "basango:crawler",
"queues": {
"details": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS)%",
"listing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_LISTING)%",
"processing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING)%"
},
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
"ttl": {
"default": 600,
"failure": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_FAILURE)%",
"result": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_RESULT)%"
}
},
"client": {
"backoffInitial": 1,
"backoffMax": 30,
"backoffMultiplier": 2,
"followRedirects": true,
"maxRetries": "%env(number:BASANGO_CRAWLER_FETCH_MAX_RETRIES)%",
"respectRetryAfter": "%env(boolean:BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER)%",
"rotate": true,
"timeout": 20,
"userAgent": "%env(BASANGO_CRAWLER_FETCH_USER_AGENT)%",
"verifySsl": true
},
"crawler": {
"direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%",
"maxWorkers": 5,
"notify": false,
"useMultiThreading": false
}
},
"paths": {
"config": "%env(BASANGO_CRAWLER_CONFIG_PATH)%",
"data": "%env(BASANGO_CRAWLER_DATA_PATH)%",
"root": "%env(BASANGO_CRAWLER_ROOT_PATH)%"
}
}
-210
View File
@@ -1,210 +0,0 @@
{
"sources": {
"html": [
{
"paginationTemplate": "actualite",
"requiresDetails": true,
"requiresRateLimit": false,
"sourceDate": {},
"sourceId": "radiookapi.net",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": ".field-name-body",
"articleCategories": ".views-field-field-cat-gorie a",
"articleDate": "head > meta[property=\"article:published_time\"]",
"articleLink": ".views-field-title a",
"articles": ".view-content > .views-row.content-row",
"articleTitle": "h1.page-header",
"pagination": "ul.pagination > li.pager-last > a"
},
"sourceUrl": "https://www.radiookapi.net",
"supportsCategories": false
},
{
"categories": ["politique", "economie", "culture", "sport", "societe"],
"paginationTemplate": "index.php/category/{category}",
"requiresDetails": true,
"requiresRateLimit": false,
"sourceDate": {},
"sourceId": "7sur7.cd",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": "div[property=\"schema:text\"].field.field--name-body",
"articleDate": "head > meta[property=\"article:published_time\"]",
"articleLink": ".views-field-title a",
"articles": ".view-content > .row.views-row",
"articleTitle": ".views-field-title a",
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
},
"sourceUrl": "https://7sur7.cd",
"supportsCategories": true
},
{
"paginationTemplate": "articles.html",
"requiresDetails": true,
"requiresRateLimit": false,
"sourceDate": {
"format": "dd.MM.yyyy"
},
"sourceId": "mediacongo.net",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": ".article_ttext",
"articleCategories": "a.color_link",
"articleDate": ".article_other_about",
"articleLink": "a:first-child",
"articles": ".for_aitems > .article_other_item",
"articleTitle": "h1",
"pagination": "div.pagination > div > a:last-child"
},
"sourceUrl": "https://www.mediacongo.net",
"supportsCategories": false
},
{
"paginationTemplate": "actualite",
"requiresDetails": true,
"requiresRateLimit": false,
"sourceDate": {},
"sourceId": "actualite.cd",
"sourceKind": "html",
"sourceSelectors": {
"articleBody": ".views-field.views-field-body .field-content",
"articleCategories": "#actu-cat",
"articleDate": "head > meta[property=\"article:published_time\"]",
"articleLink": "#actu-titre a",
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
"articleTitle": "h1.page-title"
},
"sourceUrl": "https://actualite.cd",
"supportsCategories": false
}
],
"wordpress": [
{
"requiresRateLimit": true,
"sourceId": "beto.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://beto.cd"
},
{ "sourceId": "newscd.net", "sourceKind": "wordpress", "sourceUrl": "https://newscd.net" },
{
"sourceId": "africanewsrdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://www.africanewsrdc.net"
},
{
"sourceId": "angazainstitute.ac.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://angazainstitute.ac.cd"
},
{ "sourceId": "b-onetv.cd", "sourceKind": "wordpress", "sourceUrl": "https://b-onetv.cd" },
{
"sourceId": "bukavufm.com",
"sourceKind": "wordpress",
"sourceUrl": "https://bukavufm.com"
},
{
"sourceId": "changement7.net",
"sourceKind": "wordpress",
"sourceUrl": "https://changement7.net"
},
{
"sourceId": "congoactu.net",
"sourceKind": "wordpress",
"sourceUrl": "https://congoactu.net"
},
{
"sourceId": "congoindependant.com",
"sourceKind": "wordpress",
"sourceUrl": "https://www.congoindependant.com"
},
{
"sourceId": "congoquotidien.com",
"sourceKind": "wordpress",
"sourceUrl": "https://www.congoquotidien.com"
},
{
"sourceId": "cumulard.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://www.cumulard.cd"
},
{
"sourceId": "environews-rdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://environews-rdc.net"
},
{
"sourceId": "freemediardc.info",
"sourceKind": "wordpress",
"sourceUrl": "https://www.freemediardc.info"
},
{
"sourceId": "geopolismagazine.org",
"sourceKind": "wordpress",
"sourceUrl": "https://geopolismagazine.org"
},
{
"sourceId": "habarirdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://habarirdc.net"
},
{ "sourceId": "infordc.com", "sourceKind": "wordpress", "sourceUrl": "https://infordc.com" },
{
"sourceId": "kilalopress.net",
"sourceKind": "wordpress",
"sourceUrl": "https://kilalopress.net"
},
{
"sourceId": "laprosperiteonline.net",
"sourceKind": "wordpress",
"sourceUrl": "https://laprosperiteonline.net"
},
{
"sourceId": "laprunellerdc.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://laprunellerdc.cd"
},
{
"sourceId": "lesmedias.net",
"sourceKind": "wordpress",
"sourceUrl": "https://lesmedias.net"
},
{
"sourceId": "lesvolcansnews.net",
"sourceKind": "wordpress",
"sourceUrl": "https://lesvolcansnews.net"
},
{
"sourceId": "netic-news.net",
"sourceKind": "wordpress",
"sourceUrl": "https://www.netic-news.net"
},
{
"sourceId": "objectif-infos.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://objectif-infos.cd"
},
{
"sourceId": "scooprdc.net",
"sourceKind": "wordpress",
"sourceUrl": "https://scooprdc.net"
},
{
"sourceId": "journaldekinshasa.com",
"sourceKind": "wordpress",
"sourceUrl": "https://www.journaldekinshasa.com"
},
{
"sourceId": "lepotentiel.cd",
"sourceKind": "wordpress",
"sourceUrl": "https://lepotentiel.cd"
},
{ "sourceId": "acturdc.com", "sourceKind": "wordpress", "sourceUrl": "https://acturdc.com" },
{
"sourceId": "matininfos.net",
"sourceKind": "wordpress",
"sourceUrl": "https://matininfos.net"
}
]
}
}
-81
View File
@@ -1,81 +0,0 @@
import path from "node:path";
import {
HtmlSourceConfigSchema,
PageRangeSchema,
TimestampRangeSchema,
UpdateDirectionSchema,
WordPressSourceConfigSchema,
} from "@basango/domain/crawler";
import { loadConfig as defineConfig } from "@devscast/config";
import { z } from "zod";
export const PROJECT_DIR = path.resolve(__dirname, "../");
export const PipelineConfigSchema = z.object({
fetch: z.object({
async: z.object({
prefix: z.string().default("basango:crawler:queue"),
queues: z.object({
details: z.string().default("details"),
listing: z.string().default("listing"),
processing: z.string().default("processing"),
}),
redisUrl: z.string().default("redis://localhost:6379/0"),
ttl: z.object({
default: z.number().int().positive().default(600),
failure: z.number().int().nonnegative().default(3600),
result: z.number().int().nonnegative().default(3600),
}),
}),
client: z.object({
backoffInitial: z.number().nonnegative().default(1),
backoffMax: z.number().nonnegative().default(30),
backoffMultiplier: z.number().positive().default(2),
followRedirects: z.boolean().default(true),
maxRetries: z.number().int().nonnegative().default(3),
respectRetryAfter: z.boolean().default(true),
rotate: z.boolean().default(true),
timeout: z.number().positive().default(20),
userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
verifySsl: z.boolean().default(true),
}),
crawler: z.object({
category: z.string().optional(),
dateRange: TimestampRangeSchema.optional(),
direction: UpdateDirectionSchema.default("forward"),
isUpdate: z.boolean().default(false),
maxWorkers: z.number().int().positive().default(5),
notify: z.boolean().default(false),
pageRange: PageRangeSchema.optional(),
source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
useMultiThreading: z.boolean().default(false),
}),
}),
paths: z.object({
config: z.string().default(path.join(PROJECT_DIR, "config")),
data: z.string().default(path.join(PROJECT_DIR, "data", "datasets")),
root: z.string().default(PROJECT_DIR),
}),
sources: z.object({
html: z.array(HtmlSourceConfigSchema).default([]),
wordpress: z.array(WordPressSourceConfigSchema).default([]),
}),
});
export const { config, env } = defineConfig({
cwd: process.cwd(),
env: {
path: path.join(PROJECT_DIR, ".env"),
},
schema: PipelineConfigSchema,
sources: [
path.join(PROJECT_DIR, "config", "pipeline.json"),
path.join(PROJECT_DIR, "config", "sources.json"),
],
});
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
export type FetchClientConfig = PipelineConfig["fetch"]["client"];
export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];
+20 -20
View File
@@ -1,12 +1,12 @@
import { setTimeout as delay } from "node:timers/promises";
import type { CrawlerHttpOptions } from "@basango/domain/config";
import {
DEFAULT_RETRY_AFTER_HEADER,
DEFAULT_TRANSIENT_HTTP_STATUSES,
DEFAULT_USER_AGENT,
} from "@basango/domain/constants";
import { FetchClientConfig } from "#crawler/config";
import { UserAgents } from "#crawler/http/user-agent";
export type HttpHeaders = Record<string, string>;
@@ -71,7 +71,7 @@ const buildUrl = (url: string, params?: HttpParams): string => {
* @param config - Fetch client configuration
* @param attempt - Current attempt number
*/
const computeBackoff = (config: FetchClientConfig, attempt: number): number => {
const computeBackoff = (config: CrawlerHttpOptions, attempt: number): number => {
const base = Math.min(
config.backoffInitial * config.backoffMultiplier ** attempt,
config.backoffMax,
@@ -101,26 +101,26 @@ const parseRetryAfter = (header: string): number => {
* @author Bernard Ngandu <bernard@devscast.tech>
*/
export class BaseHttpClient {
protected readonly config: FetchClientConfig;
protected readonly options: CrawlerHttpOptions;
protected readonly fetchImpl: typeof fetch;
protected readonly sleep: (ms: number) => Promise<void>;
protected readonly headers: HttpHeaders;
constructor(config: FetchClientConfig, options: HttpClientOptions = {}) {
this.config = config;
constructor(options: CrawlerHttpOptions, clientOptions: HttpClientOptions = {}) {
this.options = options;
const provider =
options.userAgentProvider ??
new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT);
const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT;
clientOptions.userAgentProvider ??
new UserAgents(options.rotate, options.userAgent ?? DEFAULT_USER_AGENT);
const userAgent = provider.get() ?? options.userAgent ?? DEFAULT_USER_AGENT;
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
if (options.defaultHeaders) {
Object.assign(baseHeaders, options.defaultHeaders);
if (clientOptions.defaultHeaders) {
Object.assign(baseHeaders, clientOptions.defaultHeaders);
}
this.headers = baseHeaders;
this.fetchImpl = options.fetchImpl ?? fetch;
this.sleep = options.sleep ?? defaultSleep;
this.fetchImpl = clientOptions.fetchImpl ?? fetch;
this.sleep = clientOptions.sleep ?? defaultSleep;
}
protected buildHeaders(headers?: HttpHeaders): HeadersInit {
@@ -136,13 +136,13 @@ export class BaseHttpClient {
if (response) {
const retryAfter = response.headers.get(retryAfterHeader);
if (retryAfter && this.config.respectRetryAfter) {
if (retryAfter && this.options.respectRetryAfter) {
waitMs = parseRetryAfter(retryAfter);
}
}
if (waitMs === 0) {
waitMs = computeBackoff(this.config, attempt);
waitMs = computeBackoff(this.options, attempt);
}
if (waitMs > 0) {
@@ -161,7 +161,7 @@ export class SyncHttpClient extends BaseHttpClient {
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
const target = buildUrl(url, options.params);
const maxAttempts = this.config.maxRetries + 1;
const maxAttempts = this.options.maxRetries + 1;
let attempt = 0;
let lastError: unknown;
@@ -169,14 +169,14 @@ export class SyncHttpClient extends BaseHttpClient {
const controller = new AbortController();
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
try {
timeoutHandle = setTimeout(() => controller.abort(), this.config.timeout * 1000);
timeoutHandle = setTimeout(() => controller.abort(), this.options.timeout * 1000);
const headers = this.buildHeaders(options.headers);
const init: RequestInit = {
body: options.data as BodyInit | undefined,
headers,
method,
redirect: this.config.followRedirects ? "follow" : "manual",
redirect: this.options.followRedirects ? "follow" : "manual",
signal: controller.signal,
};
@@ -189,7 +189,7 @@ export class SyncHttpClient extends BaseHttpClient {
if (
DEFAULT_TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
attempt < this.config.maxRetries
attempt < this.options.maxRetries
) {
await this.maybeDelay(attempt, response, retryAfterHeader);
attempt += 1;
@@ -209,12 +209,12 @@ export class SyncHttpClient extends BaseHttpClient {
if (error instanceof DOMException && error.name === "AbortError") {
lastError = error;
if (attempt >= this.config.maxRetries) {
if (attempt >= this.options.maxRetries) {
throw error;
}
} else {
lastError = error;
if (attempt >= this.config.maxRetries) {
if (attempt >= this.options.maxRetries) {
throw error;
}
}
+15 -3
View File
@@ -1,8 +1,8 @@
import { config } from "@basango/domain/config";
import { DEFAULT_OPEN_GRAPH_USER_AGENT } from "@basango/domain/constants";
import { ArticleMetadata } from "@basango/domain/models";
import { parse } from "node-html-parser";
import { config } from "#crawler/config";
import { SyncHttpClient } from "#crawler/http/http-client";
import { UserAgents } from "#crawler/http/user-agent";
import { createAbsoluteUrl } from "#crawler/utils";
@@ -44,7 +44,7 @@ export class OpenGraph {
private readonly client: Pick<SyncHttpClient, "get">;
constructor() {
const settings = config.fetch.client;
const settings = config.crawler.fetch.client;
const provider = new UserAgents(true, DEFAULT_OPEN_GRAPH_USER_AGENT);
this.client = new SyncHttpClient(settings, {
@@ -89,16 +89,28 @@ export class OpenGraph {
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
url ?? null,
]);
const author = pick([extract(root, "article:author"), extract(root, "og:article:author")]);
const publishedAt = pick([
extract(root, "article:published_time"),
extract(root, "og:article:published_time"),
]);
const updatedAt = pick([
extract(root, "article:modified_time"),
extract(root, "og:article:modified_time"),
]);
if (!title && !description && !image && !canonical) {
return undefined;
}
return {
author,
description,
image: createAbsoluteUrl(url, image ?? "") || undefined,
publishedAt,
title,
updatedAt,
url: createAbsoluteUrl(url, canonical ?? "") || undefined,
};
} as ArticleMetadata;
}
}
+3 -3
View File
@@ -1,4 +1,4 @@
import type { HtmlSourceConfig, WordPressSourceConfig } from "@basango/domain/crawler";
import type { HtmlSourceOptions, WordPressSourceOptions } from "@basango/domain/config";
import { Article } from "@basango/domain/models";
import { logger } from "@basango/logger";
@@ -24,7 +24,7 @@ export const collectHtmlListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceOptions;
if (source.sourceKind !== "html") {
return await collectWordPressListing(payload, manager);
}
@@ -63,7 +63,7 @@ export const collectWordPressListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceOptions;
if (source.sourceKind !== "wordpress") {
return await collectHtmlListing(payload, manager);
}
+25 -26
View File
@@ -1,9 +1,9 @@
import { randomUUID } from "node:crypto";
import { JobsOptions, Queue, QueueOptions } from "bullmq";
import { type CrawlerAsyncOptions, config } from "@basango/domain/config";
import { JobsOptions, Queue } from "bullmq";
import IORedis from "ioredis";
import { FetchAsyncConfig, config } from "#crawler/config";
import {
DetailsTaskPayload,
DetailsTaskPayloadSchema,
@@ -20,28 +20,27 @@ export interface QueueBackend<T = unknown> {
export type QueueFactory = (
queueName: string,
settings: FetchAsyncConfig,
options: CrawlerAsyncOptions,
connection?: IORedis,
) => QueueBackend;
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
const defaultQueueFactory: QueueFactory = (queueName, options, connection) => {
const redisConnection =
connection ??
new IORedis(settings.redisUrl, {
...parseRedisUrl(settings.redisUrl),
new IORedis(options.redisUrl, {
...parseRedisUrl(options.redisUrl),
maxRetriesPerRequest: null,
});
const options: QueueOptions = {
connection: redisConnection,
prefix: settings.prefix,
};
const queue = new Queue(queueName, options);
const queue = new Queue(queueName, {
connection: redisConnection,
prefix: options.prefix,
});
return {
add: async (name, data, opts) => {
const job = await queue.add(name, data, {
removeOnComplete: settings.ttl.result === 0 ? true : undefined,
removeOnFail: settings.ttl.failure === 0 ? true : undefined,
removeOnComplete: options.ttl.result === 0 ? true : undefined,
removeOnFail: options.ttl.failure === 0 ? true : undefined,
...opts,
});
return { id: job.id ?? randomUUID() };
@@ -55,7 +54,7 @@ export interface CreateQueueManagerOptions {
}
export interface QueueManager {
readonly settings: FetchAsyncConfig;
readonly options: CrawlerAsyncOptions;
readonly connection: IORedis;
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
@@ -66,17 +65,17 @@ export interface QueueManager {
}
export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
const settings = config.fetch.async;
const asyncOptions = config.crawler.fetch.async;
const connection =
options.connection ??
new IORedis(settings.redisUrl, {
...parseRedisUrl(settings.redisUrl),
new IORedis(asyncOptions.redisUrl, {
...parseRedisUrl(asyncOptions.redisUrl),
maxRetriesPerRequest: null,
});
const factory = options.queueFactory ?? defaultQueueFactory;
const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
const ensureQueue = (queueName: string) => factory(queueName, asyncOptions, connection);
return {
close: async () => {
@@ -85,25 +84,25 @@ export const createQueueManager = (options: CreateQueueManagerOptions = {}): Que
connection,
enqueueArticle: (payload) => {
const data = DetailsTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.details);
const queue = ensureQueue(asyncOptions.queues.details);
return queue.add("collect_article", data);
},
enqueueListing: (payload) => {
const data = ListingTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.listing);
const queue = ensureQueue(asyncOptions.queues.listing);
return queue.add("collect_listing", data);
},
enqueueProcessed: (payload) => {
const data = ProcessingTaskPayloadSchema.parse(payload);
const queue = ensureQueue(settings.queues.processing);
const queue = ensureQueue(asyncOptions.queues.processing);
return queue.add("forward_for_processing", data);
},
iterQueueNames: () => [
settings.queues.listing,
settings.queues.details,
settings.queues.processing,
asyncOptions.queues.listing,
asyncOptions.queues.details,
asyncOptions.queues.processing,
],
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
settings,
options: asyncOptions,
queueName: (suffix: string) => `${asyncOptions.prefix}:${suffix}`,
};
};
+1 -2
View File
@@ -1,5 +1,4 @@
import { PageRangeSchema, TimestampRangeSchema } from "@basango/domain/crawler";
import { articleSchema } from "@basango/domain/models";
import { PageRangeSchema, TimestampRangeSchema, articleSchema } from "@basango/domain/models";
import { z } from "zod";
export const ListingTaskPayloadSchema = z.object({
+2 -2
View File
@@ -45,7 +45,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
{
concurrency: options.concurrency ?? 5,
connection,
prefix: manager.settings.prefix,
prefix: manager.options.prefix,
},
);
@@ -56,7 +56,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
const queueEvents = new QueueEvents(queueName, {
connection,
prefix: manager.settings.prefix,
prefix: manager.options.prefix,
});
workers.push(worker);
+6 -7
View File
@@ -1,7 +1,6 @@
import type { AnySourceConfig } from "@basango/domain/crawler";
import { AnySourceOptions, CrawlerFetchingOptions, config } from "@basango/domain/config";
import logger from "@basango/logger";
import { FetchCrawlerConfig, config } from "#crawler/config";
import { JsonlPersistor, Persistor } from "#crawler/process/persistence";
import { createPageRange, createTimestampRange } from "#crawler/utils";
@@ -13,11 +12,11 @@ export interface CrawlingOptions {
}
export const resolveCrawlerConfig = (
source: AnySourceConfig,
source: AnySourceOptions,
options: CrawlingOptions,
): FetchCrawlerConfig => {
): CrawlerFetchingOptions => {
return {
...config.fetch.crawler,
...config.crawler.fetch.crawler,
category: options.category,
dateRange: createTimestampRange(options.dateRange),
pageRange: createPageRange(options.pageRange),
@@ -25,10 +24,10 @@ export const resolveCrawlerConfig = (
};
};
export const createPersistors = (source: AnySourceConfig): Persistor[] => {
export const createPersistors = (source: AnySourceOptions): Persistor[] => {
return [
new JsonlPersistor({
directory: config.paths.data,
directory: config.crawler.paths.data,
sourceId: source.sourceId,
}),
];
+9 -10
View File
@@ -1,8 +1,7 @@
import type { AnySourceConfig } from "@basango/domain/crawler";
import { AnySourceOptions, CrawlerFetchingOptions, config } from "@basango/domain/config";
import { Article } from "@basango/domain/models";
import { HTMLElement, parse as parseHtml } from "node-html-parser";
import { FetchCrawlerConfig, config } from "#crawler/config";
import { SyncHttpClient } from "#crawler/http/http-client";
import { OpenGraph } from "#crawler/http/open-graph";
import type { Persistor } from "#crawler/process/persistence";
@@ -12,23 +11,23 @@ export interface CrawlerOptions {
}
export abstract class BaseCrawler {
protected readonly settings: FetchCrawlerConfig;
protected readonly source: AnySourceConfig;
protected readonly options: CrawlerFetchingOptions;
protected readonly source: AnySourceOptions;
protected readonly http: SyncHttpClient;
protected readonly persistors: Persistor[];
protected readonly openGraph: OpenGraph;
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
if (!settings.source) {
protected constructor(options: CrawlerFetchingOptions, crawlerOptions: CrawlerOptions = {}) {
if (!options.source) {
throw new Error("Crawler requires a bound source");
}
this.http = new SyncHttpClient(config.fetch.client);
this.persistors = options.persistors ?? [];
this.http = new SyncHttpClient(config.crawler.fetch.client);
this.persistors = crawlerOptions.persistors ?? [];
this.openGraph = new OpenGraph();
this.settings = settings;
this.source = settings.source as AnySourceConfig;
this.options = options;
this.source = options.source as AnySourceOptions;
}
/**
+9 -10
View File
@@ -1,11 +1,10 @@
import type { HtmlSourceConfig, TimestampRange } from "@basango/domain/crawler";
import { Article } from "@basango/domain/models";
import { CrawlerFetchingOptions, HtmlSourceOptions } from "@basango/domain/config";
import { Article, TimestampRange } from "@basango/domain/models";
import { logger } from "@basango/logger";
import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns";
import { HTMLElement } from "node-html-parser";
import TurndownService from "turndown";
import { FetchCrawlerConfig } from "#crawler/config";
import {
ArticleOutOfDateRangeError,
InvalidArticleError,
@@ -26,21 +25,21 @@ const md = new TurndownService({
* Crawler for generic HTML pages.
*/
export class HtmlCrawler extends BaseCrawler {
readonly source: HtmlSourceConfig;
readonly source: HtmlSourceOptions;
private currentNode: string | null = null;
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
constructor(settings: CrawlerFetchingOptions, options: { persistors?: Persistor[] } = {}) {
super(settings, options);
if (!settings.source || settings.source.sourceKind !== "html") {
throw new UnsupportedSourceKindError("HtmlCrawler requires a source of kind 'html'");
}
this.source = this.settings.source as HtmlSourceConfig;
this.source = this.options.source as HtmlSourceOptions;
}
async fetch(): Promise<void> {
const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange;
const pageRange = this.options.pageRange ?? (await this.getPagination());
const dateRange = this.options.dateRange;
const selectors = this.source.sourceSelectors;
if (!selectors.articles) {
@@ -218,7 +217,7 @@ export class HtmlCrawler extends BaseCrawler {
*/
private applyCategory(template: string): string {
if (template.includes("{category}")) {
const replacement = this.settings.category ?? "";
const replacement = this.options.category ?? "";
return template.replace("{category}", replacement);
}
return template;
@@ -297,7 +296,7 @@ export class HtmlCrawler extends BaseCrawler {
* @param selector - The CSS selector
*/
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
if (!selector && this.settings.category) return [this.settings.category.toLowerCase()];
if (!selector && this.options.category) return [this.options.category.toLowerCase()];
if (!selector) return [];
const values: string[] = [];
@@ -1,10 +1,9 @@
import type { PageRange, TimestampRange, WordPressSourceConfig } from "@basango/domain/crawler";
import { Article } from "@basango/domain/models";
import { CrawlerFetchingOptions, WordPressSourceOptions } from "@basango/domain/config";
import { Article, PageRange, TimestampRange } from "@basango/domain/models";
import { logger } from "@basango/logger";
import { fromUnixTime } from "date-fns";
import TurndownService from "turndown";
import { FetchCrawlerConfig } from "#crawler/config";
import {
ArticleOutOfDateRangeError,
InvalidArticleError,
@@ -33,7 +32,7 @@ interface WordPressPost {
* Crawler for WordPress sites using the REST API.
*/
export class WordPressCrawler extends BaseCrawler {
readonly source: WordPressSourceConfig;
readonly source: WordPressSourceOptions;
private categoryMap: Map<number, string> = new Map();
public static readonly POST_QUERY =
@@ -43,7 +42,7 @@ export class WordPressCrawler extends BaseCrawler {
public static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
public static readonly TOTAL_POSTS_HEADER = "x-wp-total";
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
constructor(settings: CrawlerFetchingOptions, options: { persistors?: Persistor[] } = {}) {
super(settings, options);
if (!settings.source || settings.source.sourceKind !== "wordpress") {
@@ -51,15 +50,15 @@ export class WordPressCrawler extends BaseCrawler {
"WordPressCrawler requires a source of kind 'wordpress'",
);
}
this.source = this.settings.source as WordPressSourceConfig;
this.source = this.options.source as WordPressSourceOptions;
}
/**
* Fetch and process WordPress posts.
*/
async fetch(): Promise<void> {
const pageRange = this.settings.pageRange ?? (await this.getPagination());
const dateRange = this.settings.dateRange;
const pageRange = this.options.pageRange ?? (await this.getPagination());
const dateRange = this.options.dateRange;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const endpoint = this.buildEndpointUrl(page);
+4 -4
View File
@@ -1,11 +1,11 @@
import fs from "node:fs";
import path from "node:path";
import { config } from "@basango/domain/config";
import type { Article } from "@basango/domain/models";
import { md5 } from "@basango/encryption";
import logger from "@basango/logger";
import { config, env } from "#crawler/config";
import { HttpError, SyncHttpClient } from "#crawler/http/http-client";
export interface Persistor {
@@ -66,9 +66,9 @@ export const persist = async (
};
export const forward = async (payload: Partial<Article>): Promise<void> => {
const client = new SyncHttpClient(config.fetch.client);
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
const token = env("BASANGO_CRAWLER_TOKEN");
const client = new SyncHttpClient(config.crawler.fetch.client);
const endpoint = config.crawler.backend.endpoint;
const token = config.crawler.backend.token;
try {
const response = await client.post(endpoint, {
+3 -3
View File
@@ -1,13 +1,13 @@
#! /usr/bin/env bun
#!/usr/bin/env bun
import fs from "node:fs";
import path from "node:path";
import { createInterface } from "node:readline";
import { parseArgs } from "node:util";
import { config } from "@basango/domain/config";
import type { Article } from "@basango/domain/models";
import { logger } from "@basango/logger";
import { config } from "#crawler/config";
import { forward } from "#crawler/process/persistence";
const USAGE = `
@@ -31,7 +31,7 @@ const main = async (): Promise<void> => {
return;
}
const filePath = path.join(config.paths.data, `${sourceId}.jsonl`);
const filePath = path.join(config.crawler.paths.data, `${sourceId}.jsonl`);
if (!fs.existsSync(filePath)) {
logger.error({ filePath, sourceId }, "Source must be crawled first; JSONL not found");
+1 -1
View File
@@ -1,4 +1,4 @@
#! /usr/bin/env bun
#!/usr/bin/env bun
import { logger } from "@basango/logger";
+1 -1
View File
@@ -1,4 +1,4 @@
#! /usr/bin/env bun
#!/usr/bin/env bun
import { logger } from "@basango/logger";
+10 -9
View File
@@ -1,28 +1,29 @@
import {
AnySourceOptions,
HtmlSourceOptions,
WordPressSourceOptions,
config,
} from "@basango/domain/config";
import { DEFAULT_DATE_FORMAT } from "@basango/domain/constants";
import {
AnySourceConfig,
DateSpecSchema,
HtmlSourceConfig,
PageRange,
PageRangeSchema,
PageSpecSchema,
TimestampRange,
TimestampRangeSchema,
WordPressSourceConfig,
} from "@basango/domain/crawler";
} from "@basango/domain/models";
import { format, fromUnixTime, getUnixTime, isMatch, parse } from "date-fns";
import type { RedisOptions } from "ioredis";
import { config } from "#crawler/config";
/**
* Resolve a source configuration by its ID.
* @param id - The source ID
*/
export const resolveSourceConfig = (id: string): AnySourceConfig => {
export const resolveSourceConfig = (id: string): AnySourceOptions => {
const source =
config.sources.html.find((s: HtmlSourceConfig) => s.sourceId === id) ||
config.sources.wordpress.find((s: WordPressSourceConfig) => s.sourceId === id);
config.crawler.sources.html.find((s: HtmlSourceOptions) => s.sourceId === id) ||
config.crawler.sources.wordpress.find((s: WordPressSourceOptions) => s.sourceId === id);
if (source === undefined) {
throw new Error(`Source '${id}' not found in configuration`);