refactor: centralize configuration
This commit is contained in:
@@ -1,7 +0,0 @@
|
||||
NODE_ENV=development
|
||||
BASANGO_API_HOST=localhost
|
||||
BASANGO_API_PORT=3080
|
||||
BASANGO_API_ALLOWED_ORIGINS=http://localhost:3000,http://127.0.0.1:3000
|
||||
BASANGO_API_KEY=your_api_key_here
|
||||
BASANGO_CRAWLER_TOKEN=dev
|
||||
BASANGO_JWT_SECRET=your_jwt_secret_here
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"cors": {
|
||||
"allowedHeaders": [
|
||||
"Authorization",
|
||||
"Content-Type",
|
||||
"accept-language",
|
||||
"x-trpc-source",
|
||||
"x-user-locale",
|
||||
"x-user-timezone",
|
||||
"x-user-country"
|
||||
],
|
||||
"allowMethods": ["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"],
|
||||
"exposeHeaders": ["Content-Length"],
|
||||
"maxAge": 86400
|
||||
}
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "%env(BASANGO_API_HOST)%",
|
||||
"port": "%env(number:BASANGO_API_PORT)%",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
}
|
||||
@@ -4,13 +4,10 @@
|
||||
"@basango/domain": "workspace:*",
|
||||
"@basango/encryption": "workspace:*",
|
||||
"@basango/logger": "workspace:*",
|
||||
"@devscast/config": "catalog:",
|
||||
"@hono/node-server": "^1.19.6",
|
||||
"@hono/trpc-server": "^0.4.0",
|
||||
"@hono/zod-openapi": "^1.1.4",
|
||||
"@scalar/hono-api-reference": "^0.9.24",
|
||||
"@trpc/server": "^11.7.1",
|
||||
"ai": "^5.0.89",
|
||||
"camelcase-keys": "^10.0.1",
|
||||
"date-fns": "catalog:",
|
||||
"hono-rate-limiter": "^0.4.2",
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
import path from "node:path";
|
||||
|
||||
import { loadConfig as defineConfig } from "@devscast/config";
|
||||
import { z } from "zod";
|
||||
|
||||
export const PROJECT_DIR = path.resolve(__dirname, "../");
|
||||
|
||||
const ServerConfigurationSchema = z.object({
|
||||
cors: z.object({
|
||||
allowedHeaders: z.array(z.string()).optional(),
|
||||
allowMethods: z.array(z.string()).optional(),
|
||||
exposeHeaders: z.array(z.string()).optional(),
|
||||
maxAge: z.number().int().min(0).optional(),
|
||||
origin: z
|
||||
.array(z.string())
|
||||
.optional()
|
||||
.default(["http://localhost:3000", "http://127.0.0.1:3000", "https://dashboard.basango.io"]),
|
||||
}),
|
||||
server: z.object({
|
||||
host: z.string().default("localhost"),
|
||||
port: z.number().int().min(1).max(65535).default(4000),
|
||||
version: z.string().default("1.0.0"),
|
||||
}),
|
||||
});
|
||||
|
||||
export const { env, config } = defineConfig({
|
||||
env: {
|
||||
knownKeys: [
|
||||
"BASANGO_API_HOST",
|
||||
"BASANGO_API_PORT",
|
||||
"BASANGO_API_ALLOWED_ORIGINS",
|
||||
"BASANGO_API_KEY",
|
||||
"BASANGO_CRAWLER_TOKEN",
|
||||
"BASANGO_JWT_SECRET",
|
||||
],
|
||||
path: path.join(PROJECT_DIR, ".env"),
|
||||
},
|
||||
schema: ServerConfigurationSchema,
|
||||
sources: [
|
||||
path.join(PROJECT_DIR, "config", "server.json"),
|
||||
path.join(PROJECT_DIR, "config", "cors.json"),
|
||||
],
|
||||
});
|
||||
|
||||
export type ServerConfiguration = z.infer<typeof ServerConfigurationSchema>;
|
||||
+8
-48
@@ -1,11 +1,10 @@
|
||||
import { config } from "@basango/domain/config";
|
||||
import { trpcServer } from "@hono/trpc-server";
|
||||
import { OpenAPIHono } from "@hono/zod-openapi";
|
||||
import { Scalar } from "@scalar/hono-api-reference";
|
||||
import { cors } from "hono/cors";
|
||||
import { logger } from "hono/logger";
|
||||
import { secureHeaders } from "hono/secure-headers";
|
||||
|
||||
import { config, env } from "#api/config";
|
||||
import { routers } from "#api/rest/routers";
|
||||
import { createTRPCContext } from "#api/trpc/init";
|
||||
import { appRouter } from "#api/trpc/routers/_app";
|
||||
@@ -18,11 +17,11 @@ app.use(secureHeaders());
|
||||
app.use(
|
||||
"*",
|
||||
cors({
|
||||
allowHeaders: config.cors.allowedHeaders,
|
||||
allowMethods: config.cors.allowMethods,
|
||||
exposeHeaders: config.cors.exposeHeaders,
|
||||
maxAge: config.cors.maxAge,
|
||||
origin: ["http://localhost:3000", "http://127.0.0.1:3000", "https://dashboard.basango.io"],
|
||||
allowHeaders: config.api.cors.allowedHeaders,
|
||||
allowMethods: config.api.cors.allowMethods,
|
||||
exposeHeaders: config.api.cors.exposeHeaders,
|
||||
maxAge: config.api.cors.maxAge,
|
||||
origin: config.api.cors.origin,
|
||||
}),
|
||||
);
|
||||
|
||||
@@ -34,49 +33,10 @@ app.use(
|
||||
}),
|
||||
);
|
||||
|
||||
app.doc("/openapi", {
|
||||
info: {
|
||||
contact: {
|
||||
email: "engineering@basango.io",
|
||||
name: "Basango",
|
||||
url: "https://basango.io",
|
||||
},
|
||||
description: "Basango is a platform that leverages AI to revolutionize news curation.",
|
||||
license: {
|
||||
name: "AGPL-3.0 license",
|
||||
url: "https://github.com/bernard-ng/basango/blob/main/LICENSE",
|
||||
},
|
||||
title: "Basango API",
|
||||
version: "0.0.1",
|
||||
},
|
||||
openapi: "3.1.0",
|
||||
security: [
|
||||
{
|
||||
oauth2: [],
|
||||
},
|
||||
{ token: [] },
|
||||
],
|
||||
servers: [
|
||||
{
|
||||
description: "Production API",
|
||||
url: "https://api.basango.io",
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Register security scheme
|
||||
app.openAPIRegistry.registerComponent("securitySchemes", "token", {
|
||||
description: "Default authentication mechanism",
|
||||
scheme: "bearer",
|
||||
type: "http",
|
||||
"x-speakeasy-example": env("BASANGO_API_KEY"),
|
||||
});
|
||||
|
||||
app.get("/", Scalar({ pageTitle: "Basango API", theme: "saturn", url: "/openapi" }));
|
||||
app.route("/", routers);
|
||||
|
||||
export default {
|
||||
fetch: app.fetch,
|
||||
hostname: config.server.host,
|
||||
port: config.server.port,
|
||||
hostname: config.api.server.host,
|
||||
port: config.api.server.port,
|
||||
};
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
import { config } from "@basango/domain/config";
|
||||
import type { MiddlewareHandler } from "hono";
|
||||
import { HTTPException } from "hono/http-exception";
|
||||
|
||||
import { env } from "#api/config";
|
||||
|
||||
export const withCrawlerAuth: MiddlewareHandler = async (c, next) => {
|
||||
const token = c.req.header("Authorization");
|
||||
|
||||
@@ -10,7 +9,7 @@ export const withCrawlerAuth: MiddlewareHandler = async (c, next) => {
|
||||
throw new HTTPException(401, { message: "Authorization header required" });
|
||||
}
|
||||
|
||||
if (token !== env("BASANGO_CRAWLER_TOKEN")) {
|
||||
if (token !== config.api.security.crawlerToken) {
|
||||
throw new HTTPException(403, { message: "Invalid token" });
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ export const authRouter = createTRPCRouter({
|
||||
if (!user || user.isLocked) {
|
||||
throw new TRPCError({
|
||||
code: "UNAUTHORIZED",
|
||||
message: "Invalid credentials.",
|
||||
message: "Account is locked",
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
+10
-17
@@ -1,15 +1,8 @@
|
||||
import { Database } from "@basango/db/client";
|
||||
import { getUserById } from "@basango/db/queries";
|
||||
import {
|
||||
DEFAULT_ACCESS_TOKEN_TTL,
|
||||
DEFAULT_REFRESH_TOKEN_TTL,
|
||||
DEFAULT_TOKEN_AUDIENCE,
|
||||
DEFAULT_TOKEN_ISSUER,
|
||||
} from "@basango/domain/constants";
|
||||
import { config } from "@basango/domain/config";
|
||||
import { type JWTPayload, SignJWT, jwtVerify } from "jose";
|
||||
|
||||
import { env } from "#api/config";
|
||||
|
||||
export type Session = {
|
||||
user: {
|
||||
id: string;
|
||||
@@ -39,7 +32,7 @@ export type SessionTokens = {
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
function getSecretKey() {
|
||||
return encoder.encode(env("BASANGO_JWT_SECRET"));
|
||||
return encoder.encode(config.api.security.jwtSecret);
|
||||
}
|
||||
|
||||
export async function getSession(db: Database, accessToken?: string): Promise<Session | null> {
|
||||
@@ -74,24 +67,24 @@ async function createToken(session: Session, tokenType: TokenType, expiresIn: st
|
||||
})
|
||||
.setProtectedHeader({ alg: "HS256" })
|
||||
.setIssuedAt()
|
||||
.setAudience(DEFAULT_TOKEN_AUDIENCE)
|
||||
.setIssuer(DEFAULT_TOKEN_ISSUER)
|
||||
.setAudience(config.api.security.audience)
|
||||
.setIssuer(config.api.security.issuer)
|
||||
.setExpirationTime(expiresIn)
|
||||
.sign(getSecretKey());
|
||||
}
|
||||
|
||||
export async function createSessionTokens(session: Session): Promise<SessionTokens> {
|
||||
const [accessToken, refreshToken] = await Promise.all([
|
||||
createToken(session, "access", DEFAULT_ACCESS_TOKEN_TTL),
|
||||
createToken(session, "refresh", DEFAULT_REFRESH_TOKEN_TTL),
|
||||
createToken(session, "access", config.api.security.accessTokenTtl),
|
||||
createToken(session, "refresh", config.api.security.refreshTokenTtl),
|
||||
]);
|
||||
|
||||
const issuedAt = Date.now();
|
||||
const accessTokenExpiresAt = new Date(
|
||||
issuedAt + formatTTL(DEFAULT_ACCESS_TOKEN_TTL),
|
||||
issuedAt + formatTTL(config.api.security.accessTokenTtl),
|
||||
).toISOString();
|
||||
const refreshTokenExpiresAt = new Date(
|
||||
issuedAt + formatTTL(DEFAULT_REFRESH_TOKEN_TTL),
|
||||
issuedAt + formatTTL(config.api.security.refreshTokenTtl),
|
||||
).toISOString();
|
||||
|
||||
return {
|
||||
@@ -118,8 +111,8 @@ async function verifyToken(
|
||||
|
||||
try {
|
||||
const { payload } = await jwtVerify<VerifiedJWTPayload>(token, getSecretKey(), {
|
||||
audience: DEFAULT_TOKEN_AUDIENCE,
|
||||
issuer: DEFAULT_TOKEN_ISSUER,
|
||||
audience: config.api.security.audience,
|
||||
issuer: config.api.security.issuer,
|
||||
});
|
||||
|
||||
if (payload.tokenType !== expectedType) {
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
# paths
|
||||
BASANGO_CRAWLER_ROOT_PATH=
|
||||
BASANGO_CRAWLER_DATA_PATH=
|
||||
BASANGO_CRAWLER_LOGS_PATH=
|
||||
BASANGO_CRAWLER_CONFIG_PATH=
|
||||
|
||||
# crawler settings
|
||||
BASANGO_CRAWLER_UPDATE_DIRECTION=forward
|
||||
BASANGO_CRAWLER_FETCH_USER_AGENT="Basango/0.1 (+https://github.com/bernard-ng/basango)"
|
||||
BASANGO_CRAWLER_FETCH_MAX_RETRIES=3
|
||||
BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER=true
|
||||
|
||||
BASANGO_CRAWLER_ASYNC_REDIS_URL="redis://localhost:6379/0"
|
||||
BASANGO_CRAWLER_ASYNC_TTL_RESULT=3600
|
||||
BASANGO_CRAWLER_ASYNC_TTL_FAILURE=3600
|
||||
BASANGO_CRAWLER_ASYNC_QUEUE_LISTING="listing"
|
||||
BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS="details"
|
||||
BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING="processing"
|
||||
|
||||
BASANGO_CRAWLER_TOKEN="dev"
|
||||
BASANGO_CRAWLER_BACKEND_API_ENDPOINT="http://localhost:3080/articles"
|
||||
@@ -1,41 +0,0 @@
|
||||
{
|
||||
"fetch": {
|
||||
"async": {
|
||||
"prefix": "basango:crawler",
|
||||
"queues": {
|
||||
"details": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS)%",
|
||||
"listing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_LISTING)%",
|
||||
"processing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING)%"
|
||||
},
|
||||
"redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%",
|
||||
"ttl": {
|
||||
"default": 600,
|
||||
"failure": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_FAILURE)%",
|
||||
"result": "%env(number:BASANGO_CRAWLER_ASYNC_TTL_RESULT)%"
|
||||
}
|
||||
},
|
||||
"client": {
|
||||
"backoffInitial": 1,
|
||||
"backoffMax": 30,
|
||||
"backoffMultiplier": 2,
|
||||
"followRedirects": true,
|
||||
"maxRetries": "%env(number:BASANGO_CRAWLER_FETCH_MAX_RETRIES)%",
|
||||
"respectRetryAfter": "%env(boolean:BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER)%",
|
||||
"rotate": true,
|
||||
"timeout": 20,
|
||||
"userAgent": "%env(BASANGO_CRAWLER_FETCH_USER_AGENT)%",
|
||||
"verifySsl": true
|
||||
},
|
||||
"crawler": {
|
||||
"direction": "%env(BASANGO_CRAWLER_UPDATE_DIRECTION)%",
|
||||
"maxWorkers": 5,
|
||||
"notify": false,
|
||||
"useMultiThreading": false
|
||||
}
|
||||
},
|
||||
"paths": {
|
||||
"config": "%env(BASANGO_CRAWLER_CONFIG_PATH)%",
|
||||
"data": "%env(BASANGO_CRAWLER_DATA_PATH)%",
|
||||
"root": "%env(BASANGO_CRAWLER_ROOT_PATH)%"
|
||||
}
|
||||
}
|
||||
@@ -1,210 +0,0 @@
|
||||
{
|
||||
"sources": {
|
||||
"html": [
|
||||
{
|
||||
"paginationTemplate": "actualite",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {},
|
||||
"sourceId": "radiookapi.net",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".field-name-body",
|
||||
"articleCategories": ".views-field-field-cat-gorie a",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": ".views-field-title a",
|
||||
"articles": ".view-content > .views-row.content-row",
|
||||
"articleTitle": "h1.page-header",
|
||||
"pagination": "ul.pagination > li.pager-last > a"
|
||||
},
|
||||
"sourceUrl": "https://www.radiookapi.net",
|
||||
"supportsCategories": false
|
||||
},
|
||||
{
|
||||
"categories": ["politique", "economie", "culture", "sport", "societe"],
|
||||
"paginationTemplate": "index.php/category/{category}",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {},
|
||||
"sourceId": "7sur7.cd",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": "div[property=\"schema:text\"].field.field--name-body",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": ".views-field-title a",
|
||||
"articles": ".view-content > .row.views-row",
|
||||
"articleTitle": ".views-field-title a",
|
||||
"pagination": "ul.pagination > li.pager__item.pager__item--last > a"
|
||||
},
|
||||
"sourceUrl": "https://7sur7.cd",
|
||||
"supportsCategories": true
|
||||
},
|
||||
{
|
||||
"paginationTemplate": "articles.html",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {
|
||||
"format": "dd.MM.yyyy"
|
||||
},
|
||||
"sourceId": "mediacongo.net",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".article_ttext",
|
||||
"articleCategories": "a.color_link",
|
||||
"articleDate": ".article_other_about",
|
||||
"articleLink": "a:first-child",
|
||||
"articles": ".for_aitems > .article_other_item",
|
||||
"articleTitle": "h1",
|
||||
"pagination": "div.pagination > div > a:last-child"
|
||||
},
|
||||
"sourceUrl": "https://www.mediacongo.net",
|
||||
"supportsCategories": false
|
||||
},
|
||||
{
|
||||
"paginationTemplate": "actualite",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {},
|
||||
"sourceId": "actualite.cd",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".views-field.views-field-body .field-content",
|
||||
"articleCategories": "#actu-cat",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": "#actu-titre a",
|
||||
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
|
||||
"articleTitle": "h1.page-title"
|
||||
},
|
||||
"sourceUrl": "https://actualite.cd",
|
||||
"supportsCategories": false
|
||||
}
|
||||
],
|
||||
"wordpress": [
|
||||
{
|
||||
"requiresRateLimit": true,
|
||||
"sourceId": "beto.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://beto.cd"
|
||||
},
|
||||
{ "sourceId": "newscd.net", "sourceKind": "wordpress", "sourceUrl": "https://newscd.net" },
|
||||
{
|
||||
"sourceId": "africanewsrdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.africanewsrdc.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "angazainstitute.ac.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://angazainstitute.ac.cd"
|
||||
},
|
||||
{ "sourceId": "b-onetv.cd", "sourceKind": "wordpress", "sourceUrl": "https://b-onetv.cd" },
|
||||
{
|
||||
"sourceId": "bukavufm.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://bukavufm.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "changement7.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://changement7.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "congoactu.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://congoactu.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "congoindependant.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.congoindependant.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "congoquotidien.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.congoquotidien.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "cumulard.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.cumulard.cd"
|
||||
},
|
||||
{
|
||||
"sourceId": "environews-rdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://environews-rdc.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "freemediardc.info",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.freemediardc.info"
|
||||
},
|
||||
{
|
||||
"sourceId": "geopolismagazine.org",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://geopolismagazine.org"
|
||||
},
|
||||
{
|
||||
"sourceId": "habarirdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://habarirdc.net"
|
||||
},
|
||||
{ "sourceId": "infordc.com", "sourceKind": "wordpress", "sourceUrl": "https://infordc.com" },
|
||||
{
|
||||
"sourceId": "kilalopress.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://kilalopress.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "laprosperiteonline.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://laprosperiteonline.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "laprunellerdc.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://laprunellerdc.cd"
|
||||
},
|
||||
{
|
||||
"sourceId": "lesmedias.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://lesmedias.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "lesvolcansnews.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://lesvolcansnews.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "netic-news.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.netic-news.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "objectif-infos.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://objectif-infos.cd"
|
||||
},
|
||||
{
|
||||
"sourceId": "scooprdc.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://scooprdc.net"
|
||||
},
|
||||
{
|
||||
"sourceId": "journaldekinshasa.com",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://www.journaldekinshasa.com"
|
||||
},
|
||||
{
|
||||
"sourceId": "lepotentiel.cd",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://lepotentiel.cd"
|
||||
},
|
||||
{ "sourceId": "acturdc.com", "sourceKind": "wordpress", "sourceUrl": "https://acturdc.com" },
|
||||
{
|
||||
"sourceId": "matininfos.net",
|
||||
"sourceKind": "wordpress",
|
||||
"sourceUrl": "https://matininfos.net"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,81 +0,0 @@
|
||||
import path from "node:path";
|
||||
|
||||
import {
|
||||
HtmlSourceConfigSchema,
|
||||
PageRangeSchema,
|
||||
TimestampRangeSchema,
|
||||
UpdateDirectionSchema,
|
||||
WordPressSourceConfigSchema,
|
||||
} from "@basango/domain/crawler";
|
||||
import { loadConfig as defineConfig } from "@devscast/config";
|
||||
import { z } from "zod";
|
||||
|
||||
export const PROJECT_DIR = path.resolve(__dirname, "../");
|
||||
|
||||
export const PipelineConfigSchema = z.object({
|
||||
fetch: z.object({
|
||||
async: z.object({
|
||||
prefix: z.string().default("basango:crawler:queue"),
|
||||
queues: z.object({
|
||||
details: z.string().default("details"),
|
||||
listing: z.string().default("listing"),
|
||||
processing: z.string().default("processing"),
|
||||
}),
|
||||
redisUrl: z.string().default("redis://localhost:6379/0"),
|
||||
ttl: z.object({
|
||||
default: z.number().int().positive().default(600),
|
||||
failure: z.number().int().nonnegative().default(3600),
|
||||
result: z.number().int().nonnegative().default(3600),
|
||||
}),
|
||||
}),
|
||||
client: z.object({
|
||||
backoffInitial: z.number().nonnegative().default(1),
|
||||
backoffMax: z.number().nonnegative().default(30),
|
||||
backoffMultiplier: z.number().positive().default(2),
|
||||
followRedirects: z.boolean().default(true),
|
||||
maxRetries: z.number().int().nonnegative().default(3),
|
||||
respectRetryAfter: z.boolean().default(true),
|
||||
rotate: z.boolean().default(true),
|
||||
timeout: z.number().positive().default(20),
|
||||
userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
|
||||
verifySsl: z.boolean().default(true),
|
||||
}),
|
||||
crawler: z.object({
|
||||
category: z.string().optional(),
|
||||
dateRange: TimestampRangeSchema.optional(),
|
||||
direction: UpdateDirectionSchema.default("forward"),
|
||||
isUpdate: z.boolean().default(false),
|
||||
maxWorkers: z.number().int().positive().default(5),
|
||||
notify: z.boolean().default(false),
|
||||
pageRange: PageRangeSchema.optional(),
|
||||
source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
|
||||
useMultiThreading: z.boolean().default(false),
|
||||
}),
|
||||
}),
|
||||
paths: z.object({
|
||||
config: z.string().default(path.join(PROJECT_DIR, "config")),
|
||||
data: z.string().default(path.join(PROJECT_DIR, "data", "datasets")),
|
||||
root: z.string().default(PROJECT_DIR),
|
||||
}),
|
||||
sources: z.object({
|
||||
html: z.array(HtmlSourceConfigSchema).default([]),
|
||||
wordpress: z.array(WordPressSourceConfigSchema).default([]),
|
||||
}),
|
||||
});
|
||||
|
||||
export const { config, env } = defineConfig({
|
||||
cwd: process.cwd(),
|
||||
env: {
|
||||
path: path.join(PROJECT_DIR, ".env"),
|
||||
},
|
||||
schema: PipelineConfigSchema,
|
||||
sources: [
|
||||
path.join(PROJECT_DIR, "config", "pipeline.json"),
|
||||
path.join(PROJECT_DIR, "config", "sources.json"),
|
||||
],
|
||||
});
|
||||
|
||||
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
|
||||
export type FetchClientConfig = PipelineConfig["fetch"]["client"];
|
||||
export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
|
||||
export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];
|
||||
@@ -1,12 +1,12 @@
|
||||
import { setTimeout as delay } from "node:timers/promises";
|
||||
|
||||
import type { CrawlerHttpOptions } from "@basango/domain/config";
|
||||
import {
|
||||
DEFAULT_RETRY_AFTER_HEADER,
|
||||
DEFAULT_TRANSIENT_HTTP_STATUSES,
|
||||
DEFAULT_USER_AGENT,
|
||||
} from "@basango/domain/constants";
|
||||
|
||||
import { FetchClientConfig } from "#crawler/config";
|
||||
import { UserAgents } from "#crawler/http/user-agent";
|
||||
|
||||
export type HttpHeaders = Record<string, string>;
|
||||
@@ -71,7 +71,7 @@ const buildUrl = (url: string, params?: HttpParams): string => {
|
||||
* @param config - Fetch client configuration
|
||||
* @param attempt - Current attempt number
|
||||
*/
|
||||
const computeBackoff = (config: FetchClientConfig, attempt: number): number => {
|
||||
const computeBackoff = (config: CrawlerHttpOptions, attempt: number): number => {
|
||||
const base = Math.min(
|
||||
config.backoffInitial * config.backoffMultiplier ** attempt,
|
||||
config.backoffMax,
|
||||
@@ -101,26 +101,26 @@ const parseRetryAfter = (header: string): number => {
|
||||
* @author Bernard Ngandu <bernard@devscast.tech>
|
||||
*/
|
||||
export class BaseHttpClient {
|
||||
protected readonly config: FetchClientConfig;
|
||||
protected readonly options: CrawlerHttpOptions;
|
||||
protected readonly fetchImpl: typeof fetch;
|
||||
protected readonly sleep: (ms: number) => Promise<void>;
|
||||
protected readonly headers: HttpHeaders;
|
||||
|
||||
constructor(config: FetchClientConfig, options: HttpClientOptions = {}) {
|
||||
this.config = config;
|
||||
constructor(options: CrawlerHttpOptions, clientOptions: HttpClientOptions = {}) {
|
||||
this.options = options;
|
||||
const provider =
|
||||
options.userAgentProvider ??
|
||||
new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT);
|
||||
const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT;
|
||||
clientOptions.userAgentProvider ??
|
||||
new UserAgents(options.rotate, options.userAgent ?? DEFAULT_USER_AGENT);
|
||||
const userAgent = provider.get() ?? options.userAgent ?? DEFAULT_USER_AGENT;
|
||||
|
||||
const baseHeaders: HttpHeaders = { "User-Agent": userAgent };
|
||||
if (options.defaultHeaders) {
|
||||
Object.assign(baseHeaders, options.defaultHeaders);
|
||||
if (clientOptions.defaultHeaders) {
|
||||
Object.assign(baseHeaders, clientOptions.defaultHeaders);
|
||||
}
|
||||
|
||||
this.headers = baseHeaders;
|
||||
this.fetchImpl = options.fetchImpl ?? fetch;
|
||||
this.sleep = options.sleep ?? defaultSleep;
|
||||
this.fetchImpl = clientOptions.fetchImpl ?? fetch;
|
||||
this.sleep = clientOptions.sleep ?? defaultSleep;
|
||||
}
|
||||
|
||||
protected buildHeaders(headers?: HttpHeaders): HeadersInit {
|
||||
@@ -136,13 +136,13 @@ export class BaseHttpClient {
|
||||
|
||||
if (response) {
|
||||
const retryAfter = response.headers.get(retryAfterHeader);
|
||||
if (retryAfter && this.config.respectRetryAfter) {
|
||||
if (retryAfter && this.options.respectRetryAfter) {
|
||||
waitMs = parseRetryAfter(retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
if (waitMs === 0) {
|
||||
waitMs = computeBackoff(this.config, attempt);
|
||||
waitMs = computeBackoff(this.options, attempt);
|
||||
}
|
||||
|
||||
if (waitMs > 0) {
|
||||
@@ -161,7 +161,7 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER;
|
||||
const target = buildUrl(url, options.params);
|
||||
|
||||
const maxAttempts = this.config.maxRetries + 1;
|
||||
const maxAttempts = this.options.maxRetries + 1;
|
||||
let attempt = 0;
|
||||
let lastError: unknown;
|
||||
|
||||
@@ -169,14 +169,14 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
const controller = new AbortController();
|
||||
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
|
||||
try {
|
||||
timeoutHandle = setTimeout(() => controller.abort(), this.config.timeout * 1000);
|
||||
timeoutHandle = setTimeout(() => controller.abort(), this.options.timeout * 1000);
|
||||
|
||||
const headers = this.buildHeaders(options.headers);
|
||||
const init: RequestInit = {
|
||||
body: options.data as BodyInit | undefined,
|
||||
headers,
|
||||
method,
|
||||
redirect: this.config.followRedirects ? "follow" : "manual",
|
||||
redirect: this.options.followRedirects ? "follow" : "manual",
|
||||
signal: controller.signal,
|
||||
};
|
||||
|
||||
@@ -189,7 +189,7 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
|
||||
if (
|
||||
DEFAULT_TRANSIENT_HTTP_STATUSES.includes(response.status as number) &&
|
||||
attempt < this.config.maxRetries
|
||||
attempt < this.options.maxRetries
|
||||
) {
|
||||
await this.maybeDelay(attempt, response, retryAfterHeader);
|
||||
attempt += 1;
|
||||
@@ -209,12 +209,12 @@ export class SyncHttpClient extends BaseHttpClient {
|
||||
|
||||
if (error instanceof DOMException && error.name === "AbortError") {
|
||||
lastError = error;
|
||||
if (attempt >= this.config.maxRetries) {
|
||||
if (attempt >= this.options.maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
} else {
|
||||
lastError = error;
|
||||
if (attempt >= this.config.maxRetries) {
|
||||
if (attempt >= this.options.maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { config } from "@basango/domain/config";
|
||||
import { DEFAULT_OPEN_GRAPH_USER_AGENT } from "@basango/domain/constants";
|
||||
import { ArticleMetadata } from "@basango/domain/models";
|
||||
import { parse } from "node-html-parser";
|
||||
|
||||
import { config } from "#crawler/config";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { UserAgents } from "#crawler/http/user-agent";
|
||||
import { createAbsoluteUrl } from "#crawler/utils";
|
||||
@@ -44,7 +44,7 @@ export class OpenGraph {
|
||||
private readonly client: Pick<SyncHttpClient, "get">;
|
||||
|
||||
constructor() {
|
||||
const settings = config.fetch.client;
|
||||
const settings = config.crawler.fetch.client;
|
||||
const provider = new UserAgents(true, DEFAULT_OPEN_GRAPH_USER_AGENT);
|
||||
|
||||
this.client = new SyncHttpClient(settings, {
|
||||
@@ -89,16 +89,28 @@ export class OpenGraph {
|
||||
root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null,
|
||||
url ?? null,
|
||||
]);
|
||||
const author = pick([extract(root, "article:author"), extract(root, "og:article:author")]);
|
||||
const publishedAt = pick([
|
||||
extract(root, "article:published_time"),
|
||||
extract(root, "og:article:published_time"),
|
||||
]);
|
||||
const updatedAt = pick([
|
||||
extract(root, "article:modified_time"),
|
||||
extract(root, "og:article:modified_time"),
|
||||
]);
|
||||
|
||||
if (!title && !description && !image && !canonical) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return {
|
||||
author,
|
||||
description,
|
||||
image: createAbsoluteUrl(url, image ?? "") || undefined,
|
||||
publishedAt,
|
||||
title,
|
||||
updatedAt,
|
||||
url: createAbsoluteUrl(url, canonical ?? "") || undefined,
|
||||
};
|
||||
} as ArticleMetadata;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import type { HtmlSourceConfig, WordPressSourceConfig } from "@basango/domain/crawler";
|
||||
import type { HtmlSourceOptions, WordPressSourceOptions } from "@basango/domain/config";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
@@ -24,7 +24,7 @@ export const collectHtmlListing = async (
|
||||
payload: ListingTaskPayload,
|
||||
manager: QueueManager = createQueueManager(),
|
||||
): Promise<number> => {
|
||||
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
|
||||
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceOptions;
|
||||
if (source.sourceKind !== "html") {
|
||||
return await collectWordPressListing(payload, manager);
|
||||
}
|
||||
@@ -63,7 +63,7 @@ export const collectWordPressListing = async (
|
||||
payload: ListingTaskPayload,
|
||||
manager: QueueManager = createQueueManager(),
|
||||
): Promise<number> => {
|
||||
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
|
||||
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceOptions;
|
||||
if (source.sourceKind !== "wordpress") {
|
||||
return await collectHtmlListing(payload, manager);
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
|
||||
import { JobsOptions, Queue, QueueOptions } from "bullmq";
|
||||
import { type CrawlerAsyncOptions, config } from "@basango/domain/config";
|
||||
import { JobsOptions, Queue } from "bullmq";
|
||||
import IORedis from "ioredis";
|
||||
|
||||
import { FetchAsyncConfig, config } from "#crawler/config";
|
||||
import {
|
||||
DetailsTaskPayload,
|
||||
DetailsTaskPayloadSchema,
|
||||
@@ -20,28 +20,27 @@ export interface QueueBackend<T = unknown> {
|
||||
|
||||
export type QueueFactory = (
|
||||
queueName: string,
|
||||
settings: FetchAsyncConfig,
|
||||
options: CrawlerAsyncOptions,
|
||||
connection?: IORedis,
|
||||
) => QueueBackend;
|
||||
|
||||
const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => {
|
||||
const defaultQueueFactory: QueueFactory = (queueName, options, connection) => {
|
||||
const redisConnection =
|
||||
connection ??
|
||||
new IORedis(settings.redisUrl, {
|
||||
...parseRedisUrl(settings.redisUrl),
|
||||
new IORedis(options.redisUrl, {
|
||||
...parseRedisUrl(options.redisUrl),
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
const options: QueueOptions = {
|
||||
connection: redisConnection,
|
||||
prefix: settings.prefix,
|
||||
};
|
||||
|
||||
const queue = new Queue(queueName, options);
|
||||
const queue = new Queue(queueName, {
|
||||
connection: redisConnection,
|
||||
prefix: options.prefix,
|
||||
});
|
||||
return {
|
||||
add: async (name, data, opts) => {
|
||||
const job = await queue.add(name, data, {
|
||||
removeOnComplete: settings.ttl.result === 0 ? true : undefined,
|
||||
removeOnFail: settings.ttl.failure === 0 ? true : undefined,
|
||||
removeOnComplete: options.ttl.result === 0 ? true : undefined,
|
||||
removeOnFail: options.ttl.failure === 0 ? true : undefined,
|
||||
...opts,
|
||||
});
|
||||
return { id: job.id ?? randomUUID() };
|
||||
@@ -55,7 +54,7 @@ export interface CreateQueueManagerOptions {
|
||||
}
|
||||
|
||||
export interface QueueManager {
|
||||
readonly settings: FetchAsyncConfig;
|
||||
readonly options: CrawlerAsyncOptions;
|
||||
readonly connection: IORedis;
|
||||
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
|
||||
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
|
||||
@@ -66,17 +65,17 @@ export interface QueueManager {
|
||||
}
|
||||
|
||||
export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => {
|
||||
const settings = config.fetch.async;
|
||||
const asyncOptions = config.crawler.fetch.async;
|
||||
|
||||
const connection =
|
||||
options.connection ??
|
||||
new IORedis(settings.redisUrl, {
|
||||
...parseRedisUrl(settings.redisUrl),
|
||||
new IORedis(asyncOptions.redisUrl, {
|
||||
...parseRedisUrl(asyncOptions.redisUrl),
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
const factory = options.queueFactory ?? defaultQueueFactory;
|
||||
|
||||
const ensureQueue = (queueName: string) => factory(queueName, settings, connection);
|
||||
const ensureQueue = (queueName: string) => factory(queueName, asyncOptions, connection);
|
||||
|
||||
return {
|
||||
close: async () => {
|
||||
@@ -85,25 +84,25 @@ export const createQueueManager = (options: CreateQueueManagerOptions = {}): Que
|
||||
connection,
|
||||
enqueueArticle: (payload) => {
|
||||
const data = DetailsTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.details);
|
||||
const queue = ensureQueue(asyncOptions.queues.details);
|
||||
return queue.add("collect_article", data);
|
||||
},
|
||||
enqueueListing: (payload) => {
|
||||
const data = ListingTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.listing);
|
||||
const queue = ensureQueue(asyncOptions.queues.listing);
|
||||
return queue.add("collect_listing", data);
|
||||
},
|
||||
enqueueProcessed: (payload) => {
|
||||
const data = ProcessingTaskPayloadSchema.parse(payload);
|
||||
const queue = ensureQueue(settings.queues.processing);
|
||||
const queue = ensureQueue(asyncOptions.queues.processing);
|
||||
return queue.add("forward_for_processing", data);
|
||||
},
|
||||
iterQueueNames: () => [
|
||||
settings.queues.listing,
|
||||
settings.queues.details,
|
||||
settings.queues.processing,
|
||||
asyncOptions.queues.listing,
|
||||
asyncOptions.queues.details,
|
||||
asyncOptions.queues.processing,
|
||||
],
|
||||
queueName: (suffix: string) => `${settings.prefix}:${suffix}`,
|
||||
settings,
|
||||
options: asyncOptions,
|
||||
queueName: (suffix: string) => `${asyncOptions.prefix}:${suffix}`,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import { PageRangeSchema, TimestampRangeSchema } from "@basango/domain/crawler";
|
||||
import { articleSchema } from "@basango/domain/models";
|
||||
import { PageRangeSchema, TimestampRangeSchema, articleSchema } from "@basango/domain/models";
|
||||
import { z } from "zod";
|
||||
|
||||
export const ListingTaskPayloadSchema = z.object({
|
||||
|
||||
@@ -45,7 +45,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||
{
|
||||
concurrency: options.concurrency ?? 5,
|
||||
connection,
|
||||
prefix: manager.settings.prefix,
|
||||
prefix: manager.options.prefix,
|
||||
},
|
||||
);
|
||||
|
||||
@@ -56,7 +56,7 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
||||
|
||||
const queueEvents = new QueueEvents(queueName, {
|
||||
connection,
|
||||
prefix: manager.settings.prefix,
|
||||
prefix: manager.options.prefix,
|
||||
});
|
||||
|
||||
workers.push(worker);
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import type { AnySourceConfig } from "@basango/domain/crawler";
|
||||
import { AnySourceOptions, CrawlerFetchingOptions, config } from "@basango/domain/config";
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "#crawler/config";
|
||||
import { JsonlPersistor, Persistor } from "#crawler/process/persistence";
|
||||
import { createPageRange, createTimestampRange } from "#crawler/utils";
|
||||
|
||||
@@ -13,11 +12,11 @@ export interface CrawlingOptions {
|
||||
}
|
||||
|
||||
export const resolveCrawlerConfig = (
|
||||
source: AnySourceConfig,
|
||||
source: AnySourceOptions,
|
||||
options: CrawlingOptions,
|
||||
): FetchCrawlerConfig => {
|
||||
): CrawlerFetchingOptions => {
|
||||
return {
|
||||
...config.fetch.crawler,
|
||||
...config.crawler.fetch.crawler,
|
||||
category: options.category,
|
||||
dateRange: createTimestampRange(options.dateRange),
|
||||
pageRange: createPageRange(options.pageRange),
|
||||
@@ -25,10 +24,10 @@ export const resolveCrawlerConfig = (
|
||||
};
|
||||
};
|
||||
|
||||
export const createPersistors = (source: AnySourceConfig): Persistor[] => {
|
||||
export const createPersistors = (source: AnySourceOptions): Persistor[] => {
|
||||
return [
|
||||
new JsonlPersistor({
|
||||
directory: config.paths.data,
|
||||
directory: config.crawler.paths.data,
|
||||
sourceId: source.sourceId,
|
||||
}),
|
||||
];
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
import type { AnySourceConfig } from "@basango/domain/crawler";
|
||||
import { AnySourceOptions, CrawlerFetchingOptions, config } from "@basango/domain/config";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { HTMLElement, parse as parseHtml } from "node-html-parser";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "#crawler/config";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { OpenGraph } from "#crawler/http/open-graph";
|
||||
import type { Persistor } from "#crawler/process/persistence";
|
||||
@@ -12,23 +11,23 @@ export interface CrawlerOptions {
|
||||
}
|
||||
|
||||
export abstract class BaseCrawler {
|
||||
protected readonly settings: FetchCrawlerConfig;
|
||||
protected readonly source: AnySourceConfig;
|
||||
protected readonly options: CrawlerFetchingOptions;
|
||||
protected readonly source: AnySourceOptions;
|
||||
protected readonly http: SyncHttpClient;
|
||||
protected readonly persistors: Persistor[];
|
||||
protected readonly openGraph: OpenGraph;
|
||||
|
||||
protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) {
|
||||
if (!settings.source) {
|
||||
protected constructor(options: CrawlerFetchingOptions, crawlerOptions: CrawlerOptions = {}) {
|
||||
if (!options.source) {
|
||||
throw new Error("Crawler requires a bound source");
|
||||
}
|
||||
|
||||
this.http = new SyncHttpClient(config.fetch.client);
|
||||
this.persistors = options.persistors ?? [];
|
||||
this.http = new SyncHttpClient(config.crawler.fetch.client);
|
||||
this.persistors = crawlerOptions.persistors ?? [];
|
||||
this.openGraph = new OpenGraph();
|
||||
|
||||
this.settings = settings;
|
||||
this.source = settings.source as AnySourceConfig;
|
||||
this.options = options;
|
||||
this.source = options.source as AnySourceOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
import type { HtmlSourceConfig, TimestampRange } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { CrawlerFetchingOptions, HtmlSourceOptions } from "@basango/domain/config";
|
||||
import { Article, TimestampRange } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns";
|
||||
import { HTMLElement } from "node-html-parser";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
import { FetchCrawlerConfig } from "#crawler/config";
|
||||
import {
|
||||
ArticleOutOfDateRangeError,
|
||||
InvalidArticleError,
|
||||
@@ -26,21 +25,21 @@ const md = new TurndownService({
|
||||
* Crawler for generic HTML pages.
|
||||
*/
|
||||
export class HtmlCrawler extends BaseCrawler {
|
||||
readonly source: HtmlSourceConfig;
|
||||
readonly source: HtmlSourceOptions;
|
||||
private currentNode: string | null = null;
|
||||
|
||||
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||
constructor(settings: CrawlerFetchingOptions, options: { persistors?: Persistor[] } = {}) {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "html") {
|
||||
throw new UnsupportedSourceKindError("HtmlCrawler requires a source of kind 'html'");
|
||||
}
|
||||
this.source = this.settings.source as HtmlSourceConfig;
|
||||
this.source = this.options.source as HtmlSourceOptions;
|
||||
}
|
||||
|
||||
async fetch(): Promise<void> {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
const pageRange = this.options.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.options.dateRange;
|
||||
const selectors = this.source.sourceSelectors;
|
||||
|
||||
if (!selectors.articles) {
|
||||
@@ -218,7 +217,7 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
*/
|
||||
private applyCategory(template: string): string {
|
||||
if (template.includes("{category}")) {
|
||||
const replacement = this.settings.category ?? "";
|
||||
const replacement = this.options.category ?? "";
|
||||
return template.replace("{category}", replacement);
|
||||
}
|
||||
return template;
|
||||
@@ -297,7 +296,7 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
|
||||
if (!selector && this.settings.category) return [this.settings.category.toLowerCase()];
|
||||
if (!selector && this.options.category) return [this.options.category.toLowerCase()];
|
||||
if (!selector) return [];
|
||||
|
||||
const values: string[] = [];
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
import type { PageRange, TimestampRange, WordPressSourceConfig } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { CrawlerFetchingOptions, WordPressSourceOptions } from "@basango/domain/config";
|
||||
import { Article, PageRange, TimestampRange } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
import { fromUnixTime } from "date-fns";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
import { FetchCrawlerConfig } from "#crawler/config";
|
||||
import {
|
||||
ArticleOutOfDateRangeError,
|
||||
InvalidArticleError,
|
||||
@@ -33,7 +32,7 @@ interface WordPressPost {
|
||||
* Crawler for WordPress sites using the REST API.
|
||||
*/
|
||||
export class WordPressCrawler extends BaseCrawler {
|
||||
readonly source: WordPressSourceConfig;
|
||||
readonly source: WordPressSourceOptions;
|
||||
private categoryMap: Map<number, string> = new Map();
|
||||
|
||||
public static readonly POST_QUERY =
|
||||
@@ -43,7 +42,7 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
public static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages";
|
||||
public static readonly TOTAL_POSTS_HEADER = "x-wp-total";
|
||||
|
||||
constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) {
|
||||
constructor(settings: CrawlerFetchingOptions, options: { persistors?: Persistor[] } = {}) {
|
||||
super(settings, options);
|
||||
|
||||
if (!settings.source || settings.source.sourceKind !== "wordpress") {
|
||||
@@ -51,15 +50,15 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
"WordPressCrawler requires a source of kind 'wordpress'",
|
||||
);
|
||||
}
|
||||
this.source = this.settings.source as WordPressSourceConfig;
|
||||
this.source = this.options.source as WordPressSourceOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and process WordPress posts.
|
||||
*/
|
||||
async fetch(): Promise<void> {
|
||||
const pageRange = this.settings.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.settings.dateRange;
|
||||
const pageRange = this.options.pageRange ?? (await this.getPagination());
|
||||
const dateRange = this.options.dateRange;
|
||||
|
||||
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
|
||||
const endpoint = this.buildEndpointUrl(page);
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
import { config } from "@basango/domain/config";
|
||||
import type { Article } from "@basango/domain/models";
|
||||
import { md5 } from "@basango/encryption";
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import { config, env } from "#crawler/config";
|
||||
import { HttpError, SyncHttpClient } from "#crawler/http/http-client";
|
||||
|
||||
export interface Persistor {
|
||||
@@ -66,9 +66,9 @@ export const persist = async (
|
||||
};
|
||||
|
||||
export const forward = async (payload: Partial<Article>): Promise<void> => {
|
||||
const client = new SyncHttpClient(config.fetch.client);
|
||||
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
|
||||
const token = env("BASANGO_CRAWLER_TOKEN");
|
||||
const client = new SyncHttpClient(config.crawler.fetch.client);
|
||||
const endpoint = config.crawler.backend.endpoint;
|
||||
const token = config.crawler.backend.token;
|
||||
|
||||
try {
|
||||
const response = await client.post(endpoint, {
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#! /usr/bin/env bun
|
||||
#!/usr/bin/env bun
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import { createInterface } from "node:readline";
|
||||
import { parseArgs } from "node:util";
|
||||
|
||||
import { config } from "@basango/domain/config";
|
||||
import type { Article } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { config } from "#crawler/config";
|
||||
import { forward } from "#crawler/process/persistence";
|
||||
|
||||
const USAGE = `
|
||||
@@ -31,7 +31,7 @@ const main = async (): Promise<void> => {
|
||||
return;
|
||||
}
|
||||
|
||||
const filePath = path.join(config.paths.data, `${sourceId}.jsonl`);
|
||||
const filePath = path.join(config.crawler.paths.data, `${sourceId}.jsonl`);
|
||||
|
||||
if (!fs.existsSync(filePath)) {
|
||||
logger.error({ filePath, sourceId }, "Source must be crawled first; JSONL not found");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#! /usr/bin/env bun
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#! /usr/bin/env bun
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
|
||||
@@ -1,28 +1,29 @@
|
||||
import {
|
||||
AnySourceOptions,
|
||||
HtmlSourceOptions,
|
||||
WordPressSourceOptions,
|
||||
config,
|
||||
} from "@basango/domain/config";
|
||||
import { DEFAULT_DATE_FORMAT } from "@basango/domain/constants";
|
||||
import {
|
||||
AnySourceConfig,
|
||||
DateSpecSchema,
|
||||
HtmlSourceConfig,
|
||||
PageRange,
|
||||
PageRangeSchema,
|
||||
PageSpecSchema,
|
||||
TimestampRange,
|
||||
TimestampRangeSchema,
|
||||
WordPressSourceConfig,
|
||||
} from "@basango/domain/crawler";
|
||||
} from "@basango/domain/models";
|
||||
import { format, fromUnixTime, getUnixTime, isMatch, parse } from "date-fns";
|
||||
import type { RedisOptions } from "ioredis";
|
||||
|
||||
import { config } from "#crawler/config";
|
||||
|
||||
/**
|
||||
* Resolve a source configuration by its ID.
|
||||
* @param id - The source ID
|
||||
*/
|
||||
export const resolveSourceConfig = (id: string): AnySourceConfig => {
|
||||
export const resolveSourceConfig = (id: string): AnySourceOptions => {
|
||||
const source =
|
||||
config.sources.html.find((s: HtmlSourceConfig) => s.sourceId === id) ||
|
||||
config.sources.wordpress.find((s: WordPressSourceConfig) => s.sourceId === id);
|
||||
config.crawler.sources.html.find((s: HtmlSourceOptions) => s.sourceId === id) ||
|
||||
config.crawler.sources.wordpress.find((s: WordPressSourceOptions) => s.sourceId === id);
|
||||
|
||||
if (source === undefined) {
|
||||
throw new Error(`Source '${id}' not found in configuration`);
|
||||
|
||||
Reference in New Issue
Block a user