Files
basango/apps/crawler/src/config.ts
T

82 lines
2.9 KiB
TypeScript

import path from "node:path";
import { loadConfig as defineConfig } from "@devscast/config";
import { z } from "zod";
import {
DateRangeSchema,
HtmlSourceConfigSchema,
PageRangeSchema,
UpdateDirectionSchema,
WordPressSourceConfigSchema,
} from "@/schema";
export const PROJECT_DIR = path.resolve(__dirname, "../");
export const PipelineConfigSchema = z.object({
fetch: z.object({
async: z.object({
prefix: z.string().default("basango:crawler:queue"),
queues: z.object({
details: z.string().default("details"),
listing: z.string().default("listing"),
processing: z.string().default("processing"),
}),
redisUrl: z.string().default("redis://localhost:6379/0"),
ttl: z.object({
default: z.number().int().positive().default(600),
failure: z.number().int().nonnegative().default(3600),
result: z.number().int().nonnegative().default(3600),
}),
}),
client: z.object({
backoffInitial: z.number().nonnegative().default(1),
backoffMax: z.number().nonnegative().default(30),
backoffMultiplier: z.number().positive().default(2),
followRedirects: z.boolean().default(true),
maxRetries: z.number().int().nonnegative().default(3),
respectRetryAfter: z.boolean().default(true),
rotate: z.boolean().default(true),
timeout: z.number().positive().default(20),
userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"),
verifySsl: z.boolean().default(true),
}),
crawler: z.object({
category: z.string().optional(),
dateRange: DateRangeSchema.optional(),
direction: UpdateDirectionSchema.default("forward"),
isUpdate: z.boolean().default(false),
maxWorkers: z.number().int().positive().default(5),
notify: z.boolean().default(false),
pageRange: PageRangeSchema.optional(),
source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(),
useMultiThreading: z.boolean().default(false),
}),
}),
paths: z.object({
config: z.string().default(path.join(PROJECT_DIR, "config")),
data: z.string().default(path.join(PROJECT_DIR, "data", "datasets")),
root: z.string().default(PROJECT_DIR),
}),
sources: z.object({
html: z.array(HtmlSourceConfigSchema).default([]),
wordpress: z.array(WordPressSourceConfigSchema).default([]),
}),
});
export const { config, env } = defineConfig({
cwd: process.cwd(),
env: {
path: path.join(PROJECT_DIR, ".env"),
},
schema: PipelineConfigSchema,
sources: [
path.join(PROJECT_DIR, "config", "pipeline.json"),
path.join(PROJECT_DIR, "config", "sources.json"),
],
});
export type PipelineConfig = z.infer<typeof PipelineConfigSchema>;
export type FetchClientConfig = PipelineConfig["fetch"]["client"];
export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"];
export type FetchAsyncConfig = PipelineConfig["fetch"]["async"];