feat(domain): centralize data definition

This commit is contained in:
2025-11-17 00:04:27 +02:00
parent e7585aa76c
commit f39635e04f
96 changed files with 3474 additions and 1167 deletions
+9 -21
View File
@@ -1,8 +1,8 @@
import type { HtmlSourceConfig, WordPressSourceConfig } from "@basango/domain/crawler";
import { Article } from "@basango/domain/models";
import { logger } from "@basango/logger";
import { config, env } from "#crawler/config";
import { UnsupportedSourceKindError } from "#crawler/errors";
import { SyncHttpClient } from "#crawler/http/http-client";
import { QueueManager, createQueueManager } from "#crawler/process/async/queue";
import {
DetailsTaskPayload,
@@ -12,11 +12,11 @@ import {
import { createPersistors, resolveCrawlerConfig } from "#crawler/process/crawler";
import { HtmlCrawler } from "#crawler/process/parsers/html";
import { WordPressCrawler } from "#crawler/process/parsers/wordpress";
import { Article, HtmlSourceConfig, WordPressSourceConfig } from "#crawler/schema";
import { forward } from "#crawler/process/persistence";
import {
createDateRange,
formatDateRange,
createTimestampRange,
formatPageRange,
formatTimestampRange,
resolveSourceConfig,
} from "#crawler/utils";
@@ -45,7 +45,7 @@ export const collectHtmlListing = async (
await manager.enqueueArticle({
category: payload.category,
dateRange: createDateRange(payload.dateRange),
dateRange: createTimestampRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
@@ -85,7 +85,7 @@ export const collectWordPressListing = async (
await manager.enqueueArticle({
category: payload.category,
data,
dateRange: createDateRange(payload.dateRange),
dateRange: createTimestampRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
@@ -106,7 +106,7 @@ export const collectArticle = async (
const source = resolveSourceConfig(payload.sourceId);
const settings = resolveCrawlerConfig(source, {
category: payload.category,
dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
dateRange: payload.dateRange ? formatTimestampRange(payload.dateRange) : undefined,
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
sourceId: payload.sourceId,
});
@@ -141,19 +141,7 @@ export const forwardForProcessing = async (payload: ProcessingTaskPayload): Prom
try {
logger.info({ article: payload.article.title }, "Forwarding article to API");
const client = new SyncHttpClient(config.fetch.client);
const response = await client.post(env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT"), {
headers: {
Authorization: `${env("BASANGO_CRAWLER_TOKEN")}`,
},
json: payload.article,
});
if (response.ok) {
const data = await response.json();
logger.info({ ...data }, "Article successfully forwarded to API");
}
await forward(payload.article);
} catch (error) {
logger.error({ error }, "Failed to forward article to API");
}
+4 -4
View File
@@ -1,7 +1,7 @@
import { PageRangeSchema, TimestampRangeSchema } from "@basango/domain/crawler";
import { articleSchema } from "@basango/domain/models";
import { z } from "zod";
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "#crawler/schema";
export const ListingTaskPayloadSchema = z.object({
category: z.string().optional(),
dateRange: z.string().optional(),
@@ -12,7 +12,7 @@ export const ListingTaskPayloadSchema = z.object({
export const DetailsTaskPayloadSchema = z.object({
category: z.string().optional(),
data: z.any().optional(),
dateRange: DateRangeSchema.optional(),
dateRange: TimestampRangeSchema.optional(),
page: z.number().int().nonnegative().optional(),
pageRange: PageRangeSchema.optional(),
sourceId: z.string(),
@@ -20,7 +20,7 @@ export const DetailsTaskPayloadSchema = z.object({
});
export const ProcessingTaskPayloadSchema = z.object({
article: ArticleSchema,
article: articleSchema,
sourceId: z.string(),
});
+3 -3
View File
@@ -1,9 +1,9 @@
import type { AnySourceConfig } from "@basango/domain/crawler";
import logger from "@basango/logger";
import { FetchCrawlerConfig, config } from "#crawler/config";
import { JsonlPersistor, Persistor } from "#crawler/process/persistence";
import { AnySourceConfig } from "#crawler/schema";
import { createDateRange, createPageRange } from "#crawler/utils";
import { createPageRange, createTimestampRange } from "#crawler/utils";
export interface CrawlingOptions {
sourceId: string;
@@ -19,7 +19,7 @@ export const resolveCrawlerConfig = (
return {
...config.fetch.crawler,
category: options.category,
dateRange: createDateRange(options.dateRange),
dateRange: createTimestampRange(options.dateRange),
pageRange: createPageRange(options.pageRange),
source,
};
+6 -2
View File
@@ -1,10 +1,11 @@
import type { AnySourceConfig } from "@basango/domain/crawler";
import { Article } from "@basango/domain/models";
import { HTMLElement, parse as parseHtml } from "node-html-parser";
import { FetchCrawlerConfig, config } from "#crawler/config";
import { SyncHttpClient } from "#crawler/http/http-client";
import { OpenGraph } from "#crawler/http/open-graph";
import type { Persistor } from "#crawler/process/persistence";
import { AnySourceConfig, Article } from "#crawler/schema";
export interface CrawlerOptions {
persistors?: Persistor[];
@@ -97,7 +98,10 @@ export abstract class BaseCrawler {
* @param record - The article record
* @param url - The URL to fetch Open Graph data from
*/
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
protected async enrichWithOpenGraph(
record: Partial<Article>,
url?: string,
): Promise<Partial<Article>> {
try {
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
return { ...record, metadata };
+3 -2
View File
@@ -1,3 +1,5 @@
import type { HtmlSourceConfig, TimestampRange } from "@basango/domain/crawler";
import { Article } from "@basango/domain/models";
import { logger } from "@basango/logger";
import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns";
import { HTMLElement } from "node-html-parser";
@@ -12,7 +14,6 @@ import {
} from "#crawler/errors";
import { BaseCrawler } from "#crawler/process/parsers/base";
import { Persistor, persist } from "#crawler/process/persistence";
import { Article, DateRange, HtmlSourceConfig } from "#crawler/schema";
import { createAbsoluteUrl, isTimestampInRange } from "#crawler/utils";
const md = new TurndownService({
@@ -106,7 +107,7 @@ export class HtmlCrawler extends BaseCrawler {
* @param html - The HTML content of the article
* @param dateRange - Optional date range for filtering
*/
async fetchOne(html: string, dateRange?: DateRange | null): Promise<Article> {
async fetchOne(html: string, dateRange?: TimestampRange | null): Promise<Partial<Article>> {
const root = this.parseHtml(html);
const selectors = this.source.sourceSelectors;
@@ -1,3 +1,5 @@
import type { PageRange, TimestampRange, WordPressSourceConfig } from "@basango/domain/crawler";
import { Article } from "@basango/domain/models";
import { logger } from "@basango/logger";
import { fromUnixTime } from "date-fns";
import TurndownService from "turndown";
@@ -10,7 +12,6 @@ import {
} from "#crawler/errors";
import { BaseCrawler } from "#crawler/process/parsers/base";
import { Persistor, persist } from "#crawler/process/persistence";
import { Article, DateRange, PageRange, WordPressSourceConfig } from "#crawler/schema";
import { isTimestampInRange } from "#crawler/utils";
const md = new TurndownService({
@@ -107,7 +108,7 @@ export class WordPressCrawler extends BaseCrawler {
* @param input - Decoded JSON object or raw JSON string
* @param dateRange - Optional date range for filtering
*/
async fetchOne(input: unknown, dateRange?: DateRange | null): Promise<Article> {
async fetchOne(input: unknown, dateRange?: TimestampRange | null): Promise<Article> {
// input can be the decoded JSON object or a raw JSON string
let data: WordPressPost | null = null;
try {
+46 -10
View File
@@ -1,13 +1,15 @@
import fs from "node:fs";
import path from "node:path";
import type { Article } from "@basango/domain/models";
import { md5 } from "@basango/encryption";
import logger from "@basango/logger";
import { Article } from "#crawler/schema";
import { config, env } from "#crawler/config";
import { HttpError, SyncHttpClient } from "#crawler/http/http-client";
export interface Persistor {
persist(record: Article): Promise<void> | void;
persist(record: Partial<Article>): Promise<void> | void;
close: () => Promise<void> | void;
}
@@ -35,17 +37,20 @@ const sanitize = (text: string): string => {
return s.trim();
};
export const persist = async (payload: Article, persistors: Persistor[]): Promise<Article> => {
export const persist = async (
payload: Partial<Article>,
persistors: Persistor[],
): Promise<Article> => {
const data = {
...payload,
body: sanitize(payload.body),
categories: payload.categories.map(sanitize),
title: sanitize(payload.title),
body: sanitize(payload.body!),
categories: payload.categories!.map(sanitize),
title: sanitize(payload.title!),
};
const article = {
...data,
hash: md5(data.link),
hash: md5(data.link!),
} as Article;
for (const persistor of persistors) {
@@ -60,6 +65,37 @@ export const persist = async (payload: Article, persistors: Persistor[]): Promis
return article;
};
export const forward = async (payload: Partial<Article>): Promise<void> => {
const client = new SyncHttpClient(config.fetch.client);
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
const token = env("BASANGO_CRAWLER_TOKEN");
try {
const response = await client.post(endpoint, {
headers: {
Authorization: `${token}`,
},
json: payload,
});
if (response.ok) {
const data = await response.json();
logger.info({ ...data }, "Article forwarded");
return;
}
logger.error({ status: response.status, url: payload.link }, "Forwarding failed");
} catch (error) {
if (error instanceof HttpError) {
const data = await error.response.json();
logger.error({ ...data, url: payload.link }, "Error forwarding article");
return;
}
logger.error({ error, url: payload.link }, "Error forwarding article");
}
};
export class JsonlPersistor implements Persistor {
private readonly filePath: string;
private readonly encoding: BufferEncoding;
@@ -78,15 +114,15 @@ export class JsonlPersistor implements Persistor {
}
}
persist(record: Article): Promise<void> {
persist(payload: Partial<Article>): Promise<void> {
if (this.closed) {
return Promise.reject(new Error("Persistor has been closed"));
}
const payload = `${JSON.stringify(record)}\n`;
const record = `${JSON.stringify(payload)}\n`;
this.pending = this.pending.then(async () => {
fs.appendFileSync(this.filePath, payload, { encoding: this.encoding });
fs.appendFileSync(this.filePath, record, { encoding: this.encoding });
});
return this.pending;