feat(domain): centralize data definition
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
import type { HtmlSourceConfig, WordPressSourceConfig } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { config, env } from "#crawler/config";
|
||||
import { UnsupportedSourceKindError } from "#crawler/errors";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { QueueManager, createQueueManager } from "#crawler/process/async/queue";
|
||||
import {
|
||||
DetailsTaskPayload,
|
||||
@@ -12,11 +12,11 @@ import {
|
||||
import { createPersistors, resolveCrawlerConfig } from "#crawler/process/crawler";
|
||||
import { HtmlCrawler } from "#crawler/process/parsers/html";
|
||||
import { WordPressCrawler } from "#crawler/process/parsers/wordpress";
|
||||
import { Article, HtmlSourceConfig, WordPressSourceConfig } from "#crawler/schema";
|
||||
import { forward } from "#crawler/process/persistence";
|
||||
import {
|
||||
createDateRange,
|
||||
formatDateRange,
|
||||
createTimestampRange,
|
||||
formatPageRange,
|
||||
formatTimestampRange,
|
||||
resolveSourceConfig,
|
||||
} from "#crawler/utils";
|
||||
|
||||
@@ -45,7 +45,7 @@ export const collectHtmlListing = async (
|
||||
|
||||
await manager.enqueueArticle({
|
||||
category: payload.category,
|
||||
dateRange: createDateRange(payload.dateRange),
|
||||
dateRange: createTimestampRange(payload.dateRange),
|
||||
sourceId: payload.sourceId,
|
||||
url,
|
||||
} as DetailsTaskPayload);
|
||||
@@ -85,7 +85,7 @@ export const collectWordPressListing = async (
|
||||
await manager.enqueueArticle({
|
||||
category: payload.category,
|
||||
data,
|
||||
dateRange: createDateRange(payload.dateRange),
|
||||
dateRange: createTimestampRange(payload.dateRange),
|
||||
sourceId: payload.sourceId,
|
||||
url,
|
||||
} as DetailsTaskPayload);
|
||||
@@ -106,7 +106,7 @@ export const collectArticle = async (
|
||||
const source = resolveSourceConfig(payload.sourceId);
|
||||
const settings = resolveCrawlerConfig(source, {
|
||||
category: payload.category,
|
||||
dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
|
||||
dateRange: payload.dateRange ? formatTimestampRange(payload.dateRange) : undefined,
|
||||
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
|
||||
sourceId: payload.sourceId,
|
||||
});
|
||||
@@ -141,19 +141,7 @@ export const forwardForProcessing = async (payload: ProcessingTaskPayload): Prom
|
||||
|
||||
try {
|
||||
logger.info({ article: payload.article.title }, "Forwarding article to API");
|
||||
|
||||
const client = new SyncHttpClient(config.fetch.client);
|
||||
const response = await client.post(env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT"), {
|
||||
headers: {
|
||||
Authorization: `${env("BASANGO_CRAWLER_TOKEN")}`,
|
||||
},
|
||||
json: payload.article,
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
logger.info({ ...data }, "Article successfully forwarded to API");
|
||||
}
|
||||
await forward(payload.article);
|
||||
} catch (error) {
|
||||
logger.error({ error }, "Failed to forward article to API");
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import { PageRangeSchema, TimestampRangeSchema } from "@basango/domain/crawler";
|
||||
import { articleSchema } from "@basango/domain/models";
|
||||
import { z } from "zod";
|
||||
|
||||
import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "#crawler/schema";
|
||||
|
||||
export const ListingTaskPayloadSchema = z.object({
|
||||
category: z.string().optional(),
|
||||
dateRange: z.string().optional(),
|
||||
@@ -12,7 +12,7 @@ export const ListingTaskPayloadSchema = z.object({
|
||||
export const DetailsTaskPayloadSchema = z.object({
|
||||
category: z.string().optional(),
|
||||
data: z.any().optional(),
|
||||
dateRange: DateRangeSchema.optional(),
|
||||
dateRange: TimestampRangeSchema.optional(),
|
||||
page: z.number().int().nonnegative().optional(),
|
||||
pageRange: PageRangeSchema.optional(),
|
||||
sourceId: z.string(),
|
||||
@@ -20,7 +20,7 @@ export const DetailsTaskPayloadSchema = z.object({
|
||||
});
|
||||
|
||||
export const ProcessingTaskPayloadSchema = z.object({
|
||||
article: ArticleSchema,
|
||||
article: articleSchema,
|
||||
sourceId: z.string(),
|
||||
});
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import type { AnySourceConfig } from "@basango/domain/crawler";
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "#crawler/config";
|
||||
import { JsonlPersistor, Persistor } from "#crawler/process/persistence";
|
||||
import { AnySourceConfig } from "#crawler/schema";
|
||||
import { createDateRange, createPageRange } from "#crawler/utils";
|
||||
import { createPageRange, createTimestampRange } from "#crawler/utils";
|
||||
|
||||
export interface CrawlingOptions {
|
||||
sourceId: string;
|
||||
@@ -19,7 +19,7 @@ export const resolveCrawlerConfig = (
|
||||
return {
|
||||
...config.fetch.crawler,
|
||||
category: options.category,
|
||||
dateRange: createDateRange(options.dateRange),
|
||||
dateRange: createTimestampRange(options.dateRange),
|
||||
pageRange: createPageRange(options.pageRange),
|
||||
source,
|
||||
};
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import type { AnySourceConfig } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { HTMLElement, parse as parseHtml } from "node-html-parser";
|
||||
|
||||
import { FetchCrawlerConfig, config } from "#crawler/config";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { OpenGraph } from "#crawler/http/open-graph";
|
||||
import type { Persistor } from "#crawler/process/persistence";
|
||||
import { AnySourceConfig, Article } from "#crawler/schema";
|
||||
|
||||
export interface CrawlerOptions {
|
||||
persistors?: Persistor[];
|
||||
@@ -97,7 +98,10 @@ export abstract class BaseCrawler {
|
||||
* @param record - The article record
|
||||
* @param url - The URL to fetch Open Graph data from
|
||||
*/
|
||||
protected async enrichWithOpenGraph(record: Article, url?: string): Promise<Article> {
|
||||
protected async enrichWithOpenGraph(
|
||||
record: Partial<Article>,
|
||||
url?: string,
|
||||
): Promise<Partial<Article>> {
|
||||
try {
|
||||
const metadata = url ? await this.openGraph.consumeUrl(url) : undefined;
|
||||
return { ...record, metadata };
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import type { HtmlSourceConfig, TimestampRange } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns";
|
||||
import { HTMLElement } from "node-html-parser";
|
||||
@@ -12,7 +14,6 @@ import {
|
||||
} from "#crawler/errors";
|
||||
import { BaseCrawler } from "#crawler/process/parsers/base";
|
||||
import { Persistor, persist } from "#crawler/process/persistence";
|
||||
import { Article, DateRange, HtmlSourceConfig } from "#crawler/schema";
|
||||
import { createAbsoluteUrl, isTimestampInRange } from "#crawler/utils";
|
||||
|
||||
const md = new TurndownService({
|
||||
@@ -106,7 +107,7 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
* @param html - The HTML content of the article
|
||||
* @param dateRange - Optional date range for filtering
|
||||
*/
|
||||
async fetchOne(html: string, dateRange?: DateRange | null): Promise<Article> {
|
||||
async fetchOne(html: string, dateRange?: TimestampRange | null): Promise<Partial<Article>> {
|
||||
const root = this.parseHtml(html);
|
||||
const selectors = this.source.sourceSelectors;
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import type { PageRange, TimestampRange, WordPressSourceConfig } from "@basango/domain/crawler";
|
||||
import { Article } from "@basango/domain/models";
|
||||
import { logger } from "@basango/logger";
|
||||
import { fromUnixTime } from "date-fns";
|
||||
import TurndownService from "turndown";
|
||||
@@ -10,7 +12,6 @@ import {
|
||||
} from "#crawler/errors";
|
||||
import { BaseCrawler } from "#crawler/process/parsers/base";
|
||||
import { Persistor, persist } from "#crawler/process/persistence";
|
||||
import { Article, DateRange, PageRange, WordPressSourceConfig } from "#crawler/schema";
|
||||
import { isTimestampInRange } from "#crawler/utils";
|
||||
|
||||
const md = new TurndownService({
|
||||
@@ -107,7 +108,7 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
* @param input - Decoded JSON object or raw JSON string
|
||||
* @param dateRange - Optional date range for filtering
|
||||
*/
|
||||
async fetchOne(input: unknown, dateRange?: DateRange | null): Promise<Article> {
|
||||
async fetchOne(input: unknown, dateRange?: TimestampRange | null): Promise<Article> {
|
||||
// input can be the decoded JSON object or a raw JSON string
|
||||
let data: WordPressPost | null = null;
|
||||
try {
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
import type { Article } from "@basango/domain/models";
|
||||
import { md5 } from "@basango/encryption";
|
||||
import logger from "@basango/logger";
|
||||
|
||||
import { Article } from "#crawler/schema";
|
||||
import { config, env } from "#crawler/config";
|
||||
import { HttpError, SyncHttpClient } from "#crawler/http/http-client";
|
||||
|
||||
export interface Persistor {
|
||||
persist(record: Article): Promise<void> | void;
|
||||
persist(record: Partial<Article>): Promise<void> | void;
|
||||
close: () => Promise<void> | void;
|
||||
}
|
||||
|
||||
@@ -35,17 +37,20 @@ const sanitize = (text: string): string => {
|
||||
return s.trim();
|
||||
};
|
||||
|
||||
export const persist = async (payload: Article, persistors: Persistor[]): Promise<Article> => {
|
||||
export const persist = async (
|
||||
payload: Partial<Article>,
|
||||
persistors: Persistor[],
|
||||
): Promise<Article> => {
|
||||
const data = {
|
||||
...payload,
|
||||
body: sanitize(payload.body),
|
||||
categories: payload.categories.map(sanitize),
|
||||
title: sanitize(payload.title),
|
||||
body: sanitize(payload.body!),
|
||||
categories: payload.categories!.map(sanitize),
|
||||
title: sanitize(payload.title!),
|
||||
};
|
||||
|
||||
const article = {
|
||||
...data,
|
||||
hash: md5(data.link),
|
||||
hash: md5(data.link!),
|
||||
} as Article;
|
||||
|
||||
for (const persistor of persistors) {
|
||||
@@ -60,6 +65,37 @@ export const persist = async (payload: Article, persistors: Persistor[]): Promis
|
||||
return article;
|
||||
};
|
||||
|
||||
export const forward = async (payload: Partial<Article>): Promise<void> => {
|
||||
const client = new SyncHttpClient(config.fetch.client);
|
||||
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
|
||||
const token = env("BASANGO_CRAWLER_TOKEN");
|
||||
|
||||
try {
|
||||
const response = await client.post(endpoint, {
|
||||
headers: {
|
||||
Authorization: `${token}`,
|
||||
},
|
||||
json: payload,
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
logger.info({ ...data }, "Article forwarded");
|
||||
return;
|
||||
}
|
||||
|
||||
logger.error({ status: response.status, url: payload.link }, "Forwarding failed");
|
||||
} catch (error) {
|
||||
if (error instanceof HttpError) {
|
||||
const data = await error.response.json();
|
||||
logger.error({ ...data, url: payload.link }, "Error forwarding article");
|
||||
return;
|
||||
}
|
||||
|
||||
logger.error({ error, url: payload.link }, "Error forwarding article");
|
||||
}
|
||||
};
|
||||
|
||||
export class JsonlPersistor implements Persistor {
|
||||
private readonly filePath: string;
|
||||
private readonly encoding: BufferEncoding;
|
||||
@@ -78,15 +114,15 @@ export class JsonlPersistor implements Persistor {
|
||||
}
|
||||
}
|
||||
|
||||
persist(record: Article): Promise<void> {
|
||||
persist(payload: Partial<Article>): Promise<void> {
|
||||
if (this.closed) {
|
||||
return Promise.reject(new Error("Persistor has been closed"));
|
||||
}
|
||||
|
||||
const payload = `${JSON.stringify(record)}\n`;
|
||||
const record = `${JSON.stringify(payload)}\n`;
|
||||
|
||||
this.pending = this.pending.then(async () => {
|
||||
fs.appendFileSync(this.filePath, payload, { encoding: this.encoding });
|
||||
fs.appendFileSync(this.filePath, record, { encoding: this.encoding });
|
||||
});
|
||||
|
||||
return this.pending;
|
||||
|
||||
Reference in New Issue
Block a user