feat(crawler): improve async tasks integration
This commit is contained in:
@@ -5,15 +5,16 @@
|
|||||||
"type": "module",
|
"type": "module",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "tsc -b",
|
"build": "tsc -b",
|
||||||
"test": "vitest --run"
|
"test": "vitest --run"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"bullmq": "^4.17.0",
|
"@basango/logger": "workspace:*",
|
||||||
"date-fns": "^3.6.0",
|
"bullmq": "^4.17.0",
|
||||||
"ioredis": "^5.3.2",
|
"date-fns": "^3.6.0",
|
||||||
"tiktoken": "^1.0.14",
|
"ioredis": "^5.3.2",
|
||||||
"zod": "^4.0.0"
|
"tiktoken": "^1.0.14",
|
||||||
|
"zod": "^4.0.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
export * from "../config";
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
export * from "../services/crawler/async/queue";
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
export * from "../schema";
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
export * from "../services/crawler/async/tasks";
|
||||||
@@ -310,5 +310,27 @@ export const resolveConfigPath = (basePath: string, env?: string): string => {
|
|||||||
return `${withoutExt}.${env}${ext}`;
|
return `${withoutExt}.${env}${ext}`;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const schemaToJSON = <T extends z.ZodTypeAny>(schema: T) =>
|
export const schemaToJSON = <T extends z.ZodTypeAny>(schema: T): unknown => {
|
||||||
schema.toJSON();
|
const candidate = schema as unknown as { toJSON?: () => unknown };
|
||||||
|
if (typeof candidate.toJSON === "function") {
|
||||||
|
return candidate.toJSON();
|
||||||
|
}
|
||||||
|
|
||||||
|
const typeName = (schema as { _def?: { typeName?: z.ZodFirstPartyTypeKind } })._def
|
||||||
|
?.typeName;
|
||||||
|
|
||||||
|
switch (typeName) {
|
||||||
|
case z.ZodFirstPartyTypeKind.ZodObject:
|
||||||
|
return { type: "object" };
|
||||||
|
case z.ZodFirstPartyTypeKind.ZodArray:
|
||||||
|
return { type: "array" };
|
||||||
|
case z.ZodFirstPartyTypeKind.ZodString:
|
||||||
|
return { type: "string" };
|
||||||
|
case z.ZodFirstPartyTypeKind.ZodNumber:
|
||||||
|
return { type: "number" };
|
||||||
|
case z.ZodFirstPartyTypeKind.ZodBoolean:
|
||||||
|
return { type: "boolean" };
|
||||||
|
default:
|
||||||
|
return { type: "unknown" };
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
|
import { logger } from "@basango/logger";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
ListingTaskPayloadSchema,
|
ListingTaskPayloadSchema,
|
||||||
ArticleTaskPayloadSchema,
|
ArticleTaskPayloadSchema,
|
||||||
ProcessedTaskPayloadSchema,
|
ProcessedTaskPayloadSchema,
|
||||||
ListingTaskPayload,
|
ListingTaskPayload,
|
||||||
ArticleTaskPayload,
|
ArticleTaskPayload,
|
||||||
ProcessedTaskPayload,
|
ProcessedTaskPayload,
|
||||||
} from "./schemas";
|
} from "./schemas";
|
||||||
import {
|
import {
|
||||||
createQueueManager,
|
createQueueManager,
|
||||||
@@ -48,47 +50,122 @@ export interface ScheduleAsyncCrawlOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export const scheduleAsyncCrawl = async ({
|
export const scheduleAsyncCrawl = async ({
|
||||||
sourceId,
|
sourceId,
|
||||||
env = "development",
|
env = "development",
|
||||||
pageRange,
|
pageRange,
|
||||||
dateRange,
|
dateRange,
|
||||||
category,
|
category,
|
||||||
settings,
|
settings,
|
||||||
queueManager,
|
queueManager,
|
||||||
}: ScheduleAsyncCrawlOptions): Promise<string> => {
|
}: ScheduleAsyncCrawlOptions): Promise<string> => {
|
||||||
const payload = ListingTaskPayloadSchema.parse({
|
const payload = ListingTaskPayloadSchema.parse({
|
||||||
source_id: sourceId,
|
source_id: sourceId,
|
||||||
env,
|
env,
|
||||||
page_range: pageRange ?? undefined,
|
page_range: pageRange ?? undefined,
|
||||||
date_range: dateRange ?? undefined,
|
date_range: dateRange ?? undefined,
|
||||||
category: category ?? undefined,
|
category: category ?? undefined,
|
||||||
});
|
});
|
||||||
|
|
||||||
const manager = queueManager ?? createQueueManager({ settings });
|
const manager = queueManager ?? createQueueManager({ settings });
|
||||||
try {
|
logger.debug(
|
||||||
const job = await manager.enqueueListing(payload);
|
{
|
||||||
return job.id;
|
sourceId,
|
||||||
} finally {
|
env: payload.env,
|
||||||
if (!queueManager) {
|
pageRange: payload.page_range,
|
||||||
await manager.close();
|
dateRange: payload.date_range,
|
||||||
}
|
category: payload.category,
|
||||||
|
},
|
||||||
|
"Scheduling listing collection job",
|
||||||
|
);
|
||||||
|
try {
|
||||||
|
const job = await manager.enqueueListing(payload);
|
||||||
|
logger.info(
|
||||||
|
{ jobId: job.id, sourceId, env: payload.env },
|
||||||
|
"Scheduled listing collection job",
|
||||||
|
);
|
||||||
|
return job.id;
|
||||||
|
} finally {
|
||||||
|
if (!queueManager) {
|
||||||
|
await manager.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
export const collectListing = async (payload: unknown): Promise<number> => {
|
export const collectListing = async (payload: unknown): Promise<number> => {
|
||||||
const data = ListingTaskPayloadSchema.parse(payload);
|
const data = ListingTaskPayloadSchema.parse(payload);
|
||||||
const result = await handlers.collectListing(data);
|
logger.debug(
|
||||||
return typeof result === "number" ? result : 0;
|
{
|
||||||
|
sourceId: data.source_id,
|
||||||
|
env: data.env,
|
||||||
|
pageRange: data.page_range,
|
||||||
|
dateRange: data.date_range,
|
||||||
|
category: data.category,
|
||||||
|
},
|
||||||
|
"Collecting listing",
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await handlers.collectListing(data);
|
||||||
|
const count = typeof result === "number" ? result : 0;
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
{
|
||||||
|
sourceId: data.source_id,
|
||||||
|
env: data.env,
|
||||||
|
queuedArticles: count,
|
||||||
|
},
|
||||||
|
"Listing collection completed",
|
||||||
|
);
|
||||||
|
|
||||||
|
return count;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const collectArticle = async (payload: unknown): Promise<unknown> => {
|
export const collectArticle = async (payload: unknown): Promise<unknown> => {
|
||||||
const data = ArticleTaskPayloadSchema.parse(payload);
|
const data = ArticleTaskPayloadSchema.parse(payload);
|
||||||
return handlers.collectArticle(data);
|
logger.debug(
|
||||||
|
{
|
||||||
|
sourceId: data.source_id,
|
||||||
|
env: data.env,
|
||||||
|
url: data.url,
|
||||||
|
page: data.page,
|
||||||
|
},
|
||||||
|
"Collecting article",
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await handlers.collectArticle(data);
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
{
|
||||||
|
sourceId: data.source_id,
|
||||||
|
env: data.env,
|
||||||
|
url: data.url,
|
||||||
|
},
|
||||||
|
"Article collection completed",
|
||||||
|
);
|
||||||
|
|
||||||
|
return result;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const forwardForProcessing = async (
|
export const forwardForProcessing = async (
|
||||||
payload: unknown,
|
payload: unknown,
|
||||||
): Promise<unknown> => {
|
): Promise<unknown> => {
|
||||||
const data = ProcessedTaskPayloadSchema.parse(payload);
|
const data = ProcessedTaskPayloadSchema.parse(payload);
|
||||||
return handlers.forwardForProcessing(data);
|
logger.debug(
|
||||||
|
{
|
||||||
|
sourceId: data.source_id,
|
||||||
|
env: data.env,
|
||||||
|
},
|
||||||
|
"Forwarding article for processing",
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await handlers.forwardForProcessing(data);
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
{
|
||||||
|
sourceId: data.source_id,
|
||||||
|
env: data.env,
|
||||||
|
},
|
||||||
|
"Article forwarded for processing",
|
||||||
|
);
|
||||||
|
|
||||||
|
return result;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
{
|
{
|
||||||
"extends": "@basango/tsconfig/base.json",
|
"extends": "@basango/tsconfig/base.json",
|
||||||
"compilerOptions": {
|
"compilerOptions": {
|
||||||
"rootDir": "src",
|
"rootDir": "src",
|
||||||
"outDir": "dist"
|
"outDir": "dist",
|
||||||
},
|
"paths": {
|
||||||
"include": ["src"],
|
"@basango/crawler": ["./src/index.ts"],
|
||||||
"references": [],
|
"@basango/crawler/*": ["./src/*"]
|
||||||
"paths": {
|
}
|
||||||
"@basango/crawler": ["src/**"]
|
},
|
||||||
}
|
"include": ["src"],
|
||||||
|
"references": []
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user