feat(crawler): compute source updates dates
This commit is contained in:
@@ -4,7 +4,7 @@ BASANGO_API_PORT=3080
|
|||||||
BASANGO_API_ALLOWED_ORIGINS=http://localhost:3000,http://127.0.0.1:3000
|
BASANGO_API_ALLOWED_ORIGINS=http://localhost:3000,http://127.0.0.1:3000
|
||||||
BASANGO_API_KEY=your_api_key_here
|
BASANGO_API_KEY=your_api_key_here
|
||||||
BASANGO_API_CRAWLER_TOKEN=dev
|
BASANGO_API_CRAWLER_TOKEN=dev
|
||||||
BASANGO_API_CRAWLER_ENDPOINT="http://localhost:3080/articles"
|
BASANGO_API_CRAWLER_ENDPOINT="http://localhost:3080"
|
||||||
BASANGO_API_JWT_SECRET=your_jwt_secret_here
|
BASANGO_API_JWT_SECRET=your_jwt_secret_here
|
||||||
|
|
||||||
# db
|
# db
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
import { OpenAPIHono } from "@hono/zod-openapi";
|
import { OpenAPIHono } from "@hono/zod-openapi";
|
||||||
|
|
||||||
import { articlesRouter } from "#api/rest/routers/articles";
|
import { articlesRouter } from "#api/rest/routers/articles";
|
||||||
|
import { sourcesRouter } from "#api/rest/routers/sources";
|
||||||
|
|
||||||
const routers: OpenAPIHono = new OpenAPIHono();
|
const routers: OpenAPIHono = new OpenAPIHono();
|
||||||
|
|
||||||
routers.route("/articles", articlesRouter);
|
routers.route("/articles", articlesRouter);
|
||||||
|
routers.route("/sources", sourcesRouter);
|
||||||
|
|
||||||
export { routers };
|
export { routers };
|
||||||
|
|||||||
@@ -0,0 +1,58 @@
|
|||||||
|
import { getEarliestPublished, getLatestPublished } from "@basango/db/queries";
|
||||||
|
import {
|
||||||
|
getSourceUpdateDatesResponseSchema,
|
||||||
|
getSourceUpdateDatesSchema,
|
||||||
|
} from "@basango/domain/models";
|
||||||
|
import { OpenAPIHono, createRoute } from "@hono/zod-openapi";
|
||||||
|
|
||||||
|
import type { Context } from "#api/rest/init";
|
||||||
|
import { withCrawlerAuth } from "#api/rest/middlewares/crawler";
|
||||||
|
import { withDatabase } from "#api/rest/middlewares/db";
|
||||||
|
import { validateResponse } from "#api/utils/response";
|
||||||
|
|
||||||
|
const app = new OpenAPIHono<Context>();
|
||||||
|
|
||||||
|
app.openapi(
|
||||||
|
createRoute({
|
||||||
|
description: "Get the latest and earliest published dates for articles from a specific source.",
|
||||||
|
method: "post",
|
||||||
|
middleware: [withCrawlerAuth, withDatabase],
|
||||||
|
operationId: "GetSourceUpdateDates",
|
||||||
|
path: "/update-dates",
|
||||||
|
request: {
|
||||||
|
body: {
|
||||||
|
content: {
|
||||||
|
"application/json": {
|
||||||
|
schema: getSourceUpdateDatesSchema,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
responses: {
|
||||||
|
200: {
|
||||||
|
content: {
|
||||||
|
"application/json": {
|
||||||
|
schema: getSourceUpdateDatesResponseSchema,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
description: "Source update dates retrieved",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
summary: "Get Source Update Dates",
|
||||||
|
tags: ["Sources"],
|
||||||
|
"x-speakeasy-name-override": "getSourceUpdateDates",
|
||||||
|
}),
|
||||||
|
async (c) => {
|
||||||
|
const db = c.get("db");
|
||||||
|
const input = c.req.valid("json");
|
||||||
|
|
||||||
|
const [latest, earliest] = await Promise.all([
|
||||||
|
getLatestPublished(db, input.name),
|
||||||
|
getEarliestPublished(db, input.name),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return c.json(validateResponse({ earliest, latest }, getSourceUpdateDatesResponseSchema), 200);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
export const sourcesRouter = app;
|
||||||
@@ -1,23 +1,18 @@
|
|||||||
import type { HtmlSourceOptions, WordPressSourceOptions } from "@basango/domain/config";
|
import type { HtmlSourceOptions, WordPressSourceOptions } from "@basango/domain/config";
|
||||||
import { Article } from "@basango/domain/models";
|
|
||||||
import { logger } from "@basango/logger";
|
import { logger } from "@basango/logger";
|
||||||
|
|
||||||
import { UnsupportedSourceKindError } from "#crawler/errors";
|
import { UnsupportedSourceKindError } from "#crawler/errors";
|
||||||
import { QueueManager, createQueueManager } from "#crawler/process/async/queue";
|
import { QueueManager, createQueueManager } from "#crawler/process/async/queue";
|
||||||
import {
|
import { DetailsTaskPayload, ListingTaskPayload } from "#crawler/process/async/schemas";
|
||||||
DetailsTaskPayload,
|
|
||||||
ListingTaskPayload,
|
|
||||||
ProcessingTaskPayload,
|
|
||||||
} from "#crawler/process/async/schemas";
|
|
||||||
import { createPersistors, resolveCrawlerConfig } from "#crawler/process/crawler";
|
import { createPersistors, resolveCrawlerConfig } from "#crawler/process/crawler";
|
||||||
import { HtmlCrawler } from "#crawler/process/parsers/html";
|
import { HtmlCrawler } from "#crawler/process/parsers/html";
|
||||||
import { WordPressCrawler } from "#crawler/process/parsers/wordpress";
|
import { WordPressCrawler } from "#crawler/process/parsers/wordpress";
|
||||||
import { forward } from "#crawler/process/persistence";
|
|
||||||
import {
|
import {
|
||||||
createTimestampRange,
|
createTimestampRange,
|
||||||
formatPageRange,
|
formatPageRange,
|
||||||
formatTimestampRange,
|
formatTimestampRange,
|
||||||
resolveSourceConfig,
|
resolveSourceConfig,
|
||||||
|
resolveSourceUpdateDates,
|
||||||
} from "#crawler/utils";
|
} from "#crawler/utils";
|
||||||
|
|
||||||
export const collectHtmlListing = async (
|
export const collectHtmlListing = async (
|
||||||
@@ -30,6 +25,8 @@ export const collectHtmlListing = async (
|
|||||||
}
|
}
|
||||||
|
|
||||||
const settings = resolveCrawlerConfig(source, payload);
|
const settings = resolveCrawlerConfig(source, payload);
|
||||||
|
await resolveSourceUpdateDates(settings);
|
||||||
|
|
||||||
const crawler = new HtmlCrawler(settings);
|
const crawler = new HtmlCrawler(settings);
|
||||||
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
||||||
|
|
||||||
@@ -69,6 +66,8 @@ export const collectWordPressListing = async (
|
|||||||
}
|
}
|
||||||
|
|
||||||
const settings = resolveCrawlerConfig(source, payload);
|
const settings = resolveCrawlerConfig(source, payload);
|
||||||
|
await resolveSourceUpdateDates(settings);
|
||||||
|
|
||||||
const crawler = new WordPressCrawler(settings);
|
const crawler = new WordPressCrawler(settings);
|
||||||
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
const pageRange = settings.pageRange ?? (await crawler.getPagination());
|
||||||
|
|
||||||
@@ -99,10 +98,7 @@ export const collectWordPressListing = async (
|
|||||||
return queued;
|
return queued;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const collectArticle = async (
|
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
|
||||||
payload: DetailsTaskPayload,
|
|
||||||
manager: QueueManager = createQueueManager(),
|
|
||||||
): Promise<unknown> => {
|
|
||||||
const source = resolveSourceConfig(payload.sourceId);
|
const source = resolveSourceConfig(payload.sourceId);
|
||||||
const settings = resolveCrawlerConfig(source, {
|
const settings = resolveCrawlerConfig(source, {
|
||||||
category: payload.category,
|
category: payload.category,
|
||||||
@@ -116,35 +112,14 @@ export const collectArticle = async (
|
|||||||
const crawler = new HtmlCrawler(settings, { persistors });
|
const crawler = new HtmlCrawler(settings, { persistors });
|
||||||
const html = await crawler.crawl(payload.url);
|
const html = await crawler.crawl(payload.url);
|
||||||
|
|
||||||
const article = await crawler.fetchOne(html, settings.dateRange);
|
return await crawler.fetchOne(html, settings.dateRange);
|
||||||
await manager.enqueueProcessed({
|
|
||||||
article,
|
|
||||||
sourceId: payload.sourceId,
|
|
||||||
} as ProcessingTaskPayload);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (source.sourceKind === "wordpress") {
|
if (source.sourceKind === "wordpress") {
|
||||||
const crawler = new WordPressCrawler(settings, { persistors });
|
const crawler = new WordPressCrawler(settings, { persistors });
|
||||||
|
|
||||||
const article = await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
|
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
|
||||||
await manager.enqueueProcessed({
|
|
||||||
article,
|
|
||||||
sourceId: payload.sourceId,
|
|
||||||
} as ProcessingTaskPayload);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new UnsupportedSourceKindError(`Unsupported source kind`);
|
throw new UnsupportedSourceKindError(`Unsupported source kind`);
|
||||||
};
|
};
|
||||||
|
|
||||||
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
|
|
||||||
logger.info({ article: payload.article.title }, "Ready for downstream processing");
|
|
||||||
|
|
||||||
try {
|
|
||||||
logger.info({ article: payload.article.title }, "Forwarding article to API");
|
|
||||||
await forward(payload.article);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error({ error }, "Failed to forward article to API");
|
|
||||||
}
|
|
||||||
|
|
||||||
return payload.article;
|
|
||||||
};
|
|
||||||
|
|||||||
@@ -9,8 +9,6 @@ import {
|
|||||||
DetailsTaskPayloadSchema,
|
DetailsTaskPayloadSchema,
|
||||||
ListingTaskPayload,
|
ListingTaskPayload,
|
||||||
ListingTaskPayloadSchema,
|
ListingTaskPayloadSchema,
|
||||||
ProcessingTaskPayload,
|
|
||||||
ProcessingTaskPayloadSchema,
|
|
||||||
} from "#crawler/process/async/schemas";
|
} from "#crawler/process/async/schemas";
|
||||||
import { parseRedisUrl } from "#crawler/utils";
|
import { parseRedisUrl } from "#crawler/utils";
|
||||||
|
|
||||||
@@ -58,7 +56,6 @@ export interface QueueManager {
|
|||||||
readonly connection: IORedis;
|
readonly connection: IORedis;
|
||||||
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
|
enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>;
|
||||||
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
|
enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>;
|
||||||
enqueueProcessed: (payload: ProcessingTaskPayload) => Promise<{ id: string }>;
|
|
||||||
iterQueueNames: () => string[];
|
iterQueueNames: () => string[];
|
||||||
queueName: (suffix: string) => string;
|
queueName: (suffix: string) => string;
|
||||||
close: () => Promise<void>;
|
close: () => Promise<void>;
|
||||||
@@ -92,16 +89,7 @@ export const createQueueManager = (options: CreateQueueManagerOptions = {}): Que
|
|||||||
const queue = ensureQueue(asyncOptions.queues.listing);
|
const queue = ensureQueue(asyncOptions.queues.listing);
|
||||||
return queue.add("collect_listing", data);
|
return queue.add("collect_listing", data);
|
||||||
},
|
},
|
||||||
enqueueProcessed: (payload) => {
|
iterQueueNames: () => [asyncOptions.queues.listing, asyncOptions.queues.details],
|
||||||
const data = ProcessingTaskPayloadSchema.parse(payload);
|
|
||||||
const queue = ensureQueue(asyncOptions.queues.processing);
|
|
||||||
return queue.add("forward_for_processing", data);
|
|
||||||
},
|
|
||||||
iterQueueNames: () => [
|
|
||||||
asyncOptions.queues.listing,
|
|
||||||
asyncOptions.queues.details,
|
|
||||||
asyncOptions.queues.processing,
|
|
||||||
],
|
|
||||||
options: asyncOptions,
|
options: asyncOptions,
|
||||||
queueName: (suffix: string) => `${asyncOptions.prefix}:${suffix}`,
|
queueName: (suffix: string) => `${asyncOptions.prefix}:${suffix}`,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { PageRangeSchema, TimestampRangeSchema, articleSchema } from "@basango/domain/models";
|
import { PageRangeSchema, TimestampRangeSchema } from "@basango/domain/models";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
|
|
||||||
export const ListingTaskPayloadSchema = z.object({
|
export const ListingTaskPayloadSchema = z.object({
|
||||||
@@ -18,11 +18,5 @@ export const DetailsTaskPayloadSchema = z.object({
|
|||||||
url: z.url(),
|
url: z.url(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export const ProcessingTaskPayloadSchema = z.object({
|
|
||||||
article: articleSchema,
|
|
||||||
sourceId: z.string(),
|
|
||||||
});
|
|
||||||
|
|
||||||
export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
|
export type ListingTaskPayload = z.infer<typeof ListingTaskPayloadSchema>;
|
||||||
export type DetailsTaskPayload = z.infer<typeof DetailsTaskPayloadSchema>;
|
export type DetailsTaskPayload = z.infer<typeof DetailsTaskPayloadSchema>;
|
||||||
export type ProcessingTaskPayload = z.infer<typeof ProcessingTaskPayloadSchema>;
|
|
||||||
|
|||||||
@@ -2,11 +2,7 @@ import { logger } from "@basango/logger";
|
|||||||
|
|
||||||
import * as handlers from "#crawler/process/async/handlers";
|
import * as handlers from "#crawler/process/async/handlers";
|
||||||
import { createQueueManager } from "#crawler/process/async/queue";
|
import { createQueueManager } from "#crawler/process/async/queue";
|
||||||
import {
|
import { DetailsTaskPayloadSchema, ListingTaskPayloadSchema } from "#crawler/process/async/schemas";
|
||||||
DetailsTaskPayloadSchema,
|
|
||||||
ListingTaskPayloadSchema,
|
|
||||||
ProcessingTaskPayloadSchema,
|
|
||||||
} from "#crawler/process/async/schemas";
|
|
||||||
import { CrawlingOptions } from "#crawler/process/crawler";
|
import { CrawlingOptions } from "#crawler/process/crawler";
|
||||||
|
|
||||||
export const collectListing = async (payload: unknown): Promise<number> => {
|
export const collectListing = async (payload: unknown): Promise<number> => {
|
||||||
@@ -29,16 +25,6 @@ export const collectArticle = async (payload: unknown): Promise<unknown> => {
|
|||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const forwardForProcessing = async (payload: unknown): Promise<unknown> => {
|
|
||||||
const data = ProcessingTaskPayloadSchema.parse(payload);
|
|
||||||
logger.debug({ sourceId: data.sourceId }, "Forwarding article for processing");
|
|
||||||
|
|
||||||
const result = await handlers.forwardForProcessing(data);
|
|
||||||
logger.info({ result }, "Article forwarded for processing");
|
|
||||||
|
|
||||||
return result;
|
|
||||||
};
|
|
||||||
|
|
||||||
export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise<string> => {
|
export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise<string> => {
|
||||||
const payload = ListingTaskPayloadSchema.parse({
|
const payload = ListingTaskPayloadSchema.parse({
|
||||||
category: options.category,
|
category: options.category,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { QueueEvents, Worker } from "bullmq";
|
|||||||
import IORedis from "ioredis";
|
import IORedis from "ioredis";
|
||||||
|
|
||||||
import { QueueFactory, QueueManager } from "#crawler/process/async/queue";
|
import { QueueFactory, QueueManager } from "#crawler/process/async/queue";
|
||||||
import { collectArticle, collectListing, forwardForProcessing } from "#crawler/process/async/tasks";
|
import { collectArticle, collectListing } from "#crawler/process/async/tasks";
|
||||||
|
|
||||||
export interface WorkerOptions {
|
export interface WorkerOptions {
|
||||||
queueNames?: string[];
|
queueNames?: string[];
|
||||||
@@ -36,8 +36,6 @@ export const startWorker = (options: WorkerOptions): WorkerHandle => {
|
|||||||
return collectListing(job.data);
|
return collectListing(job.data);
|
||||||
case "collect_article":
|
case "collect_article":
|
||||||
return collectArticle(job.data);
|
return collectArticle(job.data);
|
||||||
case "forward_for_processing":
|
|
||||||
return forwardForProcessing(job.data);
|
|
||||||
default:
|
default:
|
||||||
throw new Error(`Unknown job name: ${job.name}`);
|
throw new Error(`Unknown job name: ${job.name}`);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ export class HtmlCrawler extends BaseCrawler {
|
|||||||
{ url: this.currentNode },
|
{ url: this.currentNode },
|
||||||
"Article out of date range, stopping further processing",
|
"Article out of date range, stopping further processing",
|
||||||
);
|
);
|
||||||
break;
|
process.exit(0); // stop further processing
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.error({ error, url: this.currentNode }, "Failed to process HTML article");
|
logger.error({ error, url: this.currentNode }, "Failed to process HTML article");
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ export class WordPressCrawler extends BaseCrawler {
|
|||||||
{ url: node.link },
|
{ url: node.link },
|
||||||
"Article out of date range, stopping further processing",
|
"Article out of date range, stopping further processing",
|
||||||
);
|
);
|
||||||
break;
|
process.exit(0); // stop further processing
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.error({ error, url: node.link }, "Failed to process WordPress article");
|
logger.error({ error, url: node.link }, "Failed to process WordPress article");
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import fs from "node:fs";
|
|||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
||||||
import { config } from "@basango/domain/config";
|
import { config } from "@basango/domain/config";
|
||||||
import type { Article } from "@basango/domain/models";
|
import type { Article, SourceUpdateDates } from "@basango/domain/models";
|
||||||
import { md5 } from "@basango/encryption";
|
import { md5 } from "@basango/encryption";
|
||||||
import logger from "@basango/logger";
|
import logger from "@basango/logger";
|
||||||
|
|
||||||
@@ -61,19 +61,46 @@ export const persist = async (
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
forward(article).catch((error) => {
|
||||||
|
logger.error({ error }, "Failed to forward article");
|
||||||
|
});
|
||||||
|
|
||||||
logger.info({ url: article.link }, "article successfully persisted");
|
logger.info({ url: article.link }, "article successfully persisted");
|
||||||
return article;
|
return article;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const getSourceUpdateDates = async (sourceId: string): Promise<SourceUpdateDates> => {
|
||||||
|
const client = new SyncHttpClient(config.crawler.fetch.client);
|
||||||
|
const endpoint = config.crawler.backend.endpoint;
|
||||||
|
|
||||||
|
logger.info({ sourceId }, "Fetching source update dates");
|
||||||
|
const response = await client.post(`${endpoint}/sources/update-dates`, {
|
||||||
|
headers: {
|
||||||
|
Authorization: config.crawler.backend.token,
|
||||||
|
},
|
||||||
|
json: {
|
||||||
|
name: sourceId,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const data = await response.json();
|
||||||
|
logger.info({ ...data }, "Retrieved source update dates");
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.error({ sourceId, status: response.status }, "Failed to retrieve source update dates");
|
||||||
|
return { earliest: new Date(), latest: new Date() };
|
||||||
|
};
|
||||||
|
|
||||||
export const forward = async (payload: Partial<Article>): Promise<void> => {
|
export const forward = async (payload: Partial<Article>): Promise<void> => {
|
||||||
const client = new SyncHttpClient(config.crawler.fetch.client);
|
const client = new SyncHttpClient(config.crawler.fetch.client);
|
||||||
const endpoint = config.crawler.backend.endpoint;
|
const endpoint = config.crawler.backend.endpoint;
|
||||||
const token = config.crawler.backend.token;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await client.post(endpoint, {
|
const response = await client.post(`${endpoint}/articles`, {
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `${token}`,
|
Authorization: config.crawler.backend.token,
|
||||||
},
|
},
|
||||||
json: payload,
|
json: payload,
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -8,12 +8,13 @@ import {
|
|||||||
} from "#crawler/process/crawler";
|
} from "#crawler/process/crawler";
|
||||||
import { HtmlCrawler } from "#crawler/process/parsers/html";
|
import { HtmlCrawler } from "#crawler/process/parsers/html";
|
||||||
import { WordPressCrawler } from "#crawler/process/parsers/wordpress";
|
import { WordPressCrawler } from "#crawler/process/parsers/wordpress";
|
||||||
import { resolveSourceConfig } from "#crawler/utils";
|
import { resolveSourceConfig, resolveSourceUpdateDates } from "#crawler/utils";
|
||||||
|
|
||||||
export const runSyncCrawl = async (options: CrawlingOptions): Promise<void> => {
|
export const runSyncCrawl = async (options: CrawlingOptions): Promise<void> => {
|
||||||
const source = resolveSourceConfig(options.sourceId);
|
const source = resolveSourceConfig(options.sourceId);
|
||||||
const settings = resolveCrawlerConfig(source, options);
|
const settings = resolveCrawlerConfig(source, options);
|
||||||
const persistors = createPersistors(source);
|
const persistors = createPersistors(source);
|
||||||
|
await resolveSourceUpdateDates(settings);
|
||||||
|
|
||||||
const crawler =
|
const crawler =
|
||||||
source.sourceKind === "wordpress"
|
source.sourceKind === "wordpress"
|
||||||
|
|||||||
@@ -13,9 +13,12 @@ import {
|
|||||||
TimestampRange,
|
TimestampRange,
|
||||||
TimestampRangeSchema,
|
TimestampRangeSchema,
|
||||||
} from "@basango/domain/models";
|
} from "@basango/domain/models";
|
||||||
|
import logger from "@basango/logger";
|
||||||
import { format, fromUnixTime, getUnixTime, isMatch, parse } from "date-fns";
|
import { format, fromUnixTime, getUnixTime, isMatch, parse } from "date-fns";
|
||||||
import type { RedisOptions } from "ioredis";
|
import type { RedisOptions } from "ioredis";
|
||||||
|
|
||||||
|
import { getSourceUpdateDates } from "./process/persistence";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Resolve a source configuration by its ID.
|
* Resolve a source configuration by its ID.
|
||||||
* @param id - The source ID
|
* @param id - The source ID
|
||||||
@@ -32,6 +35,41 @@ export const resolveSourceConfig = (id: string): AnySourceOptions => {
|
|||||||
return source;
|
return source;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const resolveSourceUpdateDates = async (settings: {
|
||||||
|
dateRange?: TimestampRange;
|
||||||
|
direction: "forward" | "backward";
|
||||||
|
source?: AnySourceOptions;
|
||||||
|
}) => {
|
||||||
|
if (settings.dateRange === undefined && settings.source) {
|
||||||
|
const dates = await getSourceUpdateDates(settings.source.sourceId);
|
||||||
|
|
||||||
|
switch (settings.direction) {
|
||||||
|
case "backward":
|
||||||
|
settings.dateRange = {
|
||||||
|
end: getUnixTime(dates.earliest),
|
||||||
|
start: getUnixTime(new Date()),
|
||||||
|
};
|
||||||
|
logger.info(
|
||||||
|
{ dateRange: settings.dateRange, sourceId: settings.source.sourceId },
|
||||||
|
"Set date range start from earliest published date",
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
case "forward":
|
||||||
|
if (dates.latest) {
|
||||||
|
settings.dateRange = {
|
||||||
|
end: getUnixTime(new Date()),
|
||||||
|
start: getUnixTime(dates.latest),
|
||||||
|
};
|
||||||
|
logger.info(
|
||||||
|
{ dateRange: settings.dateRange, sourceId: settings.source.sourceId },
|
||||||
|
"Set date range start from latest published date",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse a Redis URL into RedisOptions.
|
* Parse a Redis URL into RedisOptions.
|
||||||
* @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0")
|
* @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0")
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { DEFAULT_CATEGORY_SHARES_LIMIT, DEFAULT_TIMEZONE } from "@basango/domain/constants";
|
import { DEFAULT_CATEGORY_SHARES_LIMIT, DEFAULT_TIMEZONE } from "@basango/domain/constants";
|
||||||
import { ID, Publication, Publications } from "@basango/domain/models";
|
import { ID, Publication, Publications } from "@basango/domain/models";
|
||||||
import { eq, sql } from "drizzle-orm";
|
import { eq, max, min, sql } from "drizzle-orm";
|
||||||
import * as uuid from "uuid";
|
import * as uuid from "uuid";
|
||||||
|
|
||||||
import { Database } from "#db/client";
|
import { Database } from "#db/client";
|
||||||
@@ -161,3 +161,27 @@ export async function getSourceCategoryShares(
|
|||||||
|
|
||||||
return { items: data.rows, total: data.rowCount ?? 0 };
|
return { items: data.rows, total: data.rowCount ?? 0 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function getLatestPublished(db: Database, source: string): Promise<Date> {
|
||||||
|
const result = await db
|
||||||
|
.select({
|
||||||
|
publishedAt: max(articles.publishedAt),
|
||||||
|
})
|
||||||
|
.from(articles)
|
||||||
|
.innerJoin(sources, eq(articles.sourceId, sources.id))
|
||||||
|
.where(eq(sources.name, source));
|
||||||
|
|
||||||
|
return result[0]?.publishedAt ?? new Date();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getEarliestPublished(db: Database, source: string): Promise<Date> {
|
||||||
|
const result = await db
|
||||||
|
.select({
|
||||||
|
publishedAt: min(articles.publishedAt),
|
||||||
|
})
|
||||||
|
.from(articles)
|
||||||
|
.innerJoin(sources, eq(articles.sourceId, sources.id))
|
||||||
|
.where(eq(sources.name, source));
|
||||||
|
|
||||||
|
return result[0]?.publishedAt ?? new Date();
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,9 +8,9 @@ export const articleMetadataSchema = z.object({
|
|||||||
author: z.string().optional(),
|
author: z.string().optional(),
|
||||||
description: z.string().optional(),
|
description: z.string().optional(),
|
||||||
image: z.url().optional(),
|
image: z.url().optional(),
|
||||||
publishedAt: z.date().optional(),
|
publishedAt: z.string().optional(),
|
||||||
title: z.string().optional(),
|
title: z.string().optional(),
|
||||||
updatedAt: z.date().optional(),
|
updatedAt: z.string().optional(),
|
||||||
url: z.url().optional(),
|
url: z.url().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ export const tokenStatisticsSchema = z.object({
|
|||||||
export const articleSchema = z.object({
|
export const articleSchema = z.object({
|
||||||
body: z.string().min(1),
|
body: z.string().min(1),
|
||||||
categories: z.array(z.string()),
|
categories: z.array(z.string()),
|
||||||
createdAt: z.date(),
|
createdAt: z.coerce.date(),
|
||||||
excerpt: z.string().optional(),
|
excerpt: z.string().optional(),
|
||||||
hash: z.string().min(1),
|
hash: z.string().min(1),
|
||||||
id: idSchema,
|
id: idSchema,
|
||||||
@@ -38,7 +38,7 @@ export const articleSchema = z.object({
|
|||||||
sourceId: z.union([z.uuid(), z.string().min(1)]),
|
sourceId: z.union([z.uuid(), z.string().min(1)]),
|
||||||
title: z.string().min(1),
|
title: z.string().min(1),
|
||||||
tokenStatistics: tokenStatisticsSchema.optional(),
|
tokenStatistics: tokenStatisticsSchema.optional(),
|
||||||
updatedAt: z.date().optional(),
|
updatedAt: z.coerce.date().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
// API
|
// API
|
||||||
|
|||||||
@@ -39,5 +39,15 @@ export const updateSourceSchema = sourceSchema.pick({
|
|||||||
url: true,
|
url: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
export const getSourceUpdateDatesSchema = z.object({
|
||||||
|
name: z.string().min(1).max(255),
|
||||||
|
});
|
||||||
|
|
||||||
|
export const getSourceUpdateDatesResponseSchema = z.object({
|
||||||
|
earliest: z.coerce.date(),
|
||||||
|
latest: z.coerce.date(),
|
||||||
|
});
|
||||||
|
|
||||||
// types
|
// types
|
||||||
export type Source = z.infer<typeof sourceSchema>;
|
export type Source = z.infer<typeof sourceSchema>;
|
||||||
|
export type SourceUpdateDates = z.infer<typeof getSourceUpdateDatesResponseSchema>;
|
||||||
|
|||||||
Reference in New Issue
Block a user