feat(monorepo): migrate to typescript monorepo

This commit is contained in:
2025-11-07 17:09:29 +02:00
committed by BernardNganduDev
parent 3e09956f05
commit 075a388ccb
745 changed files with 2341 additions and 5082 deletions
+137
View File
@@ -0,0 +1,137 @@
import { logger } from "@basango/logger";
import { config, env } from "@/config";
import { SyncHttpClient } from "@/http/http-client";
import { createQueueManager, QueueManager } from "@/process/async/queue";
import {
DetailsTaskPayload,
ListingTaskPayload,
ProcessingTaskPayload,
} from "@/process/async/schemas";
import { resolveCrawlerConfig } from "@/process/crawler";
import { HtmlCrawler } from "@/process/parsers/html";
import { WordPressCrawler } from "@/process/parsers/wordpress";
import { JsonlPersistor } from "@/process/persistence";
import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema";
import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils";
export const collectHtmlListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig;
if (source.sourceKind !== "html") {
return await collectWordPressListing(payload, manager);
}
const settings = resolveCrawlerConfig(source, payload);
const crawler = new HtmlCrawler(settings);
const pageRange = settings.pageRange ?? (await crawler.getPagination());
let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`;
try {
const items = await crawler.fetchLinks(target, source.sourceSelectors.articles);
for (const node of items) {
const url = crawler.extractLink(node);
if (!url) continue;
await manager.enqueueArticle({
category: payload.category,
dateRange: createDateRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
queued += 1;
}
} catch (error) {
logger.error({ error, target }, "Failed to crawl page");
}
}
return queued;
};
export const collectWordPressListing = async (
payload: ListingTaskPayload,
manager: QueueManager = createQueueManager(),
): Promise<number> => {
const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig;
if (source.sourceKind !== "wordpress") {
return await collectHtmlListing(payload, manager);
}
const settings = resolveCrawlerConfig(source, payload);
const crawler = new WordPressCrawler(settings);
const pageRange = settings.pageRange ?? (await crawler.getPagination());
let queued = 0;
for (let page = pageRange.start; page <= pageRange.end; page += 1) {
const url = crawler.postsEndpoint(page);
try {
const entries = await crawler.fetchLinks(url);
for (const data of entries) {
const url = data.link;
if (!url) continue;
await manager.enqueueArticle({
category: payload.category,
data,
dateRange: createDateRange(payload.dateRange),
sourceId: payload.sourceId,
url,
} as DetailsTaskPayload);
queued += 1;
}
} catch (error) {
logger.error({ error, page }, "Failed to fetch WordPress page");
}
}
return queued;
};
export const collectArticle = async (payload: DetailsTaskPayload): Promise<unknown> => {
const source = resolveSourceConfig(payload.sourceId);
const settings = resolveCrawlerConfig(source, {
category: payload.category,
dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined,
pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined,
sourceId: payload.sourceId,
});
const persistors = [
new JsonlPersistor({
directory: config.paths.data,
sourceId: String(source.sourceId),
}),
];
if (source.sourceKind === SourceKindSchema.enum.html) {
if (!payload.url) throw new Error("Missing article url");
const crawler = new HtmlCrawler(settings, { persistors });
const html = await crawler.crawl(payload.url);
return await crawler.fetchOne(html, settings.dateRange);
}
if (source.sourceKind === SourceKindSchema.enum.wordpress) {
const crawler = new WordPressCrawler(settings, { persistors });
return await crawler.fetchOne(payload.data ?? {}, settings.dateRange);
}
throw new Error(`Unsupported source kind`);
};
export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise<Article> => {
logger.info({ article: payload.article.title }, "Ready for downstream processing");
const client = new SyncHttpClient(config.fetch.client);
const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT");
await client.post(endpoint, { json: payload.article });
logger.info({ article: payload.article.title }, "Forwarded article to API");
return payload.article;
};