From 6eb2ed08e584b2444e2d834694a26496e9498656 Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Tue, 4 Nov 2025 16:11:14 +0200 Subject: [PATCH] [crawler] configuration based api --- basango/.gitignore | 5 +- basango/apps/crawler/.env | 20 ++ basango/apps/crawler/config/pipeline.json | 211 ++--------- ...{pipeline.production.json => sources.json} | 90 +++-- basango/apps/crawler/package.json | 11 +- .../apps/crawler/src/__tests__/config.test.ts | 81 ----- .../crawler/src/__tests__/crawler.test.ts | 91 ----- .../crawler/src/__tests__/http-client.test.ts | 83 ----- .../crawler/src/__tests__/open-graph.test.ts | 49 --- .../crawler/src/__tests__/persistence.test.ts | 27 -- .../apps/crawler/src/__tests__/queue.test.ts | 57 --- .../apps/crawler/src/__tests__/schema.test.ts | 37 -- .../apps/crawler/src/__tests__/tasks.test.ts | 49 --- basango/apps/crawler/src/config.ts | 219 ++++-------- basango/apps/crawler/src/constants.ts | 13 +- basango/apps/crawler/src/http/http-client.ts | 74 ++-- basango/apps/crawler/src/http/open-graph.ts | 103 +++--- basango/apps/crawler/src/http/user-agent.ts | 9 +- basango/apps/crawler/src/persistence.ts | 59 --- .../crawler/src/process/async/handlers.ts | 138 +++++++ .../apps/crawler/src/process/async/queue.ts | 89 ++--- .../apps/crawler/src/process/async/schemas.ts | 52 ++- .../apps/crawler/src/process/async/tasks.ts | 182 ++-------- .../apps/crawler/src/process/async/worker.ts | 28 +- basango/apps/crawler/src/process/crawler.ts | 168 ++------- .../apps/crawler/src/process/parsers/base.ts | 108 ++++++ .../apps/crawler/src/process/parsers/html.ts | 338 ++++++++++++++++++ .../crawler/src/process/parsers/wordpress.ts | 240 +++++++++++++ .../apps/crawler/src/process/persistence.ts | 81 +++++ .../apps/crawler/src/process/sync/tasks.ts | 29 ++ basango/apps/crawler/src/schema.ts | 172 +++------ basango/apps/crawler/src/scripts/crawl.ts | 22 ++ basango/apps/crawler/src/scripts/queue.ts | 72 +--- basango/apps/crawler/src/scripts/utils.ts | 39 ++ basango/apps/crawler/src/scripts/worker.ts | 109 +----- basango/apps/crawler/src/utils.ts | 204 +++++------ basango/biome.json | 19 +- basango/bun.lock | 37 +- basango/packages/db/src/queries/articles.ts | 54 +-- basango/packages/db/src/queries/sources.ts | 20 +- basango/packages/db/src/schema.ts | 20 +- basango/packages/tsconfig/base.json | 2 +- package-lock.json | 6 + package.json | 1 + 44 files changed, 1658 insertions(+), 1860 deletions(-) create mode 100644 basango/apps/crawler/.env rename basango/apps/crawler/config/{pipeline.production.json => sources.json} (71%) delete mode 100644 basango/apps/crawler/src/__tests__/config.test.ts delete mode 100644 basango/apps/crawler/src/__tests__/crawler.test.ts delete mode 100644 basango/apps/crawler/src/__tests__/http-client.test.ts delete mode 100644 basango/apps/crawler/src/__tests__/open-graph.test.ts delete mode 100644 basango/apps/crawler/src/__tests__/persistence.test.ts delete mode 100644 basango/apps/crawler/src/__tests__/queue.test.ts delete mode 100644 basango/apps/crawler/src/__tests__/schema.test.ts delete mode 100644 basango/apps/crawler/src/__tests__/tasks.test.ts delete mode 100644 basango/apps/crawler/src/persistence.ts create mode 100644 basango/apps/crawler/src/process/async/handlers.ts create mode 100644 basango/apps/crawler/src/process/parsers/base.ts create mode 100644 basango/apps/crawler/src/process/parsers/html.ts create mode 100644 basango/apps/crawler/src/process/parsers/wordpress.ts create mode 100644 basango/apps/crawler/src/process/persistence.ts create mode 100644 basango/apps/crawler/src/process/sync/tasks.ts create mode 100644 basango/apps/crawler/src/scripts/crawl.ts create mode 100644 basango/apps/crawler/src/scripts/utils.ts create mode 100644 package-lock.json create mode 100644 package.json diff --git a/basango/.gitignore b/basango/.gitignore index 4d8e1d0..a6b2b18 100644 --- a/basango/.gitignore +++ b/basango/.gitignore @@ -31,8 +31,9 @@ yarn-error.log* .pnpm-debug.log* # local env files -.env -.env*.local +.env.local +.env.*.local +.env.*.local.* # vercel .vercel diff --git a/basango/apps/crawler/.env b/basango/apps/crawler/.env new file mode 100644 index 0000000..b13fcc1 --- /dev/null +++ b/basango/apps/crawler/.env @@ -0,0 +1,20 @@ +# paths +BASANGO_CRAWLER_ROOT_PATH= +BASANGO_CRAWLER_DATA_PATH= +BASANGO_CRAWLER_LOGS_PATH= +BASANGO_CRAWLER_CONFIG_PATH= + +# crawler settings +BASANGO_CRAWLER_UPDATE_DIRECTION=forward +BASANGO_CRAWLER_FETCH_USER_AGENT="Basango/0.1 (+https://github.com/bernard-ng/basango)" +BASANGO_CRAWLER_FETCH_MAX_RETRIES=3 +BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER=true + +BASANGO_CRAWLER_ASYNC_REDIS_URL="redis://localhost:6379/0" +BASANGO_CRAWLER_ASYNC_TTL_RESULT=3600 +BASANGO_CRAWLER_ASYNC_TTL_FAILURE=3600 +BASANGO_CRAWLER_ASYNC_QUEUE_LISTING="listing" +BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS="details" +BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING="processing" + +BASANGO_CRAWLER_BACKEND_API_ENDPOINT="http://localhost:3000/api/aggregator/articles?token=dev" diff --git a/basango/apps/crawler/config/pipeline.json b/basango/apps/crawler/config/pipeline.json index fb64dbf..c173617 100644 --- a/basango/apps/crawler/config/pipeline.json +++ b/basango/apps/crawler/config/pipeline.json @@ -1,195 +1,42 @@ { + "paths": { + "root": "%env(BASANGO_CRAWLER_ROOT_PATH)%", + "data": "%env(BASANGO_CRAWLER_DATA_PATH)%", + "logs": "%env(BASANGO_CRAWLER_LOGS_PATH)%", + "config": "%env(BASANGO_CRAWLER_CONFIG_PATH)%" + }, "fetch": { "client": { "timeout": 20, - "user_agent": "Basango/0.1 (+https://github.com/bernard-ng/basango)", - "follow_redirects": true, - "verify_ssl": true, + "userAgent": "%env(BASANGO_CRAWLER_FETCH_USER_AGENT)%", + "followRedirects": true, + "verifySsl": true, "rotate": true, - "max_retries": 3, - "backoff_initial": 1, - "backoff_multiplier": 2, - "backoff_max": 30, - "respect_retry_after": true + "maxRetries": "%env(BASANGO_CRAWLER_FETCH_MAX_RETRIES)%", + "backoffInitial": 1, + "backoffMultiplier": 2, + "backoffMax": 30, + "respectRetryAfter": "%env(BASANGO_CRAWLER_FETCH_RESPECT_RETRY_AFTER)%" }, "crawler": { "notify": false, "use_multi_threading": false, - "max_workers": 5 - } - }, - "logging": { - "level": "INFO", - "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", - "file_logging": true, - "console_logging": true, - "log_file": "pipeline.log", - "max_log_size": 10485760, - "backup_count": 5 - }, - "sources": { - "html": [ - { - "source_id": "radiookapi.net", - "source_url": "https://www.radiookapi.net", - "source_date": { - "pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", - "replacement": "$3-$2-$1 $4" - }, - "source_selectors": { - "articles": ".view-content > .views-row.content-row", - "article_title": "h1.page-header", - "article_link": ".views-field-title a", - "article_body": ".field-name-body", - "article_date": ".views-field-created", - "article_categories": ".views-field-field-cat-gorie a", - "pagination": "ul.pagination > li.pager-last > a" - }, - "pagination_template": "actualite", - "supports_categories": false, - "requires_details": true, - "requires_rate_limit": false + "maxWorkers": 5, + "direction": "%env(BASANGO_CRAWLER_DEFAULT_DIRECTION)%" + }, + "async": { + "redisUrl": "%env(BASANGO_CRAWLER_ASYNC_REDIS_URL)%", + "prefix": "basango:crawler", + "ttl": { + "default": 600, + "result": "%env(BASANGO_CRAWLER_ASYNC_TTL_RESULT)%", + "failure": "%env(BASANGO_CRAWLER_ASYNC_TTL_FAILURE)%" }, - { - "source_id": "7sur7.cd", - "source_url": "https://7sur7.cd", - "source_date": { - "pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", - "replacement": "$3-$2-$1 $4" - }, - "categories": ["politique", "economie", "culture", "sport", "societe"], - "source_selectors": { - "articles": ".view-content > .row.views-row", - "article_title": ".views-field-title a", - "article_link": ".views-field-title a", - "article_body": ".field.field--name-body", - "article_date": ".views-field-created", - "pagination": "ul.pagination > li.pager__item.pager__item--last > a" - }, - "pagination_template": "index.php/category/{category}", - "supports_categories": true, - "requires_details": false, - "requires_rate_limit": false - }, - { - "source_id": "mediacongo.net", - "source_url": "https://www.mediacongo.net", - "source_date": { - "format": "%d.%m.%Y %H:%M" - }, - "source_selectors": { - "articles": ".for_aitems > .article_other_item", - "article_title": "img", - "article_link": "a:first-child", - "article_categories": "a.color_link", - "article_body": ".article_ttext", - "article_date": ".article_other_about", - "pagination": "div.pagination > div > a:last-child" - }, - "pagination_template": "articles.html", - "supports_categories": false, - "requires_details": true, - "requires_rate_limit": false - }, - { - "source_id": "actualite.cd", - "source_url": "https://actualite.cd", - "source_date": { - "pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/", - "replacement": "$4-$3-$2 $5" - }, - "source_selectors": { - "articles": "#views-bootstrap-taxonomy-term-page-2 > div > div", - "article_title": "#actu-titre a", - "article_link": "#actu-titre a", - "article_categories": "#actu-cat a", - "article_body": ".views-field.views-field-body", - "article_date": "#p-date" - }, - "pagination_template": "actualite", - "supports_categories": false, - "requires_details": true, - "requires_rate_limit": false + "queues": { + "listing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_LISTING)%", + "details": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_DETAILS)%", + "processing": "%env(BASANGO_CRAWLER_ASYNC_QUEUE_PROCESSING)%" } - ], - "wordpress": [ - { - "source_id": "beto.cd", - "source_url": "https://beto.cd", - "requires_rate_limit": true - }, - { "source_id": "newscd.net", "source_url": "https://newscd.net" }, - { - "source_id": "africanewsrdc.net", - "source_url": "https://www.africanewsrdc.net" - }, - { - "source_id": "angazainstitute.ac.cd", - "source_url": "https://angazainstitute.ac.cd" - }, - { "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" }, - { "source_id": "bukavufm.com", "source_url": "https://bukavufm.com" }, - { - "source_id": "changement7.net", - "source_url": "https://changement7.net" - }, - { "source_id": "congoactu.net", "source_url": "https://congoactu.net" }, - { - "source_id": "congoindependant.com", - "source_url": "https://www.congoindependant.com" - }, - { - "source_id": "congoquotidien.com", - "source_url": "https://www.congoquotidien.com" - }, - { "source_id": "cumulard.cd", "source_url": "https://www.cumulard.cd" }, - { - "source_id": "environews-rdc.net", - "source_url": "https://environews-rdc.net" - }, - { - "source_id": "freemediardc.info", - "source_url": "https://www.freemediardc.info" - }, - { - "source_id": "geopolismagazine.org", - "source_url": "https://geopolismagazine.org" - }, - { "source_id": "habarirdc.net", "source_url": "https://habarirdc.net" }, - { "source_id": "infordc.com", "source_url": "https://infordc.com" }, - { - "source_id": "kilalopress.net", - "source_url": "https://kilalopress.net" - }, - { - "source_id": "laprosperiteonline.net", - "source_url": "https://laprosperiteonline.net" - }, - { - "source_id": "laprunellerdc.cd", - "source_url": "https://laprunellerdc.cd" - }, - { "source_id": "lesmedias.net", "source_url": "https://lesmedias.net" }, - { - "source_id": "lesvolcansnews.net", - "source_url": "https://lesvolcansnews.net" - }, - { - "source_id": "netic-news.net", - "source_url": "https://www.netic-news.net" - }, - { - "source_id": "objectif-infos.cd", - "source_url": "https://objectif-infos.cd" - }, - { "source_id": "scooprdc.net", "source_url": "https://scooprdc.net" }, - { - "source_id": "journaldekinshasa.com", - "source_url": "https://www.journaldekinshasa.com" - }, - { "source_id": "lepotentiel.cd", "source_url": "https://lepotentiel.cd" }, - { "source_id": "acturdc.com", "source_url": "https://acturdc.com" }, - { "source_id": "matininfos.net", "source_url": "https://matininfos.net" } - ] + } } } diff --git a/basango/apps/crawler/config/pipeline.production.json b/basango/apps/crawler/config/sources.json similarity index 71% rename from basango/apps/crawler/config/pipeline.production.json rename to basango/apps/crawler/config/sources.json index 83e9681..806322c 100644 --- a/basango/apps/crawler/config/pipeline.production.json +++ b/basango/apps/crawler/config/sources.json @@ -1,32 +1,4 @@ { - "fetch": { - "client": { - "timeout": 20, - "user_agent": "Basango/0.1 (+https://github.com/bernard-ng/basango)", - "follow_redirects": true, - "verify_ssl": true, - "rotate": true, - "max_retries": 3, - "backoff_initial": 1, - "backoff_multiplier": 2, - "backoff_max": 30, - "respect_retry_after": true - }, - "crawler": { - "notify": false, - "use_multi_threading": false, - "max_workers": 5 - } - }, - "logging": { - "level": "ERROR", - "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", - "file_logging": true, - "console_logging": true, - "log_file": "pipeline.log", - "max_log_size": 10485760, - "backup_count": 5 - }, "sources": { "html": [ { @@ -38,16 +10,16 @@ }, "source_selectors": { "articles": ".view-content > .views-row.content-row", - "article_title": ".views-field-title a", + "article_title": "h1.page-header", "article_link": ".views-field-title a", "article_body": ".field-name-body", "article_date": ".views-field-created", "article_categories": ".views-field-field-cat-gorie a", - "pagination": "ul.pagination > li a(:last-child)" + "pagination": "ul.pagination > li.pager-last > a" }, - "pagination_template": "/actualite?page={page}", + "pagination_template": "actualite", "supports_categories": false, - "requires_details": false, + "requires_details": true, "requires_rate_limit": false }, { @@ -64,29 +36,29 @@ "article_link": ".views-field-title a", "article_body": ".field.field--name-body", "article_date": ".views-field-created", - "pagination": "ul.pagination > li a(:last-child)" + "pagination": "ul.pagination > li.pager__item.pager__item--last > a" }, - "pagination_template": "/index.php/category/{category}?page={page}", + "pagination_template": "index.php/category/{category}", "supports_categories": true, "requires_details": false, "requires_rate_limit": false }, { "source_id": "mediacongo.net", - "source_url": "https://mediacongo.net", + "source_url": "https://www.mediacongo.net", "source_date": { "format": "%d.%m.%Y %H:%M" }, "source_selectors": { "articles": ".for_aitems > .article_other_item", "article_title": "img", - "article_link": "a(:first-child)", + "article_link": "a:first-child", "article_categories": "a.color_link", "article_body": ".article_ttext", "article_date": ".article_other_about", - "pagination": ".nav > a(:last-child)" + "pagination": "div.pagination > div > a:last-child" }, - "pagination_template": "/articles.html?page={page}", + "pagination_template": "articles.html", "supports_categories": false, "requires_details": true, "requires_rate_limit": false @@ -106,7 +78,7 @@ "article_body": ".views-field.views-field-body", "article_date": "#p-date" }, - "pagination_template": "/actualite?page={page}", + "pagination_template": "actualite", "supports_categories": false, "requires_details": true, "requires_rate_limit": false @@ -128,12 +100,18 @@ "source_url": "https://angazainstitute.ac.cd" }, { "source_id": "b-onetv.cd", "source_url": "https://b-onetv.cd" }, - { "source_id": "bukavufm.com", "source_url": "https://bukavufm.com" }, + { + "source_id": "bukavufm.com", + "source_url": "https://bukavufm.com" + }, { "source_id": "changement7.net", "source_url": "https://changement7.net" }, - { "source_id": "congoactu.net", "source_url": "https://congoactu.net" }, + { + "source_id": "congoactu.net", + "source_url": "https://congoactu.net" + }, { "source_id": "congoindependant.com", "source_url": "https://www.congoindependant.com" @@ -142,7 +120,10 @@ "source_id": "congoquotidien.com", "source_url": "https://www.congoquotidien.com" }, - { "source_id": "cumulard.cd", "source_url": "https://www.cumulard.cd" }, + { + "source_id": "cumulard.cd", + "source_url": "https://www.cumulard.cd" + }, { "source_id": "environews-rdc.net", "source_url": "https://environews-rdc.net" @@ -155,7 +136,10 @@ "source_id": "geopolismagazine.org", "source_url": "https://geopolismagazine.org" }, - { "source_id": "habarirdc.net", "source_url": "https://habarirdc.net" }, + { + "source_id": "habarirdc.net", + "source_url": "https://habarirdc.net" + }, { "source_id": "infordc.com", "source_url": "https://infordc.com" }, { "source_id": "kilalopress.net", @@ -169,7 +153,10 @@ "source_id": "laprunellerdc.cd", "source_url": "https://laprunellerdc.cd" }, - { "source_id": "lesmedias.net", "source_url": "https://lesmedias.net" }, + { + "source_id": "lesmedias.net", + "source_url": "https://lesmedias.net" + }, { "source_id": "lesvolcansnews.net", "source_url": "https://lesvolcansnews.net" @@ -182,14 +169,23 @@ "source_id": "objectif-infos.cd", "source_url": "https://objectif-infos.cd" }, - { "source_id": "scooprdc.net", "source_url": "https://scooprdc.net" }, + { + "source_id": "scooprdc.net", + "source_url": "https://scooprdc.net" + }, { "source_id": "journaldekinshasa.com", "source_url": "https://www.journaldekinshasa.com" }, - { "source_id": "lepotentiel.cd", "source_url": "https://lepotentiel.cd" }, + { + "source_id": "lepotentiel.cd", + "source_url": "https://lepotentiel.cd" + }, { "source_id": "acturdc.com", "source_url": "https://acturdc.com" }, - { "source_id": "matininfos.net", "source_url": "https://matininfos.net" } + { + "source_id": "matininfos.net", + "source_url": "https://matininfos.net" + } ] } } diff --git a/basango/apps/crawler/package.json b/basango/apps/crawler/package.json index a92c5ef..d4f5fc3 100644 --- a/basango/apps/crawler/package.json +++ b/basango/apps/crawler/package.json @@ -3,20 +3,25 @@ "private": true, "type": "module", "scripts": { + "=========== CODE STYLE ============": "", "test": "vitest --run", "lint": "biome check .", "lint:fix": "biome check --write .", "format": "biome format --write .", - "queue": "bun run src/scripts/queue.ts", - "worker": "bun run src/scripts/worker.ts" + "============= CLI =============": "", + "crawl:sync": "bun run src/scripts/crawl.ts", + "crawl:async": "bun run src/scripts/queue.ts", + "crawl:worker": "bun run src/scripts/worker.ts" }, "dependencies": { "@basango/logger": "workspace:*", + "@devscast/config": "^1.0.2", "bullmq": "^4.17.0", "date-fns": "catalog:", "ioredis": "^5.3.2", - "node-html-parser": "^6.1.10", + "node-html-parser": "^7.0.1", "tiktoken": "^1.0.14", + "turndown": "^7.2.2", "zod": "catalog:" } } diff --git a/basango/apps/crawler/src/__tests__/config.test.ts b/basango/apps/crawler/src/__tests__/config.test.ts deleted file mode 100644 index 249ecf0..0000000 --- a/basango/apps/crawler/src/__tests__/config.test.ts +++ /dev/null @@ -1,81 +0,0 @@ -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { describe, expect, it } from "vitest"; -import { loadConfig } from "@/config"; - -import { resolveConfigPath } from "@/utils"; - -describe("loadConfig", () => { - it("parses json configuration and ensures directories", () => { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "crawler-config-")); - const paths = { - root: tempDir, - data: path.join(tempDir, "data"), - logs: path.join(tempDir, "logs"), - configs: path.join(tempDir, "configs"), - }; - - const configPath = path.join(tempDir, "pipeline.json"); - fs.writeFileSync( - configPath, - JSON.stringify( - { - paths, - fetch: { - client: { timeout: 10 }, - }, - }, - null, - 2, - ), - ); - - const config = loadConfig({ path: configPath }); - - expect(config.fetch.client.timeout).toBe(10); - expect(fs.existsSync(paths.data)).toBe(true); - expect(fs.existsSync(paths.logs)).toBe(true); - expect(fs.existsSync(paths.configs)).toBe(true); - }); - - it("merges environment override if available", () => { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "crawler-config-")); - const paths = { - root: tempDir, - data: path.join(tempDir, "data"), - logs: path.join(tempDir, "logs"), - configs: path.join(tempDir, "configs"), - }; - - const basePath = path.join(tempDir, "pipeline.json"); - fs.writeFileSync( - basePath, - JSON.stringify( - { - paths, - logging: { level: "INFO" }, - }, - null, - 2, - ), - ); - - const overridePath = resolveConfigPath(basePath, "production"); - fs.writeFileSync( - overridePath, - JSON.stringify( - { - logging: { level: "DEBUG" }, - }, - null, - 2, - ), - ); - - const config = loadConfig({ path: basePath, env: "production" }); - - expect(config.logging.level).toBe("DEBUG"); - }); -}); diff --git a/basango/apps/crawler/src/__tests__/crawler.test.ts b/basango/apps/crawler/src/__tests__/crawler.test.ts deleted file mode 100644 index 039370f..0000000 --- a/basango/apps/crawler/src/__tests__/crawler.test.ts +++ /dev/null @@ -1,91 +0,0 @@ -import { describe, expect, it, beforeEach, vi } from "vitest"; - -import { PipelineConfigManager } from "@/config"; -import { registerCrawler, clearCrawlerRegistry, runSyncCrawl } from "@/process/crawler"; -import { PipelineConfigSchema, SourceKindSchema } from "@/schema"; - -const createPipeline = () => - PipelineConfigSchema.parse({ - paths: { - root: ".", - data: ".", - logs: ".", - configs: ".", - }, - sources: { - html: [ - { - source_id: "demo", - source_url: "https://example.com", - source_kind: SourceKindSchema.enum.html, - pagination_template: "/page/{page}", - }, - ], - wordpress: [], - }, - }); - -describe("runSyncCrawl", () => { - beforeEach(() => { - clearCrawlerRegistry(); - }); - - it("invokes registered crawler factory", async () => { - const pipeline = createPipeline(); - const fetch = vi.fn().mockResolvedValue(undefined); - const close = vi.fn(); - - registerCrawler(SourceKindSchema.enum.html, () => ({ fetch, close })); - - const manager = { - get: vi.fn().mockReturnValue(pipeline), - setupLogging: vi.fn(), - } as unknown as PipelineConfigManager; - - const persistClose = vi.fn(); - const persistFactory = vi.fn().mockReturnValue([ - { persist: vi.fn(), close: persistClose }, - ]); - - await runSyncCrawl({ - sourceId: "demo", - env: "test", - manager, - persistFactory, - }); - - expect(fetch).toHaveBeenCalledTimes(1); - expect(close).toHaveBeenCalledTimes(1); - expect(persistFactory).toHaveBeenCalledWith({ - pipeline, - source: pipeline.sources.html[0], - resolvedSourceId: "demo", - }); - expect(persistClose).toHaveBeenCalledTimes(1); - }); - - it("throws when source is missing", async () => { - const pipeline = createPipeline(); - registerCrawler(SourceKindSchema.enum.html, () => ({ fetch: vi.fn() })); - const manager = { - get: vi.fn().mockReturnValue(pipeline), - setupLogging: vi.fn(), - } as unknown as PipelineConfigManager; - - await expect( - runSyncCrawl({ sourceId: "unknown", manager }), - ).rejects.toThrow("Source 'unknown' not found"); - }); - - it("throws when no crawler registered", async () => { - const pipeline = createPipeline(); - const manager = { - get: vi.fn().mockReturnValue(pipeline), - setupLogging: vi.fn(), - } as unknown as PipelineConfigManager; - - await expect( - runSyncCrawl({ sourceId: "demo", manager }), - ).rejects.toThrow("No crawler registered"); - }); -}); diff --git a/basango/apps/crawler/src/__tests__/http-client.test.ts b/basango/apps/crawler/src/__tests__/http-client.test.ts deleted file mode 100644 index 2ad0048..0000000 --- a/basango/apps/crawler/src/__tests__/http-client.test.ts +++ /dev/null @@ -1,83 +0,0 @@ -import { describe, expect, it, vi } from "vitest"; - -import { ClientConfigSchema } from "@/schema"; -import { HttpError, SyncHttpClient } from "@/http/http-client"; - -const createConfig = () => - ClientConfigSchema.parse({ - timeout: 1, - max_retries: 2, - backoff_initial: 0.001, - backoff_multiplier: 2, - backoff_max: 0.01, - }); - -describe("SyncHttpClient", () => { - it("retries transient statuses", async () => { - const config = createConfig(); - const sleep = vi.fn().mockResolvedValue(undefined); - const fetchMock = vi - .fn() - .mockResolvedValueOnce(new Response("retry", { status: 503 })) - .mockResolvedValueOnce(new Response("ok", { status: 200, body: "done" })); - - const client = new SyncHttpClient(config, { fetchImpl: fetchMock, sleep }); - const response = await client.get("https://example.com"); - - expect(await response.text()).toBe("done"); - expect(fetchMock).toHaveBeenCalledTimes(2); - expect(sleep).toHaveBeenCalled(); - }); - - it("respects retry-after header", async () => { - const config = createConfig(); - const sleep = vi.fn().mockResolvedValue(undefined); - const fetchMock = vi - .fn() - .mockResolvedValueOnce( - new Response("retry", { status: 503, headers: { "Retry-After": "3" } }), - ) - .mockResolvedValueOnce(new Response("ok", { status: 200 })); - - const client = new SyncHttpClient(config, { fetchImpl: fetchMock, sleep }); - await client.get("https://example.com"); - - expect(sleep).toHaveBeenCalledWith(3000); - }); - - it("throws http error on non transient failure", async () => { - const config = createConfig(); - const fetchMock = vi - .fn() - .mockResolvedValueOnce(new Response("bad", { status: 404, statusText: "Not Found" })); - - const client = new SyncHttpClient(config, { fetchImpl: fetchMock }); - - await expect(client.get("https://example.com")) - .rejects.toBeInstanceOf(HttpError); - }); - - it("sends json payload and query params", async () => { - const config = createConfig(); - const fetchMock = vi - .fn() - .mockResolvedValue(new Response("ok", { status: 200 })); - - const client = new SyncHttpClient(config, { fetchImpl: fetchMock }); - await client.post("https://example.com/api", { - params: { page: 1, q: "news" }, - json: { hello: "world" }, - headers: { Authorization: "token" }, - }); - - expect(fetchMock).toHaveBeenCalledTimes(1); - const [url, init] = fetchMock.mock.calls[0]!; - expect(url).toBe("https://example.com/api?page=1&q=news"); - expect(init?.method).toBe("POST"); - expect(init?.body).toBe(JSON.stringify({ hello: "world" })); - expect((init?.headers as Record)["Authorization"]).toBe("token"); - expect((init?.headers as Record)["Content-Type"]).toBe( - "application/json", - ); - }); -}); diff --git a/basango/apps/crawler/src/__tests__/open-graph.test.ts b/basango/apps/crawler/src/__tests__/open-graph.test.ts deleted file mode 100644 index d3e51e7..0000000 --- a/basango/apps/crawler/src/__tests__/open-graph.test.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { describe, expect, it, vi } from "vitest"; - -import { OpenGraphProvider } from "@/http/open-graph"; - -const sampleHtml = ` - - - - Example Article - - - - - - - - - - -`; - -describe("OpenGraphProvider", () => { - it("extracts metadata from html", () => { - const metadata = OpenGraphProvider.consumeHtml(sampleHtml, "https://example.com"); - - expect(metadata).toEqual({ - title: "Open Graph Title", - description: "Summary", - image: "https://cdn.example.com/image.jpg", - url: "https://example.com/article", - }); - }); - - it("falls back to null when no metadata present", () => { - const empty = OpenGraphProvider.consumeHtml(""); - expect(empty).toBeNull(); - }); - - it("fetches metadata from url", async () => { - const response = new Response(sampleHtml, { status: 200 }); - const get = vi.fn().mockResolvedValue(response); - - const provider = new OpenGraphProvider({ client: { get } }); - const metadata = await provider.consumeUrl("https://example.com/article"); - - expect(get).toHaveBeenCalledWith("https://example.com/article"); - expect(metadata?.title).toBe("Open Graph Title"); - }); -}); diff --git a/basango/apps/crawler/src/__tests__/persistence.test.ts b/basango/apps/crawler/src/__tests__/persistence.test.ts deleted file mode 100644 index 05d9e44..0000000 --- a/basango/apps/crawler/src/__tests__/persistence.test.ts +++ /dev/null @@ -1,27 +0,0 @@ -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { describe, expect, it } from "vitest"; - -import { JsonlPersistor } from "@/persistence"; - -describe("JsonlPersistor", () => { - it("writes json lines sequentially", async () => { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "jsonl-test-")); - const persistor = new JsonlPersistor({ directory: tempDir, sourceId: "demo" }); - - await Promise.all([ - persistor.persist({ id: 1, title: "first" }), - persistor.persist({ id: 2, title: "second" }), - ]); - - await persistor.close(); - - const contents = fs.readFileSync(path.join(tempDir, "demo.jsonl"), "utf-8"); - const lines = contents.trim().split("\n").map((line) => JSON.parse(line)); - - expect(lines).toContainEqual({ id: 1, title: "first" }); - expect(lines).toContainEqual({ id: 2, title: "second" }); - }); -}); diff --git a/basango/apps/crawler/src/__tests__/queue.test.ts b/basango/apps/crawler/src/__tests__/queue.test.ts deleted file mode 100644 index 804ce3d..0000000 --- a/basango/apps/crawler/src/__tests__/queue.test.ts +++ /dev/null @@ -1,57 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { createQueueManager, createQueueSettings } from "@/process/async/queue"; - -class InMemoryQueue { - public jobs: Array<{ name: string; data: unknown }> = []; - - async add(name: string, data: unknown) { - this.jobs.push({ name, data }); - return { id: `${name}-${this.jobs.length}` }; - } -} - -describe("createQueueManager", () => { - it("prefixes queue names", () => { - const manager = createQueueManager({ - settings: createQueueSettings({ prefix: "test" }), - queueFactory: (queueName) => { - expect(queueName).toBe("listing"); - return new InMemoryQueue(); - }, - connection: { - quit: async () => undefined, - } as any, - }); - - expect(manager.iterQueueNames()).toEqual([ - "test:listing", - "test:articles", - "test:processed", - ]); - }); - - it("enqueues listing job with validated payload", async () => { - const queue = new InMemoryQueue(); - const manager = createQueueManager({ - queueFactory: () => queue, - connection: { quit: async () => undefined } as any, - }); - - const job = await manager.enqueueListing({ - source_id: "radiookapi", - env: "test", - }); - - expect(job.id).toBe("collect_listing-1"); - expect(queue.jobs[0]).toEqual({ - name: "collect_listing", - data: { - source_id: "radiookapi", - env: "test", - page_range: undefined, - date_range: undefined, - category: undefined, - }, - }); - }); -}); diff --git a/basango/apps/crawler/src/__tests__/schema.test.ts b/basango/apps/crawler/src/__tests__/schema.test.ts deleted file mode 100644 index 26b4adc..0000000 --- a/basango/apps/crawler/src/__tests__/schema.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { - PageRangeSchema, - PageRangeSpecSchema, - PipelineConfigSchema, -} from "@/schema"; -import { - createDateRange, - formatDateRange, - isTimestampInRange, - schemaToJSON, -} from "@/utils"; - -describe("schema helpers", () => { - it("creates date range from spec", () => { - const range = createDateRange("2024-01-01:2024-01-31"); - expect(range.start).toBeLessThan(range.end); - expect(formatDateRange(range)).toBe("2024-01-01:2024-01-31"); - }); - - it("checks membership", () => { - const range = createDateRange("2024-01-01:2024-01-02"); - expect(isTimestampInRange(range, range.start)).toBe(true); - expect(isTimestampInRange(range, range.start - 1)).toBe(false); - }); - - it("parses page range spec", () => { - const range = PageRangeSchema.parse(PageRangeSpecSchema.parse("1:10")); - expect(range).toEqual({ start: 1, end: 10 }); - }); - - it("produces json schema", () => { - const json = schemaToJSON(PipelineConfigSchema); - // @ts-ignore - expect(json.type).toBe("object"); - }); -}); diff --git a/basango/apps/crawler/src/__tests__/tasks.test.ts b/basango/apps/crawler/src/__tests__/tasks.test.ts deleted file mode 100644 index b6f119a..0000000 --- a/basango/apps/crawler/src/__tests__/tasks.test.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { describe, expect, it, vi } from "vitest"; -import { QueueManager } from "../process/async/queue"; -import { - collectListing, - registerCrawlerTaskHandlers, - scheduleAsyncCrawl, -} from "@/process/async/tasks"; - -describe("Async tasks", () => { - it("schedules crawl with provided manager", async () => { - const enqueueListing = vi.fn().mockResolvedValue({ id: "job-1" }); - const manager = { - enqueueListing, - } as unknown as QueueManager; - - const jobId = await scheduleAsyncCrawl({ - sourceId: "radiookapi", - queueManager: manager, - }); - - expect(jobId).toBe("job-1"); - expect(enqueueListing).toHaveBeenCalledWith({ - source_id: "radiookapi", - env: "development", - page_range: undefined, - date_range: undefined, - category: undefined, - }); - }); - - it("delegates listing collection to registered handler", async () => { - const handler = vi.fn().mockResolvedValue(5); - registerCrawlerTaskHandlers({ collectListing: handler }); - - const count = await collectListing({ - source_id: "radiookapi", - env: "development", - }); - - expect(count).toBe(5); - expect(handler).toHaveBeenCalledWith({ - source_id: "radiookapi", - env: "development", - page_range: undefined, - date_range: undefined, - category: undefined, - }); - }); -}); diff --git a/basango/apps/crawler/src/config.ts b/basango/apps/crawler/src/config.ts index 5f856e7..c7501df 100644 --- a/basango/apps/crawler/src/config.ts +++ b/basango/apps/crawler/src/config.ts @@ -1,152 +1,81 @@ -import fs from "node:fs"; import path from "node:path"; -import { logger } from "@basango/logger"; - -import { PipelineConfig, PipelineConfigSchema } from "@/schema"; +import { loadConfig } from "@devscast/config"; +import { z } from "zod"; import { - ensureDirectories, - mergePipelineConfig, - resolveConfigPath, - resolveProjectPaths, -} from "@/utils"; -import { DEFAULT_CONFIG_FILES } from "@/constants"; + DateRangeSchema, + HtmlSourceConfigSchema, + PageRangeSchema, + UpdateDirectionSchema, + WordPressSourceConfigSchema, +} from "@/schema"; -export interface LoadConfigOptions { - path?: string; - env?: string; -} +export const PROJECT_DIR = path.resolve(process.cwd(), "basango", "apps", "crawler"); -const readJsonFile = (filePath: string): unknown => { - const contents = fs.readFileSync(filePath, "utf-8"); - return contents.trim() === "" ? {} : JSON.parse(contents); -}; +export const PipelineConfigSchema = z.object({ + paths: z.object({ + root: z.string().default(PROJECT_DIR), + data: z.string().default(path.join(PROJECT_DIR, "data", "dataset")), + config: z.string().default(path.join(PROJECT_DIR, "config")), + }), + fetch: z.object({ + client: z.object({ + timeout: z.number().positive().default(20), + userAgent: z.string().default("Basango/0.1 (+https://github.com/bernard-ng/basango)"), + followRedirects: z.boolean().default(true), + verifySsl: z.boolean().default(true), + rotate: z.boolean().default(true), + maxRetries: z.number().int().nonnegative().default(3), + backoffInitial: z.number().nonnegative().default(1), + backoffMultiplier: z.number().positive().default(2), + backoffMax: z.number().nonnegative().default(30), + respectRetryAfter: z.boolean().default(true), + }), + crawler: z.object({ + source: z.union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]).optional(), + pageRange: PageRangeSchema.optional(), + dateRange: DateRangeSchema.optional(), + category: z.string().optional(), + notify: z.boolean().default(false), + isUpdate: z.boolean().default(false), + useMultiThreading: z.boolean().default(false), + maxWorkers: z.number().int().positive().default(5), + direction: UpdateDirectionSchema.default("forward"), + }), + async: z.object({ + redisUrl: z.string().default("redis://localhost:6379/0"), + prefix: z.string().default("basango:crawler:queue"), + ttl: z.object({ + default: z.number().int().positive().default(600), + result: z.number().int().nonnegative().default(3600), + failure: z.number().int().nonnegative().default(3600), + }), + queues: z.object({ + listing: z.string().default("listing"), + details: z.string().default("details"), + processing: z.string().default("processing"), + }), + }), + }), + sources: z.object({ + html: z.array(HtmlSourceConfigSchema).default([]), + wordpress: z.array(WordPressSourceConfigSchema).default([]), + }), +}); -const locateConfigFile = (explicit?: string): string => { - if (explicit && fs.existsSync(explicit)) { - return explicit; - } +export const { config, env } = loadConfig({ + schema: PipelineConfigSchema, + cwd: process.cwd(), + env: { + path: path.join(PROJECT_DIR, ".env"), + }, + sources: [ + path.join(PROJECT_DIR, "config", "pipeline.json"), + path.join(PROJECT_DIR, "config", "sources.json"), + ], +}); - for (const candidate of DEFAULT_CONFIG_FILES) { - if (fs.existsSync(candidate)) { - return candidate; - } - } - - return DEFAULT_CONFIG_FILES[0]!; -}; - -const readPipelineConfig = (configPath: string): PipelineConfig => { - if (!fs.existsSync(configPath)) { - return PipelineConfigSchema.parse({ - paths: resolveProjectPaths(path.resolve(".")), - }); - } - - const raw = readJsonFile(configPath); - return PipelineConfigSchema.parse(raw); -}; - -const applyEnvironmentOverride = ( - baseConfig: PipelineConfig, - basePath: string, - env?: string, -): PipelineConfig => { - if (!env || env === "development") { - return baseConfig; - } - - const overridePath = resolveConfigPath(basePath, env); - if (!fs.existsSync(overridePath)) { - return baseConfig; - } - - const overrides = PipelineConfigSchema.parse(readJsonFile(overridePath)); - return mergePipelineConfig(baseConfig, overrides); -}; - -export const loadConfig = (options: LoadConfigOptions = {}): PipelineConfig => { - const basePath = locateConfigFile(options.path); - const config = applyEnvironmentOverride( - readPipelineConfig(basePath), - basePath, - options.env, - ); - - ensureDirectories(config.paths); - return config; -}; - -export const dumpConfig = ( - config: PipelineConfig, - targetPath?: string, -): void => { - const destination = targetPath ?? locateConfigFile(); - const normalized = PipelineConfigSchema.parse(config); - fs.mkdirSync(path.dirname(destination), { recursive: true }); - fs.writeFileSync(destination, JSON.stringify(normalized, null, 2)); -}; - -export interface PipelineConfigManagerOptions { - path?: string; - env?: string; - autoLoad?: boolean; -} - -export class PipelineConfigManager { - private readonly explicitPath?: string; - - private readonly defaultEnv: string; - - private cache?: PipelineConfig; - - constructor(options: PipelineConfigManagerOptions = {}) { - this.explicitPath = options.path; - this.defaultEnv = options.env ?? "development"; - - if (options.autoLoad !== false) { - this.cache = loadConfig({ - path: this.explicitPath, - env: this.defaultEnv, - }); - } - } - - get(env?: string): PipelineConfig { - const resolvedEnv = env ?? this.defaultEnv; - - if (resolvedEnv !== this.defaultEnv) { - return loadConfig({ - path: this.explicitPath, - env: resolvedEnv, - }); - } - - if (!this.cache) { - this.cache = loadConfig({ - path: this.explicitPath, - env: resolvedEnv, - }); - } - - return this.cache; - } - - setupLogging(config?: PipelineConfig): void { - const pipeline = config ?? this.get(); - ensureDirectories(pipeline.paths); - - const level = pipeline.logging.level.toLowerCase(); - process.env.LOG_LEVEL = level; - logger.level = level as typeof logger.level; - - if (pipeline.logging.file_logging) { - const logDir = pipeline.paths.logs; - const destination = path.join(logDir, pipeline.logging.log_file); - fs.mkdirSync(path.dirname(destination), { recursive: true }); - if (!fs.existsSync(destination)) { - fs.writeFileSync(destination, ""); - } - } - } -} +export type PipelineConfig = z.infer; +export type FetchClientConfig = PipelineConfig["fetch"]["client"]; +export type FetchCrawlerConfig = PipelineConfig["fetch"]["crawler"]; +export type FetchAsyncConfig = PipelineConfig["fetch"]["async"]; diff --git a/basango/apps/crawler/src/constants.ts b/basango/apps/crawler/src/constants.ts index 0c4e0e1..793959f 100644 --- a/basango/apps/crawler/src/constants.ts +++ b/basango/apps/crawler/src/constants.ts @@ -1,15 +1,6 @@ -import path from "node:path"; - export const DEFAULT_DATE_FORMAT = "yyyy-LL-dd"; -export const DEFAULT_CONFIG_FILES = [ - path.join(process.cwd(), "config", "pipeline.json"), - path.join(process.cwd(), "pipeline.json"), -]; - -export const DEFAULT_USER_AGENT = - "Basango/0.1 (+https://github.com/bernard-ng/basango)"; +export const DEFAULT_USER_AGENT = "Basango/0.1 (+https://github.com/bernard-ng/basango)"; export const OPEN_GRAPH_USER_AGENT = "facebookexternalhit/1.1"; -export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504] as const; - +export const TRANSIENT_HTTP_STATUSES = [429, 500, 502, 503, 504]; export const DEFAULT_RETRY_AFTER_HEADER = "retry-after"; diff --git a/basango/apps/crawler/src/http/http-client.ts b/basango/apps/crawler/src/http/http-client.ts index 79b3c75..34c73b2 100644 --- a/basango/apps/crawler/src/http/http-client.ts +++ b/basango/apps/crawler/src/http/http-client.ts @@ -1,8 +1,12 @@ import { setTimeout as delay } from "node:timers/promises"; -import type { ClientConfig } from "@/schema"; -import { DEFAULT_RETRY_AFTER_HEADER, DEFAULT_USER_AGENT, TRANSIENT_HTTP_STATUSES } from "@/constants"; +import { + DEFAULT_RETRY_AFTER_HEADER, + DEFAULT_USER_AGENT, + TRANSIENT_HTTP_STATUSES, +} from "@/constants"; import { UserAgents } from "@/http/user-agent"; +import { FetchClientConfig } from "@/config"; export type HttpHeaders = Record; export type HttpParams = Record; @@ -34,13 +38,19 @@ export class HttpError extends Error { } } +/** + * Default sleep function using setTimeout. + * @param ms - Milliseconds to sleep + */ const defaultSleep = (ms: number): Promise => { - if (typeof Bun !== "undefined" && typeof Bun.sleep === "function") { - return Bun.sleep(ms); - } return delay(ms).then(() => undefined); }; +/** + * Builds a URL with query parameters. + * @param url - The base URL + * @param params - The query parameters to append + */ const buildUrl = (url: string, params?: HttpParams): string => { if (!params || Object.keys(params).length === 0) { return url; @@ -55,10 +65,15 @@ const buildUrl = (url: string, params?: HttpParams): string => { return target.toString(); }; -const computeBackoff = (config: ClientConfig, attempt: number): number => { +/** + * Computes the backoff time in milliseconds based on the configuration and attempt number. + * @param config - Fetch client configuration + * @param attempt - Current attempt number + */ +const computeBackoff = (config: FetchClientConfig, attempt: number): number => { const base = Math.min( - config.backoff_initial * Math.pow(config.backoff_multiplier, attempt), - config.backoff_max, + config.backoffInitial * Math.pow(config.backoffMultiplier, attempt), + config.backoffMax, ); const jitter = Math.random() * base * 0.25; return (base + jitter) * 1000; @@ -79,18 +94,23 @@ const parseRetryAfter = (header: string): number => { return delta > 0 ? delta : 0; }; +/** + * Base HTTP client providing common functionality. + * + * @author Bernard Ngandu + */ export class BaseHttpClient { - protected readonly config: ClientConfig; + protected readonly config: FetchClientConfig; protected readonly fetchImpl: typeof fetch; protected readonly sleep: (ms: number) => Promise; protected readonly headers: HttpHeaders; - constructor(config: ClientConfig, options: HttpClientOptions = {}) { + constructor(config: FetchClientConfig, options: HttpClientOptions = {}) { this.config = config; const provider = options.userAgentProvider ?? - new UserAgents(config.rotate, config.user_agent ?? DEFAULT_USER_AGENT); - const userAgent = provider.get() ?? config.user_agent ?? DEFAULT_USER_AGENT; + new UserAgents(config.rotate, config.userAgent ?? DEFAULT_USER_AGENT); + const userAgent = provider.get() ?? config.userAgent ?? DEFAULT_USER_AGENT; const baseHeaders: HttpHeaders = { "User-Agent": userAgent }; if (options.defaultHeaders) { @@ -115,7 +135,7 @@ export class BaseHttpClient { if (response) { const retryAfter = response.headers.get(retryAfterHeader); - if (retryAfter && this.config.respect_retry_after) { + if (retryAfter && this.config.respectRetryAfter) { waitMs = parseRetryAfter(retryAfter); } } @@ -130,16 +150,17 @@ export class BaseHttpClient { } } +/** + * Synchronous HTTP client with retry and timeout capabilities. + * + * @author Bernard Ngandu + */ export class SyncHttpClient extends BaseHttpClient { - async request( - method: string, - url: string, - options: HttpRequestOptions = {}, - ): Promise { + async request(method: string, url: string, options: HttpRequestOptions = {}): Promise { const retryAfterHeader = options.retryAfterHeader ?? DEFAULT_RETRY_AFTER_HEADER; const target = buildUrl(url, options.params); - const maxAttempts = this.config.max_retries + 1; + const maxAttempts = this.config.maxRetries + 1; let attempt = 0; let lastError: unknown; @@ -155,20 +176,19 @@ export class SyncHttpClient extends BaseHttpClient { headers, body: options.data as BodyInit | undefined, signal: controller.signal, - redirect: this.config.follow_redirects ? "follow" : "manual", + redirect: this.config.followRedirects ? "follow" : "manual", }; if (options.json !== undefined) { init.body = JSON.stringify(options.json); - (init.headers as Record)["Content-Type"] ??= - "application/json"; + (init.headers as Record)["Content-Type"] ??= "application/json"; } const response = await this.fetchImpl(target, init); if ( TRANSIENT_HTTP_STATUSES.includes(response.status as number) && - attempt < this.config.max_retries + attempt < this.config.maxRetries ) { await this.maybeDelay(attempt, response, retryAfterHeader); attempt += 1; @@ -188,12 +208,12 @@ export class SyncHttpClient extends BaseHttpClient { if (error instanceof DOMException && error.name === "AbortError") { lastError = error; - if (attempt >= this.config.max_retries) { + if (attempt >= this.config.maxRetries) { throw error; } } else { lastError = error; - if (attempt >= this.config.max_retries) { + if (attempt >= this.config.maxRetries) { throw error; } } @@ -207,9 +227,7 @@ export class SyncHttpClient extends BaseHttpClient { } } - throw lastError instanceof Error - ? lastError - : new Error("HTTP request failed after retries"); + throw lastError instanceof Error ? lastError : new Error("HTTP request failed after retries"); } get(url: string, options?: Omit): Promise { diff --git a/basango/apps/crawler/src/http/open-graph.ts b/basango/apps/crawler/src/http/open-graph.ts index 9aabae8..5024d6e 100644 --- a/basango/apps/crawler/src/http/open-graph.ts +++ b/basango/apps/crawler/src/http/open-graph.ts @@ -1,33 +1,30 @@ import { parse } from "node-html-parser"; import { OPEN_GRAPH_USER_AGENT } from "@/constants"; -import type { ClientConfig } from "@/schema"; import { SyncHttpClient } from "@/http/http-client"; import { UserAgents } from "@/http/user-agent"; +import { config } from "@/config"; +import { ArticleMetadata } from "@/schema"; -export interface OpenGraphMetadata { - title?: string | null; - description?: string | null; - image?: string | null; - url?: string | null; -} - -export interface OpenGraphProviderOptions { - client?: Pick; - clientConfig?: ClientConfig; - userAgentProvider?: UserAgents; -} - -const pick = (values: Array): string | null => { +/** + * Picks the first non-empty value from the provided array. + * @param values - An array of string values + */ +const pick = (values: Array): string | undefined => { for (const value of values) { if (value && value.trim().length > 0) { return value.trim(); } } - return null; + return undefined; }; -const extractMeta = (root: ReturnType, property: string): string | null => { +/** + * Extracts the content of a meta tag given its property or name. + * @param root - The root HTML element + * @param property - The property or name of the meta tag to extract + */ +const extract = (root: ReturnType, property: string): string | null => { const selector = `meta[property='${property}'], meta[name='${property}']`; const node = root.querySelector(selector); if (!node) { @@ -36,70 +33,64 @@ const extractMeta = (root: ReturnType, property: string): string | return node.getAttribute("content") ?? null; }; -export class OpenGraphProvider { +/** + * OpenGraph consumer for extracting Open Graph metadata from HTML pages. + * Uses a synchronous HTTP client to fetch the HTML content. + * + * @author Bernard Ngandu + */ +export class OpenGraph { private readonly client: Pick; - constructor(options: OpenGraphProviderOptions = {}) { - const provider = - options.userAgentProvider ?? new UserAgents(false, OPEN_GRAPH_USER_AGENT); - const clientConfig: ClientConfig = - options.clientConfig ?? ({ - timeout: 20, - user_agent: OPEN_GRAPH_USER_AGENT, - follow_redirects: true, - verify_ssl: true, - rotate: false, - max_retries: 2, - backoff_initial: 1, - backoff_multiplier: 2, - backoff_max: 5, - respect_retry_after: true, - } satisfies ClientConfig); + constructor() { + const settings = config.fetch.client; + const provider = new UserAgents(true, OPEN_GRAPH_USER_AGENT); - this.client = - options.client ?? - new SyncHttpClient(clientConfig, { - userAgentProvider: provider, - defaultHeaders: { "User-Agent": provider.og() }, - }); + this.client = new SyncHttpClient(settings, { + userAgentProvider: provider, + defaultHeaders: { "User-Agent": provider.og() }, + }); } - async consumeUrl(url: string): Promise { + /** + * Consume a URL and extract Open Graph metadata. + * @param url - The URL to fetch and parse + */ + async consumeUrl(url: string): Promise { try { const response = await this.client.get(url); const html = await response.text(); - return OpenGraphProvider.consumeHtml(html, url); + return OpenGraph.consumeHtml(html, url); } catch { - return null; + return undefined; } } - static consumeHtml(html: string, url?: string): OpenGraphMetadata | null { + /** + * Consume HTML content and extract Open Graph metadata. + * @param html - HTML content as a string + * @param url - Optional URL of the page + */ + static consumeHtml(html: string, url?: string): ArticleMetadata | undefined { if (!html) { - return null; + return undefined; } const root = parse(html); - const title = pick([ - extractMeta(root, "og:title"), - root.querySelector("title")?.text, - ]); - const description = pick([ - extractMeta(root, "og:description"), - extractMeta(root, "description"), - ]); + const title = pick([extract(root, "og:title"), root.querySelector("title")?.text]); + const description = pick([extract(root, "og:description"), extract(root, "description")]); const image = pick([ - extractMeta(root, "og:image"), + extract(root, "og:image"), root.querySelector("img")?.getAttribute("src") ?? null, ]); const canonical = pick([ - extractMeta(root, "og:url"), + extract(root, "og:url"), root.querySelector("link[rel='canonical']")?.getAttribute("href") ?? null, url ?? null, ]); if (!title && !description && !image && !canonical) { - return null; + return undefined; } return { diff --git a/basango/apps/crawler/src/http/user-agent.ts b/basango/apps/crawler/src/http/user-agent.ts index b7cb1c3..09a4c34 100644 --- a/basango/apps/crawler/src/http/user-agent.ts +++ b/basango/apps/crawler/src/http/user-agent.ts @@ -1,5 +1,12 @@ import { DEFAULT_USER_AGENT, OPEN_GRAPH_USER_AGENT } from "@/constants"; +/** + * User agent provider with optional rotation. + * Allows fetching a random user agent from a predefined list + * or using a fallback user agent. + * + * @author Bernard Ngandu + */ export class UserAgents { private static readonly USER_AGENTS: string[] = [ "Mozilla/5.0 (iPhone; CPU iPhone OS 10_4_8; like Mac OS X) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.3638.271 Mobile Safari/537.5", @@ -22,7 +29,7 @@ export class UserAgents { this.fallback = fallback; } - static og(): string { + og(): string { return OPEN_GRAPH_USER_AGENT; } diff --git a/basango/apps/crawler/src/persistence.ts b/basango/apps/crawler/src/persistence.ts deleted file mode 100644 index 5d45222..0000000 --- a/basango/apps/crawler/src/persistence.ts +++ /dev/null @@ -1,59 +0,0 @@ -import fs from "node:fs"; -import path from "node:path"; - -export interface PersistedRecord { - [key: string]: unknown; -} - -export interface Persistor { - persist(record: PersistedRecord): Promise | void; - close?: () => Promise | void; -} - -export interface JsonlPersistorOptions { - directory: string; - sourceId: string; - suffix?: string; - encoding?: BufferEncoding; -} - -export class JsonlPersistor implements Persistor { - private readonly filePath: string; - private readonly encoding: BufferEncoding; - private pending: Promise = Promise.resolve(); - private closed = false; - - constructor(options: JsonlPersistorOptions) { - const suffix = options.suffix ?? ".jsonl"; - this.encoding = options.encoding ?? "utf-8"; - - fs.mkdirSync(options.directory, { recursive: true }); - this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`); - - if (!fs.existsSync(this.filePath)) { - fs.writeFileSync(this.filePath, "", { encoding: this.encoding }); - } - } - - persist(record: PersistedRecord): Promise { - if (this.closed) { - return Promise.reject(new Error("Persistor has been closed")); - } - - const payload = `${JSON.stringify(record)}\n`; - - this.pending = this.pending.then(async () => { - const file = Bun.file(this.filePath); - await Bun.write(file, payload, { append: true }); - }); - - return this.pending; - } - - async close(): Promise { - this.closed = true; - await this.pending; - } -} - -export type { JsonlPersistorOptions as JsonlOptions }; diff --git a/basango/apps/crawler/src/process/async/handlers.ts b/basango/apps/crawler/src/process/async/handlers.ts new file mode 100644 index 0000000..fe871cb --- /dev/null +++ b/basango/apps/crawler/src/process/async/handlers.ts @@ -0,0 +1,138 @@ +import { logger } from "@basango/logger"; + +import { config, env } from "@/config"; +import { Article, HtmlSourceConfig, SourceKindSchema, WordPressSourceConfig } from "@/schema"; +import { createDateRange, formatDateRange, formatPageRange, resolveSourceConfig } from "@/utils"; +import { + DetailsTaskPayload, + ListingTaskPayload, + ProcessingTaskPayload, +} from "@/process/async/schemas"; +import { createQueueManager, QueueManager } from "@/process/async/queue"; +import { HtmlCrawler } from "@/process/parsers/html"; +import { WordPressCrawler } from "@/process/parsers/wordpress"; +import { JsonlPersistor } from "@/process/persistence"; +import { SyncHttpClient } from "@/http/http-client"; + +import { resolveCrawlerConfig } from "@/process/crawler"; + +export const collectHtmlListing = async ( + payload: ListingTaskPayload, + manager: QueueManager = createQueueManager(), +): Promise => { + const source = resolveSourceConfig(payload.sourceId) as HtmlSourceConfig; + if (source.sourceKind !== "html") { + return await collectWordPressListing(payload, manager); + } + + const settings = resolveCrawlerConfig(source, payload); + const crawler = new HtmlCrawler(settings); + const pageRange = settings.pageRange ?? (await crawler.getPagination()); + + let queued = 0; + for (let page = pageRange.start; page <= pageRange.end; page += 1) { + const target = crawler.buildPageUrl(page) ?? `${source.sourceUrl}`; + + try { + const items = await crawler.fetchLinks(target, source.sourceSelectors.articles); + for (const node of items) { + const url = crawler.extractLink(node); + if (!url) continue; + + await manager.enqueueArticle({ + url, + sourceId: payload.sourceId, + category: payload.category, + dateRange: createDateRange(payload.dateRange), + } as DetailsTaskPayload); + queued += 1; + } + } catch (error) { + logger.error({ error, target }, "Failed to crawl page"); + } + } + + return queued; +}; + +export const collectWordPressListing = async ( + payload: ListingTaskPayload, + manager: QueueManager = createQueueManager(), +): Promise => { + const source = resolveSourceConfig(payload.sourceId) as WordPressSourceConfig; + if (source.sourceKind !== "wordpress") { + return await collectHtmlListing(payload, manager); + } + + const settings = resolveCrawlerConfig(source, payload); + const crawler = new WordPressCrawler(settings); + const pageRange = settings.pageRange ?? (await crawler.getPagination()); + + let queued = 0; + for (let page = pageRange.start; page <= pageRange.end; page += 1) { + const url = crawler.postsEndpoint(page); + + try { + const entries = await crawler.fetchLinks(url); + for (const data of entries) { + const url = data.link; + if (!url) continue; + + await manager.enqueueArticle({ + url, + data, + sourceId: payload.sourceId, + category: payload.category, + dateRange: createDateRange(payload.dateRange), + } as DetailsTaskPayload); + queued += 1; + } + } catch (error) { + logger.error({ error, page }, "Failed to fetch WordPress page"); + } + } + + return queued; +}; + +export const collectArticle = async (payload: DetailsTaskPayload): Promise => { + const source = resolveSourceConfig(payload.sourceId); + const settings = resolveCrawlerConfig(source, { + pageRange: payload.pageRange ? formatPageRange(payload.pageRange) : undefined, + dateRange: payload.dateRange ? formatDateRange(payload.dateRange) : undefined, + sourceId: payload.sourceId, + category: payload.category, + }); + const persistors = [ + new JsonlPersistor({ + directory: config.paths.data, + sourceId: String(source.sourceId), + }), + ]; + + if (source.sourceKind === SourceKindSchema.enum.html) { + if (!payload.url) throw new Error("Missing article url"); + const crawler = new HtmlCrawler(settings, { persistors }); + const html = await crawler.crawl(payload.url); + return await crawler.fetchOne(html, settings.dateRange); + } + + if (source.sourceKind === SourceKindSchema.enum.wordpress) { + const crawler = new WordPressCrawler(settings, { persistors }); + return await crawler.fetchOne(payload.data ?? {}, settings.dateRange); + } + + throw new Error(`Unsupported source kind`); +}; + +export const forwardForProcessing = async (payload: ProcessingTaskPayload): Promise
=> { + logger.info({ article: payload.article.title }, "Ready for downstream processing"); + + const client = new SyncHttpClient(config.fetch.client); + const endpoint = env("BASANGO_CRAWLER_BACKEND_API_ENDPOINT"); + + await client.post(endpoint, { json: payload.article }); + logger.info({ article: payload.article.title }, "Forwarded article to API"); + + return payload.article; +}; diff --git a/basango/apps/crawler/src/process/async/queue.ts b/basango/apps/crawler/src/process/async/queue.ts index 3839861..47e2002 100644 --- a/basango/apps/crawler/src/process/async/queue.ts +++ b/basango/apps/crawler/src/process/async/queue.ts @@ -2,49 +2,17 @@ import { randomUUID } from "node:crypto"; import IORedis from "ioredis"; import { JobsOptions, Queue, QueueOptions } from "bullmq"; -import { z } from "zod"; import { - ArticleTaskPayload, - ArticleTaskPayloadSchema, + DetailsTaskPayload, + DetailsTaskPayloadSchema, ListingTaskPayload, ListingTaskPayloadSchema, - ProcessedTaskPayload, - ProcessedTaskPayloadSchema, + ProcessingTaskPayload, + ProcessingTaskPayloadSchema, } from "@/process/async/schemas"; import { parseRedisUrl } from "@/utils"; - -const QueueSettingsSchema = z.object({ - redis_url: z - .string() - .default(process.env.BASANGO_REDIS_URL ?? "redis://localhost:6379/0"), - prefix: z.string().default(process.env.BASANGO_QUEUE_PREFIX ?? "crawler"), - default_timeout: z - .number() - .int() - .positive() - .default(Number(process.env.BASANGO_QUEUE_TIMEOUT ?? 600)), - result_ttl: z - .number() - .int() - .nonnegative() - .default(Number(process.env.BASANGO_QUEUE_RESULT_TTL ?? 3600)), - failure_ttl: z - .number() - .int() - .nonnegative() - .default(Number(process.env.BASANGO_QUEUE_FAILURE_TTL ?? 3600)), - listing_queue: z.string().default("listing"), - article_queue: z.string().default("articles"), - processed_queue: z.string().default("processed"), -}); - -export type QueueSettingsInput = z.input; -export type QueueSettings = z.output; - -export const createQueueSettings = ( - input?: QueueSettingsInput, -): QueueSettings => QueueSettingsSchema.parse(input ?? {}); +import { config, FetchAsyncConfig } from "@/config"; export interface QueueBackend { add: (name: string, data: T, opts?: JobsOptions) => Promise<{ id: string }>; @@ -52,14 +20,13 @@ export interface QueueBackend { export type QueueFactory = ( queueName: string, - settings: QueueSettings, + settings: FetchAsyncConfig, connection?: IORedis, ) => QueueBackend; const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => { const redisConnection = - connection ?? - new IORedis(settings.redis_url, parseRedisUrl(settings.redis_url)); + connection ?? new IORedis(settings.redisUrl, parseRedisUrl(settings.redisUrl)); const options: QueueOptions = { connection: redisConnection, prefix: settings.prefix, @@ -69,9 +36,8 @@ const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => { return { add: async (name, data, opts) => { const job = await queue.add(name, data, { - removeOnComplete: settings.result_ttl === 0 ? true : undefined, - removeOnFail: settings.failure_ttl === 0 ? true : undefined, - //timeout: settings.default_timeout * 1000, + removeOnComplete: settings.ttl.result === 0 ? true : undefined, + removeOnFail: settings.ttl.failure === 0 ? true : undefined, ...opts, }); return { id: job.id ?? randomUUID() }; @@ -80,59 +46,52 @@ const defaultQueueFactory: QueueFactory = (queueName, settings, connection) => { }; export interface CreateQueueManagerOptions { - settings?: QueueSettings | QueueSettingsInput; queueFactory?: QueueFactory; connection?: IORedis; } export interface QueueManager { - readonly settings: QueueSettings; + readonly settings: FetchAsyncConfig; readonly connection: IORedis; enqueueListing: (payload: ListingTaskPayload) => Promise<{ id: string }>; - enqueueArticle: (payload: ArticleTaskPayload) => Promise<{ id: string }>; - enqueueProcessed: (payload: ProcessedTaskPayload) => Promise<{ id: string }>; + enqueueArticle: (payload: DetailsTaskPayload) => Promise<{ id: string }>; + enqueueProcessed: (payload: ProcessingTaskPayload) => Promise<{ id: string }>; iterQueueNames: () => string[]; queueName: (suffix: string) => string; close: () => Promise; } -export const createQueueManager = ( - options: CreateQueueManagerOptions = {}, -): QueueManager => { - const settings = createQueueSettings( - options.settings as QueueSettingsInput | undefined, - ); +export const createQueueManager = (options: CreateQueueManagerOptions = {}): QueueManager => { + const settings = config.fetch.async; const connection = - options.connection ?? - new IORedis(settings.redis_url, parseRedisUrl(settings.redis_url)); + options.connection ?? new IORedis(settings.redisUrl, parseRedisUrl(settings.redisUrl)); const factory = options.queueFactory ?? defaultQueueFactory; - const ensureQueue = (queueName: string) => - factory(queueName, settings, connection); + const ensureQueue = (queueName: string) => factory(queueName, settings, connection); return { settings, connection, enqueueListing: (payload) => { const data = ListingTaskPayloadSchema.parse(payload); - const queue = ensureQueue(settings.listing_queue); + const queue = ensureQueue(settings.queues.listing); return queue.add("collect_listing", data); }, enqueueArticle: (payload) => { - const data = ArticleTaskPayloadSchema.parse(payload); - const queue = ensureQueue(settings.article_queue); + const data = DetailsTaskPayloadSchema.parse(payload); + const queue = ensureQueue(settings.queues.details); return queue.add("collect_article", data); }, enqueueProcessed: (payload) => { - const data = ProcessedTaskPayloadSchema.parse(payload); - const queue = ensureQueue(settings.processed_queue); + const data = ProcessingTaskPayloadSchema.parse(payload); + const queue = ensureQueue(settings.queues.processing); return queue.add("forward_for_processing", data); }, iterQueueNames: () => [ - `${settings.prefix}:${settings.listing_queue}`, - `${settings.prefix}:${settings.article_queue}`, - `${settings.prefix}:${settings.processed_queue}`, + `${settings.prefix}:${settings.queues.listing}`, + `${settings.prefix}:${settings.queues.details}`, + `${settings.prefix}:${settings.queues.processing}`, ], queueName: (suffix: string) => `${settings.prefix}:${suffix}`, close: async () => { diff --git a/basango/apps/crawler/src/process/async/schemas.ts b/basango/apps/crawler/src/process/async/schemas.ts index 8744235..4101247 100644 --- a/basango/apps/crawler/src/process/async/schemas.ts +++ b/basango/apps/crawler/src/process/async/schemas.ts @@ -1,36 +1,28 @@ import { z } from "zod"; -import { AnySourceConfig, DateRangeSchema, PageRangeSchema } from "@/schema"; +import { ArticleSchema, DateRangeSchema, PageRangeSchema } from "@/schema"; export const ListingTaskPayloadSchema = z.object({ - source_id: z.string(), - env: z.string().default("development"), - page_range: z.string().optional().nullable(), - date_range: z.string().optional().nullable(), - category: z.string().optional().nullable(), + sourceId: z.string(), + pageRange: z.string().optional(), + dateRange: z.string().optional(), + category: z.string().optional(), +}); + +export const DetailsTaskPayloadSchema = z.object({ + sourceId: z.string(), + url: z.url(), + data: z.any().optional(), + page: z.number().int().nonnegative().optional(), + pageRange: PageRangeSchema.optional(), + dateRange: DateRangeSchema.optional(), + category: z.string().optional(), +}); + +export const ProcessingTaskPayloadSchema = z.object({ + sourceId: z.string(), + article: ArticleSchema, }); export type ListingTaskPayload = z.infer; - -export const ArticleTaskPayloadSchema = z.object({ - source_id: z.string(), - env: z.string().default("development"), - url: z.url(), - page: z.number().int().nonnegative().optional(), - page_range: PageRangeSchema.optional().nullable(), - date_range: DateRangeSchema.optional().nullable(), - category: z.string().optional().nullable(), -}); - -export type ArticleTaskPayload = z.infer; - -export const ProcessedTaskPayloadSchema = z.object({ - source_id: z.string(), - env: z.string().default("development"), - article: z.any(), -}); - -export type ProcessedTaskPayload = z.infer; - -export interface ListingContext { - source: AnySourceConfig; -} +export type DetailsTaskPayload = z.infer; +export type ProcessingTaskPayload = z.infer; diff --git a/basango/apps/crawler/src/process/async/tasks.ts b/basango/apps/crawler/src/process/async/tasks.ts index a41f418..2941488 100644 --- a/basango/apps/crawler/src/process/async/tasks.ts +++ b/basango/apps/crawler/src/process/async/tasks.ts @@ -1,171 +1,61 @@ import { logger } from "@basango/logger"; import { - ArticleTaskPayload, - ArticleTaskPayloadSchema, - ListingTaskPayload, + DetailsTaskPayloadSchema, ListingTaskPayloadSchema, - ProcessedTaskPayload, - ProcessedTaskPayloadSchema, + ProcessingTaskPayloadSchema, } from "@/process/async/schemas"; -import { - createQueueManager, - QueueManager, - QueueSettings, - QueueSettingsInput, -} from "@/process/async/queue"; - -export interface CrawlerTaskHandlers { - collectListing: (payload: ListingTaskPayload) => Promise | number; - collectArticle: (payload: ArticleTaskPayload) => Promise | unknown; - forwardForProcessing: ( - payload: ProcessedTaskPayload, - ) => Promise | unknown; -} - -const notImplemented = (name: keyof CrawlerTaskHandlers) => () => { - throw new Error(`Crawler task handler '${name}' is not implemented`); -}; - -let handlers: CrawlerTaskHandlers = { - collectListing: notImplemented("collectListing"), - collectArticle: notImplemented("collectArticle"), - forwardForProcessing: notImplemented("forwardForProcessing"), -}; - -export const registerCrawlerTaskHandlers = ( - overrides: Partial, -): void => { - handlers = { ...handlers, ...overrides }; -}; - -export interface ScheduleAsyncCrawlOptions { - sourceId: string; - env?: string; - pageRange?: string | null; - dateRange?: string | null; - category?: string | null; - settings?: QueueSettings | QueueSettingsInput; - queueManager?: QueueManager; -} - -export const scheduleAsyncCrawl = async ({ - sourceId, - env = "development", - pageRange, - dateRange, - category, - settings, - queueManager, -}: ScheduleAsyncCrawlOptions): Promise => { - const payload = ListingTaskPayloadSchema.parse({ - source_id: sourceId, - env, - page_range: pageRange ?? undefined, - date_range: dateRange ?? undefined, - category: category ?? undefined, - }); - - const manager = queueManager ?? createQueueManager({ settings }); - logger.debug( - { - sourceId, - env: payload.env, - pageRange: payload.page_range, - dateRange: payload.date_range, - category: payload.category, - }, - "Scheduling listing collection job", - ); - try { - const job = await manager.enqueueListing(payload); - logger.info( - { jobId: job.id, sourceId, env: payload.env }, - "Scheduled listing collection job", - ); - return job.id; - } finally { - if (!queueManager) { - await manager.close(); - } - } -}; +import { createQueueManager } from "@/process/async/queue"; +import * as handlers from "@/process/async/handlers"; +import { CrawlingOptions } from "@/process/crawler"; export const collectListing = async (payload: unknown): Promise => { const data = ListingTaskPayloadSchema.parse(payload); - logger.debug( - { - sourceId: data.source_id, - env: data.env, - pageRange: data.page_range, - dateRange: data.date_range, - category: data.category, - }, - "Collecting listing", - ); + logger.debug({ data }, "Collecting listing"); - const result = await handlers.collectListing(data); - const count = typeof result === "number" ? result : 0; - - logger.info( - { - sourceId: data.source_id, - env: data.env, - queuedArticles: count, - }, - "Listing collection completed", - ); + const count = await handlers.collectHtmlListing(data); + logger.info({ count }, "Listing collection completed"); return count; }; export const collectArticle = async (payload: unknown): Promise => { - const data = ArticleTaskPayloadSchema.parse(payload); - logger.debug( - { - sourceId: data.source_id, - env: data.env, - url: data.url, - page: data.page, - }, - "Collecting article", - ); + const data = DetailsTaskPayloadSchema.parse(payload); + logger.info({ data }, "Collecting article"); const result = await handlers.collectArticle(data); - - logger.info( - { - sourceId: data.source_id, - env: data.env, - url: data.url, - }, - "Article collection completed", - ); + logger.info({ url: data.url }, "Article collection completed"); return result; }; -export const forwardForProcessing = async ( - payload: unknown, -): Promise => { - const data = ProcessedTaskPayloadSchema.parse(payload); - logger.debug( - { - sourceId: data.source_id, - env: data.env, - }, - "Forwarding article for processing", - ); +export const forwardForProcessing = async (payload: unknown): Promise => { + const data = ProcessingTaskPayloadSchema.parse(payload); + logger.debug({ sourceId: data.sourceId }, "Forwarding article for processing"); const result = await handlers.forwardForProcessing(data); - - logger.info( - { - sourceId: data.source_id, - env: data.env, - }, - "Article forwarded for processing", - ); + logger.info({ result }, "Article forwarded for processing"); return result; }; + +export const scheduleAsyncCrawl = async (options: CrawlingOptions): Promise => { + const payload = ListingTaskPayloadSchema.parse({ + sourceId: options.sourceId, + pageRange: options.pageRange, + dateRange: options.dateRange, + category: options.category, + }); + + const manager = createQueueManager(); + logger.info({ payload }, "Scheduling listing collection job"); + + try { + const job = await manager.enqueueListing(payload); + logger.info({ job }, "Scheduled listing collection job"); + + return job.id; + } finally { + await manager.close(); + } +}; diff --git a/basango/apps/crawler/src/process/async/worker.ts b/basango/apps/crawler/src/process/async/worker.ts index a6ed43d..27e51dd 100644 --- a/basango/apps/crawler/src/process/async/worker.ts +++ b/basango/apps/crawler/src/process/async/worker.ts @@ -1,27 +1,16 @@ import IORedis from "ioredis"; import { QueueEvents, Worker } from "bullmq"; -import { - createQueueManager, - QueueFactory, - QueueManager, - QueueSettings, - QueueSettingsInput, -} from "@/process/async/queue"; -import { - collectArticle, - collectListing, - forwardForProcessing, -} from "@/process/async/tasks"; +import { QueueFactory, QueueManager } from "@/process/async/queue"; +import { collectArticle, collectListing, forwardForProcessing } from "@/process/async/tasks"; export interface WorkerOptions { queueNames?: string[]; - settings?: QueueSettings | QueueSettingsInput; connection?: IORedis; queueFactory?: QueueFactory; concurrency?: number; onError?: (error: Error) => void; - queueManager?: QueueManager; + queueManager: QueueManager; } export interface WorkerHandle { @@ -30,15 +19,8 @@ export interface WorkerHandle { close: () => Promise; } -export const startWorker = (options: WorkerOptions = {}): WorkerHandle => { - const manager = - options.queueManager ?? - createQueueManager({ - settings: options.settings, - connection: options.connection, - queueFactory: options.queueFactory, - }); - +export const startWorker = (options: WorkerOptions): WorkerHandle => { + const manager = options.queueManager; const queueNames = options.queueNames ?? manager.iterQueueNames(); const workers: Worker[] = []; const events: QueueEvents[] = []; diff --git a/basango/apps/crawler/src/process/crawler.ts b/basango/apps/crawler/src/process/crawler.ts index dfe60d8..f0e71ef 100644 --- a/basango/apps/crawler/src/process/crawler.ts +++ b/basango/apps/crawler/src/process/crawler.ts @@ -1,158 +1,44 @@ -import { logger } from "@basango/logger"; +import { config, FetchCrawlerConfig } from "@/config"; +import { JsonlPersistor, Persistor } from "@/process/persistence"; +import { AnySourceConfig } from "@/schema"; +import logger from "@basango/logger"; +import { createDateRange, createPageRange } from "@/utils"; -import { PipelineConfigManager } from "@/config"; -import { JsonlPersistor, Persistor } from "@/persistence"; -import { - AnySourceConfig, - ClientConfig, - CrawlerConfig, - CrawlerConfigSchema, - PipelineConfig, - SourceKind, -} from "@/schema"; -import { createDateRange } from "@/utils"; -import { PageRangeSchema, PageRangeSpecSchema } from "@/schema"; - -export interface CrawlerInstance { - fetch: () => Promise | void; - close?: () => Promise | void; -} - -export interface CrawlerContext { - pipeline: PipelineConfig; - source: AnySourceConfig; - clientConfig: ClientConfig; - crawlerConfig: CrawlerConfig; - persistors: Persistor[]; -} - -export type CrawlerFactory = (context: CrawlerContext) => CrawlerInstance; - -const registry = new Map(); - -export const registerCrawler = (kind: SourceKind, factory: CrawlerFactory): void => { - registry.set(kind, factory); -}; - -export const clearCrawlerRegistry = (): void => { - registry.clear(); -}; - -export interface RunSyncCrawlOptions { +export interface CrawlingOptions { sourceId: string; - env?: string; - pageRange?: string | null; - dateRange?: string | null; - category?: string | null; - notify?: boolean; - manager?: PipelineConfigManager; - persistFactory?: (context: { - pipeline: PipelineConfig; - source: AnySourceConfig; - resolvedSourceId: string; - }) => Persistor[]; + pageRange?: string | undefined; + dateRange?: string | undefined; + category?: string | undefined; } -const resolvePageRange = (spec?: string | null) => { - if (!spec) return undefined; - const parsed = PageRangeSpecSchema.parse(spec); - return PageRangeSchema.parse(parsed); -}; - -const resolveCrawlerConfig = ( +export const resolveCrawlerConfig = ( source: AnySourceConfig, - options: RunSyncCrawlOptions, -): CrawlerConfig => { - const page_range = resolvePageRange(options.pageRange); - const date_range = options.dateRange ? createDateRange(options.dateRange) : undefined; - - return CrawlerConfigSchema.parse({ + options: CrawlingOptions, +): FetchCrawlerConfig => { + return { + ...config.fetch.crawler, source, - page_range, - date_range, - category: options.category ?? undefined, - notify: options.notify ?? false, - }); + dateRange: createDateRange(options.dateRange), + pageRange: createPageRange(options.pageRange), + category: options.category, + }; }; -const createPersistors = ( - context: { pipeline: PipelineConfig; source: AnySourceConfig; sourceId: string }, - factory?: RunSyncCrawlOptions["persistFactory"], -): Persistor[] => { - if (factory) { - return factory({ - pipeline: context.pipeline, - source: context.source, - resolvedSourceId: context.sourceId, - }); - } - +export const createPersistors = (source: AnySourceConfig): Persistor[] => { return [ new JsonlPersistor({ - directory: context.pipeline.paths.data, - sourceId: context.sourceId, + directory: config.paths.data, + sourceId: source.sourceId, }), ]; }; -export const runSyncCrawl = async (options: RunSyncCrawlOptions): Promise => { - const env = options.env ?? "development"; - const manager = options.manager ?? new PipelineConfigManager({ env }); - const pipeline = manager.get(env); - manager.setupLogging(pipeline); - - const source = pipeline.sources.find(options.sourceId); - if (!source) { - throw new Error(`Source '${options.sourceId}' not found in configuration`); - } - - const crawlerConfig = resolveCrawlerConfig(source, options); - const sourceId = source.source_id ?? options.sourceId; - const persistors = createPersistors({ pipeline, source, sourceId }, options.persistFactory); - - const factory = registry.get(source.source_kind as SourceKind); - if (!factory) { - throw new Error(`No crawler registered for source kind '${source.source_kind}'`); - } - - const context: CrawlerContext = { - pipeline, - source, - clientConfig: pipeline.fetch.client, - crawlerConfig, - persistors, - }; - - const crawler = factory(context); - if (!crawler || typeof crawler.fetch !== "function") { - throw new Error("Registered crawler did not return a valid instance"); - } - - try { - await crawler.fetch(); - logger.info( - { - sourceId: options.sourceId, - kind: source.source_kind, - env, - }, - "Synchronous crawl completed", - ); - } finally { - for (const persistor of persistors) { - try { - await persistor.close?.(); - } catch (error) { - logger.warn({ error }, "Failed to close persistor"); - } - } - - if (typeof crawler.close === "function") { - try { - await crawler.close(); - } catch (error) { - logger.warn({ error }, "Failed to close crawler"); - } +export const closePersistors = async (persistors: Persistor[]): Promise => { + for (const persistor of persistors) { + try { + await persistor.close(); + } catch (error) { + logger.warn({ error }, "Failed to close persistor"); } } }; diff --git a/basango/apps/crawler/src/process/parsers/base.ts b/basango/apps/crawler/src/process/parsers/base.ts new file mode 100644 index 0000000..d7a4c4f --- /dev/null +++ b/basango/apps/crawler/src/process/parsers/base.ts @@ -0,0 +1,108 @@ +import { parse as parseHtml, HTMLElement } from "node-html-parser"; + +import { SyncHttpClient } from "@/http/http-client"; +import { OpenGraph } from "@/http/open-graph"; +import type { Persistor } from "@/process/persistence"; +import { config, FetchCrawlerConfig } from "@/config"; +import { AnySourceConfig, Article } from "@/schema"; + +export interface CrawlerOptions { + persistors?: Persistor[]; +} + +export abstract class BaseCrawler { + protected readonly settings: FetchCrawlerConfig; + protected readonly source: AnySourceConfig; + protected readonly http: SyncHttpClient; + protected readonly persistors: Persistor[]; + protected readonly openGraph: OpenGraph; + + protected constructor(settings: FetchCrawlerConfig, options: CrawlerOptions = {}) { + if (!settings.source) { + throw new Error("Crawler requires a bound source"); + } + + this.http = new SyncHttpClient(config.fetch.client); + this.persistors = options.persistors ?? []; + this.openGraph = new OpenGraph(); + + this.settings = settings; + this.source = settings.source as AnySourceConfig; + } + + /** + * Fetch and process articles from the source. + */ + abstract fetch(): Promise | void; + + /** + * Crawl the given URL and return the HTML content as a string. + * @param url - The URL to crawl + */ + async crawl(url: string): Promise { + const response = await this.http.get(url); + return await response.text(); + } + + /** + * Extract text content from an HTML node. + * @param node - The HTML node + */ + protected textContent(node: HTMLElement | null | undefined): string | null { + if (!node) return null; + // innerText keeps spacing similar to browser rendering + const value = (node as any).innerText ?? node.text; + const text = typeof value === "string" ? value.trim() : String(value ?? "").trim(); + return text.length ? text : null; + } + + /** + * Extract the first matching element from the root using the selector. + * @param root - The root HTML element + * @param selector - The CSS selector + */ + protected extractFirst(root: HTMLElement, selector?: string | null): HTMLElement | null { + if (!selector) return null; + try { + return (root as any).querySelector?.(selector) ?? null; + } catch { + return null; + } + } + + /** + * Extract all matching elements from the root using the selector. + * @param root - The root HTML element + * @param selector - The CSS selector + */ + protected extractAll(root: HTMLElement, selector?: string | null): HTMLElement[] { + if (!selector) return []; + try { + return ((root as any).querySelectorAll?.(selector) ?? []) as HTMLElement[]; + } catch { + return []; + } + } + + /** + * Parse HTML string into an HTMLElement. + * @param html - The HTML string + */ + protected parseHtml(html: string): HTMLElement { + return parseHtml(html) as unknown as HTMLElement; + } + + /** + * Enrich the record with Open Graph metadata from the given URL. + * @param record - The article record + * @param url - The URL to fetch Open Graph data from + */ + protected async enrichWithOpenGraph(record: Article, url?: string): Promise
{ + try { + const metadata = url ? await this.openGraph.consumeUrl(url) : undefined; + return { ...record, metadata }; + } catch { + return { ...record, metadata: undefined }; + } + } +} diff --git a/basango/apps/crawler/src/process/parsers/html.ts b/basango/apps/crawler/src/process/parsers/html.ts new file mode 100644 index 0000000..e5c4292 --- /dev/null +++ b/basango/apps/crawler/src/process/parsers/html.ts @@ -0,0 +1,338 @@ +import { logger } from "@basango/logger"; +import { HTMLElement } from "node-html-parser"; +import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns"; + +import { isTimestampInRange, createAbsoluteUrl } from "@/utils"; +import { persist, Persistor } from "@/process/persistence"; +import { BaseCrawler } from "@/process/parsers/base"; +import TurndownService from "turndown"; +import { DateRange, HtmlSourceConfig } from "@/schema"; +import { FetchCrawlerConfig } from "@/config"; + +const md = new TurndownService({ + headingStyle: "atx", + hr: "---", + bulletListMarker: "-", +}); + +/** + * Create a safe RegExp from the given pattern. + * @param pattern + */ +const safeRegExp = (pattern?: string | null): RegExp | null => { + if (!pattern) return null; + try { + return new RegExp(pattern, "g"); + } catch { + return null; + } +}; + +/** + * Crawler for generic HTML pages. + */ +export class HtmlCrawler extends BaseCrawler { + readonly source: HtmlSourceConfig; + private currentArticleUrl: string | null = null; + + constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) { + super(settings, options); + + if (!settings.source || settings.source.sourceKind !== "html") { + throw new Error("HtmlCrawler requires a source of kind 'html'"); + } + this.source = this.settings.source as HtmlSourceConfig; + } + + async fetch(): Promise { + const pageRange = this.settings.pageRange ?? (await this.getPagination()); + const dateRange = this.settings.dateRange; + + const articleSelector = this.source.sourceSelectors.articles; + if (!articleSelector) { + logger.error( + { source: this.source.sourceId }, + "No article selector configured for HTML source", + ); + return; + } + + let stop = false; + for (let page = pageRange.start; page <= pageRange.end; page += 1) { + const pageUrl = this.buildPageUrl(page); + let html: string; + try { + html = await this.crawl(pageUrl); + } catch (error) { + logger.error({ error, page, pageUrl }, "> page %s => [failed]", page); + continue; + } + + const root = this.parseHtml(html); + const articles = this.extractAll(root, articleSelector); + if (!articles.length) { + logger.info({ page }, "No articles found on page"); + continue; + } + + for (const node of articles) { + try { + this.currentArticleUrl = this.extractLink(node); + let targetHtml = node.toString(); + + if (this.source.requiresDetails) { + if (!this.currentArticleUrl) { + logger.debug({ page }, "Skipping article without link for details"); + continue; + } + try { + targetHtml = await this.crawl(this.currentArticleUrl); + } catch (err) { + logger.error( + { error: err, url: this.currentArticleUrl }, + "Failed to fetch detail page", + ); + continue; + } + } + + const saved = await this.fetchOne(targetHtml, dateRange); + // stop early on first out-of-range if pages are sorted by date desc + if (saved === null) { + stop = true; + break; + } + } catch (error) { + logger.error({ error, pageUrl }, "Failed to process article on page"); + } finally { + this.currentArticleUrl = null; + } + } + + if (stop) break; + } + } + + /** + * Fetch and process a single HTML article. + * @param html - The HTML content of the article + * @param dateRange - Optional date range for filtering + */ + async fetchOne(html: string, dateRange?: DateRange | null) { + const root = this.parseHtml(html); + const sel = this.source.sourceSelectors; + + const titleText = this.extractText(root, sel.articleTitle) ?? "Untitled"; + const link = this.currentArticleUrl ?? this.extractLink(root); + if (!link) { + logger.warn({ title: titleText }, "Skipping article without link"); + return null; + } + + const body = this.extractBody(root, sel.articleBody); + const categories = this.extractCategories(root, sel.articleCategories); + const rawDate = this.extractText(root, sel.articleDate); + const timestamp = this.computeTimestamp(rawDate); + + if (dateRange && !isTimestampInRange(dateRange, timestamp)) { + logger.info( + { title: titleText, link, date: rawDate, timestamp }, + "Skipping article outside date range", + ); + return null; + } + + const enriched = await this.enrichWithOpenGraph( + { + title: titleText, + link, + body, + categories, + source: this.source.sourceId, + timestamp, + }, + link, + ); + + return await persist(enriched, this.persistors); + } + + /** + * Fetch links from the target URL using the given selector. + * @param target - The target URL to crawl + * @param selector - The CSS selector to extract links + */ + async fetchLinks(target: string, selector: string) { + const html = await this.crawl(target); + const root = this.parseHtml(html); + return this.extractAll(root, selector); + } + + /** + * Get the pagination range (start and end page numbers). + */ + async getPagination(): Promise<{ start: number; end: number }> { + return { start: 0, end: await this.getLastPage() }; + } + + /** + * Determine the last page number from pagination links. + */ + private async getLastPage(): Promise { + const template = this.applyCategory(this.source.paginationTemplate); + const url = `${this.source.sourceUrl}${template}`; + try { + const html = await this.crawl(url); + const root = this.parseHtml(html); + const links = this.extractAll(root, this.source.sourceSelectors.pagination); + if (!links.length) return 1; + const last = links[links.length - 1]!; + const href = (last as any).getAttribute?.("href") as string | null; + if (!href) return 1; + + // Heuristic: prefer a number in the href, else "page" query param + const numberMatch = href.match(/(\d+)/); + if (numberMatch) { + const page = Number.parseInt(numberMatch[1]!, 10); + return Number.isFinite(page) && page > 0 ? page : 1; + } + const urlObj = new URL(createAbsoluteUrl(this.source.sourceUrl, href)); + const pageParam = urlObj.searchParams.get("page"); + if (pageParam) { + const page = Number.parseInt(pageParam, 10); + return Number.isFinite(page) && page > 0 ? page : 1; + } + return 1; + } catch { + return 1; + } + } + + /** + * Build the URL for a given page number. + * @param page - The page number + */ + buildPageUrl(page: number): string { + let template = this.applyCategory(this.source.paginationTemplate); + if (template.includes("{page}")) { + template = template.replace("{page}", String(page)); + } else if (page > 0) { + const sep = template.includes("?") ? "&" : "?"; + template = `${template}${sep}page=${page}`; + } + return createAbsoluteUrl(this.source.sourceUrl, template); + } + + /** + * Apply category replacement in the template if needed. + * @param template - The URL template + */ + private applyCategory(template: string): string { + if (template.includes("{category}")) { + const replacement = this.settings.category ?? ""; + return template.replace("{category}", replacement); + } + return template; + } + + /** + * Extract link URL from the given node using the selector. + * @param node - The HTML element + */ + extractLink(node: HTMLElement): string | null { + const selector = this.source.sourceSelectors.articleLink; + if (!selector) return null; + const target = this.extractFirst(node, selector); + if (!target) return null; + + const href = + (target.getAttribute?.("href") as string | null) ?? + ((target as any).getAttribute?.("data-href") as string | null) ?? + ((target as any).getAttribute?.("src") as string | null); + if (!href) return null; + const absolute = createAbsoluteUrl(this.source.sourceUrl, href); + return absolute; + } + + /** + * Extract text content from the root using the selector. + * @param root - The root HTML element + * @param selector - The CSS selector + */ + private extractText(root: HTMLElement, selector?: string | null): string | null { + if (!selector) return null; + const target = this.extractFirst(root, selector); + if (!target) return null; + + // If it's an image, prefer alt/title + const tag = (target as any).tagName?.toLowerCase?.() as string | undefined; + if (tag === "img") { + const alt = (target as any).getAttribute?.("alt") as string | null; + const title = (target as any).getAttribute?.("title") as string | null; + const pick = (alt ?? title ?? "").trim(); + if (pick.length > 0) return pick; + } + return this.textContent(target); + } + + /** + * Extract body content from the root using the selector. + * @param root - The root HTML element + * @param selector - The CSS selector + */ + private extractBody(root: HTMLElement, selector?: string | null): string { + if (selector) { + const nodes = this.extractAll(root, selector); + if (nodes.length) { + const parts = nodes.map((n) => md.turndown(n.toString())).filter(Boolean); + if (parts.length) return parts.join("\n"); + } + } + return md.turndown(root.toString()); + } + + /** + * Extract categories from the root using the selector. + * @param root - The root HTML element + * @param selector - The CSS selector + */ + private extractCategories(root: HTMLElement, selector?: string | null): string[] { + if (!selector) return []; + const values: string[] = []; + for (const node of this.extractAll(root, selector)) { + const text = this.textContent(node); + if (!text) continue; + const lower = text.toLowerCase(); + if (!values.includes(lower)) values.push(lower); + } + return values; + } + + /** + * Compute Unix timestamp from raw date string. + * @param raw - Raw date string + * @private + */ + private computeTimestamp(raw?: string | null): number { + if (!raw) return Math.floor(Date.now() / 1000); + let value = raw.trim(); + const pattern = safeRegExp(this.source.sourceDate?.pattern); + const replacement = this.source.sourceDate?.replacement ?? ""; + if (pattern) { + try { + value = value.replace(pattern, replacement); + } catch { + // ignore pattern failures + } + } + const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm"; + if (!isDateMatch(value, format)) { + // fallback: try native Date.parse as last resort + const parsed = Date.parse(value); + return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000); + } + const date = parseDateFns(value, format, new Date()); + const ts = getUnixTime(date); + return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000); + } +} diff --git a/basango/apps/crawler/src/process/parsers/wordpress.ts b/basango/apps/crawler/src/process/parsers/wordpress.ts new file mode 100644 index 0000000..c185a70 --- /dev/null +++ b/basango/apps/crawler/src/process/parsers/wordpress.ts @@ -0,0 +1,240 @@ +import { logger } from "@basango/logger"; + +import { DateRange, PageRange, WordPressSourceConfig } from "@/schema"; +import { BaseCrawler } from "@/process/parsers/base"; +import { persist, Persistor } from "@/process/persistence"; +import TurndownService from "turndown"; +import { FetchCrawlerConfig } from "@/config"; + +const md = new TurndownService({ + headingStyle: "atx", + hr: "---", + bulletListMarker: "-", +}); + +interface WordPressPost { + link?: string; + slug?: string; + title?: { rendered?: string }; + content?: { rendered?: string }; + date?: string; + categories?: number[]; +} + +/** + * Crawler for WordPress sites using the REST API. + */ +export class WordPressCrawler extends BaseCrawler { + readonly source: WordPressSourceConfig; + private categoryMap: Map = new Map(); + + private static readonly POST_QUERY = + "_fields=date,slug,link,title.rendered,content.rendered,categories&orderby=date&order=desc"; + private static readonly CATEGORY_QUERY = + "_fields=id,slug,count&orderby=count&order=desc&per_page=100"; + private static readonly TOTAL_PAGES_HEADER = "x-wp-totalpages"; + private static readonly TOTAL_POSTS_HEADER = "x-wp-total"; + + constructor(settings: FetchCrawlerConfig, options: { persistors?: Persistor[] } = {}) { + super(settings, options); + + if (!settings.source || settings.source.sourceKind !== "wordpress") { + throw new Error("HtmlCrawler requires a source of kind 'wordpress'"); + } + this.source = this.settings.source as WordPressSourceConfig; + } + + /** + * Fetch and process WordPress posts. + */ + async fetch(): Promise { + const pageRange = this.settings.pageRange ?? (await this.getPagination()); + const dateRange = this.settings.dateRange; + + let stop = false; + for (let page = pageRange.start; page <= pageRange.end; page += 1) { + const endpoint = this.postsEndpoint(page); + try { + const response = await this.http.get(endpoint); + const data = (await response.json()) as unknown; + const articles = Array.isArray(data) ? (data as WordPressPost[]) : []; + if (!Array.isArray(data)) { + logger.warn({ type: typeof data, page }, "Unexpected WordPress payload type"); + } + + for (const entry of articles) { + const saved = await this.fetchOne(entry, dateRange); + if (saved === null) { + stop = true; + break; + } + } + } catch (error) { + logger.error({ error, page }, "> page %s => [failed]", page); + continue; + } + if (stop) break; + } + } + + /** + * Fetch links from a WordPress posts endpoint. + * @param url - The posts endpoint URL + */ + async fetchLinks(url: string) { + const response = await this.http.get(url); + const data = (await response.json()) as unknown; + const articles = Array.isArray(data) ? (data as WordPressPost[]) : []; + if (!Array.isArray(data)) { + logger.warn({ type: typeof data }, "Unexpected WordPress payload type"); + } + return articles; + } + + /** + * Fetch and process a single WordPress post. + * @param input - Decoded JSON object or raw JSON string + * @param dateRange - Optional date range for filtering + */ + async fetchOne(input: unknown, dateRange?: DateRange | null) { + // input can be the decoded JSON object or a raw JSON string + let data: WordPressPost | null = null; + try { + if (typeof input === "string") { + data = JSON.parse(input) as WordPressPost; + } else if (input && typeof input === "object") { + data = input as WordPressPost; + } + } catch (error) { + logger.error({ error }, "Failed to decode WordPress payload"); + throw error; + } + + if (!data || typeof data !== "object") { + throw new Error("Unexpected WordPress payload type"); + } + + const link = data.link; + if (!link) { + logger.error("Skipping WordPress article without link"); + return null; + } + + const titleHtml = data.title?.rendered ?? ""; + const bodyHtml = data.content?.rendered ?? ""; + const title = this.textContent(this.parseHtml(titleHtml)) ?? data.slug ?? "Untitled"; + const body = md.turndown(bodyHtml); + const timestamp = this.computeTimestamp(data.date); + const categories = await this.mapCategories(data.categories ?? []); + + // date range skip as in HTML crawler + if (dateRange) { + const { isTimestampInRange } = await import("@/utils"); + if (!isTimestampInRange(dateRange, timestamp)) { + logger.info( + { title, link, date: data.date, timestamp }, + "Skipping article outside date range", + ); + return null; + } + } + + const enriched = await this.enrichWithOpenGraph( + { + title, + link, + body, + categories, + source: this.source.sourceId, + timestamp, + }, + link, + ); + + return await persist(enriched, this.persistors); + } + + /** + * Get pagination info from WordPress API. + */ + async getPagination(): Promise { + try { + const url = `${this.baseUrl()}wp-json/wp/v2/posts?_fields=id&per_page=100`; + const response = await this.http.get(url); + const pages = Number.parseInt( + response.headers.get(WordPressCrawler.TOTAL_PAGES_HEADER) ?? "1", + 10, + ); + const posts = Number.parseInt( + response.headers.get(WordPressCrawler.TOTAL_POSTS_HEADER) ?? "0", + 10, + ); + logger.info({ posts, pages }, "WordPress pagination"); + const end = Number.isFinite(pages) && pages > 0 ? pages : 1; + return { start: 1, end }; + } catch { + return { start: 1, end: 1 }; + } + } + + /** + * Get base URL for WordPress REST API. + */ + private baseUrl(): string { + const base = String(this.source.sourceUrl); + return base.endsWith("/") ? base : `${base}/`; + } + + /** + * Construct posts endpoint URL for a given page. + * @param page - Page number + */ + postsEndpoint(page: number): string { + return `${this.baseUrl()}wp-json/wp/v2/posts?${WordPressCrawler.POST_QUERY}&page=${page}&per_page=100`; + } + + /** + * Fetch and cache WordPress categories. + */ + private async fetchCategories(): Promise { + const url = `${this.baseUrl()}wp-json/wp/v2/categories?${WordPressCrawler.CATEGORY_QUERY}`; + const response = await this.http.get(url); + const list = (await response.json()) as Array<{ id: number; slug: string }>; + for (const c of list) { + this.categoryMap.set(c.id, c.slug); + } + } + + /** + * Map category IDs to slugs. + * @param ids - Category IDs + */ + private async mapCategories(ids: number[]): Promise { + if (this.categoryMap.size === 0) { + try { + await this.fetchCategories(); + } catch (error) { + logger.warn({ error }, "Failed to fetch WordPress categories"); + } + } + const values: string[] = []; + for (const id of [...ids].sort((a, b) => a - b)) { + const slug = this.categoryMap.get(id); + if (slug && !values.includes(slug)) values.push(slug); + } + return values; + } + + /** + * Compute UNIX timestamp from WordPress date string. + * @param raw - Raw date string + */ + private computeTimestamp(raw?: string | null): number { + if (!raw) return Math.floor(Date.now() / 1000); + // Normalize WordPress Z into +00:00 for Date parsing robustness + const cleaned = raw.replace("Z", "+00:00"); + const parsed = Date.parse(cleaned); + if (!Number.isNaN(parsed)) return Math.floor(parsed / 1000); + return Math.floor(Date.now() / 1000); + } +} diff --git a/basango/apps/crawler/src/process/persistence.ts b/basango/apps/crawler/src/process/persistence.ts new file mode 100644 index 0000000..d36c41d --- /dev/null +++ b/basango/apps/crawler/src/process/persistence.ts @@ -0,0 +1,81 @@ +import fs from "node:fs"; +import path from "node:path"; +import { Article } from "@/schema"; +import { countTokens } from "@/utils"; +import logger from "@basango/logger"; + +export interface Persistor { + persist(record: Article): Promise | void; + close: () => Promise | void; +} + +export interface PersistorOptions { + directory: string; + sourceId: string; + suffix?: string; + encoding?: BufferEncoding; +} + +export const persist = async (payload: Article, persistors: Persistor[]): Promise
=> { + const article = { + ...payload, + tokenStatistics: { + title: countTokens(payload.title), + body: countTokens(payload.body), + excerpt: countTokens(payload.body.substring(0, 200)), + categories: countTokens(payload.categories.join(",")), + }, + } as Article; + + for (const persistor of persistors) { + try { + await persistor.persist(article); + } catch (error) { + logger.error({ error }, "Failed to persist article record"); + } + } + + logger.info({ url: article.link }, "article successfully persisted"); + return article; +}; + +export class JsonlPersistor implements Persistor { + private readonly filePath: string; + private readonly encoding: BufferEncoding; + private pending: Promise = Promise.resolve(); + private closed = false; + + constructor(options: PersistorOptions) { + const suffix = options.suffix ?? ".jsonl"; + this.encoding = options.encoding ?? "utf-8"; + + fs.mkdirSync(options.directory, { recursive: true }); + this.filePath = path.join(options.directory, `${options.sourceId}${suffix}`); + + if (!fs.existsSync(this.filePath)) { + fs.writeFileSync(this.filePath, "", { encoding: this.encoding }); + } + } + + persist(record: Article): Promise { + if (this.closed) { + return Promise.reject(new Error("Persistor has been closed")); + } + + const payload = `${JSON.stringify(record)}\n`; + + this.pending = this.pending.then(async () => { + fs.writeFileSync(this.filePath, payload, { + encoding: this.encoding, + mode: "a", + }); + }); + + return this.pending; + } + + async close(): Promise { + this.closed = true; + await this.pending; + } +} diff --git a/basango/apps/crawler/src/process/sync/tasks.ts b/basango/apps/crawler/src/process/sync/tasks.ts new file mode 100644 index 0000000..bcf6fd3 --- /dev/null +++ b/basango/apps/crawler/src/process/sync/tasks.ts @@ -0,0 +1,29 @@ +import { resolveSourceConfig } from "@/utils"; +import { + closePersistors, + CrawlingOptions, + createPersistors, + resolveCrawlerConfig, +} from "@/process/crawler"; +import logger from "@basango/logger"; +import { WordPressCrawler } from "@/process/parsers/wordpress"; +import { HtmlCrawler } from "@/process/parsers/html"; + +export const runSyncCrawl = async (options: CrawlingOptions): Promise => { + const source = resolveSourceConfig(options.sourceId); + const settings = resolveCrawlerConfig(source, options); + const persistors = createPersistors(source); + + const crawler = + source.sourceKind === "wordpress" + ? new WordPressCrawler(settings, { persistors }) + : new HtmlCrawler(settings, { persistors }); + + try { + await crawler.fetch(); + } finally { + await closePersistors(persistors); + } + + logger.info({ ...options }, "Synchronous crawl completed"); +}; diff --git a/basango/apps/crawler/src/schema.ts b/basango/apps/crawler/src/schema.ts index dc05ee1..1483e5a 100644 --- a/basango/apps/crawler/src/schema.ts +++ b/basango/apps/crawler/src/schema.ts @@ -1,51 +1,8 @@ -import {z} from "zod"; -import {createSourcesConfig, resolveProjectPaths} from "@/utils"; +import { z } from "zod"; export const UpdateDirectionSchema = z.enum(["forward", "backward"]); export const SourceKindSchema = z.enum(["wordpress", "html"]); -export const SourceDateSchema = z.object({ - format: z.string().default("yyyy-LL-dd HH:mm"), - pattern: z.string().nullable().optional(), - replacement: z.string().nullable().optional(), -}); - -export const SourceSelectorsSchema = z.object({ - articles: z.string().optional().nullable(), - article_title: z.string().optional().nullable(), - article_link: z.string().optional().nullable(), - article_body: z.string().optional().nullable(), - article_date: z.string().optional().nullable(), - article_categories: z.string().optional().nullable(), - pagination: z.string().default("ul.pagination > li a"), -}); - -const BaseSourceSchema = z.object({ - source_id: z.string(), - source_url: z.url(), - source_date: SourceDateSchema.default(SourceDateSchema.parse({})), - source_kind: SourceKindSchema, - categories: z.array(z.string()).default([]), - supports_categories: z.boolean().default(false), - requires_details: z.boolean().default(false), - requires_rate_limit: z.boolean().default(false), -}); - -export const HtmlSourceConfigSchema = BaseSourceSchema.extend({ - source_kind: z.literal("html"), - source_selectors: SourceSelectorsSchema.default( - SourceSelectorsSchema.parse({}), - ), - pagination_template: z.string(), -}); - -export const WordPressSourceConfigSchema = BaseSourceSchema.extend({ - source_kind: z.literal("wordpress"), - source_date: SourceDateSchema.default( - SourceDateSchema.parse({format: "yyyy-LL-dd'T'HH:mm:ss"}), - ), -}); - export const DateRangeSchema = z .object({ start: z.number().int(), @@ -96,102 +53,79 @@ export const DateRangeSpecSchema = z .regex(/.+:.+/, "Expected start:end format") .transform((spec) => { const [startRaw, endRaw] = spec.split(":"); - return {startRaw: String(startRaw), endRaw: String(endRaw)}; + return { startRaw: String(startRaw), endRaw: String(endRaw) }; }); -export const ProjectPathsSchema = z.object({ - root: z.string(), - data: z.string(), - logs: z.string(), - configs: z.string(), +export const SourceDateSchema = z.object({ + format: z.string().default("yyyy-LL-dd HH:mm"), + pattern: z.string().nullable().optional(), + replacement: z.string().nullable().optional(), }); -export const LoggingConfigSchema = z.object({ - level: z.string().default("INFO"), - format: z - .string() - .default("%(asctime)s - %(name)s - %(levelname)s - %(message)s"), - console_logging: z.boolean().default(true), - file_logging: z.boolean().default(false), - log_file: z.string().default("crawler.log"), - max_log_size: z - .number() - .int() - .positive() - .default(10 * 1024 * 1024), - backup_count: z.number().int().nonnegative().default(5), +const BaseSourceSchema = z.object({ + sourceId: z.string(), + sourceUrl: z.url(), + sourceDate: SourceDateSchema, + sourceKind: SourceKindSchema, + categories: z.array(z.string()).default([]), + supportsCategories: z.boolean().default(false), + requiresDetails: z.boolean().default(false), + requiresRateLimit: z.boolean().default(false), }); -export const ClientConfigSchema = z.object({ - timeout: z.number().positive().default(20), - user_agent: z - .string() - .default("Basango/0.1 (+https://github.com/bernard-ng/basango)"), - follow_redirects: z.boolean().default(true), - verify_ssl: z.boolean().default(true), - rotate: z.boolean().default(true), - max_retries: z.number().int().nonnegative().default(3), - backoff_initial: z.number().nonnegative().default(1), - backoff_multiplier: z.number().positive().default(2), - backoff_max: z.number().nonnegative().default(30), - respect_retry_after: z.boolean().default(true), +export const HtmlSourceConfigSchema = BaseSourceSchema.extend({ + sourceKind: z.literal("html"), + sourceSelectors: z.object({ + articles: z.string(), + articleTitle: z.string(), + articleLink: z.string(), + articleBody: z.string(), + articleDate: z.string(), + articleCategories: z.string().optional(), + pagination: z.string().default("ul.pagination > li a"), + }), + paginationTemplate: z.string(), }); -export const CrawlerConfigSchema = z.object({ - source: z - .union([HtmlSourceConfigSchema, WordPressSourceConfigSchema]) - .optional(), - page_range: PageRangeSchema.optional(), - date_range: DateRangeSchema.optional(), - category: z.string().optional(), - notify: z.boolean().default(false), - is_update: z.boolean().default(false), - use_multi_threading: z.boolean().default(false), - max_workers: z.number().int().positive().default(5), - direction: UpdateDirectionSchema.default("forward"), +export const WordPressSourceConfigSchema = BaseSourceSchema.extend({ + sourceKind: z.literal("wordpress"), + sourceDate: SourceDateSchema.default(SourceDateSchema.parse({ format: "yyyy-LL-dd'T'HH:mm:ss" })), }); -export const FetchConfigSchema = z.object({ - client: ClientConfigSchema.default(ClientConfigSchema.parse({})), - crawler: CrawlerConfigSchema.default(CrawlerConfigSchema.parse({})), +export const ArticleMetadataSchema = z.object({ + title: z.string().optional(), + description: z.string().optional(), + image: z.string().optional(), + url: z.url().optional(), }); -export const SourcesConfigSchema = z.object({ - html: z.array(HtmlSourceConfigSchema).default([]), - wordpress: z.array(WordPressSourceConfigSchema).default([]), +export const ArticleTokenStatisticsSchema = z.object({ + title: z.number().int().nonnegative().default(0), + body: z.number().int().nonnegative().default(0), + excerpt: z.number().int().nonnegative().default(0), + categories: z.number().int().nonnegative().default(0), }); -export const PipelineConfigSchema = z.object({ - paths: ProjectPathsSchema.default(resolveProjectPaths(process.cwd())), - logging: LoggingConfigSchema.default(LoggingConfigSchema.parse({})), - fetch: FetchConfigSchema.default(FetchConfigSchema.parse({})), - sources: z - .union([SourcesConfigSchema, z.undefined()]) - .transform((value) => createSourcesConfig(value ?? {})), +export const ArticleSchema = z.object({ + title: z.string(), + link: z.url(), + body: z.string(), + categories: z.array(z.string()).default([]), + source: z.string(), + timestamp: z.number().int(), + metadata: ArticleMetadataSchema.optional(), + tokenStatistics: ArticleTokenStatisticsSchema.optional(), }); -export type UpdateDirection = z.infer; -export type SourceKind = z.infer; -export type SourceDate = z.infer; -export type SourceSelectors = z.infer; +export type ArticleMetadata = z.infer; +export type Article = z.infer; +export type DateRange = z.infer; +export type PageRange = z.infer; export type HtmlSourceConfig = z.infer; export type WordPressSourceConfig = z.infer; export type AnySourceConfig = HtmlSourceConfig | WordPressSourceConfig; -export type DateRange = z.infer; -export type PageRange = z.infer; export interface CreateDateRangeOptions { format?: string; separator?: string; } -export type SourcesConfig = z.infer & { - find: (sourceId: string) => AnySourceConfig | undefined; -}; -export type ProjectPaths = z.infer; -export type LoggingConfig = z.infer; -export type ClientConfig = z.infer; -export type CrawlerConfig = z.infer & { - source?: AnySourceConfig; -}; -export type FetchConfig = z.infer; -export type PipelineConfig = z.infer; diff --git a/basango/apps/crawler/src/scripts/crawl.ts b/basango/apps/crawler/src/scripts/crawl.ts new file mode 100644 index 0000000..d92aa6c --- /dev/null +++ b/basango/apps/crawler/src/scripts/crawl.ts @@ -0,0 +1,22 @@ +import { logger } from "@basango/logger"; +import { runSyncCrawl } from "@/process/sync/tasks"; +import { parseCrawlingCliArgs, CRAWLING_USAGE } from "@/scripts/utils"; + +const main = async (): Promise => { + const options = parseCrawlingCliArgs(); + + if (options.sourceId === undefined) { + console.log(CRAWLING_USAGE); + process.exitCode = 1; + return; + } + + try { + await runSyncCrawl({ ...options }); + } catch (error) { + logger.error({ error }, "Synchronous crawl failed"); + process.exitCode = 1; + } +}; + +void main(); diff --git a/basango/apps/crawler/src/scripts/queue.ts b/basango/apps/crawler/src/scripts/queue.ts index 0294b32..364eb77 100644 --- a/basango/apps/crawler/src/scripts/queue.ts +++ b/basango/apps/crawler/src/scripts/queue.ts @@ -1,78 +1,20 @@ -import { parseArgs } from "node:util"; - import { logger } from "@basango/logger"; -import { PipelineConfigManager } from "@/config"; -import { createQueueSettings } from "@/process/async/queue"; import { scheduleAsyncCrawl } from "@/process/async/tasks"; - -interface QueueCliOptions { - source?: string; - env: string; - page?: string; - date?: string; - category?: string; - "redis-url"?: string; - help?: boolean; -} - -const usage = ` - Usage: bun run src/scripts/queue -- --source [options] - - Options: - --page Optional page range filter (e.g. 1:5) - --date Optional date range filter (e.g. 2024-01-01:2024-01-31) - --category Optional category to crawl - --redis-url Override Redis connection URL - --env Environment to load (default: development) - -h, --help Show this message -`; - -const parseCliArgs = (): QueueCliOptions => { - const { values } = parseArgs({ - options: { - source: { type: "string" }, - page: { type: "string" }, - date: { type: "string" }, - category: { type: "string" }, - "redis-url": { type: "string" }, - env: { type: "string", default: "development" }, - help: { type: "boolean", short: "h" }, - }, - }); - - return values as QueueCliOptions; -}; +import { parseCrawlingCliArgs, CRAWLING_USAGE } from "@/scripts/utils"; const main = async (): Promise => { - const options = parseCliArgs(); + const options = parseCrawlingCliArgs(); - if (options.help || !options.source) { - console.log(usage); - if (!options.source) { - process.exitCode = 1; - } + if (options.sourceId === undefined) { + console.log(CRAWLING_USAGE); + process.exitCode = 1; return; } - const env = options.env ?? "development"; - const manager = new PipelineConfigManager({ env }); - manager.setupLogging(manager.get(env)); - - const settings = options["redis-url"] - ? createQueueSettings({ redis_url: options["redis-url"] }) - : undefined; - try { - const id = await scheduleAsyncCrawl({ - sourceId: options.source, - env, - pageRange: options.page ?? null, - dateRange: options.date ?? null, - category: options.category ?? null, - settings, - }); + const id = await scheduleAsyncCrawl({ ...options }); - logger.info({ id, ...options }, "Scheduled asynchronous crawl job"); + logger.info({ id, options }, "Scheduled asynchronous crawl job"); } catch (error) { logger.error({ error }, "Failed to schedule crawl job"); process.exitCode = 1; diff --git a/basango/apps/crawler/src/scripts/utils.ts b/basango/apps/crawler/src/scripts/utils.ts new file mode 100644 index 0000000..77bdb0e --- /dev/null +++ b/basango/apps/crawler/src/scripts/utils.ts @@ -0,0 +1,39 @@ +import { parseArgs } from "node:util"; +import { CrawlingOptions } from "@/process/crawler"; + +interface WorkerCliOptions { + queue?: string[]; +} + +export const CRAWLING_USAGE = ` + Usage: bun run crawl:[async|sync] -- --sourceId [options] + + Options: + --page Optional page range filter (e.g. 1:5) + --date Optional date range filter (e.g. 2024-01-01:2024-01-31) + --category Optional category to crawl + -h, --help Show this message +`; + +export const parseWorkerCliArgs = (): WorkerCliOptions => { + const { values } = parseArgs({ + options: { + queue: { type: "string", multiple: true, short: "q" }, + }, + }); + + return values as WorkerCliOptions; +}; + +export const parseCrawlingCliArgs = (): CrawlingOptions => { + const { values } = parseArgs({ + options: { + sourceId: { type: "string" }, + page: { type: "string" }, + date: { type: "string" }, + category: { type: "string" }, + }, + }); + + return values as CrawlingOptions; +}; diff --git a/basango/apps/crawler/src/scripts/worker.ts b/basango/apps/crawler/src/scripts/worker.ts index f877b98..c34b9d4 100644 --- a/basango/apps/crawler/src/scripts/worker.ts +++ b/basango/apps/crawler/src/scripts/worker.ts @@ -1,93 +1,20 @@ -import { parseArgs } from "node:util"; - import { logger } from "@basango/logger"; -import { PipelineConfigManager } from "@/config"; -import { createQueueManager, createQueueSettings } from "@/process/async/queue"; +import { createQueueManager } from "@/process/async/queue"; import { startWorker } from "@/process/async/worker"; - -interface WorkerCliOptions { - env: string; - queue?: string[]; - concurrency?: string; - "redis-url"?: string; - help?: boolean; -} - -const usage = ` - Usage: bun run src/scripts/worker [options] - - Options: - --env Environment to load (default: development) - -q, --queue Queue name to listen on (repeatable) - --concurrency Number of concurrent jobs per worker - --redis-url Override Redis connection URL - -h, --help Show this message -`; - -const parseCliArgs = (): WorkerCliOptions => { - const { values } = parseArgs({ - options: { - env: { type: "string", default: "development" }, - queue: { type: "string", multiple: true, short: "q" }, - concurrency: { type: "string" }, - "redis-url": { type: "string" }, - help: { type: "boolean", short: "h" }, - }, - }); - - return values as WorkerCliOptions; -}; - -const parseConcurrency = (value?: string): number | undefined => { - if (!value) { - return undefined; - } - - const parsed = Number.parseInt(value, 10); - if (Number.isNaN(parsed) || parsed <= 0) { - throw new Error(`Invalid concurrency value: ${value}`); - } - - return parsed; -}; +import { parseWorkerCliArgs } from "@/scripts/utils"; const main = async (): Promise => { - const options = parseCliArgs(); + const options = parseWorkerCliArgs(); - if (options.help) { - console.log(usage); - return; - } - - const env = options.env ?? "development"; - const manager = new PipelineConfigManager({ env }); - manager.setupLogging(manager.get(env)); - - let concurrency: number | undefined; - try { - concurrency = parseConcurrency(options.concurrency); - } catch (error) { - logger.error( - error instanceof Error ? error : { error }, - "Invalid concurrency value provided", - ); - process.exitCode = 1; - return; - } - const settings = options["redis-url"] - ? createQueueSettings({ redis_url: options["redis-url"] }) - : undefined; - const queueManager = createQueueManager({ settings }); - - const queueNames = options.queue?.length - ? options.queue.map((name) => queueManager.queueName(name)) + const manager = createQueueManager(); + const queues = options.queue?.length + ? options.queue.map((name) => manager.queueName(name)) : undefined; const handle = startWorker({ - queueManager, - queueNames, - concurrency, + queueManager: manager, + queueNames: queues, }); const shutdown = async (signal: NodeJS.Signals) => { @@ -95,26 +22,14 @@ const main = async (): Promise => { try { await handle.close(); } finally { - await queueManager.close(); + await manager.close(); process.exit(0); } }; - process.once("SIGINT", (signal) => { - void shutdown(signal); - }); - process.once("SIGTERM", (signal) => { - void shutdown(signal); - }); - - logger.info( - { - env, - queueNames: queueNames ?? queueManager.iterQueueNames(), - concurrency: concurrency ?? "default", - }, - "Crawler workers started", - ); + process.once("SIGINT", (signal) => void shutdown(signal)); + process.once("SIGTERM", (signal) => void shutdown(signal)); + logger.info({ queueNames: queues }, "Crawler workers started"); }; void main(); diff --git a/basango/apps/crawler/src/utils.ts b/basango/apps/crawler/src/utils.ts index 0c5a4f8..de6bc02 100644 --- a/basango/apps/crawler/src/utils.ts +++ b/basango/apps/crawler/src/utils.ts @@ -1,32 +1,40 @@ -import fs from "node:fs"; -import path from "node:path"; - import type { RedisOptions } from "ioredis"; import { get_encoding, TiktokenEncoding } from "tiktoken"; import { format, getUnixTime, isMatch, parse } from "date-fns"; -import { z } from "zod"; import { + AnySourceConfig, CreateDateRangeOptions, DateRange, DateRangeSchema, DateRangeSpecSchema, - PipelineConfig, - ProjectPaths, - ProjectPathsSchema, - SourcesConfig, - SourcesConfigSchema, + PageRange, + PageRangeSchema, + PageRangeSpecSchema, } from "@/schema"; import { DEFAULT_DATE_FORMAT } from "@/constants"; +import { config } from "@/config"; -export const ensureDirectories = (paths: ProjectPaths): void => { - for (const dir of [paths.data, paths.logs, paths.configs]) { - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true }); - } +/** + * Resolve a source configuration by its ID. + * @param id - The source ID + */ +export const resolveSourceConfig = (id: string): AnySourceConfig => { + const source = + config.sources.html.find((s) => s.sourceId === id) || + config.sources.wordpress.find((s) => s.sourceId === id); + + if (source === undefined) { + throw new Error(`Source '${id}' not found in configuration`); } + + return source; }; +/** + * Parse a Redis URL into RedisOptions. + * @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0") + */ export const parseRedisUrl = (url: string): RedisOptions => { if (!url.startsWith("redis://")) { return {}; @@ -40,20 +48,11 @@ export const parseRedisUrl = (url: string): RedisOptions => { }; }; -export const countTokens = ( - text: string, - encoding: TiktokenEncoding = "cl100k_base", -): number => { - try { - const encoder = get_encoding(encoding); - const tokens = encoder.encode(text); - encoder.free(); - return tokens.length; - } catch { - return text.length; - } -}; - +/** + * Parse a date string using the specified format. + * @param value - The date string to parse + * @param format - The date format + */ const parseDate = (value: string, format: string): Date => { if (!isMatch(value, format)) { throw new Error(`Invalid date '${value}' for format '${format}'`); @@ -65,10 +64,42 @@ const parseDate = (value: string, format: string): Date => { return parsed; }; +/** + * Count the number of tokens in the given text using the specified encoding. + * @param text - The input text + * @param encoding - The token encoding (default: "cl100k_base") + */ +export const countTokens = (text: string, encoding: TiktokenEncoding = "cl100k_base"): number => { + try { + const encoder = get_encoding(encoding); + const tokens = encoder.encode(text); + encoder.free(); + return tokens.length; + } catch { + return text.length; + } +}; + +/** + * Create a page range from a string specification. + * @param spec - The page range specification (e.g., "1:10") + */ +export const createPageRange = (spec: string | undefined): PageRange | undefined => { + if (!spec) return undefined; + const parsed = PageRangeSpecSchema.parse(spec); + return PageRangeSchema.parse(parsed); +}; + +/** + * Create a date range from a string specification. + * @param spec - The date range specification (e.g., "2023-01-01:2023-12-31") + * @param options - Options for date range creation + */ export const createDateRange = ( - spec: string, + spec: string | undefined, options: CreateDateRangeOptions = {}, -): DateRange => { +): DateRange | undefined => { + if (!spec) return undefined; const { format = DEFAULT_DATE_FORMAT, separator = ":" } = options; if (!separator) { throw new Error("Separator cannot be empty"); @@ -88,95 +119,44 @@ export const createDateRange = ( return DateRangeSchema.parse(range); }; -export const formatDateRange = ( - range: DateRange, - fmt = DEFAULT_DATE_FORMAT, -): string => { +/** + * Format a date range into a string representation. + * @param range - The date range + * @param fmt - The date format (default: DEFAULT_DATE_FORMAT) + */ +export const formatDateRange = (range: DateRange, fmt = DEFAULT_DATE_FORMAT): string => { const start = format(new Date(range.start * 1000), fmt); const end = format(new Date(range.end * 1000), fmt); return `${start}:${end}`; }; -export const isTimestampInRange = ( - range: DateRange, - timestamp: number, -): boolean => { +/** + * Format a page range into a string representation. + * @param range - The page range + */ +export const formatPageRange = (range: PageRange): string => { + return `${range.start}:${range.end}`; +}; + +/** + * Check if a timestamp is within a given date range. + * @param range - The date range + * @param timestamp - The timestamp to check + */ +export const isTimestampInRange = (range: DateRange, timestamp: number): boolean => { return range.start <= timestamp && timestamp <= range.end; }; -export const resolveProjectPaths = (rootDir: string): ProjectPaths => { - return ProjectPathsSchema.parse({ - root: rootDir, - data: path.join(rootDir, "data", "dataset"), - logs: path.join(rootDir, "data", "logs"), - configs: path.join(rootDir, "config"), - }); -}; - -export const createSourcesConfig = (input: unknown): SourcesConfig => { - const parsed = SourcesConfigSchema.parse(input); - const resolver = (sourceId: string) => - [...parsed.html, ...parsed.wordpress].find( - (source) => source.source_id === sourceId, - ); - return Object.assign({ find: resolver }, parsed); -}; - -export const mergePipelineConfig = ( - base: PipelineConfig, - overrides: Partial, -): PipelineConfig => { - const paths = overrides.paths ?? base.paths; - const logging = { ...base.logging, ...(overrides.logging ?? {}) }; - const fetch = { - client: { ...base.fetch.client, ...(overrides.fetch?.client ?? {}) }, - crawler: { ...base.fetch.crawler, ...(overrides.fetch?.crawler ?? {}) }, - }; - - const sources = createSourcesConfig({ - html: overrides.sources?.html ?? base.sources.html, - wordpress: overrides.sources?.wordpress ?? base.sources.wordpress, - }); - - return { - paths, - logging, - fetch, - sources, - }; -}; - -export const resolveConfigPath = (basePath: string, env?: string): string => { - if (!env || env === "development") { - return basePath; +/** + * Convert a relative URL to an absolute URL based on the base URL. + * @param base - The base URL + * @param href - The relative or absolute URL + */ +export const createAbsoluteUrl = (base: string, href: string): string => { + try { + // new URL handles relative paths with base + return new URL(href, base.endsWith("/") ? base : `${base}/`).toString(); + } catch { + return href; } - - const ext = path.extname(basePath); - const withoutExt = basePath.slice(0, basePath.length - ext.length); - return `${withoutExt}.${env}${ext}`; -}; -export const schemaToJSON = (schema: T): unknown => { - const toJSONSchema = (z as any).toJSONSchema as - | ((s: z.ZodTypeAny, opts?: Record) => unknown) - | undefined; - - if (typeof toJSONSchema === "function") { - try { - // target can be "draft-2020-12" | "draft-7" | "draft-4" | "openapi-3.0" - return toJSONSchema(schema, { - target: "draft-2020-12", - unrepresentable: "any", - }); - } catch { - // fall through to minimal mapping - } - } - - if (schema instanceof z.ZodObject) return { type: "object" }; - if (schema instanceof z.ZodArray) return { type: "array" }; - if (schema instanceof z.ZodString) return { type: "string" }; - if (schema instanceof z.ZodNumber) return { type: "number" }; - if (schema instanceof z.ZodBoolean) return { type: "boolean" }; - - return { type: "unknown" }; }; diff --git a/basango/biome.json b/basango/biome.json index 2bfef18..77cbd27 100644 --- a/basango/biome.json +++ b/basango/biome.json @@ -10,12 +10,22 @@ }, "formatter": { "enabled": true, - "indentStyle": "space" + "indentStyle": "space", + "indentWidth": 2, + "lineEnding": "lf", + "lineWidth": 100 }, "linter": { "enabled": true, "rules": { - "recommended": true + "recommended": true, + "style": { + "useImportType": "off" + }, + "correctness": { + "noUnusedImports": "on", + "useImportExtensions": "off" + } } }, "javascript": { @@ -27,7 +37,10 @@ "enabled": true, "actions": { "source": { - "organizeImports": "on" + "organizeImports": "on", + "useSortedKeys": "on", + "useSortedAttributes": "on", + "useSortedProperties": "on" } } } diff --git a/basango/bun.lock b/basango/bun.lock index b2fe0e8..bbf026d 100644 --- a/basango/bun.lock +++ b/basango/bun.lock @@ -13,13 +13,15 @@ }, "apps/crawler": { "name": "@basango/crawler", - "version": "0.1.0", "dependencies": { "@basango/logger": "workspace:*", + "@devscast/config": "^1.0.2", "bullmq": "^4.17.0", - "date-fns": "^3.6.0", + "date-fns": "catalog:", "ioredis": "^5.3.2", + "node-html-parser": "^7.0.1", "tiktoken": "^1.0.14", + "turndown": "^7.2.2", "zod": "catalog:", }, }, @@ -33,7 +35,7 @@ "snakecase-keys": "^9.0.2", }, "devDependencies": { - "@types/bun": "^1.3.1", + "@types/bun": "catalog:", "@types/pg": "^8.15.6", "drizzle-kit": "^0.31.6", "typescript": "catalog:", @@ -56,6 +58,7 @@ }, "catalog": { "@types/bun": "^1.3.1", + "date-fns": "^3.6.0", "typescript": "^5.9.3", "zod": "^4.0.0", }, @@ -88,6 +91,8 @@ "@date-fns/utc": ["@date-fns/utc@2.1.1", "", {}, "sha512-SlJDfG6RPeEX8wEVv6ZB3kak4MmbtyiI2qX/5zuKdordbrhB/iaJ58GVMZgJ6P1sJaM1gMgENFYYeg1JWrCFrA=="], + "@devscast/config": ["@devscast/config@1.0.2", "", { "peerDependencies": { "ini": "^6.0.0", "yaml": "^2.8.1", "zod": "^4.1.12" }, "optionalPeers": ["ini", "yaml"] }, "sha512-1DR8GQogAOrR4B9mtZ24YIKlEZNvKOFeovw+XepfkXVx0MB1f1fAHtPAAXppV7RPMLSyQEMFJzve17x2HbohEw=="], + "@drizzle-team/brocli": ["@drizzle-team/brocli@0.10.2", "", {}, "sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w=="], "@esbuild-kit/core-utils": ["@esbuild-kit/core-utils@3.3.2", "", { "dependencies": { "esbuild": "~0.18.20", "source-map-support": "^0.5.21" } }, "sha512-sPRAnw9CdSsRmEtnsl2WXWdyquogVpB3yZ3dgwJfe8zrOzTsV7cJvmwrKVa+0ma5BoiGJ+BoqkMvawbayKUsqQ=="], @@ -156,6 +161,8 @@ "@manypkg/tools": ["@manypkg/tools@2.1.0", "", { "dependencies": { "jju": "^1.4.0", "js-yaml": "^4.1.0", "tinyglobby": "^0.2.13" } }, "sha512-0FOIepYR4ugPYaHwK7hDeHDkfPOBVvayt9QpvRbi2LT/h2b0GaE/gM9Gag7fsnyYyNaTZ2IGyOuVg07IYepvYQ=="], + "@mixmark-io/domino": ["@mixmark-io/domino@2.2.0", "", {}, "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="], + "@msgpackr-extract/msgpackr-extract-darwin-arm64": ["@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-QZHtlVgbAdy2zAqNA9Gu1UpIuI8Xvsd1v8ic6B2pZmeFnFcMWiPLfWXh7TVw4eGEZ/C9TH281KwhVoeQUKbyjw=="], "@msgpackr-extract/msgpackr-extract-darwin-x64": ["@msgpackr-extract/msgpackr-extract-darwin-x64@3.0.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-mdzd3AVzYKuUmiWOQ8GNhl64/IoFGol569zNRdkLReh6LRLHOXxU4U8eq0JwaD8iFHdVGqSy4IjFL4reoWCDFw=="], @@ -190,6 +197,8 @@ "balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="], + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], + "brace-expansion": ["brace-expansion@2.0.2", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ=="], "buffer-from": ["buffer-from@1.1.2", "", {}, "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ=="], @@ -208,6 +217,10 @@ "cron-parser": ["cron-parser@4.9.0", "", { "dependencies": { "luxon": "^3.2.1" } }, "sha512-p0SaNjrHOnQeR8/VnfGbmg9te2kfyYSQ7Sc/j/6DtPL3JQvKxmjO9TSjNFpujqV3vEYYBvNNvXSxzyksBWAx1Q=="], + "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], + + "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], + "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="], "date-fns": ["date-fns@3.6.0", "", {}, "sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww=="], @@ -224,12 +237,22 @@ "detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="], + "dom-serializer": ["dom-serializer@2.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", "entities": "^4.2.0" } }, "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg=="], + + "domelementtype": ["domelementtype@2.3.0", "", {}, "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="], + + "domhandler": ["domhandler@5.0.3", "", { "dependencies": { "domelementtype": "^2.3.0" } }, "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w=="], + + "domutils": ["domutils@3.2.2", "", { "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3" } }, "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw=="], + "drizzle-kit": ["drizzle-kit@0.31.6", "", { "dependencies": { "@drizzle-team/brocli": "^0.10.2", "@esbuild-kit/esm-loader": "^2.5.5", "esbuild": "^0.25.4", "esbuild-register": "^3.5.0" }, "bin": { "drizzle-kit": "bin.cjs" } }, "sha512-/B4e/4pwnx25QwD5xXgdpo1S+077a2VZdosXbItE/oNmUgQwZydGDz9qJYmnQl/b+5IX0rLfwRhrPnroGtrg8Q=="], "drizzle-orm": ["drizzle-orm@0.44.7", "", { "peerDependencies": { "@aws-sdk/client-rds-data": ">=3", "@cloudflare/workers-types": ">=4", "@electric-sql/pglite": ">=0.2.0", "@libsql/client": ">=0.10.0", "@libsql/client-wasm": ">=0.10.0", "@neondatabase/serverless": ">=0.10.0", "@op-engineering/op-sqlite": ">=2", "@opentelemetry/api": "^1.4.1", "@planetscale/database": ">=1.13", "@prisma/client": "*", "@tidbcloud/serverless": "*", "@types/better-sqlite3": "*", "@types/pg": "*", "@types/sql.js": "*", "@upstash/redis": ">=1.34.7", "@vercel/postgres": ">=0.8.0", "@xata.io/client": "*", "better-sqlite3": ">=7", "bun-types": "*", "expo-sqlite": ">=14.0.0", "gel": ">=2", "knex": "*", "kysely": "*", "mysql2": ">=2", "pg": ">=8", "postgres": ">=3", "sql.js": ">=1", "sqlite3": ">=5" }, "optionalPeers": ["@aws-sdk/client-rds-data", "@cloudflare/workers-types", "@electric-sql/pglite", "@libsql/client", "@libsql/client-wasm", "@neondatabase/serverless", "@op-engineering/op-sqlite", "@opentelemetry/api", "@planetscale/database", "@prisma/client", "@tidbcloud/serverless", "@types/better-sqlite3", "@types/pg", "@types/sql.js", "@upstash/redis", "@vercel/postgres", "@xata.io/client", "better-sqlite3", "bun-types", "expo-sqlite", "gel", "knex", "kysely", "mysql2", "pg", "postgres", "sql.js", "sqlite3"] }, "sha512-quIpnYznjU9lHshEOAYLoZ9s3jweleHlZIAWR/jX9gAWNg/JhQ1wj0KGRf7/Zm+obRrYd9GjPVJg790QY9N5AQ=="], "end-of-stream": ["end-of-stream@1.4.5", "", { "dependencies": { "once": "^1.4.0" } }, "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg=="], + "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + "esbuild": ["esbuild@0.25.11", "", { "optionalDependencies": { "@esbuild/aix-ppc64": "0.25.11", "@esbuild/android-arm": "0.25.11", "@esbuild/android-arm64": "0.25.11", "@esbuild/android-x64": "0.25.11", "@esbuild/darwin-arm64": "0.25.11", "@esbuild/darwin-x64": "0.25.11", "@esbuild/freebsd-arm64": "0.25.11", "@esbuild/freebsd-x64": "0.25.11", "@esbuild/linux-arm": "0.25.11", "@esbuild/linux-arm64": "0.25.11", "@esbuild/linux-ia32": "0.25.11", "@esbuild/linux-loong64": "0.25.11", "@esbuild/linux-mips64el": "0.25.11", "@esbuild/linux-ppc64": "0.25.11", "@esbuild/linux-riscv64": "0.25.11", "@esbuild/linux-s390x": "0.25.11", "@esbuild/linux-x64": "0.25.11", "@esbuild/netbsd-arm64": "0.25.11", "@esbuild/netbsd-x64": "0.25.11", "@esbuild/openbsd-arm64": "0.25.11", "@esbuild/openbsd-x64": "0.25.11", "@esbuild/openharmony-arm64": "0.25.11", "@esbuild/sunos-x64": "0.25.11", "@esbuild/win32-arm64": "0.25.11", "@esbuild/win32-ia32": "0.25.11", "@esbuild/win32-x64": "0.25.11" }, "bin": { "esbuild": "bin/esbuild" } }, "sha512-KohQwyzrKTQmhXDW1PjCv3Tyspn9n5GcY2RTDqeORIdIJY8yKIF7sTSopFmn/wpMPW4rdPXI0UE5LJLuq3bx0Q=="], "esbuild-register": ["esbuild-register@3.6.0", "", { "dependencies": { "debug": "^4.3.4" }, "peerDependencies": { "esbuild": ">=0.12 <1" } }, "sha512-H2/S7Pm8a9CL1uhp9OvjwrBh5Pvx0H8qVOxNu8Wed9Y7qv56MPtq+GGM8RJpq6glYJn9Wspr8uw7l55uyinNeg=="], @@ -248,6 +271,8 @@ "graceful-fs": ["graceful-fs@4.2.10", "", {}, "sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA=="], + "he": ["he@1.2.0", "", { "bin": { "he": "bin/he" } }, "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw=="], + "help-me": ["help-me@5.0.0", "", {}, "sha512-7xgomUX6ADmcYzFik0HzAxh/73YlKR9bmFzf51CZwR+b6YtzU2m0u49hQCqV6SvlqIqsaxovfwdvbnsw3b/zpg=="], "inflight": ["inflight@1.0.6", "", { "dependencies": { "once": "^1.3.0", "wrappy": "1" } }, "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA=="], @@ -290,8 +315,12 @@ "node-gyp-build-optional-packages": ["node-gyp-build-optional-packages@5.2.2", "", { "dependencies": { "detect-libc": "^2.0.1" }, "bin": { "node-gyp-build-optional-packages": "bin.js", "node-gyp-build-optional-packages-optional": "optional.js", "node-gyp-build-optional-packages-test": "build-test.js" } }, "sha512-s+w+rBWnpTMwSFbaE0UXsRlg7hU4FjekKU4eyAih5T8nJuNZT1nNsskXpxmeqSK9UzkBl6UgRlnKc8hz8IEqOw=="], + "node-html-parser": ["node-html-parser@7.0.1", "", { "dependencies": { "css-select": "^5.1.0", "he": "1.2.0" } }, "sha512-KGtmPY2kS0thCWGK0VuPyOS+pBKhhe8gXztzA2ilAOhbUbxa9homF1bOyKvhGzMLXUoRds9IOmr/v5lr/lqNmA=="], + "normalize-path": ["normalize-path@3.0.0", "", {}, "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA=="], + "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + "on-exit-leak-free": ["on-exit-leak-free@2.1.2", "", {}, "sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA=="], "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="], @@ -406,6 +435,8 @@ "turbo-windows-arm64": ["turbo-windows-arm64@2.5.8", "", { "os": "win32", "cpu": "arm64" }, "sha512-eFC5XzLmgXJfnAK3UMTmVECCwuBcORrWdewoiXBnUm934DY6QN8YowC/srhNnROMpaKaqNeRpoB5FxCww3eteQ=="], + "turndown": ["turndown@7.2.2", "", { "dependencies": { "@mixmark-io/domino": "^2.2.0" } }, "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ=="], + "type-fest": ["type-fest@4.41.0", "", {}, "sha512-TeTSQ6H5YHvpqVwBRcnLDCBnDOHWYu7IvGbHT6N8AOymcr9PJGjc1GTtiWZTYg0NCgYwvnYWEkVChQAr9bjfwA=="], "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], diff --git a/basango/packages/db/src/queries/articles.ts b/basango/packages/db/src/queries/articles.ts index c018ee6..308aaca 100644 --- a/basango/packages/db/src/queries/articles.ts +++ b/basango/packages/db/src/queries/articles.ts @@ -29,17 +29,17 @@ export interface ArticleFilters { export interface ArticleOverviewRow { article_id: string; - article_title: string; - article_link: string; - article_categories: string | null; + articleTitle: string; + articleLink: string; + articleCategories: string | null; article_excerpt: string | null; article_published_at: string; article_image: string | null; article_reading_time: number | null; - source_id: string; + sourceId: string; source_display_name: string | null; source_image: string; - source_url: string; + sourceUrl: string; source_name: string; source_created_at: string; article_is_bookmarked: boolean; @@ -52,10 +52,10 @@ export interface ArticleOverviewResult { export interface ArticleDetailsRow { article_id: string; - article_title: string; - article_link: string; - article_categories: string | null; - article_body: string; + articleTitle: string; + articleLink: string; + articleCategories: string | null; + articleBody: string; article_hash: string; article_published_at: string; article_crawled_at: string; @@ -66,10 +66,10 @@ export interface ArticleDetailsRow { article_sentiment: string; article_metadata: unknown; article_reading_time: number | null; - source_id: string; + sourceId: string; source_name: string; source_description: string | null; - source_url: string; + sourceUrl: string; source_updated_at: string | null; source_display_name: string | null; source_bias: string; @@ -269,18 +269,18 @@ async function fetchArticleOverview( const selectFields = { article_id: articles.id, - article_title: articles.title, - article_link: articles.link, - article_categories: sql`array_to_string + articleTitle: articles.title, + articleLink: articles.link, + articleCategories: sql`array_to_string (${articles.categories}, ',')`, article_excerpt: articles.excerpt, article_published_at: articles.publishedAt, article_image: articles.image, article_reading_time: articles.readingTime, - source_id: sources.id, + sourceId: sources.id, source_display_name: sources.displayName, source_image: sql`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`, - source_url: sources.url, + sourceUrl: sources.url, source_name: sources.name, source_created_at: sources.createdAt, article_is_bookmarked: bookmarkExpression, @@ -405,18 +405,18 @@ export async function getBookmarkedArticleList( const selectFields = { article_id: articles.id, - article_title: articles.title, - article_link: articles.link, - article_categories: sql`array_to_string + articleTitle: articles.title, + articleLink: articles.link, + articleCategories: sql`array_to_string (${articles.categories}, ',')`, article_excerpt: articles.excerpt, article_published_at: articles.publishedAt, article_image: articles.image, article_reading_time: articles.readingTime, - source_id: sources.id, + sourceId: sources.id, source_display_name: sources.displayName, source_image: sql`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`, - source_url: sources.url, + sourceUrl: sources.url, source_name: sources.name, source_created_at: sources.createdAt, article_is_bookmarked: sql`true`, @@ -492,11 +492,11 @@ export async function getArticleDetails( const [row] = await db .select({ article_id: articles.id, - article_title: articles.title, - article_link: articles.link, - article_categories: sql`array_to_string + articleTitle: articles.title, + articleLink: articles.link, + articleCategories: sql`array_to_string (${articles.categories}, ',')`, - article_body: articles.body, + articleBody: articles.body, article_hash: articles.hash, article_published_at: articles.publishedAt, article_crawled_at: articles.crawledAt, @@ -507,10 +507,10 @@ export async function getArticleDetails( article_sentiment: articles.sentiment, article_metadata: articles.metadata, article_reading_time: articles.readingTime, - source_id: sources.id, + sourceId: sources.id, source_name: sources.name, source_description: sources.description, - source_url: sources.url, + sourceUrl: sources.url, source_updated_at: sources.updatedAt, source_display_name: sources.displayName, source_bias: sources.bias, diff --git a/basango/packages/db/src/queries/sources.ts b/basango/packages/db/src/queries/sources.ts index 41d9ff1..cb1f857 100644 --- a/basango/packages/db/src/queries/sources.ts +++ b/basango/packages/db/src/queries/sources.ts @@ -13,10 +13,10 @@ import { import { PUBLICATION_GRAPH_DAYS, SOURCE_IMAGE_BASE } from "@/constant"; export interface SourceOverviewRow { - source_id: string; + sourceId: string; source_display_name: string | null; source_image: string; - source_url: string; + sourceUrl: string; source_name: string; source_created_at: string; source_is_followed: boolean; @@ -40,10 +40,10 @@ export interface CategoryShare { export interface SourceDetailsResult { source: { - source_id: string; + sourceId: string; source_name: string; source_description: string | null; - source_url: string; + sourceUrl: string; source_updated_at: string | null; source_display_name: string | null; source_bias: string; @@ -148,7 +148,7 @@ function buildFollowExistsExpression(userId: string): SQL { return sql`EXISTS (SELECT 1 FROM ${followedSources} f - WHERE f.source_id = ${sources.id} + WHERE f.sourceId = ${sources.id} AND f.follower_id = ${userId})`; } @@ -161,10 +161,10 @@ export async function getSourceOverviewList( let query = db .select({ - source_id: sources.id, + sourceId: sources.id, source_display_name: sources.displayName, source_image: sql`('${SOURCE_IMAGE_BASE}' || ${sources.name} || '.png')`, - source_url: sources.url, + sourceUrl: sources.url, source_name: sources.name, source_created_at: sources.createdAt, source_is_followed: followExpression, @@ -186,7 +186,7 @@ export async function getSourceOverviewList( .limit(page.limit + 1); return buildPaginationResult(rows, page, { - id: "source_id", + id: "sourceId", date: "source_created_at", }); } @@ -298,10 +298,10 @@ export async function getSourceDetails( const [row] = await db .select({ - source_id: sources.id, + sourceId: sources.id, source_name: sources.name, source_description: sources.description, - source_url: sources.url, + sourceUrl: sources.url, source_updated_at: sources.updatedAt, source_display_name: sources.displayName, source_bias: sources.bias, diff --git a/basango/packages/db/src/schema.ts b/basango/packages/db/src/schema.ts index 2b71668..414b8a8 100644 --- a/basango/packages/db/src/schema.ts +++ b/basango/packages/db/src/schema.ts @@ -101,7 +101,7 @@ export const sources = pgTable( sql`lower (${table.name})`, ), - uniqueIndex("unq_source_url").using( + uniqueIndex("unq_sourceUrl").using( "btree", sql`lower (${table.url})`, @@ -113,7 +113,7 @@ export const articles = pgTable( "article", { id: uuid("id").notNull().defaultRandom().primaryKey(), - sourceId: uuid("source_id").notNull(), + sourceId: uuid("sourceId").notNull(), title: varchar("title", { length: 1024 }).notNull(), body: text("body").notNull(), hash: varchar("hash", { length: 32 }).notNull(), @@ -143,7 +143,7 @@ export const articles = pgTable( ), }, (table) => [ - index("article_source_id_idx").on(table.sourceId), + index("article_sourceId_idx").on(table.sourceId), index("idx_article_published_at").using("btree", table.publishedAt.desc()), index("idx_article_published_id").using( "btree", @@ -152,16 +152,16 @@ export const articles = pgTable( ), unique("unq_article_hash").on(table.hash), index("gin_article_tsv").using("gin", table.tsv), - index("gin_article_link_trgm").using("gin", table.link.op("gin_trgm_ops")), - index("gin_article_title_trgm").using( + index("gin_articleLink_trgm").using("gin", table.link.op("gin_trgm_ops")), + index("gin_articleTitle_trgm").using( "gin", table.title.op("gin_trgm_ops"), ), - index("gin_article_categories").using("gin", table.categories), + index("gin_articleCategories").using("gin", table.categories), foreignKey({ columns: [table.sourceId], foreignColumns: [sources.id], - name: "article_source_id_fkey", + name: "article_sourceId_fkey", }).onDelete("cascade"), { kind: "check", @@ -288,12 +288,12 @@ export const followedSources = pgTable( { id: uuid("id").notNull().defaultRandom().primaryKey(), followerId: uuid("follower_id").notNull(), - sourceId: uuid("source_id").notNull(), + sourceId: uuid("sourceId").notNull(), createdAt: timestamp("created_at", { mode: "string" }).notNull(), }, (table) => [ index("followed_source_follower_idx").on(table.followerId), - index("followed_source_source_idx").on(table.sourceId), + index("followed_source_sourceIdx").on(table.sourceId), index("idx_followed_source_follower_created").using( "btree", table.followerId, @@ -307,7 +307,7 @@ export const followedSources = pgTable( foreignKey({ columns: [table.sourceId], foreignColumns: [sources.id], - name: "followed_source_source_id_fkey", + name: "followed_source_sourceId_fkey", }).onDelete("cascade"), ], ); diff --git a/basango/packages/tsconfig/base.json b/basango/packages/tsconfig/base.json index 64cee93..f389323 100644 --- a/basango/packages/tsconfig/base.json +++ b/basango/packages/tsconfig/base.json @@ -20,7 +20,7 @@ "erasableSyntaxOnly": true, "noFallthroughCasesInSwitch": true, "noUncheckedSideEffectImports": true, - "allowImportingTsExtensions": true, + "allowImportingTsExtensions": false, "strict": true, "target": "ES2022", "baseUrl": "." diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..314a7b7 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "basango", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/package.json @@ -0,0 +1 @@ +{}