From 05a78913c0eb1a3c75537d206787910a8d4f6c1a Mon Sep 17 00:00:00 2001 From: bernard-ng Date: Fri, 14 Nov 2025 14:31:32 +0200 Subject: [PATCH] feat(crawler): sync local data --- apps/api/src/index.ts | 13 +--- apps/api/src/schemas/articles.ts | 14 ++-- apps/crawler/config/sources.json | 29 +++------ apps/crawler/src/http/open-graph.ts | 7 +- apps/crawler/src/process/parsers/html.ts | 64 +++++++++---------- apps/crawler/src/process/parsers/wordpress.ts | 3 +- apps/crawler/src/schema.ts | 2 - apps/crawler/src/scripts/sync.ts | 10 ++- apps/crawler/src/utils.ts | 14 ++++ 9 files changed, 81 insertions(+), 75 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 53f594a..d87bdc8 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -12,18 +12,7 @@ import { appRouter } from "#api/trpc/routers/_app"; const app = new OpenAPIHono(); -app.use(async (c, next) => { - const data = await c.req.json(); - - console.log("Incoming Request:", { - data: data, - headers: c.req.header, - method: c.req.method, - url: c.req.url, - }); - - return next(); -}); +app.use(logger()); app.use(secureHeaders()); app.use( diff --git a/apps/api/src/schemas/articles.ts b/apps/api/src/schemas/articles.ts index 651feea..550d92a 100644 --- a/apps/api/src/schemas/articles.ts +++ b/apps/api/src/schemas/articles.ts @@ -38,10 +38,16 @@ export const createArticleSchema = z example: "https://example.com/article", }), metadata: metadataSchema.optional(), - publishedAt: z.date().openapi({ - description: "The publication date of the article.", - example: "2023-01-01T00:00:00Z", - }), + publishedAt: z + .string() + .refine((value) => !Number.isNaN(Date.parse(value)), { + message: "Invalid date format", + }) + .transform((value) => new Date(value)) + .openapi({ + description: "The publication date of the article in ISO 8601 format.", + example: "2023-01-01T00:00:00Z", + }), sourceId: z.string().openapi({ description: "The unique identifier of the source from which the article was crawled.", example: "radiookapi.net", diff --git a/apps/crawler/config/sources.json b/apps/crawler/config/sources.json index 37193a7..6d838d0 100644 --- a/apps/crawler/config/sources.json +++ b/apps/crawler/config/sources.json @@ -5,16 +5,13 @@ "paginationTemplate": "actualite", "requiresDetails": true, "requiresRateLimit": false, - "sourceDate": { - "pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", - "replacement": "$3-$2-$1 $4" - }, + "sourceDate": {}, "sourceId": "radiookapi.net", "sourceKind": "html", "sourceSelectors": { "articleBody": ".field-name-body", "articleCategories": ".views-field-field-cat-gorie a", - "articleDate": ".views-field-created", + "articleDate": "head > meta[property=\"article:published_time\"]", "articleLink": ".views-field-title a", "articles": ".view-content > .views-row.content-row", "articleTitle": "h1.page-header", @@ -26,17 +23,14 @@ { "categories": ["politique", "economie", "culture", "sport", "societe"], "paginationTemplate": "index.php/category/{category}", - "requiresDetails": false, + "requiresDetails": true, "requiresRateLimit": false, - "sourceDate": { - "pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/", - "replacement": "$3-$2-$1 $4" - }, + "sourceDate": {}, "sourceId": "7sur7.cd", "sourceKind": "html", "sourceSelectors": { - "articleBody": ".field.field--name-body", - "articleDate": ".views-field-created", + "articleBody": "div[property=\"schema:text\"].field.field--name-body", + "articleDate": "head > meta[property=\"article:published_time\"]", "articleLink": ".views-field-title a", "articles": ".view-content > .row.views-row", "articleTitle": ".views-field-title a", @@ -50,7 +44,7 @@ "requiresDetails": true, "requiresRateLimit": false, "sourceDate": { - "format": "%d.%m.%Y %H:%M" + "format": "dd.MM.yyyy" }, "sourceId": "mediacongo.net", "sourceKind": "html", @@ -70,16 +64,13 @@ "paginationTemplate": "actualite", "requiresDetails": true, "requiresRateLimit": false, - "sourceDate": { - "pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/", - "replacement": "$4-$3-$2 $5" - }, + "sourceDate": {}, "sourceId": "actualite.cd", "sourceKind": "html", "sourceSelectors": { - "articleBody": ".views-field.views-field-body", + "articleBody": ".views-field.views-field-body .field-content", "articleCategories": "#actu-cat", - "articleDate": "#p-date", + "articleDate": "head > meta[property=\"article:published_time\"]", "articleLink": "#actu-titre a", "articles": "#views-bootstrap-taxonomy-term-page-2 > div > div", "articleTitle": "h1.page-title" diff --git a/apps/crawler/src/http/open-graph.ts b/apps/crawler/src/http/open-graph.ts index 4dbe018..008da50 100644 --- a/apps/crawler/src/http/open-graph.ts +++ b/apps/crawler/src/http/open-graph.ts @@ -5,6 +5,7 @@ import { OPEN_GRAPH_USER_AGENT } from "#crawler/constants"; import { SyncHttpClient } from "#crawler/http/http-client"; import { UserAgents } from "#crawler/http/user-agent"; import { ArticleMetadata } from "#crawler/schema"; +import { createAbsoluteUrl } from "#crawler/utils"; /** * Picks the first non-empty value from the provided array. @@ -71,7 +72,7 @@ export class OpenGraph { * @param html - HTML content as a string * @param url - Optional URL of the page */ - static consumeHtml(html: string, url?: string): ArticleMetadata | undefined { + static consumeHtml(html: string, url: string): ArticleMetadata | undefined { if (!html) { return undefined; } @@ -95,9 +96,9 @@ export class OpenGraph { return { description, - image, + image: createAbsoluteUrl(url, image ?? "") || undefined, title, - url: canonical, + url: createAbsoluteUrl(url, canonical ?? "") || undefined, }; } } diff --git a/apps/crawler/src/process/parsers/html.ts b/apps/crawler/src/process/parsers/html.ts index 8504c2d..dea1da5 100644 --- a/apps/crawler/src/process/parsers/html.ts +++ b/apps/crawler/src/process/parsers/html.ts @@ -1,5 +1,5 @@ import { logger } from "@basango/logger"; -import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns"; +import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns"; import { HTMLElement } from "node-html-parser"; import TurndownService from "turndown"; @@ -21,19 +21,6 @@ const md = new TurndownService({ hr: "---", }); -/** - * Create a safe RegExp from the given pattern. - * @param pattern - */ -const safeRegExp = (pattern?: string | null): RegExp | null => { - if (!pattern) return null; - try { - return new RegExp(pattern, "g"); - } catch { - return null; - } -}; - /** * Crawler for generic HTML pages. */ @@ -123,10 +110,10 @@ export class HtmlCrawler extends BaseCrawler { const root = this.parseHtml(html); const selectors = this.source.sourceSelectors; - const title = this.extractText(root, selectors.articleTitle) ?? "Untitled"; + const title = this.extractText(root, selectors.articleTitle); const link = this.currentNode ?? this.extractLink(root); - if (!link) { - throw new InvalidArticleError("Missing article link"); + if (!link || !title) { + throw new InvalidArticleError("Missing article link or title"); } const body = this.extractBody(root, selectors.articleBody); @@ -148,7 +135,7 @@ export class HtmlCrawler extends BaseCrawler { body, categories, link, - publishedAt: new Date(timestamp * 1000), + publishedAt: fromUnixTime(timestamp), sourceId: this.source.sourceId, title, }, @@ -271,6 +258,19 @@ export class HtmlCrawler extends BaseCrawler { const pick = (alt ?? title ?? "").trim(); if (pick.length > 0) return pick; } + + // If it's a time tag, prefer datetime attribute + if (tag === "time") { + const datetime = target.getAttribute("datetime"); + if (datetime) return datetime.trim(); + } + + // If it's a meta tag, prefer content attribute + if (tag === "meta") { + const content = target.getAttribute("content"); + if (content) return content.trim(); + } + return this.textContent(target); } @@ -296,7 +296,9 @@ export class HtmlCrawler extends BaseCrawler { * @param selector - The CSS selector */ private extractCategories(root: HTMLElement, selector?: string | null): string[] { + if (!selector && this.settings.category) return [this.settings.category.toLowerCase()]; if (!selector) return []; + const values: string[] = []; for (const node of this.extractAll(root, selector)) { const text = this.textContent(node); @@ -314,24 +316,22 @@ export class HtmlCrawler extends BaseCrawler { */ private computeTimestamp(raw?: string | null): number { if (!raw) return Math.floor(Date.now() / 1000); - let value = raw.trim(); - const pattern = safeRegExp(this.source.sourceDate?.pattern); - const replacement = this.source.sourceDate?.replacement ?? ""; - if (pattern) { - try { - value = value.replace(pattern, replacement); - } catch { - // ignore pattern failures - } + const value = raw.trim(); + + const format = this.source.sourceDate.format; + if (format === "dd.MM.yyyy") { + const [day, month, year] = raw.split(".").map(Number); + const timestamp = getUnixTime(new Date(year!, month! - 1, day)); + return Number.isFinite(timestamp) ? timestamp : Math.floor(Date.now() / 1000); } - const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm"; + if (!isDateMatch(value, format)) { - // fallback: try native Date.parse as last resort const parsed = Date.parse(value); return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000); } - const date = parseDateFns(value, format, new Date()); - const ts = getUnixTime(date); - return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000); + + const date = parse(value, format, new Date()); + const timestamp = getUnixTime(date); + return Number.isFinite(timestamp) ? timestamp : Math.floor(Date.now() / 1000); } } diff --git a/apps/crawler/src/process/parsers/wordpress.ts b/apps/crawler/src/process/parsers/wordpress.ts index 54f0391..dd48871 100644 --- a/apps/crawler/src/process/parsers/wordpress.ts +++ b/apps/crawler/src/process/parsers/wordpress.ts @@ -1,4 +1,5 @@ import { logger } from "@basango/logger"; +import { fromUnixTime } from "date-fns"; import TurndownService from "turndown"; import { FetchCrawlerConfig } from "#crawler/config"; @@ -148,7 +149,7 @@ export class WordPressCrawler extends BaseCrawler { body, categories, link, - publishedAt: new Date(timestamp * 1000), + publishedAt: fromUnixTime(timestamp), sourceId: this.source.sourceId, title, }, diff --git a/apps/crawler/src/schema.ts b/apps/crawler/src/schema.ts index aef3a57..17a8308 100644 --- a/apps/crawler/src/schema.ts +++ b/apps/crawler/src/schema.ts @@ -58,8 +58,6 @@ export const DateRangeSpecSchema = z export const SourceDateSchema = z.object({ format: z.string().default("yyyy-LL-dd HH:mm"), - pattern: z.string().nullable().optional(), - replacement: z.string().nullable().optional(), }); const BaseSourceSchema = z.object({ diff --git a/apps/crawler/src/scripts/sync.ts b/apps/crawler/src/scripts/sync.ts index 9b0a1cd..22bc632 100644 --- a/apps/crawler/src/scripts/sync.ts +++ b/apps/crawler/src/scripts/sync.ts @@ -6,7 +6,7 @@ import { parseArgs } from "node:util"; import { logger } from "@basango/logger"; import { config, env } from "#crawler/config"; -import { SyncHttpClient } from "#crawler/http/http-client"; +import { HttpError, SyncHttpClient } from "#crawler/http/http-client"; import type { Article } from "#crawler/schema"; const USAGE = ` @@ -43,7 +43,13 @@ const forwardArticle = async (article: Article): Promise => { logger.error({ link: article.link, status: response.status }, "Forwarding failed"); } catch (error) { - logger.error({ error, link: article.link }, "Failed to forward article"); + if (error instanceof HttpError) { + const data = await error.response.json(); + logger.error({ ...data, link: article.link }, "Error forwarding article"); + return; + } + + logger.error({ error, link: article.link }, "Error forwarding article"); } }; diff --git a/apps/crawler/src/utils.ts b/apps/crawler/src/utils.ts index ebac549..459fc78 100644 --- a/apps/crawler/src/utils.ts +++ b/apps/crawler/src/utils.ts @@ -145,3 +145,17 @@ export const createAbsoluteUrl = (base: string, href: string): string => { return href; } }; + +/** + * extract the domain name from a URL. + * @param url - The URL string + * @returns The domain name or null if invalid URL + */ +export const extractDomainName = (url: string): string | null => { + try { + const parsed = new URL(url); + return parsed.hostname; + } catch { + return null; + } +};