feat(crawler): sync local data

2025-11-14 14:31:32 +02:00
parent 4ec2a608b1
commit 05a78913c0
9 changed files with 81 additions and 75 deletions
@@ -12,18 +12,7 @@ import { appRouter } from "#api/trpc/routers/_app";

 const app = new OpenAPIHono();

-app.use(async (c, next) => {
-  const data = await c.req.json();
-
-  console.log("Incoming Request:", {
-    data: data,
-    headers: c.req.header,
-    method: c.req.method,
-    url: c.req.url,
-  });
-
-  return next();
-});
+app.use(logger());
 app.use(secureHeaders());

 app.use(
@@ -38,10 +38,16 @@ export const createArticleSchema = z
      example: "https://example.com/article",
    }),
    metadata: metadataSchema.optional(),
-    publishedAt: z.date().openapi({
-      description: "The publication date of the article.",
-      example: "2023-01-01T00:00:00Z",
-    }),
+    publishedAt: z
+      .string()
+      .refine((value) => !Number.isNaN(Date.parse(value)), {
+        message: "Invalid date format",
+      })
+      .transform((value) => new Date(value))
+      .openapi({
+        description: "The publication date of the article in ISO 8601 format.",
+        example: "2023-01-01T00:00:00Z",
+      }),
    sourceId: z.string().openapi({
      description: "The unique identifier of the source from which the article was crawled.",
      example: "radiookapi.net",
@@ -5,16 +5,13 @@
        "paginationTemplate": "actualite",
        "requiresDetails": true,
        "requiresRateLimit": false,
-        "sourceDate": {
-          "pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
-          "replacement": "$3-$2-$1 $4"
-        },
+        "sourceDate": {},
        "sourceId": "radiookapi.net",
        "sourceKind": "html",
        "sourceSelectors": {
          "articleBody": ".field-name-body",
          "articleCategories": ".views-field-field-cat-gorie a",
-          "articleDate": ".views-field-created",
+          "articleDate": "head > meta[property=\"article:published_time\"]",
          "articleLink": ".views-field-title a",
          "articles": ".view-content > .views-row.content-row",
          "articleTitle": "h1.page-header",
@@ -26,17 +23,14 @@
      {
        "categories": ["politique", "economie", "culture", "sport", "societe"],
        "paginationTemplate": "index.php/category/{category}",
-        "requiresDetails": false,
+        "requiresDetails": true,
        "requiresRateLimit": false,
-        "sourceDate": {
-          "pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
-          "replacement": "$3-$2-$1 $4"
-        },
+        "sourceDate": {},
        "sourceId": "7sur7.cd",
        "sourceKind": "html",
        "sourceSelectors": {
-          "articleBody": ".field.field--name-body",
-          "articleDate": ".views-field-created",
+          "articleBody": "div[property=\"schema:text\"].field.field--name-body",
+          "articleDate": "head > meta[property=\"article:published_time\"]",
          "articleLink": ".views-field-title a",
          "articles": ".view-content > .row.views-row",
          "articleTitle": ".views-field-title a",
@@ -50,7 +44,7 @@
        "requiresDetails": true,
        "requiresRateLimit": false,
        "sourceDate": {
-          "format": "%d.%m.%Y %H:%M"
+          "format": "dd.MM.yyyy"
        },
        "sourceId": "mediacongo.net",
        "sourceKind": "html",
@@ -70,16 +64,13 @@
        "paginationTemplate": "actualite",
        "requiresDetails": true,
        "requiresRateLimit": false,
-        "sourceDate": {
-          "pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
-          "replacement": "$4-$3-$2 $5"
-        },
+        "sourceDate": {},
        "sourceId": "actualite.cd",
        "sourceKind": "html",
        "sourceSelectors": {
-          "articleBody": ".views-field.views-field-body",
+          "articleBody": ".views-field.views-field-body .field-content",
          "articleCategories": "#actu-cat",
-          "articleDate": "#p-date",
+          "articleDate": "head > meta[property=\"article:published_time\"]",
          "articleLink": "#actu-titre a",
          "articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
          "articleTitle": "h1.page-title"
@@ -5,6 +5,7 @@ import { OPEN_GRAPH_USER_AGENT } from "#crawler/constants";
 import { SyncHttpClient } from "#crawler/http/http-client";
 import { UserAgents } from "#crawler/http/user-agent";
 import { ArticleMetadata } from "#crawler/schema";
+import { createAbsoluteUrl } from "#crawler/utils";

 /**
 * Picks the first non-empty value from the provided array.
@@ -71,7 +72,7 @@ export class OpenGraph {
   * @param html - HTML content as a string
   * @param url - Optional URL of the page
   */
-  static consumeHtml(html: string, url?: string): ArticleMetadata | undefined {
+  static consumeHtml(html: string, url: string): ArticleMetadata | undefined {
    if (!html) {
      return undefined;
    }
@@ -95,9 +96,9 @@ export class OpenGraph {

    return {
      description,
-      image,
+      image: createAbsoluteUrl(url, image ?? "") || undefined,
      title,
-      url: canonical,
+      url: createAbsoluteUrl(url, canonical ?? "") || undefined,
    };
  }
 }
@@ -1,5 +1,5 @@
 import { logger } from "@basango/logger";
-import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
+import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns";
 import { HTMLElement } from "node-html-parser";
 import TurndownService from "turndown";

@@ -21,19 +21,6 @@ const md = new TurndownService({
  hr: "---",
 });

-/**
- * Create a safe RegExp from the given pattern.
- * @param pattern
- */
-const safeRegExp = (pattern?: string | null): RegExp | null => {
-  if (!pattern) return null;
-  try {
-    return new RegExp(pattern, "g");
-  } catch {
-    return null;
-  }
-};
-
 /**
 * Crawler for generic HTML pages.
 */
@@ -123,10 +110,10 @@ export class HtmlCrawler extends BaseCrawler {
    const root = this.parseHtml(html);
    const selectors = this.source.sourceSelectors;

-    const title = this.extractText(root, selectors.articleTitle) ?? "Untitled";
+    const title = this.extractText(root, selectors.articleTitle);
    const link = this.currentNode ?? this.extractLink(root);
-    if (!link) {
-      throw new InvalidArticleError("Missing article link");
+    if (!link || !title) {
+      throw new InvalidArticleError("Missing article link or title");
    }

    const body = this.extractBody(root, selectors.articleBody);
@@ -148,7 +135,7 @@ export class HtmlCrawler extends BaseCrawler {
        body,
        categories,
        link,
-        publishedAt: new Date(timestamp * 1000),
+        publishedAt: fromUnixTime(timestamp),
        sourceId: this.source.sourceId,
        title,
      },
@@ -271,6 +258,19 @@ export class HtmlCrawler extends BaseCrawler {
      const pick = (alt ?? title ?? "").trim();
      if (pick.length > 0) return pick;
    }
+
+    // If it's a time tag, prefer datetime attribute
+    if (tag === "time") {
+      const datetime = target.getAttribute("datetime");
+      if (datetime) return datetime.trim();
+    }
+
+    // If it's a meta tag, prefer content attribute
+    if (tag === "meta") {
+      const content = target.getAttribute("content");
+      if (content) return content.trim();
+    }
+
    return this.textContent(target);
  }

@@ -296,7 +296,9 @@ export class HtmlCrawler extends BaseCrawler {
   * @param selector - The CSS selector
   */
  private extractCategories(root: HTMLElement, selector?: string | null): string[] {
+    if (!selector && this.settings.category) return [this.settings.category.toLowerCase()];
    if (!selector) return [];
+
    const values: string[] = [];
    for (const node of this.extractAll(root, selector)) {
      const text = this.textContent(node);
@@ -314,24 +316,22 @@ export class HtmlCrawler extends BaseCrawler {
   */
  private computeTimestamp(raw?: string | null): number {
    if (!raw) return Math.floor(Date.now() / 1000);
-    let value = raw.trim();
-    const pattern = safeRegExp(this.source.sourceDate?.pattern);
-    const replacement = this.source.sourceDate?.replacement ?? "";
-    if (pattern) {
-      try {
-        value = value.replace(pattern, replacement);
-      } catch {
-        // ignore pattern failures
-      }
+    const value = raw.trim();
+
+    const format = this.source.sourceDate.format;
+    if (format === "dd.MM.yyyy") {
+      const [day, month, year] = raw.split(".").map(Number);
+      const timestamp = getUnixTime(new Date(year!, month! - 1, day));
+      return Number.isFinite(timestamp) ? timestamp : Math.floor(Date.now() / 1000);
    }
-    const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm";
+
    if (!isDateMatch(value, format)) {
-      // fallback: try native Date.parse as last resort
      const parsed = Date.parse(value);
      return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000);
    }
-    const date = parseDateFns(value, format, new Date());
-    const ts = getUnixTime(date);
-    return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000);
+
+    const date = parse(value, format, new Date());
+    const timestamp = getUnixTime(date);
+    return Number.isFinite(timestamp) ? timestamp : Math.floor(Date.now() / 1000);
  }
 }
@@ -1,4 +1,5 @@
 import { logger } from "@basango/logger";
+import { fromUnixTime } from "date-fns";
 import TurndownService from "turndown";

 import { FetchCrawlerConfig } from "#crawler/config";
@@ -148,7 +149,7 @@ export class WordPressCrawler extends BaseCrawler {
        body,
        categories,
        link,
-        publishedAt: new Date(timestamp * 1000),
+        publishedAt: fromUnixTime(timestamp),
        sourceId: this.source.sourceId,
        title,
      },
@@ -58,8 +58,6 @@ export const DateRangeSpecSchema = z

 export const SourceDateSchema = z.object({
  format: z.string().default("yyyy-LL-dd HH:mm"),
-  pattern: z.string().nullable().optional(),
-  replacement: z.string().nullable().optional(),
 });

 const BaseSourceSchema = z.object({
@@ -6,7 +6,7 @@ import { parseArgs } from "node:util";
 import { logger } from "@basango/logger";

 import { config, env } from "#crawler/config";
-import { SyncHttpClient } from "#crawler/http/http-client";
+import { HttpError, SyncHttpClient } from "#crawler/http/http-client";
 import type { Article } from "#crawler/schema";

 const USAGE = `
@@ -43,7 +43,13 @@ const forwardArticle = async (article: Article): Promise<void> => {

    logger.error({ link: article.link, status: response.status }, "Forwarding failed");
  } catch (error) {
-    logger.error({ error, link: article.link }, "Failed to forward article");
+    if (error instanceof HttpError) {
+      const data = await error.response.json();
+      logger.error({ ...data, link: article.link }, "Error forwarding article");
+      return;
+    }
+
+    logger.error({ error, link: article.link }, "Error forwarding article");
  }
 };

@@ -145,3 +145,17 @@ export const createAbsoluteUrl = (base: string, href: string): string => {
    return href;
  }
 };
+
+/**
+ * extract the domain name from a URL.
+ * @param url - The URL string
+ * @returns The domain name or null if invalid URL
+ */
+export const extractDomainName = (url: string): string | null => {
+  try {
+    const parsed = new URL(url);
+    return parsed.hostname;
+  } catch {
+    return null;
+  }
+};