feat(crawler): sync local data
This commit is contained in:
+1
-12
@@ -12,18 +12,7 @@ import { appRouter } from "#api/trpc/routers/_app";
|
||||
|
||||
const app = new OpenAPIHono();
|
||||
|
||||
app.use(async (c, next) => {
|
||||
const data = await c.req.json();
|
||||
|
||||
console.log("Incoming Request:", {
|
||||
data: data,
|
||||
headers: c.req.header,
|
||||
method: c.req.method,
|
||||
url: c.req.url,
|
||||
});
|
||||
|
||||
return next();
|
||||
});
|
||||
app.use(logger());
|
||||
app.use(secureHeaders());
|
||||
|
||||
app.use(
|
||||
|
||||
@@ -38,10 +38,16 @@ export const createArticleSchema = z
|
||||
example: "https://example.com/article",
|
||||
}),
|
||||
metadata: metadataSchema.optional(),
|
||||
publishedAt: z.date().openapi({
|
||||
description: "The publication date of the article.",
|
||||
example: "2023-01-01T00:00:00Z",
|
||||
}),
|
||||
publishedAt: z
|
||||
.string()
|
||||
.refine((value) => !Number.isNaN(Date.parse(value)), {
|
||||
message: "Invalid date format",
|
||||
})
|
||||
.transform((value) => new Date(value))
|
||||
.openapi({
|
||||
description: "The publication date of the article in ISO 8601 format.",
|
||||
example: "2023-01-01T00:00:00Z",
|
||||
}),
|
||||
sourceId: z.string().openapi({
|
||||
description: "The unique identifier of the source from which the article was crawled.",
|
||||
example: "radiookapi.net",
|
||||
|
||||
@@ -5,16 +5,13 @@
|
||||
"paginationTemplate": "actualite",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {
|
||||
"pattern": "/(\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
||||
"replacement": "$3-$2-$1 $4"
|
||||
},
|
||||
"sourceDate": {},
|
||||
"sourceId": "radiookapi.net",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".field-name-body",
|
||||
"articleCategories": ".views-field-field-cat-gorie a",
|
||||
"articleDate": ".views-field-created",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": ".views-field-title a",
|
||||
"articles": ".view-content > .views-row.content-row",
|
||||
"articleTitle": "h1.page-header",
|
||||
@@ -26,17 +23,14 @@
|
||||
{
|
||||
"categories": ["politique", "economie", "culture", "sport", "societe"],
|
||||
"paginationTemplate": "index.php/category/{category}",
|
||||
"requiresDetails": false,
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {
|
||||
"pattern": "/\\w{3} (\\d{2})/(\\d{2})/(\\d{4}) - (\\d{2}:\\d{2})/",
|
||||
"replacement": "$3-$2-$1 $4"
|
||||
},
|
||||
"sourceDate": {},
|
||||
"sourceId": "7sur7.cd",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".field.field--name-body",
|
||||
"articleDate": ".views-field-created",
|
||||
"articleBody": "div[property=\"schema:text\"].field.field--name-body",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": ".views-field-title a",
|
||||
"articles": ".view-content > .row.views-row",
|
||||
"articleTitle": ".views-field-title a",
|
||||
@@ -50,7 +44,7 @@
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {
|
||||
"format": "%d.%m.%Y %H:%M"
|
||||
"format": "dd.MM.yyyy"
|
||||
},
|
||||
"sourceId": "mediacongo.net",
|
||||
"sourceKind": "html",
|
||||
@@ -70,16 +64,13 @@
|
||||
"paginationTemplate": "actualite",
|
||||
"requiresDetails": true,
|
||||
"requiresRateLimit": false,
|
||||
"sourceDate": {
|
||||
"pattern": "/(\\d{1}) (\\d{1,2}) (\\d{2}) (\\d{4}) - (\\d{2}:\\d{2})/",
|
||||
"replacement": "$4-$3-$2 $5"
|
||||
},
|
||||
"sourceDate": {},
|
||||
"sourceId": "actualite.cd",
|
||||
"sourceKind": "html",
|
||||
"sourceSelectors": {
|
||||
"articleBody": ".views-field.views-field-body",
|
||||
"articleBody": ".views-field.views-field-body .field-content",
|
||||
"articleCategories": "#actu-cat",
|
||||
"articleDate": "#p-date",
|
||||
"articleDate": "head > meta[property=\"article:published_time\"]",
|
||||
"articleLink": "#actu-titre a",
|
||||
"articles": "#views-bootstrap-taxonomy-term-page-2 > div > div",
|
||||
"articleTitle": "h1.page-title"
|
||||
|
||||
@@ -5,6 +5,7 @@ import { OPEN_GRAPH_USER_AGENT } from "#crawler/constants";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { UserAgents } from "#crawler/http/user-agent";
|
||||
import { ArticleMetadata } from "#crawler/schema";
|
||||
import { createAbsoluteUrl } from "#crawler/utils";
|
||||
|
||||
/**
|
||||
* Picks the first non-empty value from the provided array.
|
||||
@@ -71,7 +72,7 @@ export class OpenGraph {
|
||||
* @param html - HTML content as a string
|
||||
* @param url - Optional URL of the page
|
||||
*/
|
||||
static consumeHtml(html: string, url?: string): ArticleMetadata | undefined {
|
||||
static consumeHtml(html: string, url: string): ArticleMetadata | undefined {
|
||||
if (!html) {
|
||||
return undefined;
|
||||
}
|
||||
@@ -95,9 +96,9 @@ export class OpenGraph {
|
||||
|
||||
return {
|
||||
description,
|
||||
image,
|
||||
image: createAbsoluteUrl(url, image ?? "") || undefined,
|
||||
title,
|
||||
url: canonical,
|
||||
url: createAbsoluteUrl(url, canonical ?? "") || undefined,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import { getUnixTime, isMatch as isDateMatch, parse as parseDateFns } from "date-fns";
|
||||
import { fromUnixTime, getUnixTime, isMatch as isDateMatch, parse } from "date-fns";
|
||||
import { HTMLElement } from "node-html-parser";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
@@ -21,19 +21,6 @@ const md = new TurndownService({
|
||||
hr: "---",
|
||||
});
|
||||
|
||||
/**
|
||||
* Create a safe RegExp from the given pattern.
|
||||
* @param pattern
|
||||
*/
|
||||
const safeRegExp = (pattern?: string | null): RegExp | null => {
|
||||
if (!pattern) return null;
|
||||
try {
|
||||
return new RegExp(pattern, "g");
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Crawler for generic HTML pages.
|
||||
*/
|
||||
@@ -123,10 +110,10 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
const root = this.parseHtml(html);
|
||||
const selectors = this.source.sourceSelectors;
|
||||
|
||||
const title = this.extractText(root, selectors.articleTitle) ?? "Untitled";
|
||||
const title = this.extractText(root, selectors.articleTitle);
|
||||
const link = this.currentNode ?? this.extractLink(root);
|
||||
if (!link) {
|
||||
throw new InvalidArticleError("Missing article link");
|
||||
if (!link || !title) {
|
||||
throw new InvalidArticleError("Missing article link or title");
|
||||
}
|
||||
|
||||
const body = this.extractBody(root, selectors.articleBody);
|
||||
@@ -148,7 +135,7 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
body,
|
||||
categories,
|
||||
link,
|
||||
publishedAt: new Date(timestamp * 1000),
|
||||
publishedAt: fromUnixTime(timestamp),
|
||||
sourceId: this.source.sourceId,
|
||||
title,
|
||||
},
|
||||
@@ -271,6 +258,19 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
const pick = (alt ?? title ?? "").trim();
|
||||
if (pick.length > 0) return pick;
|
||||
}
|
||||
|
||||
// If it's a time tag, prefer datetime attribute
|
||||
if (tag === "time") {
|
||||
const datetime = target.getAttribute("datetime");
|
||||
if (datetime) return datetime.trim();
|
||||
}
|
||||
|
||||
// If it's a meta tag, prefer content attribute
|
||||
if (tag === "meta") {
|
||||
const content = target.getAttribute("content");
|
||||
if (content) return content.trim();
|
||||
}
|
||||
|
||||
return this.textContent(target);
|
||||
}
|
||||
|
||||
@@ -296,7 +296,9 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
* @param selector - The CSS selector
|
||||
*/
|
||||
private extractCategories(root: HTMLElement, selector?: string | null): string[] {
|
||||
if (!selector && this.settings.category) return [this.settings.category.toLowerCase()];
|
||||
if (!selector) return [];
|
||||
|
||||
const values: string[] = [];
|
||||
for (const node of this.extractAll(root, selector)) {
|
||||
const text = this.textContent(node);
|
||||
@@ -314,24 +316,22 @@ export class HtmlCrawler extends BaseCrawler {
|
||||
*/
|
||||
private computeTimestamp(raw?: string | null): number {
|
||||
if (!raw) return Math.floor(Date.now() / 1000);
|
||||
let value = raw.trim();
|
||||
const pattern = safeRegExp(this.source.sourceDate?.pattern);
|
||||
const replacement = this.source.sourceDate?.replacement ?? "";
|
||||
if (pattern) {
|
||||
try {
|
||||
value = value.replace(pattern, replacement);
|
||||
} catch {
|
||||
// ignore pattern failures
|
||||
}
|
||||
const value = raw.trim();
|
||||
|
||||
const format = this.source.sourceDate.format;
|
||||
if (format === "dd.MM.yyyy") {
|
||||
const [day, month, year] = raw.split(".").map(Number);
|
||||
const timestamp = getUnixTime(new Date(year!, month! - 1, day));
|
||||
return Number.isFinite(timestamp) ? timestamp : Math.floor(Date.now() / 1000);
|
||||
}
|
||||
const format = this.source.sourceDate?.format ?? "yyyy-LL-dd HH:mm";
|
||||
|
||||
if (!isDateMatch(value, format)) {
|
||||
// fallback: try native Date.parse as last resort
|
||||
const parsed = Date.parse(value);
|
||||
return Number.isNaN(parsed) ? Math.floor(Date.now() / 1000) : Math.floor(parsed / 1000);
|
||||
}
|
||||
const date = parseDateFns(value, format, new Date());
|
||||
const ts = getUnixTime(date);
|
||||
return Number.isFinite(ts) ? ts : Math.floor(Date.now() / 1000);
|
||||
|
||||
const date = parse(value, format, new Date());
|
||||
const timestamp = getUnixTime(date);
|
||||
return Number.isFinite(timestamp) ? timestamp : Math.floor(Date.now() / 1000);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { logger } from "@basango/logger";
|
||||
import { fromUnixTime } from "date-fns";
|
||||
import TurndownService from "turndown";
|
||||
|
||||
import { FetchCrawlerConfig } from "#crawler/config";
|
||||
@@ -148,7 +149,7 @@ export class WordPressCrawler extends BaseCrawler {
|
||||
body,
|
||||
categories,
|
||||
link,
|
||||
publishedAt: new Date(timestamp * 1000),
|
||||
publishedAt: fromUnixTime(timestamp),
|
||||
sourceId: this.source.sourceId,
|
||||
title,
|
||||
},
|
||||
|
||||
@@ -58,8 +58,6 @@ export const DateRangeSpecSchema = z
|
||||
|
||||
export const SourceDateSchema = z.object({
|
||||
format: z.string().default("yyyy-LL-dd HH:mm"),
|
||||
pattern: z.string().nullable().optional(),
|
||||
replacement: z.string().nullable().optional(),
|
||||
});
|
||||
|
||||
const BaseSourceSchema = z.object({
|
||||
|
||||
@@ -6,7 +6,7 @@ import { parseArgs } from "node:util";
|
||||
import { logger } from "@basango/logger";
|
||||
|
||||
import { config, env } from "#crawler/config";
|
||||
import { SyncHttpClient } from "#crawler/http/http-client";
|
||||
import { HttpError, SyncHttpClient } from "#crawler/http/http-client";
|
||||
import type { Article } from "#crawler/schema";
|
||||
|
||||
const USAGE = `
|
||||
@@ -43,7 +43,13 @@ const forwardArticle = async (article: Article): Promise<void> => {
|
||||
|
||||
logger.error({ link: article.link, status: response.status }, "Forwarding failed");
|
||||
} catch (error) {
|
||||
logger.error({ error, link: article.link }, "Failed to forward article");
|
||||
if (error instanceof HttpError) {
|
||||
const data = await error.response.json();
|
||||
logger.error({ ...data, link: article.link }, "Error forwarding article");
|
||||
return;
|
||||
}
|
||||
|
||||
logger.error({ error, link: article.link }, "Error forwarding article");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -145,3 +145,17 @@ export const createAbsoluteUrl = (base: string, href: string): string => {
|
||||
return href;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* extract the domain name from a URL.
|
||||
* @param url - The URL string
|
||||
* @returns The domain name or null if invalid URL
|
||||
*/
|
||||
export const extractDomainName = (url: string): string | null => {
|
||||
try {
|
||||
const parsed = new URL(url);
|
||||
return parsed.hostname;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user