189 lines
5.4 KiB
TypeScript
189 lines
5.4 KiB
TypeScript
import {
|
|
AnySourceOptions,
|
|
HtmlSourceOptions,
|
|
WordPressSourceOptions,
|
|
config,
|
|
} from "@basango/domain/config";
|
|
import { DEFAULT_DATE_FORMAT } from "@basango/domain/constants";
|
|
import {
|
|
DateSpecSchema,
|
|
PageRange,
|
|
PageRangeSchema,
|
|
PageSpecSchema,
|
|
TimestampRange,
|
|
TimestampRangeSchema,
|
|
} from "@basango/domain/models";
|
|
import logger from "@basango/logger";
|
|
import { format, fromUnixTime, getUnixTime, isMatch, parse } from "date-fns";
|
|
import type { RedisOptions } from "ioredis";
|
|
|
|
import { getSourceUpdateDates } from "./process/persistence";
|
|
|
|
/**
|
|
* Resolve a source configuration by its ID.
|
|
* @param id - The source ID
|
|
*/
|
|
export const resolveSourceConfig = (id: string): AnySourceOptions => {
|
|
const source =
|
|
config.crawler.sources.html.find((s: HtmlSourceOptions) => s.sourceId === id) ||
|
|
config.crawler.sources.wordpress.find((s: WordPressSourceOptions) => s.sourceId === id);
|
|
|
|
if (source === undefined) {
|
|
throw new Error(`Source '${id}' not found in configuration`);
|
|
}
|
|
|
|
return source;
|
|
};
|
|
|
|
export const resolveSourceUpdateDates = async (settings: {
|
|
dateRange?: TimestampRange;
|
|
direction: "forward" | "backward";
|
|
source?: AnySourceOptions;
|
|
}) => {
|
|
if (settings.dateRange === undefined && settings.source) {
|
|
const dates = await getSourceUpdateDates(settings.source.sourceId);
|
|
|
|
switch (settings.direction) {
|
|
case "backward":
|
|
settings.dateRange = {
|
|
end: getUnixTime(dates.earliest),
|
|
start: getUnixTime(new Date()),
|
|
};
|
|
logger.info(
|
|
{ dateRange: settings.dateRange, sourceId: settings.source.sourceId },
|
|
"Set date range start from earliest published date",
|
|
);
|
|
break;
|
|
case "forward":
|
|
if (dates.latest) {
|
|
settings.dateRange = {
|
|
end: getUnixTime(new Date()),
|
|
start: getUnixTime(dates.latest),
|
|
};
|
|
logger.info(
|
|
{ dateRange: settings.dateRange, sourceId: settings.source.sourceId },
|
|
"Set date range start from latest published date",
|
|
);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Parse a Redis URL into RedisOptions.
|
|
* @param url - The Redis URL (e.g., "redis://:password@localhost:6379/0")
|
|
*/
|
|
export const parseRedisUrl = (url: string): RedisOptions => {
|
|
if (!url.startsWith("redis://")) {
|
|
return {};
|
|
}
|
|
const parsed = new URL(url);
|
|
return {
|
|
db: Number(parsed.pathname?.replace("/", "") || 0),
|
|
host: parsed.hostname,
|
|
password: parsed.password || undefined,
|
|
port: Number(parsed.port || 6379),
|
|
};
|
|
};
|
|
|
|
/**
|
|
* Parse a date string using the specified format.
|
|
* @param value - The date string to parse
|
|
* @param format - The date format
|
|
*/
|
|
const parseDate = (value: string, format: string): Date => {
|
|
if (!isMatch(value, format)) {
|
|
throw new Error(`Invalid date '${value}' for format '${format}'`);
|
|
}
|
|
const parsed = parse(value, format, new Date());
|
|
if (Number.isNaN(parsed.getTime())) {
|
|
throw new Error(`Invalid date '${value}' for format '${format}'`);
|
|
}
|
|
return parsed;
|
|
};
|
|
|
|
/**
|
|
* Create a page range from a string specification.
|
|
* @param spec - The page range specification (e.g., "1:10")
|
|
*/
|
|
export const createPageRange = (spec: string | undefined): PageRange | undefined => {
|
|
if (!spec) return undefined;
|
|
const parsed = PageSpecSchema.parse(spec);
|
|
return PageRangeSchema.parse(parsed);
|
|
};
|
|
|
|
/**
|
|
* Create a date range from a string specification.
|
|
* @param spec - The date range specification (e.g., "2023-01-01:2023-12-31")
|
|
* @param options - Options for date range creation
|
|
*/
|
|
export const createTimestampRange = (
|
|
spec: string | undefined,
|
|
options: {
|
|
format?: string;
|
|
separator?: string;
|
|
} = {},
|
|
): TimestampRange | undefined => {
|
|
if (!spec) return undefined;
|
|
const { format = DEFAULT_DATE_FORMAT, separator = ":" } = options;
|
|
if (!separator) {
|
|
throw new Error("Separator cannot be empty");
|
|
}
|
|
|
|
const normalized = spec.replace(separator, ":");
|
|
const parsedSpec = DateSpecSchema.parse(normalized);
|
|
|
|
const startDate = parseDate(parsedSpec.startRaw, format);
|
|
const endDate = parseDate(parsedSpec.endRaw, format);
|
|
|
|
const range = {
|
|
end: getUnixTime(endDate),
|
|
start: getUnixTime(startDate),
|
|
};
|
|
|
|
return TimestampRangeSchema.parse(range);
|
|
};
|
|
|
|
/**
|
|
* Format a date range into a string representation.
|
|
* @param range - The date range
|
|
* @param fmt - The date format (default: DEFAULT_DATE_FORMAT)
|
|
*/
|
|
export const formatTimestampRange = (range: TimestampRange, fmt = DEFAULT_DATE_FORMAT): string => {
|
|
const start = format(fromUnixTime(range.start), fmt);
|
|
const end = format(fromUnixTime(range.end), fmt);
|
|
return `${start}:${end}`;
|
|
};
|
|
|
|
/**
|
|
* Format a page range into a string representation.
|
|
* @param range - The page range
|
|
*/
|
|
export const formatPageRange = (range: PageRange): string => {
|
|
return `${range.start}:${range.end}`;
|
|
};
|
|
|
|
/**
|
|
* Check if a timestamp is within a given date range.
|
|
* @param range - The date range
|
|
* @param timestamp - The timestamp to check
|
|
*/
|
|
export const isTimestampInRange = (range: TimestampRange, timestamp: number): boolean => {
|
|
return range.start <= timestamp && timestamp <= range.end;
|
|
};
|
|
|
|
/**
|
|
* Convert a relative URL to an absolute URL based on the base URL.
|
|
* @param base - The base URL
|
|
* @param href - The relative or absolute URL
|
|
*/
|
|
export const createAbsoluteUrl = (base: string, href: string): string => {
|
|
try {
|
|
// new URL handles relative paths with base
|
|
return new URL(href, base.endsWith("/") ? base : `${base}/`).toString();
|
|
} catch {
|
|
return href;
|
|
}
|
|
};
|