feat(db): cursor based pagination
This commit is contained in:
@@ -20,6 +20,20 @@ type TargetOptions = {
|
|||||||
ignoreColumns?: Record<string, string[]>;
|
ignoreColumns?: Record<string, string[]>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type CursorEncoding = "string" | "number" | "buffer";
|
||||||
|
|
||||||
|
type SyncProgress = {
|
||||||
|
offset: number;
|
||||||
|
cursor: string | null;
|
||||||
|
cursorEncoding: CursorEncoding | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
type SourceRow = {
|
||||||
|
data: Record<string, unknown>;
|
||||||
|
cursor: string | null;
|
||||||
|
cursorEncoding: CursorEncoding | null;
|
||||||
|
};
|
||||||
|
|
||||||
const DEFAULT_IGNORE: Record<string, string[]> = {
|
const DEFAULT_IGNORE: Record<string, string[]> = {
|
||||||
article: ["tsv", "image", "excerpt", "bias", "reliability", "transparency"],
|
article: ["tsv", "image", "excerpt", "bias", "reliability", "transparency"],
|
||||||
source: ["bias", "reliability", "transparency"],
|
source: ["bias", "reliability", "transparency"],
|
||||||
@@ -56,19 +70,21 @@ class Engine {
|
|||||||
async import(table: string): Promise<number> {
|
async import(table: string): Promise<number> {
|
||||||
await this.ensureProgressTable();
|
await this.ensureProgressTable();
|
||||||
|
|
||||||
let startOffset = 0;
|
let startState: SyncProgress = { cursor: null, cursorEncoding: null, offset: 0 };
|
||||||
if (this.resume) {
|
if (this.resume) {
|
||||||
startOffset = await this.getProgressOffset(table);
|
startState = await this.getProgressState(table);
|
||||||
console.log(`Resuming import for ${table} from offset=${startOffset}`);
|
console.log(
|
||||||
|
`Resuming import for ${table} from offset=${startState.offset}, cursor=${startState.cursor ?? "null"}`,
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
await this.reset(table);
|
await this.reset(table);
|
||||||
await this.setProgressOffset(table, 0);
|
await this.setProgressState(table, startState);
|
||||||
}
|
}
|
||||||
|
|
||||||
return await this.paste(table, this.copy(table, startOffset), startOffset);
|
return await this.paste(table, this.copy(table, startState), startState);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async *copy(table: string, startOffset = 0): AsyncGenerator<Record<string, unknown>> {
|
private async *copy(table: string, state: SyncProgress): AsyncGenerator<SourceRow> {
|
||||||
const mysql = await import("mysql2/promise");
|
const mysql = await import("mysql2/promise");
|
||||||
|
|
||||||
const source = mysql.createPool({
|
const source = mysql.createPool({
|
||||||
@@ -81,23 +97,49 @@ class Engine {
|
|||||||
user: this.sourceOptions.user,
|
user: this.sourceOptions.user,
|
||||||
});
|
});
|
||||||
|
|
||||||
let offset = startOffset;
|
let offset = state.offset ?? 0;
|
||||||
const size = this.pageSize;
|
const size = this.pageSize;
|
||||||
|
let cursorParam = this.decodeCursorValue(state.cursor, state.cursorEncoding);
|
||||||
|
let useCursor = cursorParam != null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
while (true) {
|
while (true) {
|
||||||
const [rows] = await source.query<RowDataPacket[]>(
|
let sql: string;
|
||||||
`SELECT * FROM \`${this.escapeBacktick(table)}\` ORDER BY \`id\` LIMIT ? OFFSET ?`,
|
let params: unknown[];
|
||||||
[size, offset],
|
if (useCursor && cursorParam != null) {
|
||||||
);
|
sql = `SELECT * FROM \`${this.escapeBacktick(table)}\` WHERE \`id\` > ? ORDER BY \`id\` LIMIT ?`;
|
||||||
|
params = [cursorParam, size];
|
||||||
|
} else {
|
||||||
|
sql = `SELECT * FROM \`${this.escapeBacktick(table)}\` ORDER BY \`id\` LIMIT ? OFFSET ?`;
|
||||||
|
params = [size, offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
const [rows] = await source.query<RowDataPacket[]>(sql, params);
|
||||||
|
|
||||||
if (!rows || rows.length === 0) break;
|
if (!rows || rows.length === 0) break;
|
||||||
|
|
||||||
for (const row of rows) {
|
for (const row of rows) {
|
||||||
yield row as Record<string, unknown>;
|
const cursorInfo = this.encodeCursorValue((row as Record<string, unknown>).id);
|
||||||
|
yield {
|
||||||
|
cursor: cursorInfo.value,
|
||||||
|
cursorEncoding: cursorInfo.encoding,
|
||||||
|
data: row as Record<string, unknown>,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
offset += rows.length;
|
offset += rows.length;
|
||||||
if (rows.length < size) break;
|
if (rows.length < size) break;
|
||||||
|
|
||||||
|
const last = rows[rows.length - 1] as Record<string, unknown>;
|
||||||
|
const lastCursor = this.encodeCursorValue(last.id);
|
||||||
|
const decoded = this.decodeCursorValue(lastCursor.value, lastCursor.encoding);
|
||||||
|
if (decoded != null) {
|
||||||
|
cursorParam = decoded;
|
||||||
|
useCursor = true;
|
||||||
|
} else if (useCursor) {
|
||||||
|
cursorParam = null;
|
||||||
|
useCursor = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
@@ -108,8 +150,8 @@ class Engine {
|
|||||||
|
|
||||||
private async paste(
|
private async paste(
|
||||||
table: string,
|
table: string,
|
||||||
rows: AsyncGenerator<Record<string, unknown>>,
|
rows: AsyncGenerator<SourceRow>,
|
||||||
startOffset = 0,
|
startState: SyncProgress,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
const target = await this.target.connect();
|
const target = await this.target.connect();
|
||||||
let total = 0;
|
let total = 0;
|
||||||
@@ -117,16 +159,19 @@ class Engine {
|
|||||||
let columns: string[] | null = null;
|
let columns: string[] | null = null;
|
||||||
let insertSql = "";
|
let insertSql = "";
|
||||||
let upsertSql = "";
|
let upsertSql = "";
|
||||||
|
let lastCursor = startState.cursor ?? null;
|
||||||
|
let lastCursorEncoding = startState.cursorEncoding ?? null;
|
||||||
|
|
||||||
const ignored = this.ignoredColumnsFor(table);
|
const ignored = this.ignoredColumnsFor(table);
|
||||||
const ignoredSet = new Set(ignored);
|
const ignoredSet = new Set(ignored);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for await (let row of rows) {
|
for await (const entry of rows) {
|
||||||
|
const row = entry.data;
|
||||||
|
const transformed = this.transformRowForTarget(table, row);
|
||||||
if (!columns) {
|
if (!columns) {
|
||||||
row = this.transformRowForTarget(table, row);
|
|
||||||
// Filter ignored columns and build column order
|
// Filter ignored columns and build column order
|
||||||
columns = Object.keys(row).filter((c) => !ignoredSet.has(c));
|
columns = Object.keys(transformed).filter((c) => !ignoredSet.has(c));
|
||||||
|
|
||||||
// If article target has credibility but source not, include computed credibility
|
// If article target has credibility but source not, include computed credibility
|
||||||
if (
|
if (
|
||||||
@@ -153,8 +198,6 @@ class Engine {
|
|||||||
await target.query("BEGIN");
|
await target.query("BEGIN");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Row transform and params in column order
|
|
||||||
const transformed = this.transformRowForTarget(table, row);
|
|
||||||
const params = columns!.map((c) => this.valueForColumn(c, transformed));
|
const params = columns!.map((c) => this.valueForColumn(c, transformed));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -172,9 +215,21 @@ class Engine {
|
|||||||
}
|
}
|
||||||
total++;
|
total++;
|
||||||
inBatch++;
|
inBatch++;
|
||||||
|
if (entry.cursor != null) {
|
||||||
|
lastCursor = entry.cursor;
|
||||||
|
lastCursorEncoding = entry.cursorEncoding ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
if (inBatch >= this.batchSize) {
|
if (inBatch >= this.batchSize) {
|
||||||
await this.setProgressOffset(table, startOffset + total, target);
|
await this.setProgressState(
|
||||||
|
table,
|
||||||
|
{
|
||||||
|
cursor: lastCursor,
|
||||||
|
cursorEncoding: lastCursorEncoding,
|
||||||
|
offset: startState.offset + total,
|
||||||
|
},
|
||||||
|
target,
|
||||||
|
);
|
||||||
await target.query("COMMIT");
|
await target.query("COMMIT");
|
||||||
inBatch = 0;
|
inBatch = 0;
|
||||||
await target.query("BEGIN");
|
await target.query("BEGIN");
|
||||||
@@ -183,7 +238,15 @@ class Engine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (inBatch > 0) {
|
if (inBatch > 0) {
|
||||||
await this.setProgressOffset(table, startOffset + total, target);
|
await this.setProgressState(
|
||||||
|
table,
|
||||||
|
{
|
||||||
|
cursor: lastCursor,
|
||||||
|
cursorEncoding: lastCursorEncoding,
|
||||||
|
offset: startState.offset + total,
|
||||||
|
},
|
||||||
|
target,
|
||||||
|
);
|
||||||
await target.query("COMMIT");
|
await target.query("COMMIT");
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -398,6 +461,30 @@ class Engine {
|
|||||||
return new Date();
|
return new Date();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private encodeCursorValue(value: unknown): {
|
||||||
|
encoding: CursorEncoding | null;
|
||||||
|
value: string | null;
|
||||||
|
} {
|
||||||
|
if (value == null) return { encoding: null, value: null };
|
||||||
|
if (Buffer.isBuffer(value)) {
|
||||||
|
return { encoding: "buffer", value: value.toString("hex") };
|
||||||
|
}
|
||||||
|
if (typeof value === "number" || typeof value === "bigint") {
|
||||||
|
return { encoding: "number", value: String(value) };
|
||||||
|
}
|
||||||
|
return { encoding: "string", value: String(value) };
|
||||||
|
}
|
||||||
|
|
||||||
|
private decodeCursorValue(value: string | null, encoding: CursorEncoding | null): unknown {
|
||||||
|
if (!value || !encoding) return null;
|
||||||
|
if (encoding === "buffer") return Buffer.from(value, "hex");
|
||||||
|
if (encoding === "number") {
|
||||||
|
const parsed = Number(value);
|
||||||
|
return Number.isNaN(parsed) ? null : parsed;
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
private quote(id: string) {
|
private quote(id: string) {
|
||||||
const norm = this.normalizedName(id);
|
const norm = this.normalizedName(id);
|
||||||
return `"${norm.replaceAll('"', '""')}"`;
|
return `"${norm.replaceAll('"', '""')}"`;
|
||||||
@@ -414,38 +501,53 @@ class Engine {
|
|||||||
`CREATE TABLE IF NOT EXISTS "__sync_state" (
|
`CREATE TABLE IF NOT EXISTS "__sync_state" (
|
||||||
table_name text PRIMARY KEY,
|
table_name text PRIMARY KEY,
|
||||||
last_offset integer NOT NULL DEFAULT 0,
|
last_offset integer NOT NULL DEFAULT 0,
|
||||||
|
last_cursor text,
|
||||||
|
cursor_encoding text,
|
||||||
updated_at timestamp NOT NULL DEFAULT NOW()
|
updated_at timestamp NOT NULL DEFAULT NOW()
|
||||||
)`,
|
)`,
|
||||||
);
|
);
|
||||||
|
await client.query(`ALTER TABLE "__sync_state" ADD COLUMN IF NOT EXISTS last_cursor text`);
|
||||||
|
await client.query(
|
||||||
|
`ALTER TABLE "__sync_state" ADD COLUMN IF NOT EXISTS cursor_encoding text`,
|
||||||
|
);
|
||||||
} finally {
|
} finally {
|
||||||
client.release();
|
client.release();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async getProgressOffset(table: string): Promise<number> {
|
private async getProgressState(table: string): Promise<SyncProgress> {
|
||||||
const client = await this.target.connect();
|
const client = await this.target.connect();
|
||||||
try {
|
try {
|
||||||
const { rows } = await client.query<{ last_offset: number }>(
|
const { rows } = await client.query<{
|
||||||
`SELECT last_offset FROM "__sync_state" WHERE table_name = $1`,
|
cursor_encoding: CursorEncoding | null;
|
||||||
|
last_cursor: string | null;
|
||||||
|
last_offset: number;
|
||||||
|
}>(
|
||||||
|
`SELECT last_offset, last_cursor, cursor_encoding FROM "__sync_state" WHERE table_name = $1`,
|
||||||
[this.normalizedName(table)],
|
[this.normalizedName(table)],
|
||||||
);
|
);
|
||||||
return rows[0]?.last_offset ?? 0;
|
const row = rows[0];
|
||||||
|
return {
|
||||||
|
cursor: row?.last_cursor ?? null,
|
||||||
|
cursorEncoding: row?.cursor_encoding ?? null,
|
||||||
|
offset: row?.last_offset ?? 0,
|
||||||
|
};
|
||||||
} finally {
|
} finally {
|
||||||
client.release();
|
client.release();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async setProgressOffset(
|
private async setProgressState(
|
||||||
table: string,
|
table: string,
|
||||||
offset: number,
|
state: SyncProgress,
|
||||||
client?: PoolClient,
|
client?: PoolClient,
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const run = async (c: PoolClient) => {
|
const run = async (c: PoolClient) => {
|
||||||
await c.query(
|
await c.query(
|
||||||
`INSERT INTO "__sync_state" (table_name, last_offset, updated_at)
|
`INSERT INTO "__sync_state" (table_name, last_offset, last_cursor, cursor_encoding, updated_at)
|
||||||
VALUES ($1, $2, NOW())
|
VALUES ($1, $2, $3, $4, NOW())
|
||||||
ON CONFLICT (table_name) DO UPDATE SET last_offset = EXCLUDED.last_offset, updated_at = NOW()`,
|
ON CONFLICT (table_name) DO UPDATE SET last_offset = EXCLUDED.last_offset, last_cursor = EXCLUDED.last_cursor, cursor_encoding = EXCLUDED.cursor_encoding, updated_at = NOW()`,
|
||||||
[this.normalizedName(table), offset],
|
[this.normalizedName(table), state.offset, state.cursor, state.cursorEncoding],
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
if (client) return run(client);
|
if (client) return run(client);
|
||||||
|
|||||||
Reference in New Issue
Block a user