feat(db): add tokens sync
This commit is contained in:
@@ -27,7 +27,8 @@
|
|||||||
"private": true,
|
"private": true,
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"clean": "rm -rf .turbo node_modules",
|
"clean": "rm -rf .turbo node_modules",
|
||||||
"sync:import": "bun ./src/importer/import.ts",
|
"sync:data": "bun ./src/synchronizers/data.ts",
|
||||||
|
"sync:tokens": "bun ./src/synchronizers/tokens.ts",
|
||||||
"typecheck": "tsc --noEmit"
|
"typecheck": "tsc --noEmit"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/usr/bin/env bun
|
|
||||||
|
|
||||||
import { env } from "@/config";
|
|
||||||
import { Engine } from "@/importer";
|
|
||||||
|
|
||||||
async function main() {
|
|
||||||
const engine = new Engine(
|
|
||||||
{
|
|
||||||
database: env("BASANGO_SOURCE_DATABASE_NAME"),
|
|
||||||
host: env("BASANGO_SOURCE_DATABASE_HOST"),
|
|
||||||
password: env("BASANGO_SOURCE_DATABASE_PASS"),
|
|
||||||
user: env("BASANGO_SOURCE_DATABASE_USER"),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
database: env("BASANGO_DATABASE_URL"),
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const tables = process.argv.slice(2);
|
|
||||||
if (tables.length === 0) tables.push("user", "source", "article");
|
|
||||||
for (const t of tables) {
|
|
||||||
const count = await engine.import(t);
|
|
||||||
console.log(`Imported ${count} records into ${t} table.`);
|
|
||||||
}
|
|
||||||
console.log("Import completed successfully");
|
|
||||||
process.exit(0);
|
|
||||||
} finally {
|
|
||||||
await engine.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
main().catch((err) => {
|
|
||||||
console.error(err?.message ?? err);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
export * from "./engine";
|
|
||||||
export * from "./import";
|
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
|
#!/usr/bin/env bun
|
||||||
|
|
||||||
import { RowDataPacket } from "mysql2/promise";
|
import { RowDataPacket } from "mysql2/promise";
|
||||||
import { Pool, PoolClient } from "pg";
|
import { Pool, PoolClient } from "pg";
|
||||||
|
|
||||||
|
import { env } from "@/config";
|
||||||
import { computeReadingTime } from "@/utils/computed";
|
import { computeReadingTime } from "@/utils/computed";
|
||||||
|
|
||||||
type SourceOptions = {
|
type SourceOptions = {
|
||||||
@@ -22,31 +25,7 @@ const DEFAULT_IGNORE: Record<string, string[]> = {
|
|||||||
source: ["bias", "reliability", "transparency"],
|
source: ["bias", "reliability", "transparency"],
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
class Engine {
|
||||||
* Engine
|
|
||||||
*
|
|
||||||
* Coordinates copying rows from a MySQL source into a PostgreSQL target in a
|
|
||||||
* controlled, transactional, batched manner.
|
|
||||||
*
|
|
||||||
* Responsibilities:
|
|
||||||
* - Establish and manage a connection pool to the target PostgreSQL database.
|
|
||||||
* - Stream rows from a MySQL source (via a temporary pool) using pagination.
|
|
||||||
* - Transform row values to match target expectations (UUID normalization,
|
|
||||||
* timestamp fallback, array parsing for categories/roles, computed JSON
|
|
||||||
* credibility, etc.).
|
|
||||||
* - Filter out ignored columns based on a configurable ignore map.
|
|
||||||
* - Insert rows into the target in configurable batch sizes with transactional
|
|
||||||
* commits every batch to limit long-running transactions.
|
|
||||||
* - Provide a safe reset operation that truncates the target table and manages
|
|
||||||
* session replication role toggling for Postgres.
|
|
||||||
*
|
|
||||||
* @param sourceOptions - connection and authentication options for the MySQL
|
|
||||||
* source (database, host, user, password, etc.).
|
|
||||||
* @param targetOptions - configuration for the Postgres target including
|
|
||||||
* connection string (database), optional pageSize, batchSize and per-table
|
|
||||||
* ignoreColumns map.
|
|
||||||
*/
|
|
||||||
export class Engine {
|
|
||||||
private readonly target: Pool;
|
private readonly target: Pool;
|
||||||
private readonly ignore: Record<string, string[]>;
|
private readonly ignore: Record<string, string[]>;
|
||||||
private readonly pageSize: number;
|
private readonly pageSize: number;
|
||||||
@@ -293,15 +272,6 @@ export class Engine {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: this is to heavy for the import process need to find a way to compute on creation or later
|
|
||||||
// if (t === "article") {
|
|
||||||
// clone.token_statistics = computeTokenStatistics({
|
|
||||||
// body: String(clone.body ?? ""),
|
|
||||||
// categories: Array.isArray(clone.categories) ? clone.categories : [],
|
|
||||||
// title: String(clone.title ?? ""),
|
|
||||||
// });
|
|
||||||
// }
|
|
||||||
|
|
||||||
return clone;
|
return clone;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -438,3 +408,35 @@ async function safeRollback(client: PoolClient) {
|
|||||||
await client.query("ROLLBACK");
|
await client.query("ROLLBACK");
|
||||||
} catch {}
|
} catch {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const engine = new Engine(
|
||||||
|
{
|
||||||
|
database: env("BASANGO_SOURCE_DATABASE_NAME"),
|
||||||
|
host: env("BASANGO_SOURCE_DATABASE_HOST"),
|
||||||
|
password: env("BASANGO_SOURCE_DATABASE_PASS"),
|
||||||
|
user: env("BASANGO_SOURCE_DATABASE_USER"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
database: env("BASANGO_DATABASE_URL"),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const tables = process.argv.slice(2);
|
||||||
|
if (tables.length === 0) tables.push("user", "source", "article");
|
||||||
|
for (const t of tables) {
|
||||||
|
const count = await engine.import(t);
|
||||||
|
console.log(`Imported ${count} records into ${t} table.`);
|
||||||
|
}
|
||||||
|
console.log("Import completed successfully");
|
||||||
|
process.exit(0);
|
||||||
|
} finally {
|
||||||
|
await engine.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err?.message ?? err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
@@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env bun
|
||||||
|
|
||||||
|
import { Pool } from "pg";
|
||||||
|
|
||||||
|
import { env } from "@/config";
|
||||||
|
import { computeTokenStatistics } from "@/utils/computed";
|
||||||
|
|
||||||
|
type ArticleRow = {
|
||||||
|
id: string;
|
||||||
|
title: string;
|
||||||
|
body: string;
|
||||||
|
categories: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
class Engine {
|
||||||
|
private readonly db: Pool;
|
||||||
|
private readonly pageSize: number = 1000;
|
||||||
|
private readonly batchSize: number = 50;
|
||||||
|
|
||||||
|
constructor(private readonly database: string) {
|
||||||
|
this.db = new Pool({
|
||||||
|
allowExitOnIdle: true,
|
||||||
|
connectionString: this.database,
|
||||||
|
max: 16,
|
||||||
|
});
|
||||||
|
console.log(
|
||||||
|
`Engine initialized with pageSize=${this.pageSize} and batchSize=${this.batchSize}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async synchronize() {
|
||||||
|
const client = await this.db.connect();
|
||||||
|
console.log("Starting token statistics computation...");
|
||||||
|
|
||||||
|
try {
|
||||||
|
let cursor: string | null = null; // keyset pagination cursor on id (DESC)
|
||||||
|
for (;;) {
|
||||||
|
const params: Array<number | string> = [this.pageSize];
|
||||||
|
const sql: string = cursor
|
||||||
|
? `SELECT id, title, body, COALESCE(categories, ARRAY[]::text[]) AS categories
|
||||||
|
FROM article
|
||||||
|
WHERE token_statistics IS NULL AND id < $2
|
||||||
|
ORDER BY id DESC
|
||||||
|
LIMIT $1`
|
||||||
|
: `SELECT id, title, body, COALESCE(categories, ARRAY[]::text[]) AS categories
|
||||||
|
FROM article
|
||||||
|
WHERE token_statistics IS NULL
|
||||||
|
ORDER BY id DESC
|
||||||
|
LIMIT $1`;
|
||||||
|
if (cursor) params.push(cursor);
|
||||||
|
|
||||||
|
const { rows } = await client.query<ArticleRow>(sql, params);
|
||||||
|
|
||||||
|
if (rows.length === 0) break;
|
||||||
|
const ids: string[] = [];
|
||||||
|
const statistics: string[] = [];
|
||||||
|
|
||||||
|
for (const r of rows) {
|
||||||
|
ids.push(r.id);
|
||||||
|
statistics.push(
|
||||||
|
JSON.stringify(
|
||||||
|
computeTokenStatistics({
|
||||||
|
body: r.body,
|
||||||
|
categories: r.categories,
|
||||||
|
title: r.title,
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Apply updates in transactional sub-batches...`);
|
||||||
|
for (let i = 0; i < ids.length; i += this.batchSize) {
|
||||||
|
const idsChunk = ids.slice(i, i + this.batchSize);
|
||||||
|
const statsChunk = statistics.slice(i, i + this.batchSize);
|
||||||
|
|
||||||
|
if (idsChunk.length === 0) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await client.query("BEGIN");
|
||||||
|
await client.query(
|
||||||
|
`UPDATE article AS a
|
||||||
|
SET token_statistics = u.token_statistics, updated_at = NOW()
|
||||||
|
FROM (
|
||||||
|
SELECT
|
||||||
|
UNNEST($1::uuid[]) AS id,
|
||||||
|
(UNNEST($2::text[]))::jsonb AS token_statistics
|
||||||
|
) AS u
|
||||||
|
WHERE a.id = u.id`,
|
||||||
|
[idsChunk, statsChunk],
|
||||||
|
);
|
||||||
|
await client.query("COMMIT");
|
||||||
|
console.log(`Updated batch: size=${idsChunk.length}`);
|
||||||
|
} catch (e) {
|
||||||
|
try {
|
||||||
|
await client.query("ROLLBACK");
|
||||||
|
} catch {}
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// advance cursor to the lowest id in this page (since we order DESC)
|
||||||
|
cursor = rows[rows.length - 1]!.id;
|
||||||
|
console.log(`Processed page: updated=${ids.length}`);
|
||||||
|
console.log(`Advancing cursor to id < ${cursor}`);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
client.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async close() {
|
||||||
|
await this.db.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const engine = new Engine(env("BASANGO_DATABASE_URL"));
|
||||||
|
|
||||||
|
try {
|
||||||
|
await engine.synchronize();
|
||||||
|
console.log("Token statistics computation completed successfully");
|
||||||
|
process.exit(0);
|
||||||
|
} finally {
|
||||||
|
await engine.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err?.message ?? err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user