feat: articles clusters

This commit is contained in:
2025-12-03 15:54:38 +02:00
parent 1d062f679b
commit 78c27b8220
20 changed files with 2113 additions and 51 deletions
@@ -1,4 +1,4 @@
import { SidebarInset, SidebarProvider } from "@basango/ui/components/sidebar"; import { SidebarProvider } from "@basango/ui/components/sidebar";
import { AppSidebar } from "#dashboard/components/sidebar/app-sidebar"; import { AppSidebar } from "#dashboard/components/sidebar/app-sidebar";
import { HydrateClient } from "#dashboard/trpc/server"; import { HydrateClient } from "#dashboard/trpc/server";
@@ -9,7 +9,7 @@ export default async function Layout({ children }: { children: React.ReactNode }
<SidebarProvider> <SidebarProvider>
<AppSidebar /> <AppSidebar />
<SidebarInset>{children}</SidebarInset> {children}
</SidebarProvider> </SidebarProvider>
</HydrateClient> </HydrateClient>
); );
@@ -4,11 +4,9 @@ import {
BreadcrumbList, BreadcrumbList,
BreadcrumbPage, BreadcrumbPage,
} from "@basango/ui/components/breadcrumb"; } from "@basango/ui/components/breadcrumb";
import { Separator } from "@basango/ui/components/separator";
import { SidebarTrigger } from "@basango/ui/components/sidebar"; import { SidebarTrigger } from "@basango/ui/components/sidebar";
import { Show } from "#dashboard/components/shell/show"; import { ThemeToggle } from "../theme-toggle";
import { ThemeToggle } from "#dashboard/components/theme-toggle";
type Props = { type Props = {
title?: string | React.ReactNode; title?: string | React.ReactNode;
@@ -16,20 +14,16 @@ type Props = {
export function PageHeader({ title }: Props) { export function PageHeader({ title }: Props) {
return ( return (
<header className="flex h-16 shrink-0 items-center justify-between gap-2"> <header className="w-full flex justify-between items-center border-b py-2 px-6 h-14">
<div className="flex items-center gap-2"> <div className="flex items-center gap-2">
<SidebarTrigger className="-ml-1" /> <SidebarTrigger className="-ml-1" />
<Breadcrumb>
<Show when={title !== undefined}> <BreadcrumbList>
<Separator className="mr-2 data-[orientation=vertical]:h-4" orientation="vertical" /> <BreadcrumbItem className="hidden md:block">
<Breadcrumb> <BreadcrumbPage className="font-bold">{title}</BreadcrumbPage>
<BreadcrumbList> </BreadcrumbItem>
<BreadcrumbItem className="hidden md:block"> </BreadcrumbList>
<BreadcrumbPage>{title}</BreadcrumbPage> </Breadcrumb>
</BreadcrumbItem>
</BreadcrumbList>
</Breadcrumb>
</Show>
</div> </div>
<div className="flex items-center gap-2"> <div className="flex items-center gap-2">
<ThemeToggle /> <ThemeToggle />
@@ -11,9 +11,13 @@ interface PageProps {
export const PageLayout = (props: React.PropsWithChildren<PageProps>) => { export const PageLayout = (props: React.PropsWithChildren<PageProps>) => {
const { title, header = <PageHeader title={title} />, children } = props; const { title, header = <PageHeader title={title} />, children } = props;
return ( return (
<div className="flex flex-1 flex-col gap-4 p-4 pt-0"> <div className="h-svh overflow-hidden lg:p-2 w-full">
{header} <div className="lg:border lg:rounded-md overflow-hidden flex flex-col items-center justify-start h-full w-full">
{children} {header}
<div className="overflow-auto w-full h-[calc(100svh-40px)] lg:h-[calc(100svh-56px)] p-4 space-y-6">
{children}
</div>
</div>
</div> </div>
); );
}; };
+21
View File
@@ -143,10 +143,13 @@
"packages/db": { "packages/db": {
"name": "@basango/db", "name": "@basango/db",
"dependencies": { "dependencies": {
"@ai-sdk/google": "^2.0.44",
"@ai-sdk/openai": "^2.0.75",
"@basango/domain": "workspace:*", "@basango/domain": "workspace:*",
"@basango/encryption": "workspace:*", "@basango/encryption": "workspace:*",
"@basango/logger": "workspace:*", "@basango/logger": "workspace:*",
"@date-fns/utc": "^2.1.1", "@date-fns/utc": "^2.1.1",
"ai": "^5.0.105",
"date-fns": "catalog:", "date-fns": "catalog:",
"drizzle-orm": "^0.44.7", "drizzle-orm": "^0.44.7",
"mysql2": "^3.15.3", "mysql2": "^3.15.3",
@@ -257,6 +260,16 @@
"packages": { "packages": {
"@0no-co/graphql.web": ["@0no-co/graphql.web@1.2.0", "", { "peerDependencies": { "graphql": "^14.0.0 || ^15.0.0 || ^16.0.0" }, "optionalPeers": ["graphql"] }, "sha512-/1iHy9TTr63gE1YcR5idjx8UREz1s0kFhydf3bBLCXyqjhkIc6igAzTOx3zPifCwFR87tsh/4Pa9cNts6d2otw=="], "@0no-co/graphql.web": ["@0no-co/graphql.web@1.2.0", "", { "peerDependencies": { "graphql": "^14.0.0 || ^15.0.0 || ^16.0.0" }, "optionalPeers": ["graphql"] }, "sha512-/1iHy9TTr63gE1YcR5idjx8UREz1s0kFhydf3bBLCXyqjhkIc6igAzTOx3zPifCwFR87tsh/4Pa9cNts6d2otw=="],
"@ai-sdk/gateway": ["@ai-sdk/gateway@2.0.17", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.18", "@vercel/oidc": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-oVAG6q72KsjKlrYdLhWjRO7rcqAR8CjokAbYuyVZoCO4Uh2PH/VzZoxZav71w2ipwlXhHCNaInGYWNs889MMDA=="],
"@ai-sdk/google": ["@ai-sdk/google@2.0.44", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.18" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-c5dck36FjqiVoeeMJQLTEmUheoURcGTU/nBT6iJu8/nZiKFT/y8pD85KMDRB7RerRYaaQOtslR2d6/5PditiRw=="],
"@ai-sdk/openai": ["@ai-sdk/openai@2.0.75", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.18" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-ThDHg1+Jes7S0AOXa01EyLBSzZiZwzB5do9vAlufNkoiRHGTH1BmoShrCyci/TUsg4ky1HwbK4hPK+Z0isiE6g=="],
"@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="],
"@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.18", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-ypv1xXMsgGcNKUP+hglKqtdDuMg68nWHucPPAhIENrbFAI+xCHiqPVN8Zllxyv1TNZwGWUghPxJXU+Mqps0YRQ=="],
"@alloc/quick-lru": ["@alloc/quick-lru@5.2.0", "", {}, "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw=="], "@alloc/quick-lru": ["@alloc/quick-lru@5.2.0", "", {}, "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw=="],
"@asteasolutions/zod-to-openapi": ["@asteasolutions/zod-to-openapi@8.1.0", "", { "dependencies": { "openapi3-ts": "^4.1.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-tQFxVs05J/6QXXqIzj6rTRk3nj1HFs4pe+uThwE95jL5II2JfpVXkK+CqkO7aT0Do5AYqO6LDrKpleLUFXgY+g=="], "@asteasolutions/zod-to-openapi": ["@asteasolutions/zod-to-openapi@8.1.0", "", { "dependencies": { "openapi3-ts": "^4.1.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-tQFxVs05J/6QXXqIzj6rTRk3nj1HFs4pe+uThwE95jL5II2JfpVXkK+CqkO7aT0Do5AYqO6LDrKpleLUFXgY+g=="],
@@ -1083,6 +1096,8 @@
"@urql/exchange-retry": ["@urql/exchange-retry@1.3.2", "", { "dependencies": { "@urql/core": "^5.1.2", "wonka": "^6.3.2" } }, "sha512-TQMCz2pFJMfpNxmSfX1VSfTjwUIFx/mL+p1bnfM1xjjdla7Z+KnGMW/EhFbpckp3LyWAH4PgOsMwOMnIN+MBFg=="], "@urql/exchange-retry": ["@urql/exchange-retry@1.3.2", "", { "dependencies": { "@urql/core": "^5.1.2", "wonka": "^6.3.2" } }, "sha512-TQMCz2pFJMfpNxmSfX1VSfTjwUIFx/mL+p1bnfM1xjjdla7Z+KnGMW/EhFbpckp3LyWAH4PgOsMwOMnIN+MBFg=="],
"@vercel/oidc": ["@vercel/oidc@3.0.5", "", {}, "sha512-fnYhv671l+eTTp48gB4zEsTW/YtRgRPnkI2nT7x6qw5rkI1Lq2hTmQIpHPgyThI0znLK+vX2n9XxKdXZ7BUbbw=="],
"@xmldom/xmldom": ["@xmldom/xmldom@0.8.11", "", {}, "sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw=="], "@xmldom/xmldom": ["@xmldom/xmldom@0.8.11", "", {}, "sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw=="],
"JSONStream": ["JSONStream@1.3.5", "", { "dependencies": { "jsonparse": "^1.2.0", "through": ">=2.2.7 <3" }, "bin": { "JSONStream": "./bin.js" } }, "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ=="], "JSONStream": ["JSONStream@1.3.5", "", { "dependencies": { "jsonparse": "^1.2.0", "through": ">=2.2.7 <3" }, "bin": { "JSONStream": "./bin.js" } }, "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ=="],
@@ -1099,6 +1114,8 @@
"aggregate-error": ["aggregate-error@3.1.0", "", { "dependencies": { "clean-stack": "^2.0.0", "indent-string": "^4.0.0" } }, "sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA=="], "aggregate-error": ["aggregate-error@3.1.0", "", { "dependencies": { "clean-stack": "^2.0.0", "indent-string": "^4.0.0" } }, "sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA=="],
"ai": ["ai@5.0.105", "", { "dependencies": { "@ai-sdk/gateway": "2.0.17", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.18", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-waQZAvv44KYzys6S3l25ti2jcSuJnkyWFTliSKy3swASL6w6ttPxJTm80d+v9sLWoIxrqE3OwhTJbweNp065fg=="],
"ajv": ["ajv@8.17.1", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="], "ajv": ["ajv@8.17.1", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="],
"anser": ["anser@1.4.10", "", {}, "sha512-hCv9AqTQ8ycjpSd3upOJd7vFwW1JaoYQ7tpham03GJ1ca8/65rqn0RpaWpItOAd6ylW9wAw6luXYPJIyPFVOww=="], "anser": ["anser@1.4.10", "", {}, "sha512-hCv9AqTQ8ycjpSd3upOJd7vFwW1JaoYQ7tpham03GJ1ca8/65rqn0RpaWpItOAd6ylW9wAw6luXYPJIyPFVOww=="],
@@ -1457,6 +1474,8 @@
"eventemitter3": ["eventemitter3@5.0.1", "", {}, "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA=="], "eventemitter3": ["eventemitter3@5.0.1", "", {}, "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA=="],
"eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="],
"exec-async": ["exec-async@2.2.0", "", {}, "sha512-87OpwcEiMia/DeiKFzaQNBNFeN3XkkpYIh9FyOqq5mS2oKv3CBE67PXoEKcr6nodWdXNogTiQ0jE2NGuoffXPw=="], "exec-async": ["exec-async@2.2.0", "", {}, "sha512-87OpwcEiMia/DeiKFzaQNBNFeN3XkkpYIh9FyOqq5mS2oKv3CBE67PXoEKcr6nodWdXNogTiQ0jE2NGuoffXPw=="],
"execa": ["execa@5.1.1", "", { "dependencies": { "cross-spawn": "^7.0.3", "get-stream": "^6.0.0", "human-signals": "^2.1.0", "is-stream": "^2.0.0", "merge-stream": "^2.0.0", "npm-run-path": "^4.0.1", "onetime": "^5.1.2", "signal-exit": "^3.0.3", "strip-final-newline": "^2.0.0" } }, "sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg=="], "execa": ["execa@5.1.1", "", { "dependencies": { "cross-spawn": "^7.0.3", "get-stream": "^6.0.0", "human-signals": "^2.1.0", "is-stream": "^2.0.0", "merge-stream": "^2.0.0", "npm-run-path": "^4.0.1", "onetime": "^5.1.2", "signal-exit": "^3.0.3", "strip-final-newline": "^2.0.0" } }, "sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg=="],
@@ -1763,6 +1782,8 @@
"json-parse-even-better-errors": ["json-parse-even-better-errors@2.3.1", "", {}, "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w=="], "json-parse-even-better-errors": ["json-parse-even-better-errors@2.3.1", "", {}, "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w=="],
"json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="],
"json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="], "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],
"json5": ["json5@2.2.3", "", { "bin": { "json5": "lib/cli.js" } }, "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg=="], "json5": ["json5@2.2.3", "", { "bin": { "json5": "lib/cli.js" } }, "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg=="],
@@ -0,0 +1,20 @@
CREATE TABLE "category" (
"candidates" text[] NOT NULL,
"created_at" timestamp DEFAULT now() NOT NULL,
"description" varchar(512),
"embeddings" jsonb,
"id" uuid PRIMARY KEY NOT NULL,
"name" varchar(255) NOT NULL,
"slug" varchar(255) NOT NULL,
"updated_at" timestamp,
"weight" integer DEFAULT 0 NOT NULL
);
--> statement-breakpoint
ALTER TABLE "article" ADD COLUMN "category_id" uuid;--> statement-breakpoint
ALTER TABLE "article" ADD COLUMN "clustered" boolean DEFAULT false NOT NULL;--> statement-breakpoint
CREATE UNIQUE INDEX "unq_category_name" ON "category" USING btree (lower((name)::text));--> statement-breakpoint
CREATE UNIQUE INDEX "unq_category_slug" ON "category" USING btree (lower((slug)::text));--> statement-breakpoint
CREATE INDEX "idx_category_weight" ON "category" USING btree ("weight");--> statement-breakpoint
ALTER TABLE "article" ADD CONSTRAINT "fk_article_category_id" FOREIGN KEY ("category_id") REFERENCES "public"."category"("id") ON DELETE set null ON UPDATE no action;--> statement-breakpoint
CREATE INDEX "idx_article_category_id" ON "article" USING btree ("category_id");--> statement-breakpoint
CREATE INDEX "idx_article_clustered" ON "article" USING btree ("clustered");
File diff suppressed because it is too large Load Diff
@@ -21,6 +21,13 @@
"tag": "0002_modern_joseph", "tag": "0002_modern_joseph",
"version": "7", "version": "7",
"when": 1763920009482 "when": 1763920009482
},
{
"breakpoints": true,
"idx": 3,
"tag": "0003_categories",
"version": "7",
"when": 1764767993880
} }
], ],
"version": "7" "version": "7"
+4
View File
@@ -1,9 +1,12 @@
{ {
"dependencies": { "dependencies": {
"@ai-sdk/google": "^2.0.44",
"@ai-sdk/openai": "^2.0.75",
"@basango/domain": "workspace:*", "@basango/domain": "workspace:*",
"@basango/encryption": "workspace:*", "@basango/encryption": "workspace:*",
"@basango/logger": "workspace:*", "@basango/logger": "workspace:*",
"@date-fns/utc": "^2.1.1", "@date-fns/utc": "^2.1.1",
"ai": "^5.0.105",
"date-fns": "catalog:", "date-fns": "catalog:",
"drizzle-orm": "^0.44.7", "drizzle-orm": "^0.44.7",
"mysql2": "^3.15.3", "mysql2": "^3.15.3",
@@ -31,6 +34,7 @@
"private": true, "private": true,
"scripts": { "scripts": {
"clean": "rm -rf .turbo node_modules", "clean": "rm -rf .turbo node_modules",
"sync:categories": "bun ./src/synchronizers/categories.ts",
"sync:data": "bun ./src/synchronizers/data.ts", "sync:data": "bun ./src/synchronizers/data.ts",
"sync:tokens": "bun ./src/synchronizers/tokens.ts", "sync:tokens": "bun ./src/synchronizers/tokens.ts",
"typecheck": "tsc --noEmit" "typecheck": "tsc --noEmit"
+18 -5
View File
@@ -11,12 +11,12 @@ import {
} from "@basango/domain/models"; } from "@basango/domain/models";
import { md5 } from "@basango/encryption"; import { md5 } from "@basango/encryption";
import type { SQL } from "drizzle-orm"; import type { SQL } from "drizzle-orm";
import { count, desc, eq, getTableColumns, sql } from "drizzle-orm"; import { count, desc, eq, getTableColumns, or, sql } from "drizzle-orm";
import * as uuid from "uuid"; import * as uuid from "uuid";
import { Database } from "#db/client"; import { Database } from "#db/client";
import { getSourceIdByName } from "#db/queries/sources"; import { getSourceIdByName } from "#db/queries/sources";
import { articles, sources } from "#db/schema"; import { articles, categories, sources } from "#db/schema";
import { CreateArticleParams, GetArticlesParams } from "#db/types/articles"; import { CreateArticleParams, GetArticlesParams } from "#db/types/articles";
import { GetDistributionsParams, GetPublicationsParams } from "#db/types/shared"; import { GetDistributionsParams, GetPublicationsParams } from "#db/types/shared";
import { import {
@@ -41,15 +41,17 @@ export async function createArticle(db: Database, params: CreateArticleParams) {
}; };
} }
const categoryList = params.categories ?? [];
const data = { const data = {
...params, ...params,
categories: categoryList,
hash: md5(params.link), hash: md5(params.link),
readingTime: computeReadingTime(params.body), readingTime: computeReadingTime(params.body),
sentiment: "neutral" as Sentiment, sentiment: (params.sentiment ?? "neutral") as Sentiment,
sourceId: await getSourceIdByName(db, params.sourceId), sourceId: await getSourceIdByName(db, params.sourceId),
tokenStatistics: computeTokenStatistics({ tokenStatistics: computeTokenStatistics({
body: params.body, body: params.body,
categories: params.categories, categories: categoryList,
title: params.title, title: params.title,
}), }),
}; };
@@ -103,7 +105,14 @@ function buildFilters(params: GetArticlesParams, pagination: PaginationState) {
} }
if (params.category) { if (params.category) {
filters.push(sql`${params.category} = ANY(${articles.categories})`); const categoryFilter = or(
eq(categories.slug, params.category),
eq(articles.categoryId, params.category),
);
if (categoryFilter) {
filters.push(categoryFilter);
}
} }
if (params.search?.trim()) { if (params.search?.trim()) {
@@ -133,11 +142,15 @@ export async function getArticles(db: Database, params: GetArticlesParams) {
const query = db const query = db
.select({ .select({
...getTableColumns(articles), ...getTableColumns(articles),
category: {
...getTableColumns(categories),
},
source: { source: {
...getTableColumns(sources), ...getTableColumns(sources),
}, },
}) })
.from(articles) .from(articles)
.leftJoin(categories, eq(articles.categoryId, categories.id))
.innerJoin(sources, eq(articles.sourceId, sources.id)); .innerJoin(sources, eq(articles.sourceId, sources.id));
const rows = await applyFilters(query, filters) const rows = await applyFilters(query, filters)
+13 -12
View File
@@ -5,7 +5,7 @@ import * as uuid from "uuid";
import { Database } from "#db/client"; import { Database } from "#db/client";
import { NotFoundError } from "#db/errors"; import { NotFoundError } from "#db/errors";
import { articles, sources } from "#db/schema"; import { articles, categories, sources } from "#db/schema";
import { import {
CategoryShare, CategoryShare,
CategoryShares, CategoryShares,
@@ -144,17 +144,18 @@ export async function getSourceCategoryShares(
): Promise<CategoryShares> { ): Promise<CategoryShares> {
const data = await db.execute<CategoryShare>(sql` const data = await db.execute<CategoryShare>(sql`
SELECT SELECT
cat AS category, ${categories.id}::text AS "categoryId",
COUNT(*)::int AS count, ${categories.slug} AS slug,
ROUND((COUNT(*)::numeric / SUM(COUNT(*)) OVER ()) * 100, 2) AS percentage ${categories.name} AS category,
FROM ( COUNT(${articles.id})::int AS count,
SELECT NULLIF(BTRIM(c), '') AS cat COALESCE(
FROM ${articles} ROUND((COUNT(*)::numeric / NULLIF(SUM(COUNT(*)) OVER (), 0)) * 100, 2),
CROSS JOIN LATERAL UNNEST(COALESCE(${articles.categories}, ARRAY[]::text[])) AS c 0
WHERE ${articles.sourceId} = ${params.id} )::float AS percentage
) t FROM ${articles}
WHERE cat IS NOT NULL JOIN ${categories} ON ${categories.id} = ${articles.categoryId}
GROUP BY cat WHERE ${articles.sourceId} = ${params.id} AND ${articles.clustered} = true
GROUP BY ${categories.id}, ${categories.slug}, ${categories.name}
ORDER BY count DESC ORDER BY count DESC
LIMIT ${params.limit ?? DEFAULT_CATEGORY_SHARES_LIMIT} LIMIT ${params.limit ?? DEFAULT_CATEGORY_SHARES_LIMIT}
`); `);
+37
View File
@@ -94,11 +94,33 @@ export const sources = pgTable(
], ],
); );
export const categories = pgTable(
"category",
{
candidates: text().array().notNull(),
createdAt: timestamp("created_at").defaultNow().notNull(),
description: varchar({ length: 512 }),
embeddings: jsonb("embeddings").$type<number[]>(),
id: uuid().primaryKey().notNull(),
name: varchar({ length: 255 }).notNull(),
slug: varchar({ length: 255 }).notNull(),
updatedAt: timestamp("updated_at"),
weight: integer().default(0).notNull(),
},
(table) => [
uniqueIndex("unq_category_name").using("btree", sql`lower((name)::text)`),
uniqueIndex("unq_category_slug").using("btree", sql`lower((slug)::text)`),
index("idx_category_weight").using("btree", table.weight.asc().nullsLast()),
],
);
export const articles = pgTable( export const articles = pgTable(
"article", "article",
{ {
body: text().notNull(), body: text().notNull(),
categories: text().array(), categories: text().array(),
categoryId: uuid("category_id"),
clustered: boolean("clustered").default(false).notNull(),
crawledAt: timestamp("crawled_at").defaultNow().notNull(), crawledAt: timestamp("crawled_at").defaultNow().notNull(),
credibility: jsonb("credibility").$type<Credibility>(), credibility: jsonb("credibility").$type<Credibility>(),
excerpt: varchar({ length: 255 }).generatedAlwaysAs(sql`("left"(body, 200) || '...'::text)`), excerpt: varchar({ length: 255 }).generatedAlwaysAs(sql`("left"(body, 200) || '...'::text)`),
@@ -123,6 +145,8 @@ export const articles = pgTable(
"gin", "gin",
table.categories.asc().nullsLast().op("array_ops"), table.categories.asc().nullsLast().op("array_ops"),
), ),
index("idx_article_category_id").using("btree", table.categoryId.asc().nullsLast()),
index("idx_article_clustered").using("btree", table.clustered.asc().nullsLast()),
index("gin_article_link_trgm").using("gin", table.link.asc().nullsLast().op("gin_trgm_ops")), index("gin_article_link_trgm").using("gin", table.link.asc().nullsLast().op("gin_trgm_ops")),
index("gin_article_title_trgm").using("gin", table.title.asc().nullsLast().op("gin_trgm_ops")), index("gin_article_title_trgm").using("gin", table.title.asc().nullsLast().op("gin_trgm_ops")),
index("gin_article_tsv").using("gin", table.tsv.asc().nullsLast().op("tsvector_ops")), index("gin_article_tsv").using("gin", table.tsv.asc().nullsLast().op("tsvector_ops")),
@@ -133,6 +157,11 @@ export const articles = pgTable(
table.id.desc().nullsFirst(), table.id.desc().nullsFirst(),
), ),
uniqueIndex("unq_article_hash").using("btree", table.hash.asc().nullsLast()), uniqueIndex("unq_article_hash").using("btree", table.hash.asc().nullsLast()),
foreignKey({
columns: [table.categoryId],
foreignColumns: [categories.id],
name: "fk_article_category_id",
}).onDelete("set null"),
foreignKey({ foreignKey({
columns: [table.sourceId], columns: [table.sourceId],
foreignColumns: [sources.id], foreignColumns: [sources.id],
@@ -425,6 +454,10 @@ export const commentRelations = relations(comments, ({ one }) => ({
export const articleRelations = relations(articles, ({ one, many }) => ({ export const articleRelations = relations(articles, ({ one, many }) => ({
bookmarkArticles: many(bookmarkArticles), bookmarkArticles: many(bookmarkArticles),
category: one(categories, {
fields: [articles.categoryId],
references: [categories.id],
}),
comments: many(comments), comments: many(comments),
source: one(sources, { source: one(sources, {
fields: [articles.sourceId], fields: [articles.sourceId],
@@ -432,6 +465,10 @@ export const articleRelations = relations(articles, ({ one, many }) => ({
}), }),
})); }));
export const categoryRelations = relations(categories, ({ many }) => ({
articles: many(articles),
}));
export const bookmarkArticleRelations = relations(bookmarkArticles, ({ one }) => ({ export const bookmarkArticleRelations = relations(bookmarkArticles, ({ one }) => ({
article: one(articles, { article: one(articles, {
fields: [bookmarkArticles.articleId], fields: [bookmarkArticles.articleId],
@@ -0,0 +1,218 @@
import { logger } from "@basango/logger";
import { desc, eq, inArray, sql } from "drizzle-orm";
import { Database } from "#db/client";
import { articles, categories } from "#db/schema";
import { DEFAULT_CATEGORY } from "#domain/constants";
import { Categories } from "#domain/models";
type CategoryRow = typeof categories.$inferSelect;
type ArticleCategories = Pick<typeof articles.$inferSelect, "categories" | "id">;
type CategoryScore = {
category: (typeof Categories)[number];
matches: number;
score: number;
};
const BATCH_SIZE = 50_000;
const CATEGORY_MAP = new Map(Categories.map((category, index) => [category.slug, index]));
const CANDIDATE_MAP = buildCandidateMap();
const FALLBACK_CATEGORY = Categories.find((category) => category.slug === DEFAULT_CATEGORY)!;
export class CategoryClassifier {
constructor(private readonly db: Database) {}
async classifyPendingArticles(limit: number = BATCH_SIZE) {
const canonical = await this.ensureCanonicalCategories();
if (canonical.size === 0) {
logger.warn("No canonical categories available for clustering");
return { matched: 0, processed: 0, unmatched: 0 };
}
const pending = await this.db
.select({
categories: articles.categories,
id: articles.id,
})
.from(articles)
.where(eq(articles.clustered, false))
.orderBy(desc(articles.publishedAt), desc(articles.id))
.limit(limit);
if (pending.length === 0) {
logger.info("No articles to cluster");
return { matched: 0, processed: 0, unmatched: 0 };
}
let matched = 0;
let unmatched = 0;
const fallbackRow = canonical.get(FALLBACK_CATEGORY.slug);
for (const article of pending) {
const best = classifyCategory(article);
const targetRow = canonical.get(best.category.slug) ?? fallbackRow;
await this.db
.update(articles)
.set({
categoryId: targetRow?.id ?? null,
clustered: true,
updatedAt: sql`now()`,
})
.where(eq(articles.id, article.id));
if (targetRow) {
matched++;
logger.debug(
{
articleId: article.id,
category: best.category.slug,
matches: best.matches,
score: best.score,
},
"Clustered article",
);
} else {
unmatched++;
logger.debug({ articleId: article.id }, "No category match found");
}
}
const processed = pending.length;
logger.info({ matched, processed, unmatched }, "Category clustering run completed");
return { matched, processed, unmatched };
}
private async ensureCanonicalCategories(): Promise<Map<string, CategoryRow>> {
const payload = Categories.map(
(category) =>
({
candidates: category.candidates,
description: category.description ?? null,
embeddings: null,
id: category.id,
name: category.name,
slug: category.slug,
weight: category.weight,
}) satisfies typeof categories.$inferInsert,
);
await this.db.insert(categories).values(payload).onConflictDoNothing();
const existing = await this.db.query.categories.findMany({
where: inArray(
categories.slug,
Categories.map((category) => category.slug),
),
});
const map = new Map<string, CategoryRow>();
for (const row of existing) {
map.set(row.slug, row);
}
if (!map.has(FALLBACK_CATEGORY.slug)) {
logger.warn("Fallback main category is missing from canonical categories");
}
return map;
}
}
function classifyCategory(article: ArticleCategories): CategoryScore {
const rawCategories = article.categories ?? [];
const normalizedCategories = Array.from(
new Set(
rawCategories
.map((value) => normalizeCategory(value))
.filter((value): value is string => Boolean(value)),
),
);
const scores = new Map<string, CategoryScore>();
for (const normalized of normalizedCategories) {
const categories = CANDIDATE_MAP.get(normalized);
if (!categories) continue;
for (const category of categories) {
const current =
scores.get(category.slug) ??
({
category,
matches: 0,
score: 0,
} satisfies CategoryScore);
current.matches += 1;
current.score += category.weight;
scores.set(category.slug, current);
}
}
if (scores.size === 0) {
return { category: FALLBACK_CATEGORY, matches: 0, score: 0 };
}
const [first, ...rest] = Array.from(scores.values());
const best = rest.reduce<CategoryScore>((winner, candidate) => {
if (candidate.score !== winner.score) {
return candidate.score > winner.score ? candidate : winner;
}
if (candidate.category.weight !== winner.category.weight) {
return candidate.category.weight > winner.category.weight ? candidate : winner;
}
if (candidate.matches !== winner.matches) {
return candidate.matches > winner.matches ? candidate : winner;
}
const winnerOrder = CATEGORY_MAP.get(winner.category.slug) ?? Number.MAX_SAFE_INTEGER;
const candidateOrder = CATEGORY_MAP.get(candidate.category.slug) ?? Number.MAX_SAFE_INTEGER;
return candidateOrder < winnerOrder ? candidate : winner;
}, first ?? { category: FALLBACK_CATEGORY, matches: 0, score: 0 });
return best;
}
function buildCandidateMap(): Map<string, (typeof Categories)[number][]> {
const map = new Map<string, (typeof Categories)[number][]>();
for (const category of Categories) {
for (const candidate of category.candidates) {
const normalized = normalizeCategory(candidate);
if (!normalized) continue;
const existing = map.get(normalized) ?? [];
if (!existing.some((item) => item.slug === category.slug)) {
existing.push(category);
}
map.set(normalized, existing);
}
}
return map;
}
export function normalizeCategory(value?: string | null): string | null {
const trimmed = value?.trim();
if (!trimmed) return null;
const normalized = trimmed
.normalize("NFD")
.replace(/\p{Diacritic}/gu, "")
.toLowerCase()
.replace(/[^a-z0-9]+/g, " ")
.trim()
.replace(/\s+/g, " ");
return normalized.length > 0 ? normalized : null;
}
@@ -0,0 +1,18 @@
#!/usr/bin/env bun
import { logger } from "@basango/logger";
import { connectDb } from "#db/client";
import { CategoryClassifier } from "#db/services/category-classifier.js";
async function main() {
const db = await connectDb();
const service = new CategoryClassifier(db);
await service.classifyPendingArticles();
}
main().catch((error) => {
logger.error({ error }, "Category clustering failed");
process.exit(1);
});
+3 -1
View File
@@ -1,9 +1,11 @@
import { ArticleMetadata, ID, Sentiment, TokenStatistics } from "@basango/domain/models"; import { ArticleMetadata, ID, Sentiment, TokenStatistics } from "@basango/domain/models";
export type CreateArticleParams = { export type CreateArticleParams = {
categoryId?: string | null;
clustered?: boolean;
title: string; title: string;
body: string; body: string;
categories: string[]; categories?: string[];
link: string; link: string;
sourceId: string; sourceId: string;
publishedAt: Date; publishedAt: Date;
+2
View File
@@ -1,9 +1,11 @@
import { DateRange, ID } from "@basango/domain/models"; import { DateRange, ID } from "@basango/domain/models";
export type CategoryShare = { export type CategoryShare = {
categoryId: string;
category: string; category: string;
count: number; count: number;
percentage: number; percentage: number;
slug: string;
}; };
export type CategoryShares = { export type CategoryShares = {
+11 -12
View File
@@ -28,21 +28,20 @@ export const computeTokenCount = (
export const computeTokenStatistics = (data: { export const computeTokenStatistics = (data: {
title: string; title: string;
body: string; body: string;
categories: string[]; categories?: string[];
}): TokenStatistics => { }): TokenStatistics => {
const [title, body, categories, excerpt] = [ const normalizedCategories = data.categories ?? [];
computeTokenCount(data.title), const titleTokens = computeTokenCount(data.title);
computeTokenCount(data.body), const bodyTokens = computeTokenCount(data.body);
computeTokenCount(data.categories.join(",")), const categoryTokens = computeTokenCount(normalizedCategories.join(","));
computeTokenCount(data.body.substring(0, 200)), const excerptTokens = computeTokenCount(data.body.substring(0, 200));
];
return { return {
body, body: bodyTokens,
categories, categories: categoryTokens,
excerpt, excerpt: excerptTokens,
title, title: titleTokens,
total: title + body + categories + excerpt, total: titleTokens + bodyTokens + categoryTokens + excerptTokens,
}; };
}; };
+1
View File
@@ -32,3 +32,4 @@ export const DEFAULT_TOKEN_AUDIENCE = "basango_dashboard";
export const DEFAULT_TOKEN_ISSUER = "basango_api"; export const DEFAULT_TOKEN_ISSUER = "basango_api";
export const DEFAULT_ACCESS_TOKEN_TTL = "35m"; export const DEFAULT_ACCESS_TOKEN_TTL = "35m";
export const DEFAULT_REFRESH_TOKEN_TTL = "7d"; export const DEFAULT_REFRESH_TOKEN_TTL = "7d";
export const DEFAULT_CATEGORY = "divers-autres";
+5 -1
View File
@@ -1,5 +1,6 @@
import z from "zod"; import z from "zod";
import { categorySchema } from "./categories";
import { idSchema, sentimentSchema } from "./shared"; import { idSchema, sentimentSchema } from "./shared";
import { sourceSchema } from "./sources"; import { sourceSchema } from "./sources";
@@ -24,7 +25,10 @@ export const tokenStatisticsSchema = z.object({
export const articleSchema = z.object({ export const articleSchema = z.object({
body: z.string().min(1), body: z.string().min(1),
categories: z.array(z.string()), categories: z.array(z.string()).optional().default([]),
category: categorySchema.optional(),
categoryId: idSchema.optional(),
clustered: z.boolean().default(false),
createdAt: z.coerce.date(), createdAt: z.coerce.date(),
excerpt: z.string().optional(), excerpt: z.string().optional(),
hash: z.string().min(1), hash: z.string().min(1),
+296
View File
@@ -0,0 +1,296 @@
import z from "zod";
import { idSchema } from "./shared";
export const categorySchema = z.object({
candidates: z.array(z.string()),
createdAt: z.coerce.date(),
description: z.string().max(512).optional(),
embeddings: z.array(z.number()).optional(),
id: idSchema,
name: z.string().min(1).max(255),
slug: z.string().min(1).max(255),
updatedAt: z.coerce.date().optional(),
weight: z.number().int(),
});
export type Category = z.infer<typeof categorySchema>;
export const Categories: Category[] = [
{
candidates: [
"accident",
"actualite",
"actualité",
"a-la-une",
"en bref",
"en-clair",
"encontinu",
"flash",
"faits-divers",
"drame",
"enquetes",
"desintox",
"archives",
"insolite",
"national",
"featured",
"related-featured",
"top-featured",
"top-trending",
"news-dont-miss",
"news-just-in",
"la-rdc-a-la-une",
"example-1",
"example-2",
"example-3",
"example-4",
"beto-premium",
"fiches",
"suspension",
],
createdAt: new Date(),
description: "Nouvelles de dernière minute, faits divers et informations générales.",
id: "06930299-71a3-735e-9dcd-055c321f2ca9",
name: "Actualités & Faits Divers",
slug: "actualites-faits-divers",
weight: 4,
},
{
candidates: [
"democratie",
"dialogue entre congolais",
"diplomatie",
"diplomatie-et-securite",
"election",
"élections",
"elections-2023",
"legislatives",
"presidentielle",
"parlement",
"politique",
"serment",
"si j'étais président",
"spécial elections",
"us-politics",
"ukraine-conflict",
"conférence des nations unies",
"nations unies",
"rebellion",
],
createdAt: new Date(),
description: "Élections, gouvernance, institutions, diplomatie et conflits politiques.",
id: "06930299-71a3-7aa5-95a4-a7b39c421255",
name: "Politique & Gouvernement",
slug: "politique-gouvernement",
weight: 10,
},
{
candidates: [
"agrobusiness",
"banking",
"banques-et-finances",
"economico",
"economie",
"économie",
"finances",
"industrie",
"investments",
"mines",
"pme-entrepreneuriat",
"featured-economy",
"featured-markets",
"intl-markets",
"us-business",
"la-une-eco",
"emploi",
],
createdAt: new Date(),
description: "Affaires, marchés financiers, entreprises, banques, emplois et entrepreneuriat.",
id: "06930299-71a3-7c5b-98b0-d58c8308496d",
name: "Économie & Finances",
slug: "economie-finances",
weight: 9,
},
{
candidates: [
"arts",
"culture",
"musique",
"livre",
"livres",
"patrimoine-traditions",
"identité culturelle",
"caricature",
"histoire",
],
createdAt: new Date(),
description: "Art, musique, patrimoine, histoire, littérature et expression culturelle.",
id: "06930299-71a3-7d47-8df2-b201975437f4",
name: "Culture & Arts",
slug: "culture-arts",
weight: 2,
},
{
candidates: ["sport", "sports", "football", "boxe", "can", "okapi sports"],
createdAt: new Date(),
description: "Compétitions sportives nationales et internationales, analyses et résultats.",
id: "06930299-71a3-7e65-9421-b418c8a161b7",
name: "Sports",
slug: "sports",
weight: 5,
},
{
candidates: [
"famille-genre",
"femme",
"jeunes",
"justice",
"criminalite",
"arrestation",
"kidnapping",
"viol",
"vol",
"manifestation",
"marche",
"salubrite",
"denonciation",
"evasion",
"sante",
"santé",
"necrologie",
"education",
"éducation",
"enseignement",
"religion",
"religion-spiritualite",
"message-des-voeux",
"style et beauté",
"societe",
"société",
],
createdAt: new Date(),
description: "Questions sociales, éducation, santé, justice, genre et vie quotidienne.",
id: "06930299-71a3-7f8b-b5a3-413f512ec6d8",
name: "Société & Vie Quotidienne",
slug: "societe-vie-quotidienne",
weight: 6,
},
{
candidates: [
"climat-et-environnement",
"developpement-durable",
"biodiversite",
"ecologico",
"environnement",
"nature",
"eau",
"electricite",
"energie",
"inondation",
"science & env.",
"sciences",
"technologie",
"technologie-innovation",
"mc geek !",
"sur le net",
],
createdAt: new Date(),
description:
"Recherche scientifique, innovation technologique, climat, environnement et énergie.",
id: "06930299-71a4-7096-8a7f-d69920882d95",
name: "Sciences, Technologies & Environnement",
slug: "sciences-technologies-environnement",
weight: 7,
},
{
candidates: [
"afrique",
"congo-brazzaville",
"congolais de l'étranger",
"diaspora",
"euro-zone",
"se-asia",
"middle-east",
"monde",
"world-news",
"grands-lacs",
"bandundu",
"bukavu",
"bunia",
"ituri",
"katanga",
"kinshasa",
"maniema",
"mbujimayi",
"provinces",
"info kin",
"tourisme",
"transport",
"route",
"infrastructures",
"ukraine-conflict",
],
createdAt: new Date(),
description: "Actualités internationales, régions du monde et provinces locales.",
id: "06930299-71a4-724a-8975-ea7d21286c22",
name: "International & Régions",
slug: "international-regions",
weight: 8,
},
{
candidates: [
"analyses",
"opinion",
"opinions",
"tribune",
"grand-angle",
"grande interview",
"le débat",
"lettre-ouverte",
"l'invité de la campagne",
"l'invité du jour",
"émissions",
"magazine",
"magazine un",
"medias",
"communication",
"communications",
"parole aux auditeurs",
"parole d'enfant",
"revue de presse",
"tele-medias",
"multimedia",
"tv",
],
createdAt: new Date(),
description: "Chroniques, analyses, tribunes, programmes et contenus médiatiques.",
id: "06930299-71a4-745b-8813-6bca9c6b3c56",
name: "Opinions & Médias",
slug: "opinions-medias",
weight: 3,
},
{
candidates: [
"beto-premium",
"example-1",
"example-2",
"example-3",
"example-4",
"fiches",
"publicite",
"okapi service",
"petro-chem-example-3",
"sans catégorie",
"uncategorized",
"lefonde",
"jdc",
],
createdAt: new Date(),
description: "Rubriques expérimentales, catégories indéterminées et éléments divers.",
id: "06930299-71a4-756a-948b-e4a244b5887e",
name: "Divers & Autres",
slug: "divers-autres",
weight: 1,
},
];
+1
View File
@@ -1,5 +1,6 @@
export * from "./articles"; export * from "./articles";
export * from "./auth"; export * from "./auth";
export * from "./categories";
export * from "./crawler"; export * from "./crawler";
export * from "./reports"; export * from "./reports";
export * from "./shared"; export * from "./shared";