basango/packages/db/src/utils/computed.ts

import { TiktokenEncoding, get_encoding } from "tiktoken";

import { TokenStatistics } from "@/schema";

/**
 * Count the number of tokens in the given text using the specified encoding.
 * @param text - The input text
 * @param encoding - The token encoding (default: "cl100k_base")
 */
export const computeTokenCount = (
  text: string,
  encoding: TiktokenEncoding = "cl100k_base",
): number => {
  try {
    const encoder = get_encoding(encoding);
    const tokens = encoder.encode(text);
    encoder.free();
    return tokens.length;
  } catch {
    return text.length;
  }
};

/**
 * Create token statistics for the given data.
 * @param data  - The input data containing title, body, and categories
 * @returns TokenStatistics object
 */
export const computeTokenStatistics = (data: {
  title: string;
  body: string;
  categories: string[];
}): TokenStatistics => {
  const title = computeTokenCount(data.title);
  const body = computeTokenCount(data.body);
  const categories = computeTokenCount(data.categories.join(","));
  const excerpt = computeTokenCount(data.body.substring(0, 200));

  return {
    body,
    categories,
    excerpt,
    title,
    total: title + body + categories + excerpt,
  };
};

/**
 * Compute the estimated reading time for the given text.
 * @param text - The input text
 * @param wordsPerMinute - The reading speed in words per minute (default: 200)
 * @returns The estimated reading time in minutes
 */
export const computeReadingTime = (text: string, wordsPerMinute = 200): number => {
  const words = text.trim().split(/\s+/).length;
  return Math.ceil(words / wordsPerMinute);
};