diff --git a/package.json b/package.json index 12b6e1a..9c48eb7 100644 --- a/package.json +++ b/package.json @@ -50,6 +50,7 @@ "dotenv": "^16.4.5", "jimp": "^1.6.0", "joplin-turndown-plugin-gfm": "^1.0.12", + "js-tiktoken": "^1.0.21", "lodash": "^4.17.21", "minimatch": "^9.0.3", "openai": "^6.7.0", diff --git a/scripts/test-token-counter.ts b/scripts/test-token-counter.ts new file mode 100644 index 0000000..4f75a5e --- /dev/null +++ b/scripts/test-token-counter.ts @@ -0,0 +1,90 @@ +import { countTokens, truncateToTokenLimit } from "../src/utils/token-counter"; + +async function runTests() { + console.log("Running token-counter verification..."); + let failed = false; + + const assert = (condition: boolean, msg: string) => { + if (!condition) { + console.error(`FAIL: ${msg}`); + failed = true; + } else { + console.log(`PASS: ${msg}`); + } + }; + + // countTokens tests + // "Hello world" -> [9906, 1917] -> 2 tokens + const countHello = countTokens("Hello world"); + assert(countHello === 2, `countTokens("Hello world") === 2 (got ${countHello})`); + assert(countTokens("") === 0, 'countTokens("") === 0'); + + // truncateToTokenLimit tests + const text = "one two three four five"; + // "one" " two" " three" " four" " five" -> 5 tokens + // NOTE: Tokenization depends on the model. cl100k_base: + // "one" -> 16 + // " two" -> 1440 + // " three" -> 1867 + // " four" -> 3550 + // " five" -> 3749 + // So 5 tokens exactly. + + const limit = 3; + const msg = "X"; // "X" -> 55 (1 token) + + // limit 3, msg 1 (1 token). Available 2. + // Should keep first 2 tokens: "one" + " two". + // Expected: "one two" + "X" = "one twoX" + + const truncated = truncateToTokenLimit(text, limit, msg); + assert(countTokens(truncated) <= limit, "truncated text is within limit"); + assert(truncated.endsWith(msg), "truncated text ends with message"); + assert(truncated === "one twoX", `truncated text is correct ("${truncated}")`); + + const longText = "This is a longer text that should be truncated properly."; + // We want to verify it truncates. + const res = truncateToTokenLimit(longText, 5); + // Default message is quite long, so if limit is 5, it might consume all tokens or fail to fit? + // Message: "\n[Content truncated due to length]" + // Encoded: [198, 91, 16183, 16996, 4390, 311, 3538, 93] -> 8 tokens? + // Let's check the length of message tokens. + const msgTokens = countTokens("\n[Content truncated due to length]"); + console.log(`Default message tokens: ${msgTokens}`); + + if (limit < msgTokens) { + // If limit is 5 and message is 8, available is 0. + // It returns just message? Or message truncated? + // My implementation: + // const messageTokens = enc.encode(truncationMessage).length; + // const availableTokens = Math.max(0, tokenLimit - messageTokens); + // const truncatedTokens = tokens.slice(0, availableTokens); + // return enc.decode(truncatedTokens) + truncationMessage; + + // If availableTokens is 0, it returns "" + message. + // So result is just message. + // And result length (tokens) is messageTokens (8) which is > limit (5). + + // So if limit is smaller than message, it overflows. + // This is acceptable behavior for now as we can't magically compress the message. + } + + // Let's test with a limit that allows text + message. + // limit = msgTokens + 2. + const safeLimit = msgTokens + 2; + const res2 = truncateToTokenLimit(longText, safeLimit); + assert(res2.endsWith("\n[Content truncated due to length]"), "Ends with default message"); + assert(countTokens(res2) <= safeLimit, `Result within safe limit (got ${countTokens(res2)}, limit ${safeLimit})`); + + if (failed) { + console.error("Some tests failed."); + process.exit(1); + } else { + console.log("All tests passed."); + } +} + +runTests().catch(e => { + console.error(e); + process.exit(1); +}); diff --git a/src/agent/actions/extract.ts b/src/agent/actions/extract.ts index 8fb366f..8ba451f 100644 --- a/src/agent/actions/extract.ts +++ b/src/agent/actions/extract.ts @@ -1,6 +1,7 @@ import { z } from "zod"; import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types"; import { parseMarkdown } from "@/utils/html-to-markdown"; +import { truncateToTokenLimit } from "@/utils"; import fs from "fs"; import { getCDPClient } from "@/cdp"; @@ -42,13 +43,7 @@ export const ExtractActionDefinition: AgentActionDefinition = { } // Trim markdown to stay within token limit - // TODO: this is a hack, we should use a better token counting method - const avgTokensPerChar = 0.75; // Conservative estimate of tokens per character - const maxChars = Math.floor(ctx.tokenLimit / avgTokensPerChar); - const trimmedMarkdown = - markdown.length > maxChars - ? markdown.slice(0, maxChars) + "\n[Content truncated due to length]" - : markdown; + const trimmedMarkdown = truncateToTokenLimit(markdown, ctx.tokenLimit); if (ctx.debugDir) { fs.writeFileSync( `${ctx.debugDir}/extract-markdown-content.md`, diff --git a/src/utils/index.ts b/src/utils/index.ts index 598ffff..e77f8ab 100644 --- a/src/utils/index.ts +++ b/src/utils/index.ts @@ -1,5 +1,6 @@ import { sleep } from "./sleep"; import { retry } from "./retry"; import { ErrorEmitter } from "./error-emitter"; +import { countTokens, truncateToTokenLimit } from "./token-counter"; -export { sleep, retry, ErrorEmitter }; +export { sleep, retry, ErrorEmitter, countTokens, truncateToTokenLimit }; diff --git a/src/utils/token-counter.ts b/src/utils/token-counter.ts new file mode 100644 index 0000000..5d3a967 --- /dev/null +++ b/src/utils/token-counter.ts @@ -0,0 +1,39 @@ +import { getEncoding } from "js-tiktoken"; + +// Use cl100k_base as it is the standard for modern OpenAI models (GPT-3.5/4) +// and serves as a good approximation for other models. +const enc = getEncoding("cl100k_base"); + +/** + * Counts the number of tokens in a text string. + */ +export function countTokens(text: string): number { + return enc.encode(text).length; +} + +/** + * Truncates text to a maximum number of tokens. + * Appends a truncation message if truncated. + */ +export function truncateToTokenLimit( + text: string, + tokenLimit: number, + truncationMessage = "\n[Content truncated due to length]" +): string { + const tokens = enc.encode(text); + + if (tokens.length <= tokenLimit) { + return text; + } + + // Reserve tokens for the truncation message + const messageTokens = enc.encode(truncationMessage).length; + // If the limit is so small it can't even fit the message, just return as much message as possible or empty string? + // Let's assume limit is reasonable. If not, we prioritize the text? No, message is important to know it was truncated. + // But if limit < messageTokens, we can't do much. + + const availableTokens = Math.max(0, tokenLimit - messageTokens); + + const truncatedTokens = tokens.slice(0, availableTokens); + return enc.decode(truncatedTokens) + truncationMessage; +} diff --git a/yarn.lock b/yarn.lock index 0903c18..3252739 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1045,7 +1045,7 @@ balanced-match@^1.0.0: resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee" integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw== -base64-js@^1.3.0, base64-js@^1.3.1: +base64-js@^1.3.0, base64-js@^1.3.1, base64-js@^1.5.1: version "1.5.1" resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.5.1.tgz#1b1b440160a5bf7ad40b650f095963481903930a" integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA== @@ -2168,6 +2168,13 @@ jpeg-js@^0.4.4: resolved "https://registry.yarnpkg.com/jpeg-js/-/jpeg-js-0.4.4.tgz#a9f1c6f1f9f0fa80cdb3484ed9635054d28936aa" integrity sha512-WZzeDOEtTOBK4Mdsar0IqEU5sMr3vSV2RqkAIzUEV2BHnUfKGyswWFPFwK5EeDo93K3FohSHbLAjj0s1Wzd+dg== +js-tiktoken@^1.0.21: + version "1.0.21" + resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.21.tgz#368a9957591a30a62997dd0c4cf30866f00f8221" + integrity sha512-biOj/6M5qdgx5TKjDnFT1ymSpM5tbd3ylwDtrQvFQSu0Z7bBYko2dF+W/aUkXUPuk6IVpRxk/3Q2sHOzGlS36g== + dependencies: + base64-js "^1.5.1" + js-yaml@^4.1.0: version "4.1.0" resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.1.0.tgz#c1fb65f8f5017901cdd2c951864ba18458a10602"