Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"dotenv": "^16.4.5",
"jimp": "^1.6.0",
"joplin-turndown-plugin-gfm": "^1.0.12",
"js-tiktoken": "^1.0.21",
"lodash": "^4.17.21",
"minimatch": "^9.0.3",
"openai": "^6.7.0",
Expand Down
90 changes: 90 additions & 0 deletions scripts/test-token-counter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import { countTokens, truncateToTokenLimit } from "../src/utils/token-counter";

async function runTests() {
console.log("Running token-counter verification...");
let failed = false;

const assert = (condition: boolean, msg: string) => {
if (!condition) {
console.error(`FAIL: ${msg}`);
failed = true;
} else {
console.log(`PASS: ${msg}`);
}
};

// countTokens tests
// "Hello world" -> [9906, 1917] -> 2 tokens
const countHello = countTokens("Hello world");
assert(countHello === 2, `countTokens("Hello world") === 2 (got ${countHello})`);
assert(countTokens("") === 0, 'countTokens("") === 0');

// truncateToTokenLimit tests
const text = "one two three four five";
// "one" " two" " three" " four" " five" -> 5 tokens
// NOTE: Tokenization depends on the model. cl100k_base:
// "one" -> 16
// " two" -> 1440
// " three" -> 1867
// " four" -> 3550
// " five" -> 3749
// So 5 tokens exactly.

const limit = 3;
const msg = "X"; // "X" -> 55 (1 token)

// limit 3, msg 1 (1 token). Available 2.
// Should keep first 2 tokens: "one" + " two".
// Expected: "one two" + "X" = "one twoX"

const truncated = truncateToTokenLimit(text, limit, msg);
assert(countTokens(truncated) <= limit, "truncated text is within limit");
assert(truncated.endsWith(msg), "truncated text ends with message");
assert(truncated === "one twoX", `truncated text is correct ("${truncated}")`);

const longText = "This is a longer text that should be truncated properly.";
// We want to verify it truncates.
const res = truncateToTokenLimit(longText, 5);
// Default message is quite long, so if limit is 5, it might consume all tokens or fail to fit?
// Message: "\n[Content truncated due to length]"
// Encoded: [198, 91, 16183, 16996, 4390, 311, 3538, 93] -> 8 tokens?
// Let's check the length of message tokens.
const msgTokens = countTokens("\n[Content truncated due to length]");
console.log(`Default message tokens: ${msgTokens}`);

if (limit < msgTokens) {
// If limit is 5 and message is 8, available is 0.
// It returns just message? Or message truncated?
// My implementation:
// const messageTokens = enc.encode(truncationMessage).length;
// const availableTokens = Math.max(0, tokenLimit - messageTokens);
// const truncatedTokens = tokens.slice(0, availableTokens);
// return enc.decode(truncatedTokens) + truncationMessage;

// If availableTokens is 0, it returns "" + message.
// So result is just message.
// And result length (tokens) is messageTokens (8) which is > limit (5).

// So if limit is smaller than message, it overflows.
// This is acceptable behavior for now as we can't magically compress the message.
}

// Let's test with a limit that allows text + message.
// limit = msgTokens + 2.
const safeLimit = msgTokens + 2;
const res2 = truncateToTokenLimit(longText, safeLimit);
assert(res2.endsWith("\n[Content truncated due to length]"), "Ends with default message");
assert(countTokens(res2) <= safeLimit, `Result within safe limit (got ${countTokens(res2)}, limit ${safeLimit})`);

if (failed) {
console.error("Some tests failed.");
process.exit(1);
} else {
console.log("All tests passed.");
}
}

runTests().catch(e => {
console.error(e);
process.exit(1);
});
9 changes: 2 additions & 7 deletions src/agent/actions/extract.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { z } from "zod";
import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types";
import { parseMarkdown } from "@/utils/html-to-markdown";
import { truncateToTokenLimit } from "@/utils";
import fs from "fs";
import { getCDPClient } from "@/cdp";

Expand Down Expand Up @@ -42,13 +43,7 @@ export const ExtractActionDefinition: AgentActionDefinition = {
}

// Trim markdown to stay within token limit
// TODO: this is a hack, we should use a better token counting method
const avgTokensPerChar = 0.75; // Conservative estimate of tokens per character
const maxChars = Math.floor(ctx.tokenLimit / avgTokensPerChar);
const trimmedMarkdown =
markdown.length > maxChars
? markdown.slice(0, maxChars) + "\n[Content truncated due to length]"
: markdown;
const trimmedMarkdown = truncateToTokenLimit(markdown, ctx.tokenLimit);
if (ctx.debugDir) {
fs.writeFileSync(
`${ctx.debugDir}/extract-markdown-content.md`,
Expand Down
3 changes: 2 additions & 1 deletion src/utils/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { sleep } from "./sleep";
import { retry } from "./retry";
import { ErrorEmitter } from "./error-emitter";
import { countTokens, truncateToTokenLimit } from "./token-counter";

export { sleep, retry, ErrorEmitter };
export { sleep, retry, ErrorEmitter, countTokens, truncateToTokenLimit };
39 changes: 39 additions & 0 deletions src/utils/token-counter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { getEncoding } from "js-tiktoken";

// Use cl100k_base as it is the standard for modern OpenAI models (GPT-3.5/4)
// and serves as a good approximation for other models.
const enc = getEncoding("cl100k_base");

/**
* Counts the number of tokens in a text string.
*/
export function countTokens(text: string): number {
return enc.encode(text).length;
}

/**
* Truncates text to a maximum number of tokens.
* Appends a truncation message if truncated.
*/
export function truncateToTokenLimit(
text: string,
tokenLimit: number,
truncationMessage = "\n[Content truncated due to length]"
): string {
const tokens = enc.encode(text);

if (tokens.length <= tokenLimit) {
return text;
}

// Reserve tokens for the truncation message
const messageTokens = enc.encode(truncationMessage).length;
// If the limit is so small it can't even fit the message, just return as much message as possible or empty string?
// Let's assume limit is reasonable. If not, we prioritize the text? No, message is important to know it was truncated.
// But if limit < messageTokens, we can't do much.

const availableTokens = Math.max(0, tokenLimit - messageTokens);

const truncatedTokens = tokens.slice(0, availableTokens);
return enc.decode(truncatedTokens) + truncationMessage;
}
9 changes: 8 additions & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,7 @@ balanced-match@^1.0.0:
resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee"
integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==

base64-js@^1.3.0, base64-js@^1.3.1:
base64-js@^1.3.0, base64-js@^1.3.1, base64-js@^1.5.1:
version "1.5.1"
resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.5.1.tgz#1b1b440160a5bf7ad40b650f095963481903930a"
integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==
Expand Down Expand Up @@ -2168,6 +2168,13 @@ jpeg-js@^0.4.4:
resolved "https://registry.yarnpkg.com/jpeg-js/-/jpeg-js-0.4.4.tgz#a9f1c6f1f9f0fa80cdb3484ed9635054d28936aa"
integrity sha512-WZzeDOEtTOBK4Mdsar0IqEU5sMr3vSV2RqkAIzUEV2BHnUfKGyswWFPFwK5EeDo93K3FohSHbLAjj0s1Wzd+dg==

js-tiktoken@^1.0.21:
version "1.0.21"
resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.21.tgz#368a9957591a30a62997dd0c4cf30866f00f8221"
integrity sha512-biOj/6M5qdgx5TKjDnFT1ymSpM5tbd3ylwDtrQvFQSu0Z7bBYko2dF+W/aUkXUPuk6IVpRxk/3Q2sHOzGlS36g==
dependencies:
base64-js "^1.5.1"

js-yaml@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.1.0.tgz#c1fb65f8f5017901cdd2c951864ba18458a10602"
Expand Down