From 595afb73cbe262397a94d0adadddcc6caa16c8ee Mon Sep 17 00:00:00 2001 From: Roo Code Date: Thu, 24 Jul 2025 15:33:47 +0000 Subject: [PATCH 1/4] feat: add safeguard for large files in readFileTool when maxReadFileLine is -1 - Add token counting check using tiktoken for files over 1000 lines - Automatically switch to partial read (first 2000 lines) when token count exceeds 50k - Add fallback safeguard for very large files (>5000 lines) when token counting fails - Include informative notice explaining why partial read is being used - Add comprehensive test coverage for all safeguard scenarios This prevents consuming the entire context window when reading very large files. --- src/core/tools/__tests__/readFileTool.spec.ts | 334 +++++++++++++++++- src/core/tools/readFileTool.ts | 54 ++- 2 files changed, 380 insertions(+), 8 deletions(-) diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index 44be1d3b924..58e3ead445c 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -10,6 +10,7 @@ import { isBinaryFile } from "isbinaryfile" import { ReadFileToolUse, ToolParamName, ToolResponse } from "../../../shared/tools" import { readFileTool } from "../readFileTool" import { formatResponse } from "../../prompts/responses" +import { tiktoken } from "../../../utils/tiktoken" vi.mock("path", async () => { const originalPath = await vi.importActual("path") @@ -35,18 +36,27 @@ vi.mock("../../../integrations/misc/read-lines") let mockInputContent = "" // First create all the mocks -vi.mock("../../../integrations/misc/extract-text") +vi.mock("../../../integrations/misc/extract-text", () => ({ + extractTextFromFile: vi.fn(), + addLineNumbers: vi.fn(), + getSupportedBinaryFormats: vi.fn(() => [".pdf", ".docx", ".ipynb"]), +})) vi.mock("../../../services/tree-sitter") +vi.mock("../../../utils/tiktoken") + +// Import the mocked functions +import { addLineNumbers, getSupportedBinaryFormats } from "../../../integrations/misc/extract-text" // Then create the mock functions -const addLineNumbersMock = vi.fn().mockImplementation((text, startLine = 1) => { +const addLineNumbersMock = vi.mocked(addLineNumbers) +addLineNumbersMock.mockImplementation((text: string, startLine = 1) => { if (!text) return "" const lines = typeof text === "string" ? text.split("\n") : [text] - return lines.map((line, i) => `${startLine + i} | ${line}`).join("\n") + return lines.map((line: string, i: number) => `${startLine + i} | ${line}`).join("\n") }) -const extractTextFromFileMock = vi.fn() -const getSupportedBinaryFormatsMock = vi.fn(() => [".pdf", ".docx", ".ipynb"]) +const extractTextFromFileMock = vi.mocked(extractTextFromFile) +const getSupportedBinaryFormatsMock = vi.mocked(getSupportedBinaryFormats) vi.mock("../../ignore/RooIgnoreController", () => ({ RooIgnoreController: class { @@ -520,3 +530,317 @@ describe("read_file tool XML output structure", () => { }) }) }) + +describe("read_file tool with large file safeguard", () => { + // Test data + const testFilePath = "test/largefile.txt" + const absoluteFilePath = "/test/largefile.txt" + + // Mocked functions + const mockedCountFileLines = vi.mocked(countFileLines) + const mockedReadLines = vi.mocked(readLines) + const mockedExtractTextFromFile = vi.mocked(extractTextFromFile) + const mockedIsBinaryFile = vi.mocked(isBinaryFile) + const mockedPathResolve = vi.mocked(path.resolve) + const mockedTiktoken = vi.mocked(tiktoken) + + const mockCline: any = {} + let mockProvider: any + let toolResult: ToolResponse | undefined + + beforeEach(() => { + vi.clearAllMocks() + + mockedPathResolve.mockReturnValue(absoluteFilePath) + mockedIsBinaryFile.mockResolvedValue(false) + + mockProvider = { + getState: vi.fn(), + deref: vi.fn().mockReturnThis(), + } + + mockCline.cwd = "/" + mockCline.task = "Test" + mockCline.providerRef = mockProvider + mockCline.rooIgnoreController = { + validateAccess: vi.fn().mockReturnValue(true), + } + mockCline.say = vi.fn().mockResolvedValue(undefined) + mockCline.ask = vi.fn().mockResolvedValue({ response: "yesButtonClicked" }) + mockCline.fileContextTracker = { + trackFileContext: vi.fn().mockResolvedValue(undefined), + } + mockCline.recordToolUsage = vi.fn().mockReturnValue(undefined) + mockCline.recordToolError = vi.fn().mockReturnValue(undefined) + + toolResult = undefined + }) + + async function executeReadFileTool( + params: Partial = {}, + options: { + maxReadFileLine?: number + totalLines?: number + tokenCount?: number + } = {}, + ): Promise { + const maxReadFileLine = options.maxReadFileLine ?? -1 + const totalLines = options.totalLines ?? 5 + const tokenCount = options.tokenCount ?? 100 + + mockProvider.getState.mockResolvedValue({ maxReadFileLine }) + mockedCountFileLines.mockResolvedValue(totalLines) + mockedTiktoken.mockResolvedValue(tokenCount) + + const argsContent = `${testFilePath}` + + const toolUse: ReadFileToolUse = { + type: "tool_use", + name: "read_file", + params: { args: argsContent, ...params }, + partial: false, + } + + await readFileTool( + mockCline, + toolUse, + mockCline.ask, + vi.fn(), + (result: ToolResponse) => { + toolResult = result + }, + (_: ToolParamName, content?: string) => content ?? "", + ) + + return toolResult + } + + describe("when file has many lines and high token count", () => { + it("should apply safeguard and read only first 2000 lines", async () => { + // Setup - large file with high token count + const largeFileContent = Array(1500).fill("This is a line of text").join("\n") + const partialContent = Array(2000).fill("This is a line of text").join("\n") + + mockedExtractTextFromFile.mockResolvedValue(largeFileContent) + mockedReadLines.mockResolvedValue(partialContent) + + // Setup addLineNumbers mock for this test + addLineNumbersMock.mockImplementation((text: string) => { + const lines = text.split("\n") + return lines.map((line: string, i: number) => `${i + 1} | ${line}`).join("\n") + }) + + // Execute with high line count and token count + const result = await executeReadFileTool( + {}, + { + maxReadFileLine: -1, + totalLines: 1500, + tokenCount: 60000, // Above threshold + }, + ) + + // Verify safeguard was applied + expect(mockedTiktoken).toHaveBeenCalled() + expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) + + // Verify the result contains the safeguard notice + expect(result).toContain("This file contains 1500 lines and approximately 60,000 tokens") + expect(result).toContain("Showing only the first 2000 lines to preserve context space") + expect(result).toContain(``) + }) + + it("should not apply safeguard when token count is below threshold", async () => { + // Setup - large file but with low token count + const fileContent = Array(1500).fill("Short").join("\n") + const numberedContent = fileContent + .split("\n") + .map((line, i) => `${i + 1} | ${line}`) + .join("\n") + + mockedExtractTextFromFile.mockImplementation(() => Promise.resolve(numberedContent)) + + // Execute with high line count but low token count + const result = await executeReadFileTool( + {}, + { + maxReadFileLine: -1, + totalLines: 1500, + tokenCount: 30000, // Below threshold + }, + ) + + // Verify safeguard was NOT applied + expect(mockedTiktoken).toHaveBeenCalled() + expect(mockedReadLines).not.toHaveBeenCalled() + expect(mockedExtractTextFromFile).toHaveBeenCalled() + + // Verify no safeguard notice + expect(result).not.toContain("preserve context space") + expect(result).toContain(``) + }) + + it("should not apply safeguard for files under 1000 lines", async () => { + // Setup - file with less than 1000 lines + const fileContent = Array(999).fill("This is a line of text").join("\n") + const numberedContent = fileContent + .split("\n") + .map((line, i) => `${i + 1} | ${line}`) + .join("\n") + + mockedExtractTextFromFile.mockImplementation(() => Promise.resolve(numberedContent)) + + // Execute + const result = await executeReadFileTool( + {}, + { + maxReadFileLine: -1, + totalLines: 999, + tokenCount: 100000, // Even with high token count + }, + ) + + // Verify tiktoken was NOT called (optimization) + expect(mockedTiktoken).not.toHaveBeenCalled() + expect(mockedReadLines).not.toHaveBeenCalled() + expect(mockedExtractTextFromFile).toHaveBeenCalled() + + // Verify no safeguard notice + expect(result).not.toContain("preserve context space") + expect(result).toContain(``) + }) + + it("should apply safeguard for very large files even if token counting fails", async () => { + // Setup - very large file and token counting fails + const partialContent = Array(2000).fill("This is a line of text").join("\n") + + mockedExtractTextFromFile.mockResolvedValue("Large content") + mockedReadLines.mockResolvedValue(partialContent) + + // Setup addLineNumbers mock for partial content + addLineNumbersMock.mockImplementation((text: string) => { + const lines = text.split("\n") + return lines.map((line: string, i: number) => `${i + 1} | ${line}`).join("\n") + }) + + // Set up the provider state + mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) + mockedCountFileLines.mockResolvedValue(6000) + + // IMPORTANT: Set up tiktoken to reject AFTER other mocks are set + mockedTiktoken.mockRejectedValue(new Error("Token counting failed")) + + const argsContent = `${testFilePath}` + + const toolUse: ReadFileToolUse = { + type: "tool_use", + name: "read_file", + params: { args: argsContent }, + partial: false, + } + + await readFileTool( + mockCline, + toolUse, + mockCline.ask, + vi.fn(), + (result: ToolResponse) => { + toolResult = result + }, + (_: ToolParamName, content?: string) => content ?? "", + ) + + // Verify safeguard was applied despite token counting failure + expect(mockedTiktoken).toHaveBeenCalled() + expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) + + // Verify the result contains the safeguard notice (without token count) + expect(toolResult).toContain("This file contains 6000 lines") + expect(toolResult).toContain("Showing only the first 2000 lines to preserve context space") + expect(toolResult).toContain(``) + }) + + it("should not apply safeguard when maxReadFileLine is not -1", async () => { + // Setup + const fileContent = Array(2000).fill("This is a line of text").join("\n") + mockedExtractTextFromFile.mockResolvedValue(fileContent) + + // Execute with maxReadFileLine = 500 (not -1) + const result = await executeReadFileTool( + {}, + { + maxReadFileLine: 500, + totalLines: 2000, + tokenCount: 100000, + }, + ) + + // Verify tiktoken was NOT called + expect(mockedTiktoken).not.toHaveBeenCalled() + + // The normal maxReadFileLine logic should apply + expect(mockedReadLines).toHaveBeenCalled() + }) + + it("should handle line ranges correctly with safeguard", async () => { + // When line ranges are specified, safeguard should not apply + const rangeContent = "Line 100\nLine 101\nLine 102" + mockedReadLines.mockResolvedValue(rangeContent) + + const argsContent = `${testFilePath}100-102` + + const toolUse: ReadFileToolUse = { + type: "tool_use", + name: "read_file", + params: { args: argsContent }, + partial: false, + } + + mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) + mockedCountFileLines.mockResolvedValue(10000) + + await readFileTool( + mockCline, + toolUse, + mockCline.ask, + vi.fn(), + (result: ToolResponse) => { + toolResult = result + }, + (_: ToolParamName, content?: string) => content ?? "", + ) + + // Verify tiktoken was NOT called for range reads + expect(mockedTiktoken).not.toHaveBeenCalled() + expect(toolResult).toContain(``) + expect(toolResult).not.toContain("preserve context space") + }) + }) + + describe("safeguard thresholds", () => { + it("should use correct thresholds for line count and token count", async () => { + // Test boundary conditions + + // Just below line threshold - no token check + await executeReadFileTool({}, { totalLines: 1000, maxReadFileLine: -1 }) + expect(mockedTiktoken).not.toHaveBeenCalled() + + // Just above line threshold - token check performed + vi.clearAllMocks() + mockedExtractTextFromFile.mockResolvedValue("content") + await executeReadFileTool({}, { totalLines: 1001, maxReadFileLine: -1, tokenCount: 40000 }) + expect(mockedTiktoken).toHaveBeenCalled() + + // Token count just below threshold - no safeguard + expect(toolResult).not.toContain("preserve context space") + + // Token count just above threshold - safeguard applied + vi.clearAllMocks() + mockedExtractTextFromFile.mockResolvedValue("content") + mockedReadLines.mockResolvedValue("partial content") + await executeReadFileTool({}, { totalLines: 1001, maxReadFileLine: -1, tokenCount: 50001 }) + expect(mockedReadLines).toHaveBeenCalled() + expect(toolResult).toContain("preserve context space") + }) + }) +}) diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index 6de8dd56421..b9036351e02 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -14,6 +14,7 @@ import { readLines } from "../../integrations/misc/read-lines" import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from "../../integrations/misc/extract-text" import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter" import { parseXml } from "../../utils/xml" +import { tiktoken } from "../../utils/tiktoken" export function getReadFileToolDescription(blockName: string, blockParams: any): string { // Handle both single path and multiple files via args @@ -516,13 +517,60 @@ export async function readFileTool( continue } - // Handle normal file read - const content = await extractTextFromFile(fullPath) - const lineRangeAttr = ` lines="1-${totalLines}"` + // Handle normal file read with safeguard for large files + // Define thresholds for the safeguard + const LARGE_FILE_LINE_THRESHOLD = 1000 // Consider files with more than 1000 lines as "large" + const MAX_TOKEN_THRESHOLD = 50000 // ~50% of a typical 100k context window + const FALLBACK_MAX_LINES = 2000 // Default number of lines to read when applying safeguard + + // Check if we should apply the safeguard + let shouldApplySafeguard = false + let safeguardNotice = "" + let linesToRead = totalLines + + if (maxReadFileLine === -1 && totalLines > LARGE_FILE_LINE_THRESHOLD) { + // File has many lines and we're trying to read the full file + // Perform token count check + try { + const fullContent = await extractTextFromFile(fullPath) + const tokenCount = await tiktoken([{ type: "text", text: fullContent }]) + + if (tokenCount > MAX_TOKEN_THRESHOLD) { + shouldApplySafeguard = true + linesToRead = FALLBACK_MAX_LINES + safeguardNotice = `This file contains ${totalLines} lines and approximately ${tokenCount.toLocaleString()} tokens, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` + } + } catch (error) { + // If token counting fails, apply safeguard based on line count alone + console.warn(`Failed to count tokens for large file ${relPath}:`, error) + if (totalLines > LARGE_FILE_LINE_THRESHOLD * 5) { + // For very large files (>5000 lines), apply safeguard anyway + shouldApplySafeguard = true + linesToRead = FALLBACK_MAX_LINES + safeguardNotice = `This file contains ${totalLines} lines, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` + } + } + } + + let content: string + let lineRangeAttr: string + + if (shouldApplySafeguard) { + // Read partial file with safeguard + content = addLineNumbers(await readLines(fullPath, linesToRead - 1, 0)) + lineRangeAttr = ` lines="1-${linesToRead}"` + } else { + // Read full file as normal + content = await extractTextFromFile(fullPath) + lineRangeAttr = ` lines="1-${totalLines}"` + } + let xmlInfo = totalLines > 0 ? `\n${content}\n` : `` if (totalLines === 0) { xmlInfo += `File is empty\n` + } else if (safeguardNotice) { + xmlInfo += safeguardNotice } // Track file read From 9fb73924e569926a31f69173451037195cd6c93d Mon Sep 17 00:00:00 2001 From: Roo Code Date: Thu, 24 Jul 2025 15:49:10 +0000 Subject: [PATCH 2/4] feat: use actual context window size and increase line threshold to 10K - Access actual context window size from Task object via cline.api.getModel().info.contextWindow - Increase LARGE_FILE_LINE_THRESHOLD from 1,000 to 10,000 lines - Calculate MAX_TOKEN_THRESHOLD as 50% of actual context window instead of hardcoded 50K - Update tests to reflect new thresholds and mock api.getModel() properly --- src/core/tools/__tests__/readFileTool.spec.ts | 144 +++++++++++++++--- src/core/tools/readFileTool.ts | 9 +- 2 files changed, 131 insertions(+), 22 deletions(-) diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index 58e3ead445c..cce86b33696 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -137,6 +137,15 @@ describe("read_file tool with maxReadFileLine setting", () => { mockCline.recordToolUsage = vi.fn().mockReturnValue(undefined) mockCline.recordToolError = vi.fn().mockReturnValue(undefined) + // Add default api mock + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + toolResult = undefined }) @@ -393,6 +402,15 @@ describe("read_file tool XML output structure", () => { mockCline.recordToolError = vi.fn().mockReturnValue(undefined) mockCline.didRejectTool = false + // Add default api mock + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + toolResult = undefined }) @@ -573,6 +591,15 @@ describe("read_file tool with large file safeguard", () => { mockCline.recordToolUsage = vi.fn().mockReturnValue(undefined) mockCline.recordToolError = vi.fn().mockReturnValue(undefined) + // Add default api mock + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + toolResult = undefined }) @@ -618,7 +645,7 @@ describe("read_file tool with large file safeguard", () => { describe("when file has many lines and high token count", () => { it("should apply safeguard and read only first 2000 lines", async () => { // Setup - large file with high token count - const largeFileContent = Array(1500).fill("This is a line of text").join("\n") + const largeFileContent = Array(15000).fill("This is a line of text").join("\n") const partialContent = Array(2000).fill("This is a line of text").join("\n") mockedExtractTextFromFile.mockResolvedValue(largeFileContent) @@ -630,12 +657,21 @@ describe("read_file tool with large file safeguard", () => { return lines.map((line: string, i: number) => `${i + 1} | ${line}`).join("\n") }) + // Mock the api.getModel() to return a model with context window + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + // Execute with high line count and token count const result = await executeReadFileTool( {}, { maxReadFileLine: -1, - totalLines: 1500, + totalLines: 15000, tokenCount: 60000, // Above threshold }, ) @@ -645,14 +681,14 @@ describe("read_file tool with large file safeguard", () => { expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) // Verify the result contains the safeguard notice - expect(result).toContain("This file contains 1500 lines and approximately 60,000 tokens") + expect(result).toContain("This file contains 15000 lines and approximately 60,000 tokens") expect(result).toContain("Showing only the first 2000 lines to preserve context space") expect(result).toContain(``) }) it("should not apply safeguard when token count is below threshold", async () => { // Setup - large file but with low token count - const fileContent = Array(1500).fill("Short").join("\n") + const fileContent = Array(15000).fill("Short").join("\n") const numberedContent = fileContent .split("\n") .map((line, i) => `${i + 1} | ${line}`) @@ -660,12 +696,21 @@ describe("read_file tool with large file safeguard", () => { mockedExtractTextFromFile.mockImplementation(() => Promise.resolve(numberedContent)) + // Mock the api.getModel() to return a model with context window + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + // Execute with high line count but low token count const result = await executeReadFileTool( {}, { maxReadFileLine: -1, - totalLines: 1500, + totalLines: 15000, tokenCount: 30000, // Below threshold }, ) @@ -677,12 +722,12 @@ describe("read_file tool with large file safeguard", () => { // Verify no safeguard notice expect(result).not.toContain("preserve context space") - expect(result).toContain(``) + expect(result).toContain(``) }) - it("should not apply safeguard for files under 1000 lines", async () => { - // Setup - file with less than 1000 lines - const fileContent = Array(999).fill("This is a line of text").join("\n") + it("should not apply safeguard for files under 10000 lines", async () => { + // Setup - file with less than 10000 lines + const fileContent = Array(9999).fill("This is a line of text").join("\n") const numberedContent = fileContent .split("\n") .map((line, i) => `${i + 1} | ${line}`) @@ -690,12 +735,21 @@ describe("read_file tool with large file safeguard", () => { mockedExtractTextFromFile.mockImplementation(() => Promise.resolve(numberedContent)) + // Mock the api.getModel() to return a model with context window + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + // Execute const result = await executeReadFileTool( {}, { maxReadFileLine: -1, - totalLines: 999, + totalLines: 9999, tokenCount: 100000, // Even with high token count }, ) @@ -707,7 +761,7 @@ describe("read_file tool with large file safeguard", () => { // Verify no safeguard notice expect(result).not.toContain("preserve context space") - expect(result).toContain(``) + expect(result).toContain(``) }) it("should apply safeguard for very large files even if token counting fails", async () => { @@ -723,9 +777,18 @@ describe("read_file tool with large file safeguard", () => { return lines.map((line: string, i: number) => `${i + 1} | ${line}`).join("\n") }) + // Mock the api.getModel() to return a model with context window + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + // Set up the provider state mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) - mockedCountFileLines.mockResolvedValue(6000) + mockedCountFileLines.mockResolvedValue(60000) // IMPORTANT: Set up tiktoken to reject AFTER other mocks are set mockedTiktoken.mockRejectedValue(new Error("Token counting failed")) @@ -755,22 +818,31 @@ describe("read_file tool with large file safeguard", () => { expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) // Verify the result contains the safeguard notice (without token count) - expect(toolResult).toContain("This file contains 6000 lines") + expect(toolResult).toContain("This file contains 60000 lines") expect(toolResult).toContain("Showing only the first 2000 lines to preserve context space") expect(toolResult).toContain(``) }) it("should not apply safeguard when maxReadFileLine is not -1", async () => { // Setup - const fileContent = Array(2000).fill("This is a line of text").join("\n") + const fileContent = Array(20000).fill("This is a line of text").join("\n") mockedExtractTextFromFile.mockResolvedValue(fileContent) + // Mock the api.getModel() to return a model with context window + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + // Execute with maxReadFileLine = 500 (not -1) const result = await executeReadFileTool( {}, { maxReadFileLine: 500, - totalLines: 2000, + totalLines: 20000, tokenCount: 100000, }, ) @@ -787,6 +859,15 @@ describe("read_file tool with large file safeguard", () => { const rangeContent = "Line 100\nLine 101\nLine 102" mockedReadLines.mockResolvedValue(rangeContent) + // Mock the api.getModel() to return a model with context window + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + const argsContent = `${testFilePath}100-102` const toolUse: ReadFileToolUse = { @@ -797,7 +878,7 @@ describe("read_file tool with large file safeguard", () => { } mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) - mockedCountFileLines.mockResolvedValue(10000) + mockedCountFileLines.mockResolvedValue(100000) await readFileTool( mockCline, @@ -819,16 +900,33 @@ describe("read_file tool with large file safeguard", () => { describe("safeguard thresholds", () => { it("should use correct thresholds for line count and token count", async () => { + // Mock the api.getModel() to return a model with context window + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } + // Test boundary conditions // Just below line threshold - no token check - await executeReadFileTool({}, { totalLines: 1000, maxReadFileLine: -1 }) + await executeReadFileTool({}, { totalLines: 10000, maxReadFileLine: -1 }) expect(mockedTiktoken).not.toHaveBeenCalled() // Just above line threshold - token check performed vi.clearAllMocks() + // Re-mock the api.getModel() after clearAllMocks + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } mockedExtractTextFromFile.mockResolvedValue("content") - await executeReadFileTool({}, { totalLines: 1001, maxReadFileLine: -1, tokenCount: 40000 }) + await executeReadFileTool({}, { totalLines: 10001, maxReadFileLine: -1, tokenCount: 40000 }) expect(mockedTiktoken).toHaveBeenCalled() // Token count just below threshold - no safeguard @@ -836,9 +934,17 @@ describe("read_file tool with large file safeguard", () => { // Token count just above threshold - safeguard applied vi.clearAllMocks() + // Re-mock the api.getModel() after clearAllMocks + mockCline.api = { + getModel: vi.fn().mockReturnValue({ + info: { + contextWindow: 100000, + }, + }), + } mockedExtractTextFromFile.mockResolvedValue("content") mockedReadLines.mockResolvedValue("partial content") - await executeReadFileTool({}, { totalLines: 1001, maxReadFileLine: -1, tokenCount: 50001 }) + await executeReadFileTool({}, { totalLines: 10001, maxReadFileLine: -1, tokenCount: 50001 }) expect(mockedReadLines).toHaveBeenCalled() expect(toolResult).toContain("preserve context space") }) diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index b9036351e02..e024ca632a7 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -519,10 +519,13 @@ export async function readFileTool( // Handle normal file read with safeguard for large files // Define thresholds for the safeguard - const LARGE_FILE_LINE_THRESHOLD = 1000 // Consider files with more than 1000 lines as "large" - const MAX_TOKEN_THRESHOLD = 50000 // ~50% of a typical 100k context window + const LARGE_FILE_LINE_THRESHOLD = 10000 // Consider files with more than 10000 lines as "large" const FALLBACK_MAX_LINES = 2000 // Default number of lines to read when applying safeguard + // Get the actual context window size from the model + const contextWindow = cline.api.getModel().info.contextWindow || 100000 // Default to 100k if not available + const MAX_TOKEN_THRESHOLD = Math.floor(contextWindow * 0.5) // Use 50% of the actual context window + // Check if we should apply the safeguard let shouldApplySafeguard = false let safeguardNotice = "" @@ -544,7 +547,7 @@ export async function readFileTool( // If token counting fails, apply safeguard based on line count alone console.warn(`Failed to count tokens for large file ${relPath}:`, error) if (totalLines > LARGE_FILE_LINE_THRESHOLD * 5) { - // For very large files (>5000 lines), apply safeguard anyway + // For very large files (>50000 lines), apply safeguard anyway shouldApplySafeguard = true linesToRead = FALLBACK_MAX_LINES safeguardNotice = `This file contains ${totalLines} lines, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` From 89790bc968d9c883d49c620843f4a0b9d40234c4 Mon Sep 17 00:00:00 2001 From: Roo Code Date: Thu, 24 Jul 2025 16:05:29 +0000 Subject: [PATCH 3/4] refactor: use file size instead of line count for large file detection - Replace line count threshold with file size threshold (100KB) - Files larger than 100KB trigger token count check - Files larger than 1MB automatically apply safeguard if token counting fails - Update tests to reflect new file size-based approach - This better handles files with large amounts of content on single lines As requested by @cte in PR comment --- src/core/tools/__tests__/readFileTool.spec.ts | 50 ++++++++++++------- src/core/tools/readFileTool.ts | 49 ++++++++++-------- 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index cce86b33696..61701c93aef 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -1,6 +1,7 @@ // npx vitest src/core/tools/__tests__/readFileTool.spec.ts import * as path from "path" +import { stat } from "fs/promises" import { countFileLines } from "../../../integrations/misc/line-counter" import { readLines } from "../../../integrations/misc/read-lines" @@ -25,6 +26,7 @@ vi.mock("fs/promises", () => ({ mkdir: vi.fn().mockResolvedValue(undefined), writeFile: vi.fn().mockResolvedValue(undefined), readFile: vi.fn().mockResolvedValue("{}"), + stat: vi.fn().mockResolvedValue({ size: 1024 }), // Default 1KB file })) vi.mock("isbinaryfile") @@ -561,6 +563,7 @@ describe("read_file tool with large file safeguard", () => { const mockedIsBinaryFile = vi.mocked(isBinaryFile) const mockedPathResolve = vi.mocked(path.resolve) const mockedTiktoken = vi.mocked(tiktoken) + const mockedStat = vi.mocked(stat) const mockCline: any = {} let mockProvider: any @@ -609,15 +612,18 @@ describe("read_file tool with large file safeguard", () => { maxReadFileLine?: number totalLines?: number tokenCount?: number + fileSize?: number } = {}, ): Promise { const maxReadFileLine = options.maxReadFileLine ?? -1 const totalLines = options.totalLines ?? 5 const tokenCount = options.tokenCount ?? 100 + const fileSize = options.fileSize ?? 1024 // Default 1KB mockProvider.getState.mockResolvedValue({ maxReadFileLine }) mockedCountFileLines.mockResolvedValue(totalLines) mockedTiktoken.mockResolvedValue(tokenCount) + mockedStat.mockResolvedValue({ size: fileSize } as any) const argsContent = `${testFilePath}` @@ -642,7 +648,7 @@ describe("read_file tool with large file safeguard", () => { return toolResult } - describe("when file has many lines and high token count", () => { + describe("when file has large size and high token count", () => { it("should apply safeguard and read only first 2000 lines", async () => { // Setup - large file with high token count const largeFileContent = Array(15000).fill("This is a line of text").join("\n") @@ -666,13 +672,14 @@ describe("read_file tool with large file safeguard", () => { }), } - // Execute with high line count and token count + // Execute with large file size and high token count const result = await executeReadFileTool( {}, { maxReadFileLine: -1, totalLines: 15000, tokenCount: 60000, // Above threshold + fileSize: 200 * 1024, // 200KB - above threshold }, ) @@ -681,7 +688,7 @@ describe("read_file tool with large file safeguard", () => { expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) // Verify the result contains the safeguard notice - expect(result).toContain("This file contains 15000 lines and approximately 60,000 tokens") + expect(result).toContain("This file is 200KB and contains approximately 60,000 tokens") expect(result).toContain("Showing only the first 2000 lines to preserve context space") expect(result).toContain(``) }) @@ -705,13 +712,14 @@ describe("read_file tool with large file safeguard", () => { }), } - // Execute with high line count but low token count + // Execute with large file size but low token count const result = await executeReadFileTool( {}, { maxReadFileLine: -1, totalLines: 15000, tokenCount: 30000, // Below threshold + fileSize: 200 * 1024, // 200KB - above threshold }, ) @@ -725,9 +733,9 @@ describe("read_file tool with large file safeguard", () => { expect(result).toContain(``) }) - it("should not apply safeguard for files under 10000 lines", async () => { - // Setup - file with less than 10000 lines - const fileContent = Array(9999).fill("This is a line of text").join("\n") + it("should not apply safeguard for small files", async () => { + // Setup - small file + const fileContent = Array(999).fill("This is a line of text").join("\n") const numberedContent = fileContent .split("\n") .map((line, i) => `${i + 1} | ${line}`) @@ -744,13 +752,14 @@ describe("read_file tool with large file safeguard", () => { }), } - // Execute + // Execute with small file size const result = await executeReadFileTool( {}, { maxReadFileLine: -1, - totalLines: 9999, + totalLines: 999, tokenCount: 100000, // Even with high token count + fileSize: 50 * 1024, // 50KB - below threshold }, ) @@ -761,7 +770,7 @@ describe("read_file tool with large file safeguard", () => { // Verify no safeguard notice expect(result).not.toContain("preserve context space") - expect(result).toContain(``) + expect(result).toContain(``) }) it("should apply safeguard for very large files even if token counting fails", async () => { @@ -788,7 +797,8 @@ describe("read_file tool with large file safeguard", () => { // Set up the provider state mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) - mockedCountFileLines.mockResolvedValue(60000) + mockedCountFileLines.mockResolvedValue(6000) + mockedStat.mockResolvedValue({ size: 2 * 1024 * 1024 } as any) // 2MB file // IMPORTANT: Set up tiktoken to reject AFTER other mocks are set mockedTiktoken.mockRejectedValue(new Error("Token counting failed")) @@ -818,7 +828,7 @@ describe("read_file tool with large file safeguard", () => { expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) // Verify the result contains the safeguard notice (without token count) - expect(toolResult).toContain("This file contains 60000 lines") + expect(toolResult).toContain("This file is 2048KB") expect(toolResult).toContain("Showing only the first 2000 lines to preserve context space") expect(toolResult).toContain(``) }) @@ -844,6 +854,7 @@ describe("read_file tool with large file safeguard", () => { maxReadFileLine: 500, totalLines: 20000, tokenCount: 100000, + fileSize: 2 * 1024 * 1024, // 2MB }, ) @@ -878,7 +889,8 @@ describe("read_file tool with large file safeguard", () => { } mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) - mockedCountFileLines.mockResolvedValue(100000) + mockedCountFileLines.mockResolvedValue(10000) + mockedStat.mockResolvedValue({ size: 10 * 1024 * 1024 } as any) // 10MB file await readFileTool( mockCline, @@ -899,7 +911,7 @@ describe("read_file tool with large file safeguard", () => { }) describe("safeguard thresholds", () => { - it("should use correct thresholds for line count and token count", async () => { + it("should use correct thresholds for file size and token count", async () => { // Mock the api.getModel() to return a model with context window mockCline.api = { getModel: vi.fn().mockReturnValue({ @@ -911,11 +923,11 @@ describe("read_file tool with large file safeguard", () => { // Test boundary conditions - // Just below line threshold - no token check - await executeReadFileTool({}, { totalLines: 10000, maxReadFileLine: -1 }) + // Just below size threshold - no token check + await executeReadFileTool({}, { fileSize: 100 * 1024 - 1, maxReadFileLine: -1 }) // Just under 100KB expect(mockedTiktoken).not.toHaveBeenCalled() - // Just above line threshold - token check performed + // Just above size threshold - token check performed vi.clearAllMocks() // Re-mock the api.getModel() after clearAllMocks mockCline.api = { @@ -926,7 +938,7 @@ describe("read_file tool with large file safeguard", () => { }), } mockedExtractTextFromFile.mockResolvedValue("content") - await executeReadFileTool({}, { totalLines: 10001, maxReadFileLine: -1, tokenCount: 40000 }) + await executeReadFileTool({}, { fileSize: 100 * 1024 + 1, maxReadFileLine: -1, tokenCount: 40000 }) // Just over 100KB expect(mockedTiktoken).toHaveBeenCalled() // Token count just below threshold - no safeguard @@ -944,7 +956,7 @@ describe("read_file tool with large file safeguard", () => { } mockedExtractTextFromFile.mockResolvedValue("content") mockedReadLines.mockResolvedValue("partial content") - await executeReadFileTool({}, { totalLines: 10001, maxReadFileLine: -1, tokenCount: 50001 }) + await executeReadFileTool({}, { fileSize: 100 * 1024 + 1, maxReadFileLine: -1, tokenCount: 50001 }) expect(mockedReadLines).toHaveBeenCalled() expect(toolResult).toContain("preserve context space") }) diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index e024ca632a7..4117a9d8eeb 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -1,5 +1,6 @@ import path from "path" import { isBinaryFile } from "isbinaryfile" +import { stat } from "fs/promises" import { Task } from "../task/Task" import { ClineSayTool } from "../../shared/ExtensionMessage" @@ -519,7 +520,8 @@ export async function readFileTool( // Handle normal file read with safeguard for large files // Define thresholds for the safeguard - const LARGE_FILE_LINE_THRESHOLD = 10000 // Consider files with more than 10000 lines as "large" + const LARGE_FILE_SIZE_THRESHOLD = 100 * 1024 // 100KB - files larger than this will be checked for token count + const VERY_LARGE_FILE_SIZE = 1024 * 1024 // 1MB - apply safeguard automatically const FALLBACK_MAX_LINES = 2000 // Default number of lines to read when applying safeguard // Get the actual context window size from the model @@ -531,26 +533,31 @@ export async function readFileTool( let safeguardNotice = "" let linesToRead = totalLines - if (maxReadFileLine === -1 && totalLines > LARGE_FILE_LINE_THRESHOLD) { - // File has many lines and we're trying to read the full file - // Perform token count check - try { - const fullContent = await extractTextFromFile(fullPath) - const tokenCount = await tiktoken([{ type: "text", text: fullContent }]) - - if (tokenCount > MAX_TOKEN_THRESHOLD) { - shouldApplySafeguard = true - linesToRead = FALLBACK_MAX_LINES - safeguardNotice = `This file contains ${totalLines} lines and approximately ${tokenCount.toLocaleString()} tokens, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` - } - } catch (error) { - // If token counting fails, apply safeguard based on line count alone - console.warn(`Failed to count tokens for large file ${relPath}:`, error) - if (totalLines > LARGE_FILE_LINE_THRESHOLD * 5) { - // For very large files (>50000 lines), apply safeguard anyway - shouldApplySafeguard = true - linesToRead = FALLBACK_MAX_LINES - safeguardNotice = `This file contains ${totalLines} lines, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` + if (maxReadFileLine === -1) { + // Get file size + const fileStats = await stat(fullPath) + const fileSizeKB = Math.round(fileStats.size / 1024) + + if (fileStats.size > LARGE_FILE_SIZE_THRESHOLD) { + // File is large enough to warrant token count check + try { + const fullContent = await extractTextFromFile(fullPath) + const tokenCount = await tiktoken([{ type: "text", text: fullContent }]) + + if (tokenCount > MAX_TOKEN_THRESHOLD) { + shouldApplySafeguard = true + linesToRead = FALLBACK_MAX_LINES + safeguardNotice = `This file is ${fileSizeKB}KB and contains approximately ${tokenCount.toLocaleString()} tokens, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` + } + } catch (error) { + // If token counting fails, apply safeguard based on file size alone + console.warn(`Failed to count tokens for large file ${relPath}:`, error) + if (fileStats.size > VERY_LARGE_FILE_SIZE) { + // For very large files (>1MB), apply safeguard anyway + shouldApplySafeguard = true + linesToRead = FALLBACK_MAX_LINES + safeguardNotice = `This file is ${fileSizeKB}KB, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` + } } } } From f52f374fd4c7515622a0e4dda5001671f29acd38 Mon Sep 17 00:00:00 2001 From: Daniel Riccio Date: Mon, 28 Jul 2025 17:44:10 -0500 Subject: [PATCH 4/4] feat: improve readFileTool safeguard with character-based limiting and i18n - Replace line-based limiting with character-based limiting to handle files with very long lines - Move threshold constants to packages/types/src/file-limits.ts for better organization - Add readLinesWithCharLimit function that truncates at complete line boundaries - Optimize file reading to avoid double reads when checking token count - Add i18n support for safeguard notice messages in all 18 supported languages - Update tests to match new character-based implementation - Safeguard now limits by character count (200KB default) instead of line count - Ensures files are never truncated in the middle of a line --- packages/types/src/file-limits.ts | 39 +++ packages/types/src/index.ts | 1 + src/core/tools/__tests__/readFileTool.spec.ts | 56 +++-- src/core/tools/readFileTool.ts | 60 +++-- src/i18n/locales/ca/tools.json | 3 +- src/i18n/locales/de/tools.json | 3 +- src/i18n/locales/en/tools.json | 3 +- src/i18n/locales/es/tools.json | 3 +- src/i18n/locales/fr/tools.json | 3 +- src/i18n/locales/hi/tools.json | 3 +- src/i18n/locales/id/tools.json | 3 +- src/i18n/locales/it/tools.json | 3 +- src/i18n/locales/ja/tools.json | 3 +- src/i18n/locales/ko/tools.json | 3 +- src/i18n/locales/nl/tools.json | 3 +- src/i18n/locales/pl/tools.json | 3 +- src/i18n/locales/pt-BR/tools.json | 3 +- src/i18n/locales/ru/tools.json | 3 +- src/i18n/locales/tr/tools.json | 3 +- src/i18n/locales/vi/tools.json | 3 +- src/i18n/locales/zh-CN/tools.json | 3 +- src/i18n/locales/zh-TW/tools.json | 3 +- .../__tests__/read-lines-char-limit.spec.ts | 224 ++++++++++++++++++ .../misc/read-lines-char-limit.ts | 117 +++++++++ 24 files changed, 501 insertions(+), 50 deletions(-) create mode 100644 packages/types/src/file-limits.ts create mode 100644 src/integrations/misc/__tests__/read-lines-char-limit.spec.ts create mode 100644 src/integrations/misc/read-lines-char-limit.ts diff --git a/packages/types/src/file-limits.ts b/packages/types/src/file-limits.ts new file mode 100644 index 00000000000..c215148c949 --- /dev/null +++ b/packages/types/src/file-limits.ts @@ -0,0 +1,39 @@ +/** + * File size and limit constants used across the application + */ + +/** + * Files larger than this threshold will be checked for token count + * to prevent consuming too much of the context window + */ +export const LARGE_FILE_SIZE_THRESHOLD = 100 * 1024 // 100KB + +/** + * Files larger than this size will have the safeguard applied automatically + * without token counting + */ +export const VERY_LARGE_FILE_SIZE = 1024 * 1024 // 1MB + +/** + * Default number of lines to read when applying the large file safeguard + */ +export const FALLBACK_MAX_LINES = 2000 + +/** + * Maximum character count for file reading when safeguard is applied. + * Based on typical token-to-character ratio (1 token ≈ 4 characters), + * this ensures we don't consume too much of the context window. + * For a 100k token context window at 50%, this would be ~200k characters. + */ +export const MAX_CHAR_LIMIT = 200_000 // 200k characters + +/** + * Percentage of the context window to use as the maximum token threshold + * for file reading operations + */ +export const CONTEXT_WINDOW_PERCENTAGE = 0.5 // 50% + +/** + * Average characters per token ratio used for estimation + */ +export const CHARS_PER_TOKEN_RATIO = 4 diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index 44937da235b..952ad1fd0f4 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -4,6 +4,7 @@ export * from "./api.js" export * from "./codebase-index.js" export * from "./cloud.js" export * from "./experiment.js" +export * from "./file-limits.js" export * from "./followup.js" export * from "./global-settings.js" export * from "./history.js" diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index 61701c93aef..5f7a7a74d48 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -5,6 +5,7 @@ import { stat } from "fs/promises" import { countFileLines } from "../../../integrations/misc/line-counter" import { readLines } from "../../../integrations/misc/read-lines" +import { readLinesWithCharLimit } from "../../../integrations/misc/read-lines-char-limit" import { extractTextFromFile } from "../../../integrations/misc/extract-text" import { parseSourceCodeDefinitionsForFile } from "../../../services/tree-sitter" import { isBinaryFile } from "isbinaryfile" @@ -33,6 +34,7 @@ vi.mock("isbinaryfile") vi.mock("../../../integrations/misc/line-counter") vi.mock("../../../integrations/misc/read-lines") +vi.mock("../../../integrations/misc/read-lines-char-limit") // Mock input content for tests let mockInputContent = "" @@ -655,7 +657,15 @@ describe("read_file tool with large file safeguard", () => { const partialContent = Array(2000).fill("This is a line of text").join("\n") mockedExtractTextFromFile.mockResolvedValue(largeFileContent) - mockedReadLines.mockResolvedValue(partialContent) + + // Mock readLinesWithCharLimit + const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit) + mockedReadLinesWithCharLimit.mockResolvedValue({ + content: partialContent, + linesRead: 2000, + charactersRead: partialContent.length, + wasTruncated: true, + }) // Setup addLineNumbers mock for this test addLineNumbersMock.mockImplementation((text: string) => { @@ -685,11 +695,10 @@ describe("read_file tool with large file safeguard", () => { // Verify safeguard was applied expect(mockedTiktoken).toHaveBeenCalled() - expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) + expect(mockedReadLinesWithCharLimit).toHaveBeenCalled() // Verify the result contains the safeguard notice - expect(result).toContain("This file is 200KB and contains approximately 60,000 tokens") - expect(result).toContain("Showing only the first 2000 lines to preserve context space") + expect(result).toContain("readFile.safeguardNotice") expect(result).toContain(``) }) @@ -725,7 +734,8 @@ describe("read_file tool with large file safeguard", () => { // Verify safeguard was NOT applied expect(mockedTiktoken).toHaveBeenCalled() - expect(mockedReadLines).not.toHaveBeenCalled() + const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit) + expect(mockedReadLinesWithCharLimit).not.toHaveBeenCalled() expect(mockedExtractTextFromFile).toHaveBeenCalled() // Verify no safeguard notice @@ -765,7 +775,8 @@ describe("read_file tool with large file safeguard", () => { // Verify tiktoken was NOT called (optimization) expect(mockedTiktoken).not.toHaveBeenCalled() - expect(mockedReadLines).not.toHaveBeenCalled() + const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit) + expect(mockedReadLinesWithCharLimit).not.toHaveBeenCalled() expect(mockedExtractTextFromFile).toHaveBeenCalled() // Verify no safeguard notice @@ -778,7 +789,15 @@ describe("read_file tool with large file safeguard", () => { const partialContent = Array(2000).fill("This is a line of text").join("\n") mockedExtractTextFromFile.mockResolvedValue("Large content") - mockedReadLines.mockResolvedValue(partialContent) + + // Mock readLinesWithCharLimit + const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit) + mockedReadLinesWithCharLimit.mockResolvedValue({ + content: partialContent, + linesRead: 2000, + charactersRead: partialContent.length, + wasTruncated: true, + }) // Setup addLineNumbers mock for partial content addLineNumbersMock.mockImplementation((text: string) => { @@ -825,11 +844,10 @@ describe("read_file tool with large file safeguard", () => { // Verify safeguard was applied despite token counting failure expect(mockedTiktoken).toHaveBeenCalled() - expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0) + expect(mockedReadLinesWithCharLimit).toHaveBeenCalled() - // Verify the result contains the safeguard notice (without token count) - expect(toolResult).toContain("This file is 2048KB") - expect(toolResult).toContain("Showing only the first 2000 lines to preserve context space") + // Verify the result contains the safeguard notice + expect(toolResult).toContain("readFile.safeguardNotice") expect(toolResult).toContain(``) }) @@ -861,8 +879,10 @@ describe("read_file tool with large file safeguard", () => { // Verify tiktoken was NOT called expect(mockedTiktoken).not.toHaveBeenCalled() - // The normal maxReadFileLine logic should apply + // The normal maxReadFileLine logic should apply (using readLines, not readLinesWithCharLimit) expect(mockedReadLines).toHaveBeenCalled() + const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit) + expect(mockedReadLinesWithCharLimit).not.toHaveBeenCalled() }) it("should handle line ranges correctly with safeguard", async () => { @@ -955,10 +975,16 @@ describe("read_file tool with large file safeguard", () => { }), } mockedExtractTextFromFile.mockResolvedValue("content") - mockedReadLines.mockResolvedValue("partial content") + const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit) + mockedReadLinesWithCharLimit.mockResolvedValue({ + content: "partial content", + linesRead: 2000, + charactersRead: 50000, + wasTruncated: true, + }) await executeReadFileTool({}, { fileSize: 100 * 1024 + 1, maxReadFileLine: -1, tokenCount: 50001 }) - expect(mockedReadLines).toHaveBeenCalled() - expect(toolResult).toContain("preserve context space") + expect(mockedReadLinesWithCharLimit).toHaveBeenCalled() + expect(toolResult).toContain("readFile.safeguardNotice") }) }) }) diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index 4117a9d8eeb..dc7328c24f7 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -16,6 +16,15 @@ import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from " import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter" import { parseXml } from "../../utils/xml" import { tiktoken } from "../../utils/tiktoken" +import { + LARGE_FILE_SIZE_THRESHOLD, + VERY_LARGE_FILE_SIZE, + FALLBACK_MAX_LINES, + CONTEXT_WINDOW_PERCENTAGE, + MAX_CHAR_LIMIT, + CHARS_PER_TOKEN_RATIO, +} from "@roo-code/types" +import { readLinesWithCharLimit } from "../../integrations/misc/read-lines-char-limit" export function getReadFileToolDescription(blockName: string, blockParams: any): string { // Handle both single path and multiple files via args @@ -519,19 +528,16 @@ export async function readFileTool( } // Handle normal file read with safeguard for large files - // Define thresholds for the safeguard - const LARGE_FILE_SIZE_THRESHOLD = 100 * 1024 // 100KB - files larger than this will be checked for token count - const VERY_LARGE_FILE_SIZE = 1024 * 1024 // 1MB - apply safeguard automatically - const FALLBACK_MAX_LINES = 2000 // Default number of lines to read when applying safeguard - // Get the actual context window size from the model const contextWindow = cline.api.getModel().info.contextWindow || 100000 // Default to 100k if not available - const MAX_TOKEN_THRESHOLD = Math.floor(contextWindow * 0.5) // Use 50% of the actual context window + const MAX_TOKEN_THRESHOLD = Math.floor(contextWindow * CONTEXT_WINDOW_PERCENTAGE) + const MAX_CHAR_THRESHOLD = MAX_TOKEN_THRESHOLD * CHARS_PER_TOKEN_RATIO // Check if we should apply the safeguard let shouldApplySafeguard = false let safeguardNotice = "" - let linesToRead = totalLines + let fullContent: string | null = null + let actualLinesRead = totalLines if (maxReadFileLine === -1) { // Get file size @@ -541,22 +547,22 @@ export async function readFileTool( if (fileStats.size > LARGE_FILE_SIZE_THRESHOLD) { // File is large enough to warrant token count check try { - const fullContent = await extractTextFromFile(fullPath) + // Read the full content once + fullContent = await extractTextFromFile(fullPath) const tokenCount = await tiktoken([{ type: "text", text: fullContent }]) if (tokenCount > MAX_TOKEN_THRESHOLD) { shouldApplySafeguard = true - linesToRead = FALLBACK_MAX_LINES - safeguardNotice = `This file is ${fileSizeKB}KB and contains approximately ${tokenCount.toLocaleString()} tokens, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` + // Clear fullContent to avoid using it when we need partial content + fullContent = null } + // If tokenCount <= MAX_TOKEN_THRESHOLD, we keep fullContent to reuse it } catch (error) { // If token counting fails, apply safeguard based on file size alone console.warn(`Failed to count tokens for large file ${relPath}:`, error) if (fileStats.size > VERY_LARGE_FILE_SIZE) { // For very large files (>1MB), apply safeguard anyway shouldApplySafeguard = true - linesToRead = FALLBACK_MAX_LINES - safeguardNotice = `This file is ${fileSizeKB}KB, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.\n` } } } @@ -566,12 +572,32 @@ export async function readFileTool( let lineRangeAttr: string if (shouldApplySafeguard) { - // Read partial file with safeguard - content = addLineNumbers(await readLines(fullPath, linesToRead - 1, 0)) - lineRangeAttr = ` lines="1-${linesToRead}"` + // Read partial file with character-based safeguard + // Use the smaller of MAX_CHAR_LIMIT or the calculated character threshold + const charLimit = Math.min(MAX_CHAR_LIMIT, MAX_CHAR_THRESHOLD) + const result = await readLinesWithCharLimit(fullPath, charLimit) + + content = addLineNumbers(result.content, 1) + actualLinesRead = result.linesRead + lineRangeAttr = ` lines="1-${actualLinesRead}"` + + const fileStats = await stat(fullPath) + const fileSizeKB = Math.round(fileStats.size / 1024) + + if (result.wasTruncated) { + safeguardNotice = `${t("tools:readFile.safeguardNotice", { + fileSizeKB, + actualLinesRead, + charactersRead: result.charactersRead.toLocaleString(), + })}\n` + } } else { - // Read full file as normal - content = await extractTextFromFile(fullPath) + // Read full file - reuse fullContent if we already have it + if (fullContent !== null) { + content = fullContent + } else { + content = await extractTextFromFile(fullPath) + } lineRangeAttr = ` lines="1-${totalLines}"` } diff --git a/src/i18n/locales/ca/tools.json b/src/i18n/locales/ca/tools.json index 5b3a228bdec..2916c755022 100644 --- a/src/i18n/locales/ca/tools.json +++ b/src/i18n/locales/ca/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (línies {{start}}-{{end}})", "definitionsOnly": " (només definicions)", - "maxLines": " (màxim {{max}} línies)" + "maxLines": " (màxim {{max}} línies)", + "safeguardNotice": "Aquest fitxer té {{fileSizeKB}}KB i consumiria una part significativa de la finestra de context. Mostrant només les primeres {{actualLinesRead}} línies completes ({{charactersRead}} caràcters) per preservar l'espai de context. Utilitza line_range si necessites llegir seccions específiques." }, "toolRepetitionLimitReached": "Roo sembla estar atrapat en un bucle, intentant la mateixa acció ({{toolName}}) repetidament. Això podria indicar un problema amb la seva estratègia actual. Considera reformular la tasca, proporcionar instruccions més específiques o guiar-lo cap a un enfocament diferent.", "codebaseSearch": { diff --git a/src/i18n/locales/de/tools.json b/src/i18n/locales/de/tools.json index eb1afbc0821..eebc218397b 100644 --- a/src/i18n/locales/de/tools.json +++ b/src/i18n/locales/de/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (Zeilen {{start}}-{{end}})", "definitionsOnly": " (nur Definitionen)", - "maxLines": " (maximal {{max}} Zeilen)" + "maxLines": " (maximal {{max}} Zeilen)", + "safeguardNotice": "Diese Datei ist {{fileSizeKB}}KB groß und würde einen erheblichen Teil des Kontextfensters verbrauchen. Es werden nur die ersten {{actualLinesRead}} vollständigen Zeilen ({{charactersRead}} Zeichen) angezeigt, um Kontextplatz zu sparen. Verwenden Sie line_range, wenn Sie bestimmte Abschnitte lesen müssen." }, "toolRepetitionLimitReached": "Roo scheint in einer Schleife festzustecken und versucht wiederholt dieselbe Aktion ({{toolName}}). Dies könnte auf ein Problem mit der aktuellen Strategie hindeuten. Überlege dir, die Aufgabe umzuformulieren, genauere Anweisungen zu geben oder Roo zu einem anderen Ansatz zu führen.", "codebaseSearch": { diff --git a/src/i18n/locales/en/tools.json b/src/i18n/locales/en/tools.json index 0265a843985..e6a99407948 100644 --- a/src/i18n/locales/en/tools.json +++ b/src/i18n/locales/en/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (lines {{start}}-{{end}})", "definitionsOnly": " (definitions only)", - "maxLines": " (max {{max}} lines)" + "maxLines": " (max {{max}} lines)", + "safeguardNotice": "This file is {{fileSizeKB}}KB and would consume a significant portion of the context window. Showing only the first {{actualLinesRead}} complete lines ({{charactersRead}} characters) to preserve context space. Use line_range if you need to read specific sections." }, "toolRepetitionLimitReached": "Roo appears to be stuck in a loop, attempting the same action ({{toolName}}) repeatedly. This might indicate a problem with its current strategy. Consider rephrasing the task, providing more specific instructions, or guiding it towards a different approach.", "codebaseSearch": { diff --git a/src/i18n/locales/es/tools.json b/src/i18n/locales/es/tools.json index 303f5365ed0..a01df99e300 100644 --- a/src/i18n/locales/es/tools.json +++ b/src/i18n/locales/es/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (líneas {{start}}-{{end}})", "definitionsOnly": " (solo definiciones)", - "maxLines": " (máximo {{max}} líneas)" + "maxLines": " (máximo {{max}} líneas)", + "safeguardNotice": "Este archivo tiene {{fileSizeKB}}KB y consumiría una parte significativa de la ventana de contexto. Mostrando solo las primeras {{actualLinesRead}} líneas completas ({{charactersRead}} caracteres) para preservar el espacio de contexto. Usa line_range si necesitas leer secciones específicas." }, "toolRepetitionLimitReached": "Roo parece estar atrapado en un bucle, intentando la misma acción ({{toolName}}) repetidamente. Esto podría indicar un problema con su estrategia actual. Considera reformular la tarea, proporcionar instrucciones más específicas o guiarlo hacia un enfoque diferente.", "codebaseSearch": { diff --git a/src/i18n/locales/fr/tools.json b/src/i18n/locales/fr/tools.json index a6c71aca333..113a7cdb27d 100644 --- a/src/i18n/locales/fr/tools.json +++ b/src/i18n/locales/fr/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (lignes {{start}}-{{end}})", "definitionsOnly": " (définitions uniquement)", - "maxLines": " (max {{max}} lignes)" + "maxLines": " (max {{max}} lignes)", + "safeguardNotice": "Ce fichier fait {{fileSizeKB}}KB et consommerait une partie importante de la fenêtre de contexte. Affichage uniquement des {{actualLinesRead}} premières lignes complètes ({{charactersRead}} caractères) pour préserver l'espace de contexte. Utilisez line_range si vous devez lire des sections spécifiques." }, "toolRepetitionLimitReached": "Roo semble être bloqué dans une boucle, tentant la même action ({{toolName}}) de façon répétée. Cela pourrait indiquer un problème avec sa stratégie actuelle. Envisage de reformuler la tâche, de fournir des instructions plus spécifiques ou de le guider vers une approche différente.", "codebaseSearch": { diff --git a/src/i18n/locales/hi/tools.json b/src/i18n/locales/hi/tools.json index 0cb4aeb14ec..f5260b6f542 100644 --- a/src/i18n/locales/hi/tools.json +++ b/src/i18n/locales/hi/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (पंक्तियाँ {{start}}-{{end}})", "definitionsOnly": " (केवल परिभाषाएँ)", - "maxLines": " (अधिकतम {{max}} पंक्तियाँ)" + "maxLines": " (अधिकतम {{max}} पंक्तियाँ)", + "safeguardNotice": "यह फ़ाइल {{fileSizeKB}}KB की है और संदर्भ विंडो का एक महत्वपूर्ण हिस्सा उपभोग करेगी। संदर्भ स्थान को संरक्षित करने के लिए केवल पहली {{actualLinesRead}} पूर्ण पंक्तियाँ ({{charactersRead}} वर्ण) दिखाई जा रही हैं। यदि आपको विशिष्ट अनुभाग पढ़ने की आवश्यकता है तो line_range का उपयोग करें।" }, "toolRepetitionLimitReached": "Roo एक लूप में फंसा हुआ लगता है, बार-बार एक ही क्रिया ({{toolName}}) को दोहरा रहा है। यह उसकी वर्तमान रणनीति में किसी समस्या का संकेत हो सकता है। कार्य को पुनः परिभाषित करने, अधिक विशिष्ट निर्देश देने, या उसे एक अलग दृष्टिकोण की ओर मार्गदर्शित करने पर विचार करें।", "codebaseSearch": { diff --git a/src/i18n/locales/id/tools.json b/src/i18n/locales/id/tools.json index 2e3c4f0c22e..3941d295c8b 100644 --- a/src/i18n/locales/id/tools.json +++ b/src/i18n/locales/id/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (baris {{start}}-{{end}})", "definitionsOnly": " (hanya definisi)", - "maxLines": " (maks {{max}} baris)" + "maxLines": " (maks {{max}} baris)", + "safeguardNotice": "File ini berukuran {{fileSizeKB}}KB dan akan menggunakan sebagian besar jendela konteks. Hanya menampilkan {{actualLinesRead}} baris lengkap pertama ({{charactersRead}} karakter) untuk menjaga ruang konteks. Gunakan line_range jika Anda perlu membaca bagian tertentu." }, "toolRepetitionLimitReached": "Roo tampaknya terjebak dalam loop, mencoba aksi yang sama ({{toolName}}) berulang kali. Ini mungkin menunjukkan masalah dengan strategi saat ini. Pertimbangkan untuk mengubah frasa tugas, memberikan instruksi yang lebih spesifik, atau mengarahkannya ke pendekatan yang berbeda.", "codebaseSearch": { diff --git a/src/i18n/locales/it/tools.json b/src/i18n/locales/it/tools.json index ffae474f1db..f0cbfc5571f 100644 --- a/src/i18n/locales/it/tools.json +++ b/src/i18n/locales/it/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (righe {{start}}-{{end}})", "definitionsOnly": " (solo definizioni)", - "maxLines": " (max {{max}} righe)" + "maxLines": " (max {{max}} righe)", + "safeguardNotice": "Questo file è di {{fileSizeKB}}KB e consumerebbe una parte significativa della finestra di contesto. Vengono mostrate solo le prime {{actualLinesRead}} righe complete ({{charactersRead}} caratteri) per preservare lo spazio di contesto. Usa line_range se devi leggere sezioni specifiche." }, "toolRepetitionLimitReached": "Roo sembra essere bloccato in un ciclo, tentando ripetutamente la stessa azione ({{toolName}}). Questo potrebbe indicare un problema con la sua strategia attuale. Considera di riformulare l'attività, fornire istruzioni più specifiche o guidarlo verso un approccio diverso.", "codebaseSearch": { diff --git a/src/i18n/locales/ja/tools.json b/src/i18n/locales/ja/tools.json index 04a5fcc0856..54c38eb5146 100644 --- a/src/i18n/locales/ja/tools.json +++ b/src/i18n/locales/ja/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " ({{start}}-{{end}}行目)", "definitionsOnly": " (定義のみ)", - "maxLines": " (最大{{max}}行)" + "maxLines": " (最大{{max}}行)", + "safeguardNotice": "このファイルは{{fileSizeKB}}KBで、コンテキストウィンドウの大部分を消費します。コンテキストスペースを保持するため、最初の{{actualLinesRead}}行({{charactersRead}}文字)のみを表示しています。特定のセクションを読む必要がある場合は、line_rangeを使用してください。" }, "toolRepetitionLimitReached": "Rooが同じ操作({{toolName}})を繰り返し試みるループに陥っているようです。これは現在の方法に問題がある可能性を示しています。タスクの言い換え、より具体的な指示の提供、または別のアプローチへの誘導を検討してください。", "codebaseSearch": { diff --git a/src/i18n/locales/ko/tools.json b/src/i18n/locales/ko/tools.json index e43a541794a..941bf0bdd8b 100644 --- a/src/i18n/locales/ko/tools.json +++ b/src/i18n/locales/ko/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " ({{start}}-{{end}}행)", "definitionsOnly": " (정의만)", - "maxLines": " (최대 {{max}}행)" + "maxLines": " (최대 {{max}}행)", + "safeguardNotice": "이 파일은 {{fileSizeKB}}KB로 컨텍스트 창의 상당 부분을 차지합니다. 컨텍스트 공간을 보존하기 위해 처음 {{actualLinesRead}}개의 완전한 줄({{charactersRead}}자)만 표시합니다. 특정 섹션을 읽어야 하는 경우 line_range를 사용하세요." }, "toolRepetitionLimitReached": "Roo가 같은 동작({{toolName}})을 반복적으로 시도하면서 루프에 갇힌 것 같습니다. 이는 현재 전략에 문제가 있을 수 있음을 나타냅니다. 작업을 다시 표현하거나, 더 구체적인 지침을 제공하거나, 다른 접근 방식으로 안내해 보세요.", "codebaseSearch": { diff --git a/src/i18n/locales/nl/tools.json b/src/i18n/locales/nl/tools.json index 56a8cdbc466..f300f8ba1b0 100644 --- a/src/i18n/locales/nl/tools.json +++ b/src/i18n/locales/nl/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (regels {{start}}-{{end}})", "definitionsOnly": " (alleen definities)", - "maxLines": " (max {{max}} regels)" + "maxLines": " (max {{max}} regels)", + "safeguardNotice": "Dit bestand is {{fileSizeKB}}KB en zou een aanzienlijk deel van het contextvenster gebruiken. Er worden alleen de eerste {{actualLinesRead}} volledige regels ({{charactersRead}} tekens) weergegeven om contextruimte te behouden. Gebruik line_range als u specifieke secties moet lezen." }, "toolRepetitionLimitReached": "Roo lijkt vast te zitten in een lus, waarbij hij herhaaldelijk dezelfde actie ({{toolName}}) probeert. Dit kan duiden op een probleem met de huidige strategie. Overweeg de taak te herformuleren, specifiekere instructies te geven of Roo naar een andere aanpak te leiden.", "codebaseSearch": { diff --git a/src/i18n/locales/pl/tools.json b/src/i18n/locales/pl/tools.json index 62568826aae..57eaea3d2ed 100644 --- a/src/i18n/locales/pl/tools.json +++ b/src/i18n/locales/pl/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (linie {{start}}-{{end}})", "definitionsOnly": " (tylko definicje)", - "maxLines": " (maks. {{max}} linii)" + "maxLines": " (maks. {{max}} linii)", + "safeguardNotice": "Ten plik ma {{fileSizeKB}}KB i zużyłby znaczną część okna kontekstu. Wyświetlane jest tylko pierwsze {{actualLinesRead}} pełnych linii ({{charactersRead}} znaków), aby zachować przestrzeń kontekstu. Użyj line_range, jeśli musisz przeczytać określone sekcje." }, "toolRepetitionLimitReached": "Wygląda na to, że Roo utknął w pętli, wielokrotnie próbując wykonać tę samą akcję ({{toolName}}). Może to wskazywać na problem z jego obecną strategią. Rozważ przeformułowanie zadania, podanie bardziej szczegółowych instrukcji lub nakierowanie go na inne podejście.", "codebaseSearch": { diff --git a/src/i18n/locales/pt-BR/tools.json b/src/i18n/locales/pt-BR/tools.json index f74e0f8196e..aafbe71b41e 100644 --- a/src/i18n/locales/pt-BR/tools.json +++ b/src/i18n/locales/pt-BR/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (linhas {{start}}-{{end}})", "definitionsOnly": " (apenas definições)", - "maxLines": " (máx. {{max}} linhas)" + "maxLines": " (máx. {{max}} linhas)", + "safeguardNotice": "Este arquivo tem {{fileSizeKB}}KB e consumiria uma parte significativa da janela de contexto. Mostrando apenas as primeiras {{actualLinesRead}} linhas completas ({{charactersRead}} caracteres) para preservar o espaço de contexto. Use line_range se precisar ler seções específicas." }, "toolRepetitionLimitReached": "Roo parece estar preso em um loop, tentando a mesma ação ({{toolName}}) repetidamente. Isso pode indicar um problema com sua estratégia atual. Considere reformular a tarefa, fornecer instruções mais específicas ou guiá-lo para uma abordagem diferente.", "codebaseSearch": { diff --git a/src/i18n/locales/ru/tools.json b/src/i18n/locales/ru/tools.json index 1e59d10499c..e6a61f689ae 100644 --- a/src/i18n/locales/ru/tools.json +++ b/src/i18n/locales/ru/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (строки {{start}}-{{end}})", "definitionsOnly": " (только определения)", - "maxLines": " (макс. {{max}} строк)" + "maxLines": " (макс. {{max}} строк)", + "safeguardNotice": "Этот файл размером {{fileSizeKB}}КБ займет значительную часть контекстного окна. Показаны только первые {{actualLinesRead}} полных строк ({{charactersRead}} символов) для сохранения контекстного пространства. Используйте line_range, если нужно прочитать определенные разделы." }, "toolRepetitionLimitReached": "Похоже, что Roo застрял в цикле, многократно пытаясь выполнить одно и то же действие ({{toolName}}). Это может указывать на проблему с его текущей стратегией. Попробуйте переформулировать задачу, предоставить более конкретные инструкции или направить его к другому подходу.", "codebaseSearch": { diff --git a/src/i18n/locales/tr/tools.json b/src/i18n/locales/tr/tools.json index e4c73cdc4b2..d133826c56b 100644 --- a/src/i18n/locales/tr/tools.json +++ b/src/i18n/locales/tr/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (satır {{start}}-{{end}})", "definitionsOnly": " (sadece tanımlar)", - "maxLines": " (maks. {{max}} satır)" + "maxLines": " (maks. {{max}} satır)", + "safeguardNotice": "Bu dosya {{fileSizeKB}}KB boyutunda ve bağlam penceresinin önemli bir kısmını tüketecektir. Bağlam alanını korumak için yalnızca ilk {{actualLinesRead}} tam satır ({{charactersRead}} karakter) gösteriliyor. Belirli bölümleri okumanız gerekiyorsa line_range kullanın." }, "toolRepetitionLimitReached": "Roo bir döngüye takılmış gibi görünüyor, aynı eylemi ({{toolName}}) tekrar tekrar deniyor. Bu, mevcut stratejisinde bir sorun olduğunu gösterebilir. Görevi yeniden ifade etmeyi, daha spesifik talimatlar vermeyi veya onu farklı bir yaklaşıma yönlendirmeyi düşünün.", "codebaseSearch": { diff --git a/src/i18n/locales/vi/tools.json b/src/i18n/locales/vi/tools.json index 9811ee12c92..00f67fb19a9 100644 --- a/src/i18n/locales/vi/tools.json +++ b/src/i18n/locales/vi/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (dòng {{start}}-{{end}})", "definitionsOnly": " (chỉ định nghĩa)", - "maxLines": " (tối đa {{max}} dòng)" + "maxLines": " (tối đa {{max}} dòng)", + "safeguardNotice": "Tệp này có kích thước {{fileSizeKB}}KB và sẽ chiếm một phần đáng kể của cửa sổ ngữ cảnh. Chỉ hiển thị {{actualLinesRead}} dòng đầu tiên hoàn chỉnh ({{charactersRead}} ký tự) để bảo toàn không gian ngữ cảnh. Sử dụng line_range nếu bạn cần đọc các phần cụ thể." }, "toolRepetitionLimitReached": "Roo dường như đang bị mắc kẹt trong một vòng lặp, liên tục cố gắng thực hiện cùng một hành động ({{toolName}}). Điều này có thể cho thấy vấn đề với chiến lược hiện tại. Hãy cân nhắc việc diễn đạt lại nhiệm vụ, cung cấp hướng dẫn cụ thể hơn, hoặc hướng Roo theo một cách tiếp cận khác.", "codebaseSearch": { diff --git a/src/i18n/locales/zh-CN/tools.json b/src/i18n/locales/zh-CN/tools.json index 13641b8d43b..55bb52f36e8 100644 --- a/src/i18n/locales/zh-CN/tools.json +++ b/src/i18n/locales/zh-CN/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (第 {{start}}-{{end}} 行)", "definitionsOnly": " (仅定义)", - "maxLines": " (最多 {{max}} 行)" + "maxLines": " (最多 {{max}} 行)", + "safeguardNotice": "此文件大小为 {{fileSizeKB}}KB,会占用上下文窗口的大部分空间。为了保留上下文空间,仅显示前 {{actualLinesRead}} 行完整内容({{charactersRead}} 个字符)。如需读取特定部分,请使用 line_range。" }, "toolRepetitionLimitReached": "Roo 似乎陷入循环,反复尝试同一操作 ({{toolName}})。这可能表明当前策略存在问题。请考虑重新描述任务、提供更具体的指示或引导其尝试不同的方法。", "codebaseSearch": { diff --git a/src/i18n/locales/zh-TW/tools.json b/src/i18n/locales/zh-TW/tools.json index a726e3c9192..43b079d32be 100644 --- a/src/i18n/locales/zh-TW/tools.json +++ b/src/i18n/locales/zh-TW/tools.json @@ -2,7 +2,8 @@ "readFile": { "linesRange": " (第 {{start}}-{{end}} 行)", "definitionsOnly": " (僅定義)", - "maxLines": " (最多 {{max}} 行)" + "maxLines": " (最多 {{max}} 行)", + "safeguardNotice": "此檔案大小為 {{fileSizeKB}}KB,會佔用上下文視窗的大部分空間。為了保留上下文空間,僅顯示前 {{actualLinesRead}} 行完整內容({{charactersRead}} 個字元)。如需讀取特定部分,請使用 line_range。" }, "toolRepetitionLimitReached": "Roo 似乎陷入循環,反覆嘗試同一操作 ({{toolName}})。這可能表明目前策略存在問題。請考慮重新描述工作、提供更具體的指示或引導其嘗試不同的方法。", "codebaseSearch": { diff --git a/src/integrations/misc/__tests__/read-lines-char-limit.spec.ts b/src/integrations/misc/__tests__/read-lines-char-limit.spec.ts new file mode 100644 index 00000000000..a983bad705e --- /dev/null +++ b/src/integrations/misc/__tests__/read-lines-char-limit.spec.ts @@ -0,0 +1,224 @@ +import { promises as fs } from "fs" +import path from "path" +import { readLinesWithCharLimit } from "../read-lines-char-limit" + +describe("readLinesWithCharLimit", () => { + const testDir = path.join(__dirname, "test-files") + const testFile = path.join(testDir, "char-limit-test.txt") + const longLineFile = path.join(testDir, "long-lines.txt") + const mixedFile = path.join(testDir, "mixed-content.txt") + + beforeAll(async () => { + // Create test directory + await fs.mkdir(testDir, { recursive: true }) + + // Create test file with predictable content + // Each line is "Line X" (6 chars) + newline (1 char) = 7 chars per line + const lines = Array.from({ length: 20 }, (_, i) => `Line ${i + 1}`).join("\n") + await fs.writeFile(testFile, lines) + + // Create file with very long lines + const longLine = "A".repeat(1000) // 1000 chars + const longLines = Array.from({ length: 5 }, () => longLine).join("\n") + await fs.writeFile(longLineFile, longLines) + + // Create file with mixed line lengths + const mixedContent = [ + "Short", // 5 chars + "Medium length line", // 18 chars + "A".repeat(100), // 100 chars + "Another short", // 13 chars + "B".repeat(200), // 200 chars + ].join("\n") + await fs.writeFile(mixedFile, mixedContent) + }) + + afterAll(async () => { + // Clean up test files + await fs.rm(testDir, { recursive: true, force: true }) + }) + + describe("basic functionality", () => { + it("should read complete file when char limit is not exceeded", async () => { + const result = await readLinesWithCharLimit(testFile, 1000) + + expect(result.wasTruncated).toBe(false) + expect(result.linesRead).toBe(20) + // Lines 1-9: "Line X\n" (7 chars each) = 9 * 7 = 63 + // Lines 10-19: "Line XX\n" (8 chars each) = 10 * 8 = 80 + // Line 20: "Line 20" (7 chars, no newline) + // Total: 63 + 80 + 7 = 150 + expect(result.charactersRead).toBe(150) + expect(result.content).toContain("Line 1") + expect(result.content).toContain("Line 20") + }) + + it("should truncate at line boundary when char limit is exceeded", async () => { + // Set limit to 50 chars, which should include ~7 complete lines + const result = await readLinesWithCharLimit(testFile, 50) + + expect(result.wasTruncated).toBe(true) + expect(result.linesRead).toBe(7) // 7 * 7 = 49 chars + expect(result.charactersRead).toBe(49) + expect(result.content).toContain("Line 1") + expect(result.content).toContain("Line 7") + expect(result.content).not.toContain("Line 8") + }) + + it("should handle startLine parameter correctly", async () => { + // Start from line 5 (0-based index 4) + const result = await readLinesWithCharLimit(testFile, 50, 4) + + expect(result.wasTruncated).toBe(true) + // Lines 5-9: "Line X\n" (7 chars each) = 5 * 7 = 35 + // Line 10: "Line 10\n" (8 chars) = 8 + // Total so far: 43 chars, can fit one more line + // Line 11: "Line 11\n" (8 chars) would make 51, exceeds limit + // So we get lines 5-10 = 6 lines + expect(result.linesRead).toBe(6) + expect(result.content).toContain("Line 5") + expect(result.content).toContain("Line 10") + expect(result.content).not.toContain("Line 4") + expect(result.content).not.toContain("Line 11") + }) + }) + + describe("edge cases", () => { + it("should handle empty files", async () => { + const emptyFile = path.join(testDir, "empty.txt") + await fs.writeFile(emptyFile, "") + + const result = await readLinesWithCharLimit(emptyFile, 100) + + expect(result.wasTruncated).toBe(false) + expect(result.linesRead).toBe(0) + expect(result.charactersRead).toBe(0) + expect(result.content).toBe("") + }) + + it("should handle single character limit", async () => { + const result = await readLinesWithCharLimit(testFile, 1) + + expect(result.wasTruncated).toBe(true) + expect(result.linesRead).toBe(0) // Can't fit even one line + expect(result.charactersRead).toBe(0) + expect(result.content).toBe("") + }) + + it("should handle file with no newline at end", async () => { + const noNewlineFile = path.join(testDir, "no-newline.txt") + await fs.writeFile(noNewlineFile, "Line without newline") + + const result = await readLinesWithCharLimit(noNewlineFile, 100) + + expect(result.wasTruncated).toBe(false) + expect(result.linesRead).toBe(1) + expect(result.charactersRead).toBe(20) + expect(result.content).toBe("Line without newline") + }) + + it("should reject negative maxChars", async () => { + await expect(readLinesWithCharLimit(testFile, -1)).rejects.toThrow("maxChars must be positive") + }) + + it("should reject negative startLine", async () => { + await expect(readLinesWithCharLimit(testFile, 100, -1)).rejects.toThrow("startLine must be non-negative") + }) + }) + + describe("long lines handling", () => { + it("should not include partial lines when they exceed char limit", async () => { + // Each line is 1001 chars (1000 'A's + newline) + // With 1500 char limit, should only include 1 complete line + const result = await readLinesWithCharLimit(longLineFile, 1500) + + expect(result.wasTruncated).toBe(true) + expect(result.linesRead).toBe(1) + expect(result.charactersRead).toBe(1001) + expect(result.content).toMatch(/^A{1000}\n$/) + }) + + it("should handle case where first line exceeds limit", async () => { + // Limit is less than first line length + const result = await readLinesWithCharLimit(longLineFile, 500) + + expect(result.wasTruncated).toBe(true) + expect(result.linesRead).toBe(0) + expect(result.charactersRead).toBe(0) + expect(result.content).toBe("") + }) + }) + + describe("mixed content handling", () => { + it("should correctly count characters with mixed line lengths", async () => { + // First 3 lines: "Short\n" (6) + "Medium length line\n" (19) + 100 A's + \n (101) = 126 chars + const result = await readLinesWithCharLimit(mixedFile, 130) + + expect(result.wasTruncated).toBe(true) + expect(result.linesRead).toBe(3) + expect(result.charactersRead).toBe(126) + expect(result.content).toContain("Short") + expect(result.content).toContain("Medium length line") + expect(result.content).toContain("A".repeat(100)) + expect(result.content).not.toContain("Another short") + }) + + it("should handle exact character boundary", async () => { + // Exactly enough for first two lines + const result = await readLinesWithCharLimit(mixedFile, 25) + + expect(result.wasTruncated).toBe(true) + expect(result.linesRead).toBe(2) + expect(result.charactersRead).toBe(25) + expect(result.content).toBe("Short\nMedium length line\n") + }) + }) + + describe("unicode handling", () => { + it("should handle unicode characters correctly", async () => { + const unicodeFile = path.join(testDir, "unicode.txt") + const unicodeContent = [ + "Hello 👋", // 8 chars (emoji counts as 2) + "世界", // 2 chars + "🌍🌎🌏", // 6 chars (3 emojis) + ].join("\n") + await fs.writeFile(unicodeFile, unicodeContent) + + const result = await readLinesWithCharLimit(unicodeFile, 20) + + expect(result.wasTruncated).toBe(false) + expect(result.linesRead).toBe(3) + // Note: character count is based on JavaScript string length + expect(result.content).toContain("Hello 👋") + expect(result.content).toContain("世界") + expect(result.content).toContain("🌍🌎🌏") + }) + }) + + describe("performance considerations", () => { + it("should handle large files efficiently", async () => { + const largeFile = path.join(testDir, "large.txt") + // Create a 10MB file + const chunk = "A".repeat(1000) + "\n" + const chunks = Array(10000).fill(chunk).join("") + await fs.writeFile(largeFile, chunks) + + const startTime = Date.now() + const result = await readLinesWithCharLimit(largeFile, 10000) + const duration = Date.now() - startTime + + expect(result.wasTruncated).toBe(true) + expect(result.linesRead).toBe(9) // 9 complete lines + expect(result.charactersRead).toBe(9009) // 9 * 1001 + expect(duration).toBeLessThan(100) // Should complete quickly + }) + }) + + describe("file not found handling", () => { + it("should reject when file does not exist", async () => { + const nonExistentFile = path.join(testDir, "does-not-exist.txt") + + await expect(readLinesWithCharLimit(nonExistentFile, 100)).rejects.toThrow() + }) + }) +}) diff --git a/src/integrations/misc/read-lines-char-limit.ts b/src/integrations/misc/read-lines-char-limit.ts new file mode 100644 index 00000000000..850354956f2 --- /dev/null +++ b/src/integrations/misc/read-lines-char-limit.ts @@ -0,0 +1,117 @@ +import { createReadStream } from "fs" + +/** + * Result of reading lines with character limit + */ +export interface ReadLinesCharLimitResult { + /** The content that was read */ + content: string + /** The number of complete lines that were read */ + linesRead: number + /** Whether the file was truncated due to character limit */ + wasTruncated: boolean + /** Total number of characters read (excluding any incomplete final line) */ + charactersRead: number +} + +/** + * Reads lines from a file up to a maximum character count, ensuring we don't + * break in the middle of a line. + * + * @param filepath - Path to the file to read + * @param maxChars - Maximum number of characters to read + * @param startLine - Optional. The line number to start reading from (0-based, inclusive) + * @returns Promise resolving to the read result with content and metadata + */ +export function readLinesWithCharLimit( + filepath: string, + maxChars: number, + startLine: number = 0, +): Promise { + return new Promise((resolve, reject) => { + // Validate inputs + if (maxChars <= 0) { + return reject(new RangeError(`maxChars must be positive, got ${maxChars}`)) + } + if (startLine < 0) { + return reject(new RangeError(`startLine must be non-negative, got ${startLine}`)) + } + + const input = createReadStream(filepath, { encoding: "utf8" }) + let buffer = "" + let currentLineNumber = 0 + let result = "" + let charactersRead = 0 + let linesIncluded = 0 + let wasTruncated = false + + // Handle errors + input.on("error", reject) + + // Process data chunks + input.on("data", (chunk) => { + buffer += chunk.toString() + + let pos = 0 + let nextNewline = buffer.indexOf("\n", pos) + + // Process complete lines in the buffer + while (nextNewline !== -1) { + const lineWithNewline = buffer.substring(pos, nextNewline + 1) + + // Check if we're past the start line + if (currentLineNumber >= startLine) { + // Check if adding this line would exceed the character limit + if (charactersRead + lineWithNewline.length > maxChars) { + // We've hit the limit, stop reading + wasTruncated = true + input.destroy() + resolve({ + content: result, + linesRead: linesIncluded, + wasTruncated, + charactersRead, + }) + return + } + + // Add the line to the result + result += lineWithNewline + charactersRead += lineWithNewline.length + linesIncluded++ + } + + // Move to next line + pos = nextNewline + 1 + currentLineNumber++ + nextNewline = buffer.indexOf("\n", pos) + } + + // Keep the incomplete line in the buffer + buffer = buffer.substring(pos) + }) + + // Handle end of file + input.on("end", () => { + // Process any remaining data in buffer (last line without newline) + if (buffer.length > 0 && currentLineNumber >= startLine) { + // Check if adding this final line would exceed the limit + if (charactersRead + buffer.length <= maxChars) { + result += buffer + charactersRead += buffer.length + linesIncluded++ + } else { + // Mark as truncated if we couldn't include the last line + wasTruncated = true + } + } + + resolve({ + content: result, + linesRead: linesIncluded, + wasTruncated, + charactersRead, + }) + }) + }) +}