From edb8809e2fb6aac94d5e4d41b74c490e01a316ef Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Fri, 24 Oct 2025 22:55:14 -0600 Subject: [PATCH 01/16] Browser Use 2.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary This release upgrades the in‑chat browsing experience with persistent sessions, clearer feedback, a dedicated browser panel, and more natural action descriptions — all fully localized. What's new - Persistent Browser Sessions - The browser stays open across steps so you can send follow‑ups without relaunching. - You’ll see a "Browser Session" header and a "Session started" note when active. - Dedicated Browser Session panel - Open a full‑size view when you need more space, while keeping the chat context in view. - Live, readable action feed - Actions are presented in plain language: Launch, Click, Type, Press, Hover, Scroll. - Keyboard events now appear as "Press Enter" or "Press Esc" for easier scanning. - Broader keyboard coverage: navigation keys and common shortcuts are supported for more natural control. - Inline console logs - Console output is surfaced inline with a clear "No new logs" state. - Noise-reduced by default: only new entries since the previous step are shown to cut repeat noise. - Filter by type (Errors, Warnings, Logs) so you can focus on what matters. - Clear session controls - A prominent Disconnect/Close control makes it easy to end a session when you’re done. - Interactive in-session controls - Follow-ups attach to the active session so you can guide the assistant mid-flow without restarting. - Suggested follow-ups appear inline to keep momentum. - More accurate interactions - Improved click, scroll, and hover reliability across screen sizes with a consistent preview aspect ratio. - Seamless follow‑ups - Keep chatting while the session is open; the assistant continues from the same context. - Fully localized - New labels and action text are translated across all supported languages. What you'll notice in the UI - "Browser Session" appears in chat when a session is active. - A "Session started" status line confirms the start. - Follow-up suggestions appear inside the Browser Session row when active. - Keyboard actions are summarized clearly (e.g., "Press Tab", "Shift+Tab", "Arrow keys"). - New action wording like "Press Enter" or "Hover (x, y)". - Console Logs are visible inline, with a "No new logs" indicator and a noise‑reduced view that shows only new entries since the last step. - Type filters (All, Errors, Warnings, Logs) above the log list to quickly narrow the feed. - A quick Disconnect button to end the session. --- packages/types/src/message.ts | 1 + .../presentAssistantMessage.ts | 28 +- .../__tests__/getEnvironmentDetails.spec.ts | 19 +- src/core/environment/getEnvironmentDetails.ts | 32 + .../with-computer-use-support.snap | 37 +- src/core/prompts/tools/browser-action.ts | 35 +- src/core/task/Task.ts | 73 +- src/core/tools/BrowserActionTool.ts | 244 --- ...rowserActionTool.coordinateScaling.spec.ts | 148 ++ .../webview/BrowserSessionPanelManager.ts | 310 ++++ src/core/webview/ClineProvider.ts | 5 + .../webview/__tests__/ClineProvider.spec.ts | 1 + src/core/webview/webviewMessageHandler.ts | 99 +- src/services/browser/BrowserSession.ts | 288 +++- src/services/browser/UrlContentFetcher.ts | 6 +- .../browser/__tests__/BrowserSession.spec.ts | 165 ++ src/shared/ExtensionMessage.ts | 10 + src/shared/WebviewMessage.ts | 10 + webview-ui/browser-panel.html | 12 + webview-ui/src/browser-panel.tsx | 12 + .../BrowserPanelStateProvider.tsx | 60 + .../browser-session/BrowserSessionPanel.tsx | 109 ++ .../src/components/chat/BrowserActionRow.tsx | 247 +++ .../src/components/chat/BrowserSessionRow.tsx | 1442 ++++++++++++----- .../chat/BrowserSessionStatusRow.tsx | 34 + webview-ui/src/components/chat/ChatRow.tsx | 22 +- .../src/components/chat/ChatTextArea.tsx | 35 +- webview-ui/src/components/chat/ChatView.tsx | 195 ++- .../BrowserSessionRow.aspect-ratio.spec.tsx | 55 + ...owserSessionRow.disconnect-button.spec.tsx | 42 + .../ChatView.followup-in-session.spec.tsx | 119 ++ .../src/context/ExtensionStateContext.tsx | 1 + .../__tests__/ExtensionStateContext.spec.tsx | 1 + webview-ui/src/i18n/locales/ca/chat.json | 3 + webview-ui/src/i18n/locales/de/chat.json | 3 + webview-ui/src/i18n/locales/en/chat.json | 4 +- webview-ui/src/i18n/locales/es/chat.json | 3 + webview-ui/src/i18n/locales/fr/chat.json | 3 + webview-ui/src/i18n/locales/hi/chat.json | 3 + webview-ui/src/i18n/locales/id/chat.json | 3 + webview-ui/src/i18n/locales/it/chat.json | 3 + webview-ui/src/i18n/locales/ja/chat.json | 3 + webview-ui/src/i18n/locales/ko/chat.json | 3 + webview-ui/src/i18n/locales/nl/chat.json | 3 + webview-ui/src/i18n/locales/pl/chat.json | 3 + webview-ui/src/i18n/locales/pt-BR/chat.json | 3 + webview-ui/src/i18n/locales/ru/chat.json | 3 + webview-ui/src/i18n/locales/tr/chat.json | 3 + webview-ui/src/i18n/locales/vi/chat.json | 3 + webview-ui/src/i18n/locales/zh-CN/chat.json | 3 + webview-ui/src/i18n/locales/zh-TW/chat.json | 3 + webview-ui/vite.config.ts | 4 + 52 files changed, 3155 insertions(+), 801 deletions(-) delete mode 100644 src/core/tools/BrowserActionTool.ts create mode 100644 src/core/tools/__tests__/browserActionTool.coordinateScaling.spec.ts create mode 100644 src/core/webview/BrowserSessionPanelManager.ts create mode 100644 webview-ui/browser-panel.html create mode 100644 webview-ui/src/browser-panel.tsx create mode 100644 webview-ui/src/components/browser-session/BrowserPanelStateProvider.tsx create mode 100644 webview-ui/src/components/browser-session/BrowserSessionPanel.tsx create mode 100644 webview-ui/src/components/chat/BrowserActionRow.tsx create mode 100644 webview-ui/src/components/chat/BrowserSessionStatusRow.tsx create mode 100644 webview-ui/src/components/chat/__tests__/BrowserSessionRow.aspect-ratio.spec.tsx create mode 100644 webview-ui/src/components/chat/__tests__/BrowserSessionRow.disconnect-button.spec.tsx create mode 100644 webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx diff --git a/packages/types/src/message.ts b/packages/types/src/message.ts index 09737f9ea6d..548da850b62 100644 --- a/packages/types/src/message.ts +++ b/packages/types/src/message.ts @@ -166,6 +166,7 @@ export const clineSays = [ "shell_integration_warning", "browser_action", "browser_action_result", + "browser_session_status", "mcp_server_request_started", "mcp_server_response", "subtask_result", diff --git a/src/core/assistant-message/presentAssistantMessage.ts b/src/core/assistant-message/presentAssistantMessage.ts index 0955d5d111f..df0371cb7dd 100644 --- a/src/core/assistant-message/presentAssistantMessage.ts +++ b/src/core/assistant-message/presentAssistantMessage.ts @@ -437,8 +437,32 @@ export async function presentAssistantMessage(cline: Task) { return text.replace(tagRegex, "") } - if (block.name !== "browser_action") { - await cline.browserSession.closeBrowser() + // Keep browser open during an active session so other tools can run. + // Session is active if we've seen any browser_action_result and the last browser_action is not "close". + try { + const messages = cline.clineMessages || [] + const hasStarted = messages.some((m: any) => m.say === "browser_action_result") + let isClosed = false + for (let i = messages.length - 1; i >= 0; i--) { + const m = messages[i] + if (m.say === "browser_action") { + try { + const act = JSON.parse(m.text || "{}") + isClosed = act.action === "close" + } catch {} + break + } + } + const sessionActive = hasStarted && !isClosed + // Only auto-close when no active browser session is present, and this isn't a browser_action + if (!sessionActive && block.name !== "browser_action") { + await cline.browserSession.closeBrowser() + } + } catch { + // On any unexpected error, fall back to conservative behavior + if (block.name !== "browser_action") { + await cline.browserSession.closeBrowser() + } } if (!block.partial) { diff --git a/src/core/environment/__tests__/getEnvironmentDetails.spec.ts b/src/core/environment/__tests__/getEnvironmentDetails.spec.ts index 9b346aeea9f..0d6a4da591f 100644 --- a/src/core/environment/__tests__/getEnvironmentDetails.spec.ts +++ b/src/core/environment/__tests__/getEnvironmentDetails.spec.ts @@ -118,6 +118,9 @@ describe("getEnvironmentDetails", () => { deref: vi.fn().mockReturnValue(mockProvider), [Symbol.toStringTag]: "WeakRef", } as unknown as WeakRef, + browserSession: { + isSessionActive: vi.fn().mockReturnValue(false), + } as any, } // Mock other dependencies. @@ -393,7 +396,6 @@ describe("getEnvironmentDetails", () => { const result = await getEnvironmentDetails(cline as Task) expect(result).toContain("REMINDERS") }) - it("should include git status when maxGitStatusFiles > 0", async () => { ;(getGitStatus as Mock).mockResolvedValue("## main\nM file1.ts") mockProvider.getState.mockResolvedValue({ @@ -456,4 +458,19 @@ describe("getEnvironmentDetails", () => { expect(getGitStatus).toHaveBeenCalledWith(mockCwd, 5) }) + + it("should include Browser Session Status when inactive", async () => { + const result = await getEnvironmentDetails(mockCline as Task) + expect(result).toContain("# Browser Session Status") + expect(result).toContain("Inactive - Browser is not launched") + }) + + it("should include Browser Session Status with current viewport when active", async () => { + ;(mockCline.browserSession as any).isSessionActive = vi.fn().mockReturnValue(true) + ;(mockCline.browserSession as any).getViewportSize = vi.fn().mockReturnValue({ width: 1280, height: 720 }) + + const result = await getEnvironmentDetails(mockCline as Task) + expect(result).toContain("Active - A browser session is currently open and ready for browser_action commands") + expect(result).toContain("Current viewport size: 1280x720 pixels.") + }) }) diff --git a/src/core/environment/getEnvironmentDetails.ts b/src/core/environment/getEnvironmentDetails.ts index bf0e3c8392b..4c529e65e10 100644 --- a/src/core/environment/getEnvironmentDetails.ts +++ b/src/core/environment/getEnvironmentDetails.ts @@ -248,6 +248,38 @@ export async function getEnvironmentDetails(cline: Task, includeFileDetails: boo } } + // Add browser session status - Always show to prevent LLM from trying browser actions when no session is active + const isBrowserActive = cline.browserSession.isSessionActive() + + // Build viewport info for status (prefer actual viewport if available, else fallback to configured setting) + const configuredViewport = (state?.browserViewportSize as string | undefined) ?? "900x600" + let configuredWidth: number | undefined + let configuredHeight: number | undefined + if (configuredViewport.includes("x")) { + const parts = configuredViewport.split("x").map((v) => Number(v)) + configuredWidth = parts[0] + configuredHeight = parts[1] + } + + let actualWidth: number | undefined + let actualHeight: number | undefined + // Use optional chaining to avoid issues with tests that stub browserSession + const vp = isBrowserActive ? (cline.browserSession as any).getViewportSize?.() : undefined + if (vp) { + actualWidth = vp.width + actualHeight = vp.height + } + + const width = actualWidth ?? configuredWidth + const height = actualHeight ?? configuredHeight + const viewportInfo = isBrowserActive && width && height ? `\nCurrent viewport size: ${width}x${height} pixels.` : "" + + details += `\n# Browser Session Status\n${ + isBrowserActive + ? "Active - A browser session is currently open and ready for browser_action commands" + : "Inactive - Browser is not launched. Using any browser action except the browser_action with action='launch' to start a new session will result in an error." + }${viewportInfo}\n` + if (includeFileDetails) { details += `\n\n# Current Workspace Directory (${cline.cwd.toPosix()}) Files\n` const isDesktop = arePathsEqual(cline.cwd, path.join(os.homedir(), "Desktop")) diff --git a/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap b/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap index 03e66365c7c..cea59da7f57 100644 --- a/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap +++ b/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap @@ -228,10 +228,12 @@ Example for appending to the end of file: ## browser_action Description: Request to interact with a Puppeteer-controlled browser. Every action, except `close`, will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action per message, and wait for the user's response including a screenshot and logs to determine the next action. -- The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL. -- While the browser is active, only the `browser_action` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser. For example if you run into an error and need to fix a file, you must close the browser, then use other tools to make the necessary changes, then re-launch the browser to verify the result. -- The browser window has a resolution of **1280x800** pixels. When performing any click actions, ensure the coordinates are within this resolution range. -- Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges. + +**Browser Session Lifecycle:** +- Browser sessions **start** with `launch` and **end** with `close` +- The session remains active across multiple messages and tool uses +- You can use other tools while the browser session is active - it will stay open in the background + Parameters: - action: (required) The action to perform. The available actions are: * launch: Launch a new Puppeteer-controlled browser instance at the specified URL. This **must always be the first action**. @@ -245,6 +247,12 @@ Parameters: - Always click in the center of an element (icon, button, link, etc.) based on coordinates derived from a screenshot. * type: Type a string of text on the keyboard. You might use this after clicking on a text field to input text. - Use with the `text` parameter to provide the string to type. + * press: Press a single keyboard key or key combination (e.g., Enter, Tab, Escape, Cmd+K, Shift+Enter). + - Use with the `text` parameter to provide the key name or combination. + - For single keys: Enter, Tab, Escape, etc. + - For key combinations: Cmd+K, Ctrl+C, Shift+Enter, Alt+F4, etc. + - Supported modifiers: Cmd/Command/Meta, Ctrl/Control, Shift, Alt/Option + - Example: Cmd+K or Shift+Enter * resize: Resize the viewport to a specific w,h size. - Use with the `size` parameter to specify the new size. * scroll_down: Scroll down the page by one page height. @@ -253,17 +261,24 @@ Parameters: - Example: `close` - url: (optional) Use this for providing the URL for the `launch` action. * Example: https://example.com -- coordinate: (optional) The X and Y coordinates for the `click` and `hover` actions. Coordinates should be within the **1280x800** resolution. - * Example: 450,300 +- coordinate: (optional) The X and Y coordinates for the `click` and `hover` actions. + * **CRITICAL**: Screenshot dimensions are NOT the same as the browser viewport dimensions + * Format: x,y@widthxheight + * Measure x,y on the screenshot image you see in chat + * The widthxheight MUST be the EXACT pixel size of that screenshot image (never the browser viewport) + * Never use the browser viewport size for widthxheight - the viewport is only a reference and is often larger than the screenshot + * Images are often downscaled before you see them, so the screenshot's dimensions will likely be smaller than the viewport + * Example A: If the screenshot you see is 1094x1092 and you want to click (450,300) on that image, use: 450,300@1094x1092 + * Example B: If the browser viewport is 1280x800 but the screenshot is 1000x625 and you want to click (500,300) on the screenshot, use: 500,300@1000x625 - size: (optional) The width and height for the `resize` action. * Example: 1280,720 - text: (optional) Use this for providing the text for the `type` action. * Example: Hello, world! Usage: -Action to perform (e.g., launch, click, type, scroll_down, scroll_up, close) +Action to perform (e.g., launch, click, type, press, scroll_down, scroll_up, close) URL to launch the browser at (optional) -x,y coordinates (optional) +x,y@widthxheight coordinates (optional) Text to type (optional) @@ -273,10 +288,10 @@ Example: Requesting to launch a browser at https://example.com https://example.com -Example: Requesting to click on the element at coordinates 450,300 +Example: Requesting to click on the element at coordinates 450,300 on a 1024x768 image click -450,300 +450,300@1024x768 ## ask_followup_question @@ -505,7 +520,7 @@ RULES - At the end of each user message, you will automatically receive environment_details. This information is not written by the user themselves, but is auto-generated to provide potentially relevant context about the project structure and environment. While this information can be valuable for understanding the project context, do not treat it as a direct part of the user's request or response. Use it to inform your actions and decisions, but don't assume the user is explicitly asking about or referring to this information unless they clearly do so in their message. When using environment_details, explain your actions clearly to ensure the user understands, as they may not be aware of these details. - Before executing commands, check the "Actively Running Terminals" section in environment_details. If present, consider how these active processes might impact your task. For example, if a local development server is already running, you wouldn't need to start it again. If no active terminals are listed, proceed with command execution as normal. - MCP operations should be used one at a time, similar to other tool usage. Wait for confirmation of success before proceeding with additional operations. -- It is critical you wait for the user's response after each tool use, in order to confirm the success of the tool use. For example, if asked to make a todo app, you would create a file, wait for the user's response it was created successfully, then create another file if needed, wait for the user's response it was created successfully, etc. Then if you want to test your work, you might use browser_action to launch the site, wait for the user's response confirming the site was launched along with a screenshot, then perhaps e.g., click a button to test functionality if needed, wait for the user's response confirming the button was clicked along with a screenshot of the new state, before finally closing the browser. +- It is critical you wait for the user's response after each tool use, in order to confirm the success of the tool use. For example, if asked to make a todo app, you would create a file, wait for the user's response it was created successfully, then create another file if needed, wait for the user's response it was created successfully, etc. ==== diff --git a/src/core/prompts/tools/browser-action.ts b/src/core/prompts/tools/browser-action.ts index e1b33b9d7d1..3f9a5c1ae29 100644 --- a/src/core/prompts/tools/browser-action.ts +++ b/src/core/prompts/tools/browser-action.ts @@ -6,10 +6,12 @@ export function getBrowserActionDescription(args: ToolArgs): string | undefined } return `## browser_action Description: Request to interact with a Puppeteer-controlled browser. Every action, except \`close\`, will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action per message, and wait for the user's response including a screenshot and logs to determine the next action. -- The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL. -- While the browser is active, only the \`browser_action\` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser. For example if you run into an error and need to fix a file, you must close the browser, then use other tools to make the necessary changes, then re-launch the browser to verify the result. -- The browser window has a resolution of **${args.browserViewportSize}** pixels. When performing any click actions, ensure the coordinates are within this resolution range. -- Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges. + +**Browser Session Lifecycle:** +- Browser sessions **start** with \`launch\` and **end** with \`close\` +- The session remains active across multiple messages and tool uses +- You can use other tools while the browser session is active - it will stay open in the background + Parameters: - action: (required) The action to perform. The available actions are: * launch: Launch a new Puppeteer-controlled browser instance at the specified URL. This **must always be the first action**. @@ -23,6 +25,12 @@ Parameters: - Always click in the center of an element (icon, button, link, etc.) based on coordinates derived from a screenshot. * type: Type a string of text on the keyboard. You might use this after clicking on a text field to input text. - Use with the \`text\` parameter to provide the string to type. + * press: Press a single keyboard key or key combination (e.g., Enter, Tab, Escape, Cmd+K, Shift+Enter). + - Use with the \`text\` parameter to provide the key name or combination. + - For single keys: Enter, Tab, Escape, etc. + - For key combinations: Cmd+K, Ctrl+C, Shift+Enter, Alt+F4, etc. + - Supported modifiers: Cmd/Command/Meta, Ctrl/Control, Shift, Alt/Option + - Example: Cmd+K or Shift+Enter * resize: Resize the viewport to a specific w,h size. - Use with the \`size\` parameter to specify the new size. * scroll_down: Scroll down the page by one page height. @@ -31,17 +39,24 @@ Parameters: - Example: \`close\` - url: (optional) Use this for providing the URL for the \`launch\` action. * Example: https://example.com -- coordinate: (optional) The X and Y coordinates for the \`click\` and \`hover\` actions. Coordinates should be within the **${args.browserViewportSize}** resolution. - * Example: 450,300 +- coordinate: (optional) The X and Y coordinates for the \`click\` and \`hover\` actions. + * **CRITICAL**: Screenshot dimensions are NOT the same as the browser viewport dimensions + * Format: x,y@widthxheight + * Measure x,y on the screenshot image you see in chat + * The widthxheight MUST be the EXACT pixel size of that screenshot image (never the browser viewport) + * Never use the browser viewport size for widthxheight - the viewport is only a reference and is often larger than the screenshot + * Images are often downscaled before you see them, so the screenshot's dimensions will likely be smaller than the viewport + * Example A: If the screenshot you see is 1094x1092 and you want to click (450,300) on that image, use: 450,300@1094x1092 + * Example B: If the browser viewport is 1280x800 but the screenshot is 1000x625 and you want to click (500,300) on the screenshot, use: 500,300@1000x625 - size: (optional) The width and height for the \`resize\` action. * Example: 1280,720 - text: (optional) Use this for providing the text for the \`type\` action. * Example: Hello, world! Usage: -Action to perform (e.g., launch, click, type, scroll_down, scroll_up, close) +Action to perform (e.g., launch, click, type, press, scroll_down, scroll_up, close) URL to launch the browser at (optional) -x,y coordinates (optional) +x,y@widthxheight coordinates (optional) Text to type (optional) @@ -51,9 +66,9 @@ Example: Requesting to launch a browser at https://example.com https://example.com -Example: Requesting to click on the element at coordinates 450,300 +Example: Requesting to click on the element at coordinates 450,300 on a 1024x768 image click -450,300 +450,300@1024x768 ` } diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 7c0355e4982..925f4bf7e86 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -385,7 +385,28 @@ export class Task extends EventEmitter implements TaskLike { this.autoApprovalHandler = new AutoApprovalHandler() this.urlContentFetcher = new UrlContentFetcher(provider.context) - this.browserSession = new BrowserSession(provider.context) + this.browserSession = new BrowserSession(provider.context, (isActive: boolean) => { + // Add a message to indicate browser session status change + this.say("browser_session_status", isActive ? "Browser session opened" : "Browser session closed") + // Broadcast to browser panel + this.broadcastBrowserSessionUpdate() + + // When a browser session becomes active, automatically open/reveal the Browser Session tab + if (isActive) { + try { + // Lazy-load to avoid circular imports at module load time + const { BrowserSessionPanelManager } = require("../webview/BrowserSessionPanelManager") + const providerRef = this.providerRef.deref() + if (providerRef) { + BrowserSessionPanelManager.getInstance(providerRef) + .show() + .catch(() => {}) + } + } catch (err) { + console.error("[Task] Failed to auto-open Browser Session panel:", err) + } + } + }) this.diffEnabled = enableDiff this.fuzzyMatchThreshold = fuzzyMatchThreshold this.consecutiveMistakeLimit = consecutiveMistakeLimit ?? DEFAULT_CONSECUTIVE_MISTAKE_LIMIT @@ -1366,6 +1387,11 @@ export class Task extends EventEmitter implements TaskLike { contextCondense, }) } + + // Broadcast browser session updates to panel when browser-related messages are added + if (type === "browser_action" || type === "browser_action_result" || type === "browser_session_status") { + this.broadcastBrowserSessionUpdate() + } } async sayAndCreateMissingParamError(toolName: ToolName, paramName: string, relPath?: string) { @@ -1786,6 +1812,16 @@ export class Task extends EventEmitter implements TaskLike { } catch (error) { console.error("Error closing browser session:", error) } + // Also close the Browser Session panel when the task is disposed + try { + const provider = this.providerRef.deref() + if (provider) { + const { BrowserSessionPanelManager } = require("../webview/BrowserSessionPanelManager") + BrowserSessionPanelManager.getInstance(provider).dispose() + } + } catch (error) { + console.error("Error closing browser session panel:", error) + } try { if (this.rooIgnoreController) { @@ -3488,6 +3524,41 @@ export class Task extends EventEmitter implements TaskLike { return this.workspacePath } + /** + * Broadcast browser session updates to the browser panel (if open) + */ + private broadcastBrowserSessionUpdate(): void { + const provider = this.providerRef.deref() + if (!provider) { + return + } + + try { + const { BrowserSessionPanelManager } = require("../webview/BrowserSessionPanelManager") + const panelManager = BrowserSessionPanelManager.getInstance(provider) + + // Get browser session messages + const browserSessionStartIndex = this.clineMessages.findIndex( + (m) => + m.ask === "browser_action_launch" || + (m.say === "browser_session_status" && m.text?.includes("opened")), + ) + + const browserSessionMessages = + browserSessionStartIndex !== -1 ? this.clineMessages.slice(browserSessionStartIndex) : [] + + const isBrowserSessionActive = this.browserSession?.isSessionActive() ?? false + + // Update the panel asynchronously + panelManager.updateBrowserSession(browserSessionMessages, isBrowserSessionActive).catch((error: Error) => { + console.error("Failed to broadcast browser session update:", error) + }) + } catch (error) { + // Silently fail if panel manager is not available + console.debug("Browser panel not available for update:", error) + } + } + /** * Process any queued messages by dequeuing and submitting them. * This ensures that queued user messages are sent when appropriate, diff --git a/src/core/tools/BrowserActionTool.ts b/src/core/tools/BrowserActionTool.ts deleted file mode 100644 index 3e8f6f176e4..00000000000 --- a/src/core/tools/BrowserActionTool.ts +++ /dev/null @@ -1,244 +0,0 @@ -import type { BrowserActionParams, Coordinate, Size } from "@roo-code/types" -import { Task } from "../task/Task" -import { BaseTool, ToolCallbacks } from "./BaseTool" -import type { ToolUse } from "../../shared/tools" -import { - BrowserAction, - BrowserActionResult, - browserActions, - ClineSayBrowserAction, -} from "../../shared/ExtensionMessage" -import { formatResponse } from "../prompts/responses" - -export class BrowserActionTool extends BaseTool<"browser_action"> { - readonly name = "browser_action" as const - - parseLegacy(params: Partial>): BrowserActionParams { - const action = params.action as BrowserAction | undefined - - // Parse coordinate if present - XML protocol sends "x,y" format - let coordinate: Coordinate | undefined - if (params.coordinate) { - // Try parsing as "x,y" string first (XML protocol) - const parts = params.coordinate.split(",") - if (parts.length === 2) { - const x = parseInt(parts[0], 10) - const y = parseInt(parts[1], 10) - if (!isNaN(x) && !isNaN(y)) { - coordinate = { x, y } - } - } else { - // Try parsing as JSON object (fallback) - try { - const parsed = JSON.parse(params.coordinate) - if (parsed && typeof parsed.x === "number" && typeof parsed.y === "number") { - coordinate = { x: parsed.x, y: parsed.y } - } - } catch (error) { - // Invalid coordinate format, leave undefined - } - } - } - - // Parse size if present - XML protocol sends "width,height" format - let size: Size | undefined - if (params.size) { - // Try parsing as "width,height" string first (XML protocol) - const parts = params.size.split(",") - if (parts.length === 2) { - const width = parseInt(parts[0], 10) - const height = parseInt(parts[1], 10) - if (!isNaN(width) && !isNaN(height)) { - size = { width, height } - } - } else { - // Try parsing as JSON object (fallback) - try { - const parsed = JSON.parse(params.size) - if (parsed && typeof parsed.width === "number" && typeof parsed.height === "number") { - size = { width: parsed.width, height: parsed.height } - } - } catch (error) { - // Invalid size format, leave undefined - } - } - } - - return { - action: action!, - url: params.url, - coordinate, - size, - text: params.text, - } - } - - async execute(params: BrowserActionParams, task: Task, callbacks: ToolCallbacks): Promise { - const { action, url, coordinate, text, size } = params - const { handleError, pushToolResult } = callbacks - - // Validate action - if (!action || !browserActions.includes(action)) { - task.consecutiveMistakeCount++ - task.recordToolError("browser_action") - pushToolResult(await task.sayAndCreateMissingParamError("browser_action", "action")) - await task.browserSession.closeBrowser() - return - } - - try { - let browserActionResult: BrowserActionResult = {} - - if (action === "launch") { - if (!url) { - task.consecutiveMistakeCount++ - task.recordToolError("browser_action") - pushToolResult(await task.sayAndCreateMissingParamError("browser_action", "url")) - await task.browserSession.closeBrowser() - return - } - - task.consecutiveMistakeCount = 0 - const didApprove = await callbacks.askApproval("browser_action_launch", url) - - if (!didApprove) { - return - } - - await task.say("browser_action_result", "") - await task.browserSession.launchBrowser() - browserActionResult = await task.browserSession.navigateToUrl(url) - } else { - // Validate parameters for specific actions - if (action === "click" || action === "hover") { - if (!coordinate) { - task.consecutiveMistakeCount++ - task.recordToolError("browser_action") - pushToolResult(await task.sayAndCreateMissingParamError("browser_action", "coordinate")) - await task.browserSession.closeBrowser() - return - } - } - - if (action === "type") { - if (!text) { - task.consecutiveMistakeCount++ - task.recordToolError("browser_action") - pushToolResult(await task.sayAndCreateMissingParamError("browser_action", "text")) - await task.browserSession.closeBrowser() - return - } - } - - if (action === "resize") { - if (!size) { - task.consecutiveMistakeCount++ - task.recordToolError("browser_action") - pushToolResult(await task.sayAndCreateMissingParamError("browser_action", "size")) - await task.browserSession.closeBrowser() - return - } - } - - task.consecutiveMistakeCount = 0 - - await task.say( - "browser_action", - JSON.stringify({ - action: action as BrowserAction, - coordinate: coordinate ? `${coordinate.x},${coordinate.y}` : undefined, - text, - } satisfies ClineSayBrowserAction), - undefined, - false, - ) - - switch (action) { - case "click": - browserActionResult = await task.browserSession.click(`${coordinate!.x},${coordinate!.y}`) - break - case "hover": - browserActionResult = await task.browserSession.hover(`${coordinate!.x},${coordinate!.y}`) - break - case "type": - browserActionResult = await task.browserSession.type(text!) - break - case "scroll_down": - browserActionResult = await task.browserSession.scrollDown() - break - case "scroll_up": - browserActionResult = await task.browserSession.scrollUp() - break - case "resize": - browserActionResult = await task.browserSession.resize(`${size!.width},${size!.height}`) - break - case "close": - browserActionResult = await task.browserSession.closeBrowser() - break - } - } - - switch (action) { - case "launch": - case "click": - case "hover": - case "type": - case "scroll_down": - case "scroll_up": - case "resize": - await task.say("browser_action_result", JSON.stringify(browserActionResult)) - - pushToolResult( - formatResponse.toolResult( - `The browser action has been executed. The console logs and screenshot have been captured for your analysis.\n\nConsole logs:\n${ - browserActionResult?.logs || "(No new logs)" - }\n\n(REMEMBER: if you need to proceed to using non-\`browser_action\` tools or launch a new browser, you MUST first close cline browser. For example, if after analyzing the logs and screenshot you need to edit a file, you must first close the browser before you can use the write_to_file tool.)`, - browserActionResult?.screenshot ? [browserActionResult.screenshot] : [], - ), - ) - break - - case "close": - pushToolResult( - formatResponse.toolResult( - `The browser has been closed. You may now proceed to using other tools.`, - ), - ) - break - } - } catch (error) { - await task.browserSession.closeBrowser() - await handleError("executing browser action", error as Error) - } - } - - override async handlePartial(task: Task, block: ToolUse<"browser_action">): Promise { - const action: BrowserAction | undefined = block.params.action as BrowserAction - const url: string | undefined = block.params.url - const coordinate: string | undefined = block.params.coordinate - const text: string | undefined = block.params.text - - if (!action || !browserActions.includes(action)) { - return - } - - if (action === "launch") { - await task - .ask("browser_action_launch", this.removeClosingTag("url", url, block.partial), block.partial) - .catch(() => {}) - } else { - await task.say( - "browser_action", - JSON.stringify({ - action: action as BrowserAction, - coordinate: this.removeClosingTag("coordinate", coordinate, block.partial), - text: this.removeClosingTag("text", text, block.partial), - } satisfies ClineSayBrowserAction), - undefined, - block.partial, - ) - } - } -} - -export const browserActionTool = new BrowserActionTool() diff --git a/src/core/tools/__tests__/browserActionTool.coordinateScaling.spec.ts b/src/core/tools/__tests__/browserActionTool.coordinateScaling.spec.ts new file mode 100644 index 00000000000..08604026745 --- /dev/null +++ b/src/core/tools/__tests__/browserActionTool.coordinateScaling.spec.ts @@ -0,0 +1,148 @@ +// Test coordinate scaling functionality in browser actions +import { describe, it, expect, vi, beforeEach } from "vitest" + +// Mock the scaleCoordinate function by extracting it +// In a real scenario, we'd export it or test through the main function +// For now, we'll test the regex pattern and logic + +describe("Browser Action Coordinate Scaling", () => { + describe("Coordinate format validation", () => { + it("should match valid coordinate format with image dimensions", () => { + const validFormats = [ + "450,300@1024x768", + "0,0@1920x1080", + "1920,1080@1920x1080", + "100,200@800x600", + " 273 , 273 @ 1280x800 ", + "267,273@1280,800", // comma separator for dimensions + "450,300@1024,768", // comma separator for dimensions + ] + + const regex = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/ + + validFormats.forEach((coord) => { + expect(coord).toMatch(regex) + }) + }) + + it("should not match invalid coordinate formats", () => { + const invalidFormats = [ + "450,300", // missing image dimensions + "450,300@", // incomplete dimensions + "450,300@1024", // missing height + "450,300@1024x", // missing height value + "@1024x768", // missing coordinates + "450@1024x768", // missing y coordinate + ",300@1024x768", // missing x coordinate + "450,300@1024x768x2", // extra dimension + "a,b@1024x768", // non-numeric coordinates + "450,300@axb", // non-numeric dimensions + ] + + const regex = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/ + + invalidFormats.forEach((coord) => { + expect(coord).not.toMatch(regex) + }) + }) + }) + + describe("Coordinate scaling logic", () => { + it("should correctly scale coordinates from image to viewport", () => { + // Simulate the scaling logic + const scaleCoordinate = (coordinate: string, viewportWidth: number, viewportHeight: number): string => { + const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) + if (!match) { + throw new Error(`Invalid coordinate format: "${coordinate}"`) + } + + const [, xStr, yStr, imgWidthStr, imgHeightStr] = match + const x = parseInt(xStr, 10) + const y = parseInt(yStr, 10) + const imgWidth = parseInt(imgWidthStr, 10) + const imgHeight = parseInt(imgHeightStr, 10) + + const scaledX = Math.round((x / imgWidth) * viewportWidth) + const scaledY = Math.round((y / imgHeight) * viewportHeight) + + return `${scaledX},${scaledY}` + } + + // Test case 1: Same dimensions (no scaling) + expect(scaleCoordinate("450,300@900x600", 900, 600)).toBe("450,300") + + // Test case 2: Half dimensions (2x upscale) + expect(scaleCoordinate("225,150@450x300", 900, 600)).toBe("450,300") + + // Test case 3: Double dimensions (0.5x downscale) + expect(scaleCoordinate("900,600@1800x1200", 900, 600)).toBe("450,300") + + // Test case 4: Different aspect ratio + expect(scaleCoordinate("512,384@1024x768", 1920, 1080)).toBe("960,540") + + // Test case 5: Edge cases (0,0) + expect(scaleCoordinate("0,0@1024x768", 1920, 1080)).toBe("0,0") + + // Test case 6: Edge cases (max coordinates) + expect(scaleCoordinate("1024,768@1024x768", 1920, 1080)).toBe("1920,1080") + }) + + it("should throw error for invalid coordinate format", () => { + const scaleCoordinate = (coordinate: string, viewportWidth: number, viewportHeight: number): string => { + const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) + if (!match) { + throw new Error( + `Invalid coordinate format: "${coordinate}". ` + + `Expected format: "x,y@widthxheight" (e.g., "450,300@1024x768")`, + ) + } + + const [, xStr, yStr, imgWidthStr, imgHeightStr] = match + const x = parseInt(xStr, 10) + const y = parseInt(yStr, 10) + const imgWidth = parseInt(imgWidthStr, 10) + const imgHeight = parseInt(imgHeightStr, 10) + + const scaledX = Math.round((x / imgWidth) * viewportWidth) + const scaledY = Math.round((y / imgHeight) * viewportHeight) + + return `${scaledX},${scaledY}` + } + + // Test invalid formats + expect(() => scaleCoordinate("450,300", 900, 600)).toThrow("Invalid coordinate format") + expect(() => scaleCoordinate("450,300@1024", 900, 600)).toThrow("Invalid coordinate format") + expect(() => scaleCoordinate("invalid", 900, 600)).toThrow("Invalid coordinate format") + }) + + it("should handle rounding correctly", () => { + const scaleCoordinate = (coordinate: string, viewportWidth: number, viewportHeight: number): string => { + const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) + if (!match) { + throw new Error(`Invalid coordinate format: "${coordinate}"`) + } + + const [, xStr, yStr, imgWidthStr, imgHeightStr] = match + const x = parseInt(xStr, 10) + const y = parseInt(yStr, 10) + const imgWidth = parseInt(imgWidthStr, 10) + const imgHeight = parseInt(imgHeightStr, 10) + + const scaledX = Math.round((x / imgWidth) * viewportWidth) + const scaledY = Math.round((y / imgHeight) * viewportHeight) + + return `${scaledX},${scaledY}` + } + + // Test rounding behavior + // 333 / 1000 * 900 = 299.7 -> rounds to 300 + expect(scaleCoordinate("333,333@1000x1000", 900, 900)).toBe("300,300") + + // 666 / 1000 * 900 = 599.4 -> rounds to 599 + expect(scaleCoordinate("666,666@1000x1000", 900, 900)).toBe("599,599") + + // 500 / 1000 * 900 = 450.0 -> rounds to 450 + expect(scaleCoordinate("500,500@1000x1000", 900, 900)).toBe("450,450") + }) + }) +}) diff --git a/src/core/webview/BrowserSessionPanelManager.ts b/src/core/webview/BrowserSessionPanelManager.ts new file mode 100644 index 00000000000..514c1315f7f --- /dev/null +++ b/src/core/webview/BrowserSessionPanelManager.ts @@ -0,0 +1,310 @@ +import * as vscode from "vscode" +import type { ClineMessage } from "@roo-code/types" +import { getUri } from "./getUri" +import { getNonce } from "./getNonce" +import type { ClineProvider } from "./ClineProvider" +import { webviewMessageHandler } from "./webviewMessageHandler" + +export class BrowserSessionPanelManager { + private static instances: WeakMap = new WeakMap() + private panel: vscode.WebviewPanel | undefined + private disposables: vscode.Disposable[] = [] + private isReady: boolean = false + private pendingUpdate?: { messages: ClineMessage[]; isActive: boolean } + private pendingNavigateIndex?: number + private userManuallyClosedPanel: boolean = false + + private constructor(private readonly provider: ClineProvider) {} + + /** + * Get or create a BrowserSessionPanelManager instance for the given provider + */ + public static getInstance(provider: ClineProvider): BrowserSessionPanelManager { + let instance = BrowserSessionPanelManager.instances.get(provider) + if (!instance) { + instance = new BrowserSessionPanelManager(provider) + BrowserSessionPanelManager.instances.set(provider, instance) + } + return instance + } + + /** + * Show the browser session panel, creating it if necessary + */ + public async show(): Promise { + await this.createOrShowPanel() + + // Send initial browser session data + const task = this.provider.getCurrentTask() + if (task) { + const messages = task.clineMessages || [] + const browserSessionStartIndex = messages.findIndex( + (m) => + m.ask === "browser_action_launch" || + (m.say === "browser_session_status" && m.text?.includes("opened")), + ) + const browserSessionMessages = + browserSessionStartIndex !== -1 ? messages.slice(browserSessionStartIndex) : [] + const isBrowserSessionActive = task.browserSession?.isSessionActive() ?? false + + await this.updateBrowserSession(browserSessionMessages, isBrowserSessionActive) + } + } + + private async createOrShowPanel(): Promise { + // If panel already exists, show it + if (this.panel) { + this.panel.reveal(vscode.ViewColumn.One) + return + } + + const extensionUri = this.provider.context.extensionUri + const extensionMode = this.provider.context.extensionMode + + // Create new panel + this.panel = vscode.window.createWebviewPanel("roo.browserSession", "Browser Session", vscode.ViewColumn.One, { + enableScripts: true, + retainContextWhenHidden: true, + localResourceRoots: [extensionUri], + }) + + // Set up the webview's HTML content + this.panel.webview.html = + extensionMode === vscode.ExtensionMode.Development + ? await this.getHMRHtmlContent(this.panel.webview, extensionUri) + : this.getHtmlContent(this.panel.webview, extensionUri) + + // Wire message channel for this panel (state handshake + actions) + this.panel.webview.onDidReceiveMessage( + async (message: any) => { + try { + // Let the shared handler process commands that work for any webview + if (message?.type) { + await webviewMessageHandler(this.provider as any, message) + } + // Panel-specific readiness and initial state + if (message?.type === "webviewDidLaunch") { + this.isReady = true + // Send full extension state to this panel (the sidebar postState targets the main webview) + const state = await (this.provider as any).getStateToPostToWebview?.() + if (state) { + await this.panel?.webview.postMessage({ type: "state", state }) + } + // Flush any pending browser session update queued before readiness + if (this.pendingUpdate) { + await this.updateBrowserSession(this.pendingUpdate.messages, this.pendingUpdate.isActive) + this.pendingUpdate = undefined + } + // Flush any pending navigation request queued before readiness + if (this.pendingNavigateIndex !== undefined) { + await this.navigateToStep(this.pendingNavigateIndex) + this.pendingNavigateIndex = undefined + } + } + } catch (err) { + console.error("[BrowserSessionPanel] onDidReceiveMessage error:", err) + } + }, + undefined, + this.disposables, + ) + + // Handle panel disposal - track that user closed it manually + this.panel.onDidDispose( + () => { + // Mark that user manually closed the panel (unless we're programmatically disposing) + if (this.panel) { + this.userManuallyClosedPanel = true + } + this.panel = undefined + this.dispose() + }, + null, + this.disposables, + ) + } + + public async updateBrowserSession(messages: ClineMessage[], isBrowserSessionActive: boolean): Promise { + if (!this.panel) { + return + } + // If the panel isn't ready yet, queue the latest snapshot to post after handshake + if (!this.isReady) { + this.pendingUpdate = { messages, isActive: isBrowserSessionActive } + return + } + + await this.panel.webview.postMessage({ + type: "browserSessionUpdate", + browserSessionMessages: messages, + isBrowserSessionActive, + }) + } + + /** + * Navigate the Browser Session panel to a specific step index. + * If the panel isn't ready yet, queue the navigation to run after handshake. + */ + public async navigateToStep(stepIndex: number): Promise { + if (!this.panel) { + return + } + if (!this.isReady) { + this.pendingNavigateIndex = stepIndex + return + } + + await this.panel.webview.postMessage({ + type: "browserSessionNavigate", + stepIndex, + }) + } + + /** + * Reset the manual close flag (call this when a new browser session launches) + */ + public resetManualCloseFlag(): void { + this.userManuallyClosedPanel = false + } + + /** + * Check if auto-opening should be allowed (not manually closed by user) + */ + public shouldAllowAutoOpen(): boolean { + return !this.userManuallyClosedPanel + } + + /** + * Whether the Browser Session panel is currently open. + */ + public isOpen(): boolean { + return !!this.panel + } + + /** + * Toggle the Browser Session panel visibility. + * - If open: closes it + * - If closed: opens it and sends initial session snapshot + */ + public async toggle(): Promise { + if (this.panel) { + this.dispose() + } else { + await this.show() + } + } + + public dispose(): void { + // Clear the panel reference before disposing to prevent marking as manual close + const panelToDispose = this.panel + this.panel = undefined + + while (this.disposables.length) { + const disposable = this.disposables.pop() + if (disposable) { + disposable.dispose() + } + } + try { + panelToDispose?.dispose() + } catch {} + this.isReady = false + this.pendingUpdate = undefined + } + + private async getHMRHtmlContent(webview: vscode.Webview, extensionUri: vscode.Uri): Promise { + const fs = require("fs") + const path = require("path") + let localPort = "5173" + + try { + const portFilePath = path.resolve(__dirname, "../../.vite-port") + if (fs.existsSync(portFilePath)) { + localPort = fs.readFileSync(portFilePath, "utf8").trim() + } + } catch (err) { + console.error("[BrowserSessionPanel:Vite] Failed to read port file:", err) + } + + const localServerUrl = `localhost:${localPort}` + const nonce = getNonce() + + const stylesUri = getUri(webview, extensionUri, ["webview-ui", "build", "assets", "index.css"]) + const codiconsUri = getUri(webview, extensionUri, ["assets", "codicons", "codicon.css"]) + + const scriptUri = `http://${localServerUrl}/src/browser-panel.tsx` + + const reactRefresh = ` + + ` + + const csp = [ + "default-src 'none'", + `font-src ${webview.cspSource} data:`, + `style-src ${webview.cspSource} 'unsafe-inline' https://* http://${localServerUrl}`, + `img-src ${webview.cspSource} data:`, + `script-src 'unsafe-eval' ${webview.cspSource} http://${localServerUrl} 'nonce-${nonce}'`, + `connect-src ${webview.cspSource} ws://${localServerUrl} http://${localServerUrl}`, + ] + + return ` + + + + + + + + + Browser Session + + +
+ ${reactRefresh} + + + + ` + } + + private getHtmlContent(webview: vscode.Webview, extensionUri: vscode.Uri): string { + const stylesUri = getUri(webview, extensionUri, ["webview-ui", "build", "assets", "index.css"]) + const scriptUri = getUri(webview, extensionUri, ["webview-ui", "build", "assets", "browser-panel.js"]) + const codiconsUri = getUri(webview, extensionUri, ["assets", "codicons", "codicon.css"]) + + const nonce = getNonce() + + const csp = [ + "default-src 'none'", + `font-src ${webview.cspSource} data:`, + `style-src ${webview.cspSource} 'unsafe-inline'`, + `img-src ${webview.cspSource} data:`, + `script-src ${webview.cspSource} 'wasm-unsafe-eval' 'nonce-${nonce}'`, + `connect-src ${webview.cspSource}`, + ] + + return ` + + + + + + + + + Browser Session + + +
+ + + + ` + } +} diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts index ff97d5f030a..9a10387dc6b 100644 --- a/src/core/webview/ClineProvider.ts +++ b/src/core/webview/ClineProvider.ts @@ -1974,6 +1974,7 @@ export class ClineProvider alwaysAllowModeSwitch: alwaysAllowModeSwitch ?? false, alwaysAllowSubtasks: alwaysAllowSubtasks ?? false, alwaysAllowUpdateTodoList: alwaysAllowUpdateTodoList ?? false, + isBrowserSessionActive: this.getCurrentTask()?.browserSession?.isSessionActive() ?? false, allowedMaxRequests, allowedMaxCost, autoCondenseContext: autoCondenseContext ?? true, @@ -2187,6 +2188,9 @@ export class ClineProvider ) } + // Get actual browser session state + const isBrowserSessionActive = this.getCurrentTask()?.browserSession?.isSessionActive() ?? false + // Return the same structure as before. return { apiConfiguration: providerSettings, @@ -2205,6 +2209,7 @@ export class ClineProvider alwaysAllowSubtasks: stateValues.alwaysAllowSubtasks ?? false, alwaysAllowFollowupQuestions: stateValues.alwaysAllowFollowupQuestions ?? false, alwaysAllowUpdateTodoList: stateValues.alwaysAllowUpdateTodoList ?? false, + isBrowserSessionActive, followupAutoApproveTimeoutMs: stateValues.followupAutoApproveTimeoutMs ?? 60000, diagnosticsEnabled: stateValues.diagnosticsEnabled ?? true, allowedMaxRequests: stateValues.allowedMaxRequests, diff --git a/src/core/webview/__tests__/ClineProvider.spec.ts b/src/core/webview/__tests__/ClineProvider.spec.ts index d494715691c..70876373feb 100644 --- a/src/core/webview/__tests__/ClineProvider.spec.ts +++ b/src/core/webview/__tests__/ClineProvider.spec.ts @@ -503,6 +503,7 @@ describe("ClineProvider", () => { const mockState: ExtensionState = { version: "1.0.0", + isBrowserSessionActive: false, clineMessages: [], taskHistory: [], shouldShowAnnouncement: false, diff --git a/src/core/webview/webviewMessageHandler.ts b/src/core/webview/webviewMessageHandler.ts index 8f89a9ec516..5f7c4fcc3f1 100644 --- a/src/core/webview/webviewMessageHandler.ts +++ b/src/core/webview/webviewMessageHandler.ts @@ -23,6 +23,7 @@ import { type ApiMessage } from "../task-persistence/apiMessages" import { saveTaskMessages } from "../task-persistence" import { ClineProvider } from "./ClineProvider" +import { BrowserSessionPanelManager } from "./BrowserSessionPanelManager" import { handleCheckpointRestoreOperation } from "./checkpointRestoreHandler" import { changeLanguage, t } from "../../i18n" import { Package } from "../../shared/package" @@ -1116,6 +1117,101 @@ export const webviewMessageHandler = async ( case "cancelTask": await provider.cancelTask() break + case "killBrowserSession": + { + const task = provider.getCurrentTask() + if (task?.browserSession) { + await task.browserSession.closeBrowser() + await provider.postStateToWebview() + } + } + break + case "openBrowserSessionPanel": + { + // Toggle the Browser Session panel (open if closed, close if open) + const panelManager = BrowserSessionPanelManager.getInstance(provider) + await panelManager.toggle() + } + break + case "showBrowserSessionPanelAtStep": + { + const panelManager = BrowserSessionPanelManager.getInstance(provider) + + // If this is a launch action, reset the manual close flag + if (message.isLaunchAction) { + panelManager.resetManualCloseFlag() + } + + // Show panel if: + // 1. Manual click (forceShow) - always show + // 2. Launch action - always show and reset flag + // 3. Auto-open for non-launch action - only if user hasn't manually closed + if (message.forceShow || message.isLaunchAction || panelManager.shouldAllowAutoOpen()) { + // Ensure panel is shown and populated + await panelManager.show() + + // Navigate to a specific step if provided + // For launch actions: navigate to step 0 + // For manual clicks: navigate to the clicked step + // For auto-opens of regular actions: don't navigate, let BrowserSessionRow's + // internal auto-advance logic handle it (only advances if user is on most recent step) + if (typeof message.stepIndex === "number" && message.stepIndex >= 0) { + await panelManager.navigateToStep(message.stepIndex) + } + } + } + break + case "refreshBrowserSessionPanel": + { + // Re-send the latest browser session snapshot to the panel + const panelManager = BrowserSessionPanelManager.getInstance(provider) + const task = provider.getCurrentTask() + if (task) { + const messages = task.clineMessages || [] + const browserSessionStartIndex = messages.findIndex( + (m) => + m.ask === "browser_action_launch" || + (m.say === "browser_session_status" && m.text?.includes("opened")), + ) + const browserSessionMessages = + browserSessionStartIndex !== -1 ? messages.slice(browserSessionStartIndex) : [] + const isBrowserSessionActive = task.browserSession?.isSessionActive() ?? false + await panelManager.updateBrowserSession(browserSessionMessages, isBrowserSessionActive) + } + } + break + case "allowedCommands": { + // Validate and sanitize the commands array + const commands = message.commands ?? [] + const validCommands = Array.isArray(commands) + ? commands.filter((cmd) => typeof cmd === "string" && cmd.trim().length > 0) + : [] + + await updateGlobalState("allowedCommands", validCommands) + + // Also update workspace settings. + await vscode.workspace + .getConfiguration(Package.name) + .update("allowedCommands", validCommands, vscode.ConfigurationTarget.Global) + + break + } + case "deniedCommands": { + // Validate and sanitize the commands array + const commands = message.commands ?? [] + const validCommands = Array.isArray(commands) + ? commands.filter((cmd) => typeof cmd === "string" && cmd.trim().length > 0) + : [] + + await updateGlobalState("deniedCommands", validCommands) + + // Also update workspace settings. + await vscode.workspace + .getConfiguration(Package.name) + .update("deniedCommands", validCommands, vscode.ConfigurationTarget.Global) + + break + } case "openCustomModesSettings": { const customModesFilePath = await provider.customModesManager.getCustomModesFilePath() @@ -2043,7 +2139,8 @@ export const webviewMessageHandler = async ( provider.postMessageToWebview({ type: "importModeResult", success: true, - slug: result.slug, + // Cast to any to support older ImportResult types that may not declare slug + slug: (result as any)?.slug, }) // Show success message diff --git a/src/services/browser/BrowserSession.ts b/src/services/browser/BrowserSession.ts index 75b432f01d2..fdd897c5ac6 100644 --- a/src/services/browser/BrowserSession.ts +++ b/src/services/browser/BrowserSession.ts @@ -1,7 +1,7 @@ import * as vscode from "vscode" import * as fs from "fs/promises" import * as path from "path" -import { Browser, Page, ScreenshotOptions, TimeoutError, launch, connect } from "puppeteer-core" +import { Browser, Page, ScreenshotOptions, TimeoutError, launch, connect, KeyInput } from "puppeteer-core" // @ts-ignore import PCR from "puppeteer-chromium-resolver" import pWaitFor from "p-wait-for" @@ -25,9 +25,15 @@ export class BrowserSession { private currentMousePosition?: string private lastConnectionAttempt?: number private isUsingRemoteBrowser: boolean = false + private onStateChange?: (isActive: boolean) => void - constructor(context: vscode.ExtensionContext) { + // Track last known viewport to surface in environment details + private lastViewportWidth?: number + private lastViewportHeight?: number + + constructor(context: vscode.ExtensionContext, onStateChange?: (isActive: boolean) => void) { this.context = context + this.onStateChange = onStateChange } private async ensureChromiumExists(): Promise { @@ -189,13 +195,20 @@ export class BrowserSession { await this.launchLocalBrowser() } } + + // Notify that browser session is now active + if (this.browser && this.onStateChange) { + this.onStateChange(true) + } } /** * Closes the browser and resets browser state */ async closeBrowser(): Promise { - if (this.browser || this.page) { + const wasActive = !!(this.browser || this.page) + + if (wasActive) { console.log("closing browser...") if (this.isUsingRemoteBrowser && this.browser) { @@ -204,6 +217,11 @@ export class BrowserSession { await this.browser?.close().catch(() => {}) } this.resetBrowserState() + + // Notify that browser session is now inactive + if (this.onStateChange) { + this.onStateChange(false) + } } return {} } @@ -216,12 +234,14 @@ export class BrowserSession { this.page = undefined this.currentMousePosition = undefined this.isUsingRemoteBrowser = false + this.lastViewportWidth = undefined + this.lastViewportHeight = undefined } async doAction(action: (page: Page) => Promise): Promise { if (!this.page) { throw new Error( - "Browser is not launched. This may occur if the browser was automatically closed by a non-`browser_action` tool.", + "Cannot perform browser action: no active browser session. The browser must be launched first using the 'launch' action before other browser actions can be performed.", ) } @@ -260,6 +280,11 @@ export class BrowserSession { interval: 100, }).catch(() => {}) + // Draw cursor indicator if we have a cursor position + if (this.currentMousePosition) { + await this.drawCursorIndicator(this.page, this.currentMousePosition) + } + let options: ScreenshotOptions = { encoding: "base64", @@ -291,15 +316,29 @@ export class BrowserSession { throw new Error("Failed to take screenshot.") } + // Remove cursor indicator after taking screenshot + if (this.currentMousePosition) { + await this.removeCursorIndicator(this.page) + } + // this.page.removeAllListeners() <- causes the page to crash! this.page.off("console", consoleListener) this.page.off("pageerror", errorListener) + // Get actual viewport dimensions + const viewport = this.page.viewport() + + // Persist last known viewport dimensions + this.lastViewportWidth = viewport?.width + this.lastViewportHeight = viewport?.height + return { screenshot, logs: logs.join("\n"), currentUrl: this.page.url(), currentMousePosition: this.currentMousePosition, + viewportWidth: viewport?.width, + viewportHeight: viewport?.height, } } @@ -453,6 +492,64 @@ export class BrowserSession { } } + /** + * Force links and window.open to navigate in the same tab. + * This makes clicks on anchors with target="_blank" stay in the current page + * and also intercepts window.open so SPA/open-in-new-tab patterns don't spawn popups. + */ + private async forceLinksToSameTab(page: Page): Promise { + try { + await page.evaluate(() => { + try { + // Ensure we only install once per document + if ((window as any).__ROO_FORCE_SAME_TAB__) return + ;(window as any).__ROO_FORCE_SAME_TAB__ = true + + // Override window.open to navigate current tab instead of creating a new one + const originalOpen = window.open + window.open = function (url: string | URL, target?: string, features?: string) { + try { + const href = typeof url === "string" ? url : String(url) + location.href = href + } catch { + // fall back to original if something unexpected occurs + try { + return originalOpen.apply(window, [url as any, "_self", features]) as any + } catch {} + } + return null as any + } as any + + // Rewrite anchors that explicitly open new tabs + document.querySelectorAll('a[target="_blank"]').forEach((a) => { + a.setAttribute("target", "_self") + }) + + // Defensive capture: if an element still tries to open in a new tab, force same-tab + document.addEventListener( + "click", + (ev) => { + const el = (ev.target as HTMLElement | null)?.closest?.( + 'a[target="_blank"]', + ) as HTMLAnchorElement | null + if (el && el.href) { + ev.preventDefault() + try { + location.href = el.href + } catch {} + } + }, + { capture: true, passive: false }, + ) + } catch { + // no-op; forcing same-tab is best-effort + } + }) + } catch { + // If evaluate fails (e.g., cross-origin/state), continue without breaking the action + } + } + /** * Handles mouse interaction with network activity monitoring */ @@ -463,6 +560,9 @@ export class BrowserSession { ): Promise { const [x, y] = coordinate.split(",").map(Number) + // Force any new-tab behavior (target="_blank", window.open) to stay in the same tab + await this.forceLinksToSameTab(page) + // Set up network request monitoring let hasNetworkActivity = false const requestListener = () => { @@ -506,6 +606,106 @@ export class BrowserSession { }) } + async press(key: string): Promise { + return this.doAction(async (page) => { + // Parse key combinations (e.g., "Cmd+K", "Shift+Enter") + const parts = key.split("+").map((k) => k.trim()) + const modifiers: string[] = [] + let mainKey = parts[parts.length - 1] + + // Identify modifiers + for (let i = 0; i < parts.length - 1; i++) { + const part = parts[i].toLowerCase() + if (part === "cmd" || part === "command" || part === "meta") { + modifiers.push("Meta") + } else if (part === "ctrl" || part === "control") { + modifiers.push("Control") + } else if (part === "shift") { + modifiers.push("Shift") + } else if (part === "alt" || part === "option") { + modifiers.push("Alt") + } + } + + // Map common key aliases to Puppeteer KeyInput values + const mapping: Record = { + esc: "Escape", + return: "Enter", + escape: "Escape", + enter: "Enter", + tab: "Tab", + space: "Space", + arrowup: "ArrowUp", + arrowdown: "ArrowDown", + arrowleft: "ArrowLeft", + arrowright: "ArrowRight", + } + mainKey = (mapping[mainKey.toLowerCase()] ?? mainKey) as string + + // Avoid new-tab behavior from Enter on links/buttons + await this.forceLinksToSameTab(page) + + // Track inflight requests so we can detect brief network bursts + let inflight = 0 + const onRequest = () => { + inflight++ + } + const onRequestDone = () => { + inflight = Math.max(0, inflight - 1) + } + page.on("request", onRequest) + page.on("requestfinished", onRequestDone) + page.on("requestfailed", onRequestDone) + + // Start a short navigation wait in parallel; if no nav, it times out harmlessly + const HARD_CAP_MS = 3000 + const navPromise = page + .waitForNavigation({ + // domcontentloaded is enough to confirm a submit navigated + waitUntil: ["domcontentloaded"], + timeout: HARD_CAP_MS, + }) + .catch(() => undefined) + + // Press key combination + if (modifiers.length > 0) { + // Hold down modifiers + for (const modifier of modifiers) { + await page.keyboard.down(modifier as KeyInput) + } + + // Press main key + await page.keyboard.press(mainKey as KeyInput) + + // Release modifiers + for (const modifier of modifiers) { + await page.keyboard.up(modifier as KeyInput) + } + } else { + // Single key press + await page.keyboard.press(mainKey as KeyInput) + } + + // Give time for any requests to kick off + await delay(120) + + // Hard-cap the wait to avoid UI hangs + await Promise.race([ + navPromise, + pWaitFor(() => inflight === 0, { timeout: HARD_CAP_MS, interval: 100 }).catch(() => {}), + delay(HARD_CAP_MS), + ]) + + // Stabilize DOM briefly before capturing screenshot (shorter cap) + await this.waitTillHTMLStable(page, 2_000) + + // Cleanup + page.off("request", onRequest) + page.off("requestfinished", onRequestDone) + page.off("requestfailed", onRequestDone) + }) + } + /** * Scrolls the page by the specified amount */ @@ -557,4 +757,84 @@ export class BrowserSession { }) }) } + + /** + * Draws a cursor indicator on the page at the specified position + */ + private async drawCursorIndicator(page: Page, coordinate: string): Promise { + const [x, y] = coordinate.split(",").map(Number) + + try { + await page.evaluate( + (cursorX: number, cursorY: number) => { + // Create a cursor indicator element + const cursor = document.createElement("div") + cursor.id = "__roo_cursor_indicator__" + cursor.style.cssText = ` + position: fixed; + left: ${cursorX}px; + top: ${cursorY}px; + width: 35px; + height: 35px; + pointer-events: none; + z-index: 2147483647; + ` + + // Create SVG cursor pointer + const svg = ` + + + + + ` + cursor.innerHTML = svg + + document.body.appendChild(cursor) + }, + x, + y, + ) + } catch (error) { + console.log("Failed to draw cursor indicator:", error) + } + } + + /** + * Removes the cursor indicator from the page + */ + private async removeCursorIndicator(page: Page): Promise { + try { + await page.evaluate(() => { + const cursor = document.getElementById("__roo_cursor_indicator__") + if (cursor) { + cursor.remove() + } + }) + } catch (error) { + console.log("Failed to remove cursor indicator:", error) + } + } + + /** + * Returns whether a browser session is currently active + */ + isSessionActive(): boolean { + return !!(this.browser && this.page) + } + + /** + * Returns the last known viewport size (if any) + */ + getViewportSize(): { width?: number; height?: number } { + return { + width: this.lastViewportWidth, + height: this.lastViewportHeight, + } + } } diff --git a/src/services/browser/UrlContentFetcher.ts b/src/services/browser/UrlContentFetcher.ts index b271bc2ef41..2d8e4a3de84 100644 --- a/src/services/browser/UrlContentFetcher.ts +++ b/src/services/browser/UrlContentFetcher.ts @@ -90,9 +90,9 @@ export class UrlContentFetcher { throw new Error("Browser not initialized") } /* - - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms. - - domcontentloaded is when the basic DOM is loaded - this should be sufficient for most doc sites + - In Puppeteer, "networkidle2" waits until there are no more than 2 network connections for at least 500 ms (roughly equivalent to Playwright's "networkidle"). + - "domcontentloaded" is when the basic DOM is loaded. + This should be sufficient for most doc sites. */ try { await this.page.goto(url, { diff --git a/src/services/browser/__tests__/BrowserSession.spec.ts b/src/services/browser/__tests__/BrowserSession.spec.ts index b69fb2d1406..d3784c3afff 100644 --- a/src/services/browser/__tests__/BrowserSession.spec.ts +++ b/src/services/browser/__tests__/BrowserSession.spec.ts @@ -229,4 +229,169 @@ describe("BrowserSession", () => { expect(mockBrowser.close).not.toHaveBeenCalled() }) }) + + it("forces same-tab behavior before click", async () => { + // Prepare a minimal mock page with required APIs + const page: any = { + on: vi.fn(), + off: vi.fn(), + screenshot: vi.fn().mockResolvedValue("mockScreenshotBase64"), + url: vi.fn().mockReturnValue("https://example.com"), + viewport: vi.fn().mockReturnValue({ width: 900, height: 600 }), + waitForNavigation: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn().mockResolvedValue(undefined), + mouse: { + click: vi.fn().mockResolvedValue(undefined), + move: vi.fn().mockResolvedValue(undefined), + }, + } + + ;(browserSession as any).page = page + + // Spy on the forceLinksToSameTab helper to ensure it's invoked + const forceSpy = vi.fn().mockResolvedValue(undefined) + ;(browserSession as any).forceLinksToSameTab = forceSpy + + await browserSession.click("10,20") + + expect(forceSpy).toHaveBeenCalledTimes(1) + expect(forceSpy).toHaveBeenCalledWith(page) + expect(page.mouse.click).toHaveBeenCalledWith(10, 20) + }) +}) + +describe("keyboard press", () => { + it("presses a keyboard key", async () => { + // Prepare a minimal mock page with required APIs + const page: any = { + on: vi.fn(), + off: vi.fn(), + screenshot: vi.fn().mockResolvedValue("mockScreenshotBase64"), + url: vi.fn().mockReturnValue("https://example.com"), + viewport: vi.fn().mockReturnValue({ width: 900, height: 600 }), + waitForNavigation: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn().mockResolvedValue(undefined), + keyboard: { + press: vi.fn().mockResolvedValue(undefined), + type: vi.fn().mockResolvedValue(undefined), + }, + } + + // Create a fresh BrowserSession with a mock context + const mockCtx: any = { + globalState: { get: vi.fn(), update: vi.fn() }, + globalStorageUri: { fsPath: "/mock/global/storage/path" }, + extensionUri: { fsPath: "/mock/extension/path" }, + } + const session = new BrowserSession(mockCtx) + + ;(session as any).page = page + + await session.press("Enter") + + expect(page.keyboard.press).toHaveBeenCalledTimes(1) + expect(page.keyboard.press).toHaveBeenCalledWith("Enter") + }) +}) + +describe("cursor visualization", () => { + it("should draw cursor indicator when cursor position exists", async () => { + // Prepare a minimal mock page with required APIs + const page: any = { + on: vi.fn(), + off: vi.fn(), + screenshot: vi.fn().mockResolvedValue("mockScreenshotBase64"), + url: vi.fn().mockReturnValue("https://example.com"), + viewport: vi.fn().mockReturnValue({ width: 900, height: 600 }), + evaluate: vi.fn().mockResolvedValue(undefined), + mouse: { + click: vi.fn().mockResolvedValue(undefined), + }, + } + + // Create a fresh BrowserSession with a mock context + const mockCtx: any = { + globalState: { get: vi.fn(), update: vi.fn() }, + globalStorageUri: { fsPath: "/mock/global/storage/path" }, + extensionUri: { fsPath: "/mock/extension/path" }, + } + const session = new BrowserSession(mockCtx) + + ;(session as any).page = page + + // Perform a click action which sets cursor position + const result = await session.click("100,200") + + // Verify cursor indicator was drawn and removed + // evaluate is called 3 times: 1 for forceLinksToSameTab, 1 for draw cursor, 1 for remove cursor + expect(page.evaluate).toHaveBeenCalled() + + // Verify the result includes cursor position + expect(result.currentMousePosition).toBe("100,200") + }) + + it("should include cursor position in action result", async () => { + // Prepare a minimal mock page with required APIs + const page: any = { + on: vi.fn(), + off: vi.fn(), + screenshot: vi.fn().mockResolvedValue("mockScreenshotBase64"), + url: vi.fn().mockReturnValue("https://example.com"), + viewport: vi.fn().mockReturnValue({ width: 900, height: 600 }), + evaluate: vi.fn().mockResolvedValue(undefined), + mouse: { + move: vi.fn().mockResolvedValue(undefined), + }, + } + + // Create a fresh BrowserSession with a mock context + const mockCtx: any = { + globalState: { get: vi.fn(), update: vi.fn() }, + globalStorageUri: { fsPath: "/mock/global/storage/path" }, + extensionUri: { fsPath: "/mock/extension/path" }, + } + const session = new BrowserSession(mockCtx) + + ;(session as any).page = page + + // Perform a hover action which sets cursor position + const result = await session.hover("150,250") + + // Verify the result includes cursor position + expect(result.currentMousePosition).toBe("150,250") + expect(result.viewportWidth).toBe(900) + expect(result.viewportHeight).toBe(600) + }) + + it("should not draw cursor indicator when no cursor position exists", async () => { + // Prepare a minimal mock page with required APIs + const page: any = { + on: vi.fn(), + off: vi.fn(), + screenshot: vi.fn().mockResolvedValue("mockScreenshotBase64"), + url: vi.fn().mockReturnValue("https://example.com"), + viewport: vi.fn().mockReturnValue({ width: 900, height: 600 }), + evaluate: vi.fn().mockResolvedValue(undefined), + } + + // Create a fresh BrowserSession with a mock context + const mockCtx: any = { + globalState: { get: vi.fn(), update: vi.fn() }, + globalStorageUri: { fsPath: "/mock/global/storage/path" }, + extensionUri: { fsPath: "/mock/extension/path" }, + } + const session = new BrowserSession(mockCtx) + + ;(session as any).page = page + + // Perform scroll action which doesn't set cursor position + const result = await session.scrollDown() + + // Verify evaluate was called only for scroll operation (not for cursor drawing/removal) + // scrollDown calls evaluate once for scrolling + expect(page.evaluate).toHaveBeenCalledTimes(1) + + // Verify no cursor position in result + expect(result.currentMousePosition).toBeUndefined() + }) }) diff --git a/src/shared/ExtensionMessage.ts b/src/shared/ExtensionMessage.ts index 59745b9cf99..e456452fe2c 100644 --- a/src/shared/ExtensionMessage.ts +++ b/src/shared/ExtensionMessage.ts @@ -129,6 +129,8 @@ export interface ExtensionMessage { | "dismissedUpsells" | "organizationSwitchResult" | "interactionRequired" + | "browserSessionUpdate" + | "browserSessionNavigate" text?: string payload?: any // Add a generic payload for now, can refine later // Checkpoint warning message @@ -213,6 +215,9 @@ export interface ExtensionMessage { queuedMessages?: QueuedMessage[] list?: string[] // For dismissedUpsells organizationId?: string | null // For organizationSwitchResult + browserSessionMessages?: ClineMessage[] // For browser session panel updates + isBrowserSessionActive?: boolean // For browser session panel updates + stepIndex?: number // For browserSessionNavigate: the target step index to display } export type ExtensionState = Pick< @@ -333,6 +338,8 @@ export type ExtensionState = Pick< organizationAllowList: OrganizationAllowList organizationSettingsVersion?: number + isBrowserSessionActive: boolean // Actual browser session state + autoCondenseContext: boolean autoCondenseContextPercent: number marketplaceItems?: MarketplaceItem[] @@ -420,6 +427,7 @@ export const browserActions = [ "click", "hover", "type", + "press", "scroll_down", "scroll_up", "resize", @@ -440,6 +448,8 @@ export type BrowserActionResult = { logs?: string currentUrl?: string currentMousePosition?: string + viewportWidth?: number + viewportHeight?: number } export interface ClineAskUseMcpServer { diff --git a/src/shared/WebviewMessage.ts b/src/shared/WebviewMessage.ts index 1d403f16caa..5806da8e973 100644 --- a/src/shared/WebviewMessage.ts +++ b/src/shared/WebviewMessage.ts @@ -166,6 +166,13 @@ export interface WebviewMessage { | "dismissUpsell" | "getDismissedUpsells" | "updateSettings" + | "allowedCommands" + | "deniedCommands" + | "killBrowserSession" + | "openBrowserSessionPanel" + | "showBrowserSessionPanelAtStep" + | "refreshBrowserSessionPanel" + | "browserPanelDidLaunch" text?: string editedMessageContent?: string tab?: "settings" | "history" | "mcp" | "modes" | "chat" | "marketplace" | "cloud" @@ -177,6 +184,9 @@ export interface WebviewMessage { images?: string[] bool?: boolean value?: number + stepIndex?: number + isLaunchAction?: boolean + forceShow?: boolean commands?: string[] audioType?: AudioType serverName?: string diff --git a/webview-ui/browser-panel.html b/webview-ui/browser-panel.html new file mode 100644 index 00000000000..92943abfe34 --- /dev/null +++ b/webview-ui/browser-panel.html @@ -0,0 +1,12 @@ + + + + + + Browser Session + + +
+ + + \ No newline at end of file diff --git a/webview-ui/src/browser-panel.tsx b/webview-ui/src/browser-panel.tsx new file mode 100644 index 00000000000..a7f5af891e6 --- /dev/null +++ b/webview-ui/src/browser-panel.tsx @@ -0,0 +1,12 @@ +import { StrictMode } from "react" +import { createRoot } from "react-dom/client" + +import "./index.css" +import BrowserSessionPanel from "./components/browser-session/BrowserSessionPanel" +import "../node_modules/@vscode/codicons/dist/codicon.css" + +createRoot(document.getElementById("root")!).render( + + + , +) diff --git a/webview-ui/src/components/browser-session/BrowserPanelStateProvider.tsx b/webview-ui/src/components/browser-session/BrowserPanelStateProvider.tsx new file mode 100644 index 00000000000..50b078c7402 --- /dev/null +++ b/webview-ui/src/components/browser-session/BrowserPanelStateProvider.tsx @@ -0,0 +1,60 @@ +import React, { createContext, useContext, useState, useEffect, useCallback } from "react" +import { ExtensionMessage } from "@roo/ExtensionMessage" + +interface BrowserPanelState { + browserViewportSize: string + isBrowserSessionActive: boolean + language: string +} + +const BrowserPanelStateContext = createContext(undefined) + +export const BrowserPanelStateProvider: React.FC<{ children: React.ReactNode }> = ({ children }) => { + const [state, setState] = useState({ + browserViewportSize: "900x600", + isBrowserSessionActive: false, + language: "en", + }) + + const handleMessage = useCallback((event: MessageEvent) => { + const message: ExtensionMessage = event.data + + switch (message.type) { + case "state": + if (message.state) { + setState((prev) => ({ + ...prev, + browserViewportSize: message.state?.browserViewportSize || "900x600", + isBrowserSessionActive: message.state?.isBrowserSessionActive || false, + language: message.state?.language || "en", + })) + } + break + case "browserSessionUpdate": + if (message.isBrowserSessionActive !== undefined) { + setState((prev) => ({ + ...prev, + isBrowserSessionActive: message.isBrowserSessionActive || false, + })) + } + break + } + }, []) + + useEffect(() => { + window.addEventListener("message", handleMessage) + return () => { + window.removeEventListener("message", handleMessage) + } + }, [handleMessage]) + + return {children} +} + +export const useBrowserPanelState = () => { + const context = useContext(BrowserPanelStateContext) + if (context === undefined) { + throw new Error("useBrowserPanelState must be used within a BrowserPanelStateProvider") + } + return context +} diff --git a/webview-ui/src/components/browser-session/BrowserSessionPanel.tsx b/webview-ui/src/components/browser-session/BrowserSessionPanel.tsx new file mode 100644 index 00000000000..00f3e176b6c --- /dev/null +++ b/webview-ui/src/components/browser-session/BrowserSessionPanel.tsx @@ -0,0 +1,109 @@ +import React, { useEffect, useState, useCallback } from "react" +import { type ClineMessage } from "@roo-code/types" +import BrowserSessionRow from "../chat/BrowserSessionRow" +import { TooltipProvider } from "@src/components/ui/tooltip" +import ErrorBoundary from "../ErrorBoundary" +import TranslationProvider from "@src/i18n/TranslationContext" +import { ExtensionMessage } from "@roo/ExtensionMessage" +import { BrowserPanelStateProvider, useBrowserPanelState } from "./BrowserPanelStateProvider" +import { vscode } from "@src/utils/vscode" +import { ExtensionStateContextProvider } from "@/context/ExtensionStateContext" + +interface BrowserSessionPanelState { + messages: ClineMessage[] +} + +const BrowserSessionPanelContent: React.FC = () => { + const { browserViewportSize, isBrowserSessionActive } = useBrowserPanelState() + const [state, setState] = useState({ + messages: [], + }) + // Target page index to navigate BrowserSessionRow to + const [navigateToStepIndex, setNavigateToStepIndex] = useState(undefined) + + const [expandedRows, setExpandedRows] = useState>({}) + + useEffect(() => { + const handleMessage = (event: MessageEvent) => { + const message: ExtensionMessage = event.data + + switch (message.type) { + case "browserSessionUpdate": + if (message.browserSessionMessages) { + setState((prev) => ({ + ...prev, + messages: message.browserSessionMessages || [], + })) + } + break + case "browserSessionNavigate": + if (typeof message.stepIndex === "number" && message.stepIndex >= 0) { + setNavigateToStepIndex(message.stepIndex) + } + break + } + } + + window.addEventListener("message", handleMessage) + + return () => { + window.removeEventListener("message", handleMessage) + } + }, []) + + const handleHeightChange = useCallback(() => { + // No-op for panel - no scrolling needed + }, []) + + return ( +
+
+ expandedRows[messageTs] ?? false} + onToggleExpand={(messageTs: number) => { + setExpandedRows((prev: Record) => ({ + ...prev, + [messageTs]: !prev[messageTs], + })) + }} + fullScreen={true} + browserViewportSizeProp={browserViewportSize} + isBrowserSessionActiveProp={isBrowserSessionActive} + navigateToPageIndex={navigateToStepIndex} + /> +
+
+ ) +} + +const BrowserSessionPanel: React.FC = () => { + // Ensure the panel receives initial state and becomes "ready" without needing a second click + useEffect(() => { + try { + vscode.postMessage({ type: "webviewDidLaunch" }) + } catch { + // Ignore errors during initial launch + } + }, []) + + return ( + + + + + + + + + + + + ) +} + +export default BrowserSessionPanel diff --git a/webview-ui/src/components/chat/BrowserActionRow.tsx b/webview-ui/src/components/chat/BrowserActionRow.tsx new file mode 100644 index 00000000000..9b13b9426c0 --- /dev/null +++ b/webview-ui/src/components/chat/BrowserActionRow.tsx @@ -0,0 +1,247 @@ +import { memo, useMemo, useEffect, useRef } from "react" +import { ClineMessage } from "@roo-code/types" +import { ClineSayBrowserAction } from "@roo/ExtensionMessage" +import { vscode } from "@src/utils/vscode" +import { + MousePointer as MousePointerIcon, + Keyboard, + ArrowDown, + ArrowUp, + Pointer, + Play, + Check, + Maximize2, +} from "lucide-react" +import { useExtensionState } from "@src/context/ExtensionStateContext" + +const prettyKey = (k?: string): string => { + if (!k) return "" + return k + .split("+") + .map((part) => { + const p = part.trim() + const lower = p.toLowerCase() + const map: Record = { + enter: "Enter", + tab: "Tab", + escape: "Esc", + esc: "Esc", + backspace: "Backspace", + space: "Space", + shift: "Shift", + control: "Ctrl", + ctrl: "Ctrl", + alt: "Alt", + meta: "Meta", + command: "Cmd", + cmd: "Cmd", + arrowup: "Arrow Up", + arrowdown: "Arrow Down", + arrowleft: "Arrow Left", + arrowright: "Arrow Right", + pageup: "Page Up", + pagedown: "Page Down", + home: "Home", + end: "End", + } + if (map[lower]) return map[lower] + const keyMatch = /^Key([A-Z])$/.exec(p) + if (keyMatch) return keyMatch[1].toUpperCase() + const digitMatch = /^Digit([0-9])$/.exec(p) + if (digitMatch) return digitMatch[1] + const spaced = p.replace(/([a-z])([A-Z])/g, "$1 $2") + return spaced.charAt(0).toUpperCase() + spaced.slice(1) + }) + .join(" + ") +} + +interface BrowserActionRowProps { + message: ClineMessage + nextMessage?: ClineMessage + actionIndex?: number + totalActions?: number +} + +// Get icon for each action type +const getActionIcon = (action: string) => { + switch (action) { + case "click": + return + case "type": + case "press": + return + case "scroll_down": + return + case "scroll_up": + return + case "launch": + return + case "close": + return + case "resize": + return + case "hover": + default: + return + } +} + +const BrowserActionRow = memo(({ message, nextMessage, actionIndex, totalActions }: BrowserActionRowProps) => { + const { isBrowserSessionActive } = useExtensionState() + const hasHandledAutoOpenRef = useRef(false) + + // Parse this specific browser action + const browserAction = useMemo(() => { + try { + return JSON.parse(message.text || "{}") as ClineSayBrowserAction + } catch { + return null + } + }, [message.text]) + + // Get viewport dimensions from the result message if available + const viewportDimensions = useMemo(() => { + if (!nextMessage || nextMessage.say !== "browser_action_result") return null + try { + const result = JSON.parse(nextMessage.text || "{}") + return { + width: result.viewportWidth, + height: result.viewportHeight, + } + } catch { + return null + } + }, [nextMessage]) + + // Format action display text + const actionText = useMemo(() => { + if (!browserAction) return "Browser action" + + // Helper to scale coordinates from screenshot dimensions to viewport dimensions + // Matches the backend's scaleCoordinate function logic + const getViewportCoordinate = (coord?: string): string => { + if (!coord) return "" + + // Parse "x,y@widthxheight" format + const match = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/.exec(coord) + if (!match) { + // If no @dimensions, return as-is (might be plain x,y format) + const simpleMatch = /^\s*(\d+)\s*,\s*(\d+)/.exec(coord) + return simpleMatch ? `${simpleMatch[1]},${simpleMatch[2]}` : coord + } + + const x = parseInt(match[1], 10) + const y = parseInt(match[2], 10) + const imgWidth = parseInt(match[3], 10) + const imgHeight = parseInt(match[4], 10) + + // If we don't have viewport dimensions, just return the screenshot coordinates + if (!viewportDimensions?.width || !viewportDimensions?.height) { + return `${x},${y}` + } + + // Scale coordinates from image dimensions to viewport dimensions (same as backend) + const scaledX = Math.round((x / imgWidth) * viewportDimensions.width) + const scaledY = Math.round((y / imgHeight) * viewportDimensions.height) + + return `${scaledX},${scaledY}` + } + + switch (browserAction.action) { + case "launch": + return `Launched browser` + case "click": + return `Clicked at: ${getViewportCoordinate(browserAction.coordinate)}` + case "type": + return `Typed: ${browserAction.text}` + case "press": + return `Pressed key: ${prettyKey(browserAction.text)}` + case "hover": + return `Hovered at: ${getViewportCoordinate(browserAction.coordinate)}` + case "scroll_down": + return "Scrolled down" + case "scroll_up": + return "Scrolled up" + case "resize": + return `Resized to: ${browserAction.size?.split(/[x,]/).join(" x ")}` + case "close": + return "Closed browser" + default: + return browserAction.action + } + }, [browserAction, viewportDimensions]) + + // Auto-open Browser Session panel when: + // 1. This is a "launch" action (new browser session) - always opens and navigates to launch + // 2. Regular actions - only open panel if user hasn't manually closed it, let internal auto-advance logic handle step + // Only run this once per action to avoid re-sending messages when scrolling + useEffect(() => { + if (!isBrowserSessionActive || hasHandledAutoOpenRef.current) { + return + } + + const isLaunchAction = browserAction?.action === "launch" + + if (isLaunchAction) { + // Launch action: navigate to step 0 (the launch) + vscode.postMessage({ + type: "showBrowserSessionPanelAtStep", + stepIndex: 0, + isLaunchAction: true, + }) + hasHandledAutoOpenRef.current = true + } else { + // Regular actions: just show panel, don't navigate + // BrowserSessionRow's internal auto-advance logic will handle jumping to new steps + // only if user is currently on the most recent step + vscode.postMessage({ + type: "showBrowserSessionPanelAtStep", + isLaunchAction: false, + }) + hasHandledAutoOpenRef.current = true + } + }, [isBrowserSessionActive, browserAction]) + + const headerStyle: React.CSSProperties = { + display: "flex", + alignItems: "center", + gap: "10px", + marginBottom: "10px", + wordBreak: "break-word", + } + + return ( +
+ {/* Header with action description - clicking opens Browser Session panel at this step */} +
{ + const idx = typeof actionIndex === "number" ? Math.max(0, actionIndex - 1) : 0 + vscode.postMessage({ type: "showBrowserSessionPanelAtStep", stepIndex: idx, forceShow: true }) + }}> + + Browser Action + {actionIndex !== undefined && totalActions !== undefined && ( + + {" "} + - {actionIndex}/{totalActions} -{" "} + + )} + {browserAction && ( + <> + {getActionIcon(browserAction.action)} + {actionText} + + )} +
+
+ ) +}) + +BrowserActionRow.displayName = "BrowserActionRow" + +export default BrowserActionRow diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index 57cb0cf2432..695d40bc848 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -1,20 +1,160 @@ import React, { memo, useEffect, useMemo, useRef, useState } from "react" -import { useSize } from "react-use" import deepEqual from "fast-deep-equal" import { useTranslation } from "react-i18next" - import type { ClineMessage } from "@roo-code/types" import { BrowserAction, BrowserActionResult, ClineSayBrowserAction } from "@roo/ExtensionMessage" import { vscode } from "@src/utils/vscode" import { useExtensionState } from "@src/context/ExtensionStateContext" -import { Button } from "@src/components/ui" -import CodeBlock, { CODE_BLOCK_BG_COLOR } from "../common/CodeBlock" -import { ChatRowContent } from "./ChatRow" -import { ProgressIndicator } from "./ProgressIndicator" -import { Globe, Pointer, SquareTerminal } from "lucide-react" +import CodeBlock from "../common/CodeBlock" +import { Button, StandardTooltip } from "@src/components/ui" +import { + Globe, + Pointer, + SquareTerminal, + MousePointer as MousePointerIcon, + Keyboard, + ArrowDown, + ArrowUp, + Play, + Check, + Maximize2, + OctagonX, + ArrowLeft, + ArrowRight, + ChevronsLeft, + ChevronsRight, + ExternalLink, + Copy, +} from "lucide-react" + +const prettyKey = (k?: string): string => { + if (!k) return "" + return k + .split("+") + .map((part) => { + const p = part.trim() + const lower = p.toLowerCase() + const map: Record = { + enter: "Enter", + tab: "Tab", + escape: "Esc", + esc: "Esc", + backspace: "Backspace", + space: "Space", + shift: "Shift", + control: "Ctrl", + ctrl: "Ctrl", + alt: "Alt", + meta: "Meta", + command: "Cmd", + cmd: "Cmd", + arrowup: "Arrow Up", + arrowdown: "Arrow Down", + arrowleft: "Arrow Left", + arrowright: "Arrow Right", + pageup: "Page Up", + pagedown: "Page Down", + home: "Home", + end: "End", + } + if (map[lower]) return map[lower] + const keyMatch = /^Key([A-Z])$/.exec(p) + if (keyMatch) return keyMatch[1].toUpperCase() + const digitMatch = /^Digit([0-9])$/.exec(p) + if (digitMatch) return digitMatch[1] + const spaced = p.replace(/([a-z])([A-Z])/g, "$1 $2") + return spaced.charAt(0).toUpperCase() + spaced.slice(1) + }) + .join(" + ") +} + +const getBrowserActionText = ( + action: BrowserAction, + coordinate?: string, + text?: string, + size?: string, + viewportWidth?: number, + viewportHeight?: number, +) => { + // Helper to scale coordinates from screenshot dimensions to viewport dimensions + // Matches the backend's scaleCoordinate function logic + const getViewportCoordinate = (coord?: string): string => { + if (!coord) return "" + + // Parse "x,y@widthxheight" format + const match = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/.exec(coord) + if (!match) { + // If no @dimensions, return as-is (might be plain x,y format) + const simpleMatch = /^\s*(\d+)\s*,\s*(\d+)/.exec(coord) + return simpleMatch ? `${simpleMatch[1]},${simpleMatch[2]}` : coord + } + + const x = parseInt(match[1], 10) + const y = parseInt(match[2], 10) + const imgWidth = parseInt(match[3], 10) + const imgHeight = parseInt(match[4], 10) + + // If we don't have viewport dimensions, just return the screenshot coordinates + if (!viewportWidth || !viewportHeight) { + return `${x},${y}` + } + + // Scale coordinates from image dimensions to viewport dimensions (same as backend) + const scaledX = Math.round((x / imgWidth) * viewportWidth) + const scaledY = Math.round((y / imgHeight) * viewportHeight) + + return `${scaledX},${scaledY}` + } + + switch (action) { + case "launch": + return `Launched browser` + case "click": + return `Clicked at: ${getViewportCoordinate(coordinate)}` + case "type": + return `Typed: ${text}` + case "press": + return `Pressed key: ${prettyKey(text)}` + case "scroll_down": + return "Scrolled down" + case "scroll_up": + return "Scrolled up" + case "hover": + return `Hovered at: ${getViewportCoordinate(coordinate)}` + case "resize": + return `Resized to: ${size?.split(/[x,]/).join(" x ")}` + case "close": + return "Closed browser" + default: + return action + } +} + +const getActionIcon = (action: BrowserAction) => { + switch (action) { + case "click": + return + case "type": + case "press": + return + case "scroll_down": + return + case "scroll_up": + return + case "launch": + return + case "close": + return + case "resize": + return + case "hover": + default: + return + } +} interface BrowserSessionRowProps { messages: ClineMessage[] @@ -24,18 +164,65 @@ interface BrowserSessionRowProps { isLast: boolean onHeightChange: (isTaller: boolean) => void isStreaming: boolean + onExpandChange?: (expanded: boolean) => void + fullScreen?: boolean + // Optional props for standalone panel (when not using ExtensionStateContext) + browserViewportSizeProp?: string + isBrowserSessionActiveProp?: boolean + // Optional: navigate to a specific page index (used by Browser Session panel) + navigateToPageIndex?: number } const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { - const { messages, isLast, onHeightChange, lastModifiedMessage } = props + const { messages, isLast, onHeightChange, lastModifiedMessage, onExpandChange, fullScreen } = props const { t } = useTranslation() const prevHeightRef = useRef(0) - const [maxActionHeight, setMaxActionHeight] = useState(0) const [consoleLogsExpanded, setConsoleLogsExpanded] = useState(false) + const [nextActionsExpanded, setNextActionsExpanded] = useState(false) + const [logFilter, setLogFilter] = useState<"all" | "debug" | "info" | "warn" | "error" | "log">("all") + // Track screenshot container size for precise cursor positioning with object-fit: contain + const screenshotRef = useRef(null) + const [sW, setSW] = useState(0) + const [sH, setSH] = useState(0) + + // Auto-expand drawer when in fullScreen takeover mode so content is visible immediately + useEffect(() => { + if (fullScreen) { + setNextActionsExpanded(true) + } + }, [fullScreen]) + + // Observe screenshot container size to align cursor correctly with letterboxing + useEffect(() => { + const el = screenshotRef.current + if (!el) return + const update = () => { + const r = el.getBoundingClientRect() + setSW(r.width) + setSH(r.height) + } + update() + const ro = + typeof window !== "undefined" && "ResizeObserver" in window ? new ResizeObserver(() => update()) : null + if (ro) ro.observe(el) + return () => { + if (ro) ro.disconnect() + } + }, []) + + // Try to use ExtensionStateContext if available, otherwise use props + let browserViewportSize = props.browserViewportSizeProp || "900x600" + let isBrowserSessionActive = props.isBrowserSessionActiveProp || false + + try { + const extensionState = useExtensionState() + browserViewportSize = extensionState.browserViewportSize || "900x600" + isBrowserSessionActive = extensionState.isBrowserSessionActive || false + } catch (_e) { + // Not in ExtensionStateContext, use props + } - const { browserViewportSize = "900x600" } = useExtensionState() const [viewportWidth, viewportHeight] = browserViewportSize.split("x").map(Number) - const aspectRatio = ((viewportHeight / viewportWidth) * 100).toFixed(2) const defaultMousePosition = `${Math.round(viewportWidth / 2)},${Math.round(viewportHeight / 2)}` const isLastApiReqInterrupted = useMemo(() => { @@ -58,93 +245,106 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { return isLast && messages.some((m) => m.say === "browser_action_result") && !isLastApiReqInterrupted // after user approves, browser_action_result with "" is sent to indicate that the session has started }, [isLast, messages, isLastApiReqInterrupted]) - // Organize messages into pages with current state and next action + // Organize messages into pages based on ALL browser actions (including those without screenshots) const pages = useMemo(() => { const result: { - currentState: { - url?: string - screenshot?: string - mousePosition?: string - consoleLogs?: string - messages: ClineMessage[] // messages up to and including the result - } - nextAction?: { - messages: ClineMessage[] // messages leading to next result - } + url?: string + screenshot?: string + mousePosition?: string + consoleLogs?: string + action?: ClineSayBrowserAction + size?: string + viewportWidth?: number + viewportHeight?: number }[] = [] - let currentStateMessages: ClineMessage[] = [] - let nextActionMessages: ClineMessage[] = [] - + // Build pages from browser_action messages and pair with results messages.forEach((message) => { - if (message.ask === "browser_action_launch") { - // Start first page - currentStateMessages = [message] - } else if (message.say === "browser_action_result") { - if (message.text === "") { - // first browser_action_result is an empty string that signals that session has started - return + if (message.say === "browser_action") { + try { + const action = JSON.parse(message.text || "{}") as ClineSayBrowserAction + // Find the corresponding result message + const resultMessage = messages.find( + (m) => m.say === "browser_action_result" && m.ts > message.ts && m.text !== "", + ) + + if (resultMessage) { + const resultData = JSON.parse(resultMessage.text || "{}") as BrowserActionResult + result.push({ + url: resultData.currentUrl, + screenshot: resultData.screenshot, + mousePosition: resultData.currentMousePosition, + consoleLogs: resultData.logs, + action, + size: action.size, + viewportWidth: resultData.viewportWidth, + viewportHeight: resultData.viewportHeight, + }) + } else { + // For actions without results (like close), add a page without screenshot + result.push({ action, size: action.size }) + } + } catch { + // ignore parse errors } - // Complete current state - currentStateMessages.push(message) - const resultData = JSON.parse(message.text || "{}") as BrowserActionResult - - // Add page with current state and previous next actions - result.push({ - currentState: { - url: resultData.currentUrl, - screenshot: resultData.screenshot, - mousePosition: resultData.currentMousePosition, - consoleLogs: resultData.logs, - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) - - // Reset for next page - currentStateMessages = [] - nextActionMessages = [] - } else if ( - message.say === "api_req_started" || - message.say === "text" || - message.say === "browser_action" - ) { - // These messages lead to the next result, so they should always go in nextActionMessages - nextActionMessages.push(message) - } else { - // Any other message types - currentStateMessages.push(message) } }) - // Add incomplete page if exists - if (currentStateMessages.length > 0 || nextActionMessages.length > 0) { - result.push({ - currentState: { - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) + // Add placeholder page if no actions yet + if (result.length === 0) { + result.push({}) } return result }, [messages]) - // Auto-advance to latest page + // Page index + user navigation guard (don't auto-jump while exploring history) const [currentPageIndex, setCurrentPageIndex] = useState(0) + const hasUserNavigatedRef = useRef(false) + const didInitIndexRef = useRef(false) + const prevPagesLengthRef = useRef(0) + useEffect(() => { - setCurrentPageIndex(pages.length - 1) - }, [pages.length]) + // Initialize to last page on mount + if (!didInitIndexRef.current && pages.length > 0) { + didInitIndexRef.current = true + setCurrentPageIndex(pages.length - 1) + prevPagesLengthRef.current = pages.length + return + } + + // Auto-advance if user is on the most recent step and a new step arrives + if (pages.length > prevPagesLengthRef.current) { + const wasOnLastPage = currentPageIndex === prevPagesLengthRef.current - 1 + if (wasOnLastPage && !hasUserNavigatedRef.current) { + // User was on the most recent step, auto-advance to the new step + setCurrentPageIndex(pages.length - 1) + } + prevPagesLengthRef.current = pages.length + } + }, [pages.length, currentPageIndex]) + + // External navigation request (from panel host) + // Only navigate when navigateToPageIndex actually changes, not when pages.length changes + const prevNavigateToPageIndexRef = useRef() + useEffect(() => { + if ( + typeof props.navigateToPageIndex === "number" && + props.navigateToPageIndex !== prevNavigateToPageIndexRef.current && + pages.length > 0 + ) { + const idx = Math.max(0, Math.min(pages.length - 1, props.navigateToPageIndex)) + setCurrentPageIndex(idx) + // Only reset manual navigation guard if navigating to the last page + // This allows auto-advance to work when clicking to the most recent step + // but prevents unwanted auto-advance when viewing historical steps + if (idx === pages.length - 1) { + hasUserNavigatedRef.current = false + } + prevNavigateToPageIndexRef.current = props.navigateToPageIndex + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [props.navigateToPageIndex]) // Get initial URL from launch message const initialUrl = useMemo(() => { @@ -152,240 +352,790 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { return launchMessage?.text || "" }, [messages]) - // Find the latest available URL and screenshot - const latestState = useMemo(() => { + const currentPage = pages[currentPageIndex] + + // Use actual viewport dimensions from result if available, otherwise fall back to settings + + // Find the last available screenshot and its associated data to use as placeholders + const lastPageWithScreenshot = useMemo(() => { for (let i = pages.length - 1; i >= 0; i--) { - const page = pages[i] - if (page.currentState.url || page.currentState.screenshot) { - return { - url: page.currentState.url, - mousePosition: page.currentState.mousePosition, - consoleLogs: page.currentState.consoleLogs, - screenshot: page.currentState.screenshot, - } + if (pages[i].screenshot) { + return pages[i] } } - return { url: undefined, mousePosition: undefined, consoleLogs: undefined, screenshot: undefined } + return undefined }, [pages]) - const currentPage = pages[currentPageIndex] - const isLastPage = currentPageIndex === pages.length - 1 - - // Use latest state if we're on the last page and don't have a state yet - const displayState = isLastPage - ? { - url: currentPage?.currentState.url || latestState.url || initialUrl, - mousePosition: - currentPage?.currentState.mousePosition || latestState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot || latestState.screenshot, - } - : { - url: currentPage?.currentState.url || initialUrl, - mousePosition: currentPage?.currentState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot, + // Find last mouse position up to current page (not from future pages) + const lastPageWithMousePositionUpToCurrent = useMemo(() => { + for (let i = currentPageIndex; i >= 0; i--) { + if (pages[i].mousePosition) { + return pages[i] } + } + return undefined + }, [pages, currentPageIndex]) - const [actionContent, { height: actionHeight }] = useSize( -
- {currentPage?.nextAction?.messages.map((message) => ( - - ))} - {!isBrowsing && messages.some((m) => m.say === "browser_action_result") && currentPageIndex === 0 && ( - - )} -
, - ) + // Display state from current page, with smart fallbacks + const displayState = { + url: currentPage?.url || initialUrl, + mousePosition: + currentPage?.mousePosition || lastPageWithMousePositionUpToCurrent?.mousePosition || defaultMousePosition, + consoleLogs: currentPage?.consoleLogs, + screenshot: currentPage?.screenshot || lastPageWithScreenshot?.screenshot, + } - useEffect(() => { - if (actionHeight === 0 || actionHeight === Infinity) { - return + // Parse logs for counts and filtering + const parsedLogs = useMemo(() => { + const counts = { debug: 0, info: 0, warn: 0, error: 0, log: 0 } + const byType: Record<"debug" | "info" | "warn" | "error" | "log", string[]> = { + debug: [], + info: [], + warn: [], + error: [], + log: [], } - if (actionHeight > maxActionHeight) { - setMaxActionHeight(actionHeight) + const raw = displayState.consoleLogs || "" + raw.split(/\r?\n/).forEach((line) => { + const trimmed = line.trim() + if (!trimmed) return + const m = /^\[([^\]]+)\]\s*/i.exec(trimmed) + let type = (m?.[1] || "").toLowerCase() + if (type === "warning") type = "warn" + if (!["debug", "info", "warn", "error", "log"].includes(type)) type = "log" + counts[type as keyof typeof counts]++ + byType[type as keyof typeof byType].push(line) + }) + return { counts, byType } + }, [displayState.consoleLogs]) + + const logsToShow = useMemo(() => { + if (!displayState.consoleLogs) return t("chat:browser.noNewLogs") as string + if (logFilter === "all") return displayState.consoleLogs + const arr = parsedLogs.byType[logFilter] + return arr.length ? arr.join("\n") : (t("chat:browser.noNewLogs") as string) + }, [displayState.consoleLogs, logFilter, parsedLogs, t]) + + // Meta for log badges (include "All" first) + const logTypeMeta = [ + { key: "all", label: "All" }, + { key: "debug", label: "Debug" }, + { key: "info", label: "Info" }, + { key: "warn", label: "Warn" }, + { key: "error", label: "Error" }, + { key: "log", label: "Log" }, + ] as const + + // Use a fixed standard aspect ratio and dimensions for the drawer to prevent flickering + // Even if viewport changes, the drawer maintains consistent size + const fixedDrawerWidth = 900 + const fixedDrawerHeight = 600 + const drawerAspectRatio = (fixedDrawerHeight / fixedDrawerWidth) * 100 + + // For cursor positioning, use the viewport dimensions from the same page as the data we're displaying + // This ensures cursor position matches the screenshot/mouse position being shown + let cursorViewportWidth: number + let cursorViewportHeight: number + + if (currentPage?.screenshot) { + // Current page has screenshot - use its dimensions + cursorViewportWidth = currentPage.viewportWidth ?? viewportWidth + cursorViewportHeight = currentPage.viewportHeight ?? viewportHeight + } else if (lastPageWithScreenshot) { + // Using placeholder screenshot - use dimensions from that page + cursorViewportWidth = lastPageWithScreenshot.viewportWidth ?? viewportWidth + cursorViewportHeight = lastPageWithScreenshot.viewportHeight ?? viewportHeight + } else { + // No screenshot available - use default settings + cursorViewportWidth = viewportWidth + cursorViewportHeight = viewportHeight + } + + // Get browser action for current page (now stored in pages array) + const currentPageAction = useMemo(() => { + return pages[currentPageIndex]?.action + }, [pages, currentPageIndex]) + + // Latest non-close browser_action for header summary (fallback) + + // Determine if the overall browser session is still active (spins until 'close') + const lastBrowserActionOverall = useMemo(() => { + const all = messages.filter((m) => m.say === "browser_action") + return all.at(-1) + }, [messages]) + + // Use actual Playwright session state from extension (not message parsing) + const isBrowserSessionOpen = isBrowserSessionActive + + // Check if currently performing a browser action (for spinner) + const _isSessionActive = useMemo(() => { + // Only show active spinner if a session has started + const started = messages.some((m) => m.say === "browser_action_result") + if (!started) return false + // If the last API request got interrupted/cancelled, treat session as inactive + if (isLastApiReqInterrupted) return false + if (!lastBrowserActionOverall) return true + try { + const act = JSON.parse(lastBrowserActionOverall.text || "{}") as ClineSayBrowserAction + return act.action !== "close" + } catch { + return true } - }, [actionHeight, maxActionHeight]) + }, [messages, lastBrowserActionOverall, isLastApiReqInterrupted]) - // Track latest click coordinate - const latestClickPosition = useMemo(() => { - if (!isBrowsing) return undefined + // Browser session drawer never auto-expands - user must manually toggle it - // Look through current page's next actions for the latest browser_action - const actions = currentPage?.nextAction?.messages || [] - for (let i = actions.length - 1; i >= 0; i--) { - const message = actions[i] - if (message.say === "browser_action") { - const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction - if (browserAction.action === "click" && browserAction.coordinate) { - return browserAction.coordinate + // Calculate total API cost for the browser session + const totalApiCost = useMemo(() => { + let total = 0 + messages.forEach((message) => { + if (message.say === "api_req_started" && message.text) { + try { + const data = JSON.parse(message.text) + if (data.cost && typeof data.cost === "number") { + total += data.cost + } + } catch { + // Ignore parsing errors } } + }) + return total + }, [messages]) + + // Local size tracking without react-use to avoid timers after unmount in tests + const containerRef = useRef(null) + const [rowHeight, setRowHeight] = useState(0) + useEffect(() => { + const el = containerRef.current + if (!el) return + let mounted = true + const setH = (h: number) => { + if (mounted) setRowHeight(h) } - return undefined - }, [isBrowsing, currentPage?.nextAction?.messages]) - - // Use latest click position while browsing, otherwise use display state - const mousePosition = isBrowsing - ? latestClickPosition || displayState.mousePosition - : displayState.mousePosition || defaultMousePosition - - const [browserSessionRow, { height: rowHeight }] = useSize( -
-
- {isBrowsing ? : } - - <>{t("chat:browser.rooWantsToUse")} - -
+ const ro = + typeof window !== "undefined" && "ResizeObserver" in window + ? new ResizeObserver((entries) => { + const entry = entries[0] + setH(entry?.contentRect?.height ?? el.getBoundingClientRect().height) + }) + : null + // initial + setH(el.getBoundingClientRect().height) + if (ro) ro.observe(el) + return () => { + mounted = false + if (ro) ro.disconnect() + } + }, []) + + const browserSessionRow = ( +
+ {/* Main header - clickable to expand/collapse, mimics TodoList style */}
- {/* URL Bar */} -
+ setNextActionsExpanded((v) => { + const nv = !v + onExpandChange?.(nv) + return nv + }), + })} + /> + + {/* Simple text: "Browser Session" with step counter */} + + setNextActionsExpanded((v) => { + const nv = !v + onExpandChange?.(nv) + return nv + }), + })} style={{ - margin: "0px auto", - width: "calc(100%)", - boxSizing: "border-box", // includes padding in width calculation - borderRadius: "4px 4px 0 0", - padding: "5px", + flex: 1, + fontSize: 13, + fontWeight: 500, + lineHeight: "22px", + color: "var(--vscode-editor-foreground)", + cursor: fullScreen ? "default" : "pointer", display: "flex", alignItems: "center", - justifyContent: "center", - color: "var(--vscode-descriptionForeground)", - fontSize: "12px", + gap: 8, }}> + {t("chat:browser.session")} + {pages.length > 0 && ( + + {currentPageIndex + 1}/{pages.length} + + )} + {/* Inline action summary to the right, similar to ChatView */} + + {(() => { + const action = currentPageAction + const pageSize = pages[currentPageIndex]?.size + const pageViewportWidth = pages[currentPageIndex]?.viewportWidth + const pageViewportHeight = pages[currentPageIndex]?.viewportHeight + if (action) { + return ( + <> + {getActionIcon(action.action)} + + {getBrowserActionText( + action.action, + action.coordinate, + action.text, + pageSize, + pageViewportWidth, + pageViewportHeight, + )} + + + ) + } else if (initialUrl) { + return ( + <> + {getActionIcon("launch" as any)} + {getBrowserActionText("launch", undefined, initialUrl, undefined)} + + ) + } + return null + })()} + + + + {/* Right side: cost badge and chevron */} + {totalApiCost > 0 && (
- - {displayState.url || "http"} + ${totalApiCost.toFixed(4)}
-
+ )} + + {/* Chevron toggle hidden in fullScreen */} + {!fullScreen && ( + + setNextActionsExpanded((v) => { + const nv = !v + onExpandChange?.(nv) + return nv + }) + } + className={`codicon ${nextActionsExpanded ? "codicon-chevron-up" : "codicon-chevron-down"}`} + style={{ + fontSize: 13, + fontWeight: 500, + lineHeight: "22px", + color: "var(--vscode-editor-foreground)", + cursor: "pointer", + display: "inline-block", + transition: "transform 150ms ease", + }} + /> + )} + + {/* Kill browser button hidden from header in fullScreen; kept in toolbar */} + {isBrowserSessionOpen && !fullScreen && ( + + + + )} +
- {/* Screenshot Area */} + {/* Expanded drawer content - inline/fullscreen */} + {nextActionsExpanded && (
- {displayState.screenshot ? ( - {t("chat:browser.screenshot")} - vscode.postMessage({ - type: "openImage", - text: displayState.screenshot, - }) - } - /> - ) : ( + {/* Browser-like Toolbar */} +
+ {/* Go to beginning */} + + + + + {/* Back */} + + + + + {/* Forward */} + + + + + {/* Go to end */} + + + + + {/* Address Bar */}
+ + style={{ + fontSize: 12, + lineHeight: "18px", + textOverflow: "ellipsis", + overflow: "hidden", + whiteSpace: "nowrap", + color: "var(--vscode-foreground)", + }}> + {displayState.url || "about:blank"} + + {/* Step counter removed */}
- )} - {displayState.mousePosition && ( - - )} -
- {/* Console Logs Accordion */} -
{ - setConsoleLogsExpanded(!consoleLogsExpanded) - }} - className="flex items-center justify-between gap-2 text-vscode-editor-foreground/50 hover:text-vscode-editor-foreground transition-colors" - style={{ - width: "100%", - cursor: "pointer", - padding: `9px 10px ${consoleLogsExpanded ? 0 : 8}px 10px`, - }}> - - {t("chat:browser.consoleLogs")} - -
- {consoleLogsExpanded && ( - - )} -
+ {/* Kill (Disconnect) replaces Reload */} + + + - {/* Action content with min height */} -
{actionContent}
+ {/* Open External */} + + + - {/* Pagination moved to bottom */} - {pages.length > 1 && ( -
-
- {t("chat:browser.navigation.step", { current: currentPageIndex + 1, total: pages.length })} + {/* Copy URL */} + + +
-
- - + {/* Screenshot Area */} +
+ {displayState.screenshot ? ( + {t("chat:browser.screenshot")} + vscode.postMessage({ + type: "openImage", + text: displayState.screenshot, + }) + } + /> + ) : ( +
+ +
+ )} + {displayState.mousePosition && + (() => { + // Use measured size if available; otherwise fall back to current client size so cursor remains visible + const containerW = sW || (screenshotRef.current?.clientWidth ?? 0) + const containerH = sH || (screenshotRef.current?.clientHeight ?? 0) + if (containerW <= 0 || containerH <= 0) { + // Minimal fallback to keep cursor visible before first measurement + return ( + + ) + } + + // Compute displayed image box within the container for object-fit: contain; objectPosition: top center + const imgAspect = cursorViewportWidth / cursorViewportHeight + const containerAspect = containerW / containerH + let displayW = containerW + let displayH = containerH + let offsetX = 0 + let offsetY = 0 + if (containerAspect > imgAspect) { + // Full height, letterboxed left/right; top aligned + displayH = containerH + displayW = containerH * imgAspect + offsetX = (containerW - displayW) / 2 + offsetY = 0 + } else { + // Full width, potential space below; top aligned + displayW = containerW + displayH = containerW / imgAspect + offsetX = 0 + offsetY = 0 + } + + // Parse "x,y" or "x,y@widthxheight" for original basis + const m = /^\s*(\d+)\s*,\s*(\d+)(?:\s*@\s*(\d+)\s*[x,]\s*(\d+))?\s*$/.exec( + displayState.mousePosition || "", + ) + const mx = parseInt(m?.[1] || "0", 10) + const my = parseInt(m?.[2] || "0", 10) + const baseW = m?.[3] ? parseInt(m[3], 10) : cursorViewportWidth + const baseH = m?.[4] ? parseInt(m[4], 10) : cursorViewportHeight + + const leftPx = offsetX + (baseW > 0 ? (mx / baseW) * displayW : 0) + const topPx = offsetY + (baseH > 0 ? (my / baseH) * displayH : 0) + + return ( + + ) + })()} +
+ + {/* Browser Action summary moved inline to header; row removed */} + + {/* Console Logs Section (collapsible, default collapsed) */} +
+
{ + e.stopPropagation() + setConsoleLogsExpanded((v) => !v) + }} + className="text-vscode-editor-foreground/70 hover:text-vscode-editor-foreground transition-colors" + style={{ + display: "flex", + alignItems: "center", + gap: "8px", + marginBottom: consoleLogsExpanded ? "6px" : 0, + cursor: "pointer", + }}> + + + {t("chat:browser.consoleLogs")} + + + {/* Log type indicators */} +
e.stopPropagation()} + style={{ display: "flex", alignItems: "center", gap: 6, marginLeft: "auto" }}> + {logTypeMeta.map(({ key, label }) => { + const isAll = key === "all" + const count = isAll + ? (Object.values(parsedLogs.counts) as number[]).reduce((a, b) => a + b, 0) + : parsedLogs.counts[key as "debug" | "info" | "warn" | "error" | "log"] + const isActive = logFilter === (key as any) + const disabled = count === 0 + return ( + + ) + })} + setConsoleLogsExpanded((v) => !v)} + className={`codicon codicon-chevron-${consoleLogsExpanded ? "down" : "right"}`} + style={{ marginLeft: 6 }} + /> +
+
+ {consoleLogsExpanded && ( +
+ +
+ )}
)} -
, +
) // Height change effect @@ -402,150 +1152,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { return browserSessionRow }, deepEqual) -interface BrowserSessionRowContentProps extends Omit { - message: ClineMessage - setMaxActionHeight: (height: number) => void - isStreaming: boolean -} - -const BrowserSessionRowContent = ({ - message, - isExpanded, - onToggleExpand, - lastModifiedMessage, - isLast, - setMaxActionHeight, - isStreaming, -}: BrowserSessionRowContentProps) => { - const { t } = useTranslation() - const headerStyle: React.CSSProperties = { - display: "flex", - alignItems: "center", - gap: "10px", - marginBottom: "10px", - wordBreak: "break-word", - } - - switch (message.type) { - case "say": - switch (message.say) { - case "api_req_started": - case "text": - return ( -
- { - if (message.say === "api_req_started") { - setMaxActionHeight(0) - } - onToggleExpand(message.ts) - }} - lastModifiedMessage={lastModifiedMessage} - isLast={isLast} - isStreaming={isStreaming} - /> -
- ) - - case "browser_action": - const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction - return ( - - ) - - default: - return null - } - - case "ask": - switch (message.ask) { - case "browser_action_launch": - return ( - <> -
- {t("chat:browser.sessionStarted")} -
-
- -
- - ) - - default: - return null - } - } -} - -const BrowserActionBox = ({ - action, - coordinate, - text, -}: { - action: BrowserAction - coordinate?: string - text?: string -}) => { - const { t } = useTranslation() - const getBrowserActionText = (action: BrowserAction, coordinate?: string, text?: string) => { - switch (action) { - case "launch": - return t("chat:browser.actions.launch", { url: text }) - case "click": - return t("chat:browser.actions.click", { coordinate: coordinate?.replace(",", ", ") }) - case "type": - return t("chat:browser.actions.type", { text }) - case "scroll_down": - return t("chat:browser.actions.scrollDown") - case "scroll_up": - return t("chat:browser.actions.scrollUp") - case "close": - return t("chat:browser.actions.close") - default: - return action - } - } - return ( -
-
-
- - {t("chat:browser.actions.title")} - {getBrowserActionText(action, coordinate, text)} - -
-
-
- ) -} - const BrowserCursor: React.FC<{ style?: React.CSSProperties }> = ({ style }) => { const { t } = useTranslation() // (can't use svgs in vsc extensions) diff --git a/webview-ui/src/components/chat/BrowserSessionStatusRow.tsx b/webview-ui/src/components/chat/BrowserSessionStatusRow.tsx new file mode 100644 index 00000000000..862dc80a62f --- /dev/null +++ b/webview-ui/src/components/chat/BrowserSessionStatusRow.tsx @@ -0,0 +1,34 @@ +import { memo } from "react" +import { Globe } from "lucide-react" +import { ClineMessage } from "@roo-code/types" + +interface BrowserSessionStatusRowProps { + message: ClineMessage +} + +const BrowserSessionStatusRow = memo(({ message }: BrowserSessionStatusRowProps) => { + const isOpened = message.text?.includes("opened") + + return ( +
+ + + {message.text} + +
+ ) +}) + +BrowserSessionStatusRow.displayName = "BrowserSessionStatusRow" + +export default BrowserSessionStatusRow diff --git a/webview-ui/src/components/chat/ChatRow.tsx b/webview-ui/src/components/chat/ChatRow.tsx index aaaaaae09d7..f2b1dc08728 100644 --- a/webview-ui/src/components/chat/ChatRow.tsx +++ b/webview-ui/src/components/chat/ChatRow.tsx @@ -160,6 +160,7 @@ export const ChatRowContent = ({ onSuggestionClick, onFollowUpUnmount, onBatchFileResponse, + editable, isFollowUpAnswered, }: ChatRowContentProps) => { const { t } = useTranslation() @@ -536,11 +537,24 @@ export const ChatRowContent = ({ } case "updateTodoList" as any: { const todos = (tool as any).todos || [] - // Get previous todos from the latest todos in the task context const previousTodos = getPreviousTodos(clineMessages, message.ts) - return + return ( + <> + + { + if (typeof vscode !== "undefined" && vscode?.postMessage) { + vscode.postMessage({ type: "updateTodoList", payload: { todos: updatedTodos } }) + } + }} + editable={!!(editable && isLast)} + /> + + ) } case "newFileCreated": return ( @@ -1381,6 +1395,10 @@ export const ChatRowContent = ({
) + case "browser_action": + case "browser_action_result": + // Handled by BrowserSessionRow; prevent raw JSON (action/result) from rendering here + return null default: return ( <> diff --git a/webview-ui/src/components/chat/ChatTextArea.tsx b/webview-ui/src/components/chat/ChatTextArea.tsx index 0b8c89388c9..c9e017a3169 100644 --- a/webview-ui/src/components/chat/ChatTextArea.tsx +++ b/webview-ui/src/components/chat/ChatTextArea.tsx @@ -1,7 +1,7 @@ import React, { forwardRef, useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from "react" import { useEvent } from "react-use" import DynamicTextArea from "react-textarea-autosize" -import { VolumeX, Image, WandSparkles, SendHorizontal, MessageSquareX } from "lucide-react" +import { VolumeX, Image, WandSparkles, SendHorizontal, MessageSquareX, Globe } from "lucide-react" import { mentionRegex, mentionRegexGlobal, commandRegexGlobal, unescapeSpaces } from "@roo/context-mentions" import { WebviewMessage } from "@roo/WebviewMessage" @@ -21,7 +21,7 @@ import { } from "@src/utils/context-mentions" import { cn } from "@src/lib/utils" import { convertToMentionPath } from "@src/utils/path-mentions" -import { StandardTooltip } from "@src/components/ui" +import { StandardTooltip, Button } from "@src/components/ui" import Thumbnails from "../common/Thumbnails" import { ModeSelector } from "./ModeSelector" @@ -51,6 +51,9 @@ interface ChatTextAreaProps { // Edit mode props isEditMode?: boolean onCancel?: () => void + // Browser session status + isBrowserSessionActive?: boolean + showBrowserDockToggle?: boolean } export const ChatTextArea = forwardRef( @@ -71,6 +74,8 @@ export const ChatTextArea = forwardRef( modeShortcutText, isEditMode = false, onCancel, + isBrowserSessionActive = false, + showBrowserDockToggle = false, }, ref, ) => { @@ -1236,7 +1241,7 @@ export const ChatTextArea = forwardRef(
{isTtsPlaying && ( @@ -1259,6 +1264,30 @@ export const ChatTextArea = forwardRef( )} + {!isEditMode && showBrowserDockToggle && ( + + + + )} {!isEditMode ? : null} {!isEditMode && cloudUserInfo && }
diff --git a/webview-ui/src/components/chat/ChatView.tsx b/webview-ui/src/components/chat/ChatView.tsx index 9adf603ee4b..dbec6979f07 100644 --- a/webview-ui/src/components/chat/ChatView.tsx +++ b/webview-ui/src/components/chat/ChatView.tsx @@ -1,5 +1,5 @@ import React, { forwardRef, useCallback, useEffect, useImperativeHandle, useMemo, useRef, useState } from "react" -import { useDeepCompareEffect, useEvent, useMount } from "react-use" +import { useDeepCompareEffect, useEvent } from "react-use" import debounce from "debounce" import { Virtuoso, type VirtuosoHandle } from "react-virtuoso" import removeMd from "remove-markdown" @@ -13,7 +13,8 @@ import { appendImages } from "@src/utils/imageUtils" import type { ClineAsk, ClineMessage } from "@roo-code/types" -import { ClineSayBrowserAction, ClineSayTool, ExtensionMessage } from "@roo/ExtensionMessage" +import { ClineSayTool, ExtensionMessage } from "@roo/ExtensionMessage" +import { McpServer, McpTool } from "@roo/mcp" import { findLast } from "@roo/array" import { SuggestionItem } from "@roo-code/types" import { combineApiRequests } from "@roo/combineApiRequests" @@ -38,6 +39,8 @@ import VersionIndicator from "../common/VersionIndicator" import HistoryPreview from "../history/HistoryPreview" import Announcement from "./Announcement" import BrowserSessionRow from "./BrowserSessionRow" +import BrowserActionRow from "./BrowserActionRow" +import BrowserSessionStatusRow from "./BrowserSessionStatusRow" import ChatRow from "./ChatRow" import { ChatTextArea } from "./ChatTextArea" import TaskHeader from "./TaskHeader" @@ -95,6 +98,7 @@ const ChatViewComponent: React.ForwardRefRenderFunction textAreaRef.current?.focus()) - const visibleMessages = useMemo(() => { // Pre-compute checkpoint hashes that have associated user messages for O(1) lookup const userMessageCheckpointHashes = new Set() @@ -965,97 +966,58 @@ const ChatViewComponent: React.ForwardRefRenderFunction { - // Which of visible messages are browser session messages, see above. - if (message.type === "ask") { - return ["browser_action_launch"].includes(message.ask!) - } - - if (message.type === "say") { - return ["api_req_started", "text", "browser_action", "browser_action_result"].includes(message.say!) - } - - return false - } - - const groupedMessages = useMemo(() => { - const result: (ClineMessage | ClineMessage[])[] = [] - let currentGroup: ClineMessage[] = [] - let isInBrowserSession = false - - const endBrowserSession = () => { - if (currentGroup.length > 0) { - result.push([...currentGroup]) - currentGroup = [] - isInBrowserSession = false + // Compute current browser session messages for the top banner (not grouped into chat stream) + // Find the FIRST browser session from the beginning to show ALL sessions + const browserSessionStartIndex = useMemo(() => { + for (let i = 0; i < messages.length; i++) { + if (messages[i].ask === "browser_action_launch") { + return i + } + // Also check for browser_session_status as a fallback indicator + if (messages[i].say === "browser_session_status" && messages[i].text?.includes("opened")) { + return i } } + return -1 + }, [messages]) - visibleMessages.forEach((message: ClineMessage) => { - if (message.ask === "browser_action_launch") { - // Complete existing browser session if any. - endBrowserSession() - // Start new. - isInBrowserSession = true - currentGroup.push(message) - } else if (isInBrowserSession) { - // End session if `api_req_started` is cancelled. - - if (message.say === "api_req_started") { - // Get last `api_req_started` in currentGroup to check if - // it's cancelled. If it is then this api req is not part - // of the current browser session. - const lastApiReqStarted = [...currentGroup].reverse().find((m) => m.say === "api_req_started") - - if (lastApiReqStarted?.text !== null && lastApiReqStarted?.text !== undefined) { - const info = JSON.parse(lastApiReqStarted.text) - const isCancelled = info.cancelReason !== null && info.cancelReason !== undefined - - if (isCancelled) { - endBrowserSession() - result.push(message) - return - } - } - } - - if (isBrowserSessionMessage(message)) { - currentGroup.push(message) + const _browserSessionMessages = useMemo(() => { + if (browserSessionStartIndex === -1) return [] + return messages.slice(browserSessionStartIndex) + }, [browserSessionStartIndex, messages]) - // Check if this is a close action - if (message.say === "browser_action") { - const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction - if (browserAction.action === "close") { - endBrowserSession() - } - } - } else { - // complete existing browser session if any - endBrowserSession() - result.push(message) - } - } else { - result.push(message) - } - }) + // Show globe toggle only when in a task that has a browser session (active or inactive) + const showBrowserDockToggle = useMemo( + () => Boolean(task && (browserSessionStartIndex !== -1 || isBrowserSessionActive)), + [task, browserSessionStartIndex, isBrowserSessionActive], + ) - // Handle case where browser session is the last group - if (currentGroup.length > 0) { - result.push([...currentGroup]) + const isBrowserSessionMessage = useCallback((message: ClineMessage): boolean => { + // Only the launch ask should be hidden from chat (it's shown in the drawer header) + if (message.type === "ask" && message.ask === "browser_action_launch") { + return true } + // browser_action_result messages are paired with browser_action and should not appear independently + if (message.type === "say" && message.say === "browser_action_result") { + return true + } + return false + }, []) + + const groupedMessages = useMemo(() => { + // Only filter out the launch ask and result messages - browser actions appear in chat + const result: ClineMessage[] = visibleMessages.filter((msg) => !isBrowserSessionMessage(msg)) if (isCondensing) { - // Show indicator after clicking condense button result.push({ type: "say", say: "condense_context", ts: Date.now(), partial: true, - }) + } as any) } - return result - }, [isCondensing, visibleMessages]) + }, [isCondensing, visibleMessages, isBrowserSessionMessage]) // scrolling @@ -1212,7 +1174,7 @@ const ChatViewComponent: React.ForwardRefRenderFunction { - // browser session group + // browser session group - this should never be called now since we don't group messages if (Array.isArray(messageOrGroup)) { return ( ) } + const hasCheckpoint = modifiedMessages.some((message) => message.say === "checkpoint_saved") + // Check if this is a browser action message + if (messageOrGroup.type === "say" && messageOrGroup.say === "browser_action") { + // Find the corresponding result message by looking for the next browser_action_result after this action's timestamp + const nextMessage = modifiedMessages.find( + (m) => m.ts > messageOrGroup.ts && m.say === "browser_action_result", + ) + + // Calculate action index and total count + const browserActions = modifiedMessages.filter((m) => m.say === "browser_action") + const actionIndex = browserActions.findIndex((m) => m.ts === messageOrGroup.ts) + 1 + const totalActions = browserActions.length + + return ( + + ) + } + + // Check if this is a browser session status message + if (messageOrGroup.type === "say" && messageOrGroup.say === "browser_session_status") { + return + } + // regular message return ( -
- { - setIsAtBottom(isAtBottom) - if (isAtBottom) { - disableAutoScrollRef.current = false - } - setShowScrollToBottom(disableAutoScrollRef.current && !isAtBottom) - }} - atBottomThreshold={10} - initialTopMostItemIndex={groupedMessages.length - 1} - /> +
+
+ { + setIsAtBottom(isAtBottom) + if (isAtBottom) { + disableAutoScrollRef.current = false + } + setShowScrollToBottom(disableAutoScrollRef.current && !isAtBottom) + }} + atBottomThreshold={10} + initialTopMostItemIndex={groupedMessages.length - 1} + /> +
{areButtonsVisible && (
{isProfileDisabled && ( diff --git a/webview-ui/src/components/chat/__tests__/BrowserSessionRow.aspect-ratio.spec.tsx b/webview-ui/src/components/chat/__tests__/BrowserSessionRow.aspect-ratio.spec.tsx new file mode 100644 index 00000000000..87465862032 --- /dev/null +++ b/webview-ui/src/components/chat/__tests__/BrowserSessionRow.aspect-ratio.spec.tsx @@ -0,0 +1,55 @@ +import { render, screen, fireEvent } from "@testing-library/react" +import React from "react" +import BrowserSessionRow from "../BrowserSessionRow" +import { ExtensionStateContext } from "@src/context/ExtensionStateContext" +import { TooltipProvider } from "@src/components/ui/tooltip" + +describe("BrowserSessionRow - screenshot area", () => { + const renderRow = (messages: any[]) => { + const mockExtState: any = { + // Ensure known viewport so expected aspect ratio is deterministic (600/900 = 66.67%) + browserViewportSize: "900x600", + isBrowserSessionActive: false, + } + + return render( + + + true} + onToggleExpand={() => {}} + lastModifiedMessage={undefined as any} + isLast={true} + onHeightChange={() => {}} + isStreaming={false} + /> + + , + ) + } + + it("reserves height while screenshot is loading (no layout collapse)", () => { + // Only a launch action, no corresponding browser_action_result yet (no screenshot) + const messages = [ + { + ts: 1, + say: "browser_action", + text: JSON.stringify({ action: "launch", url: "http://localhost:3000" }), + }, + ] + + renderRow(messages) + + // Open the browser session drawer + const globe = screen.getByLabelText("Browser interaction") + fireEvent.click(globe) + + const container = screen.getByTestId("screenshot-container") as HTMLDivElement + // padding-bottom should reflect aspect ratio (600/900 * 100) even without an image + const pb = parseFloat(container.style.paddingBottom || "0") + expect(pb).toBeGreaterThan(0) + // Be tolerant of rounding + expect(Math.round(pb)).toBe(67) + }) +}) diff --git a/webview-ui/src/components/chat/__tests__/BrowserSessionRow.disconnect-button.spec.tsx b/webview-ui/src/components/chat/__tests__/BrowserSessionRow.disconnect-button.spec.tsx new file mode 100644 index 00000000000..0c2b4762c4e --- /dev/null +++ b/webview-ui/src/components/chat/__tests__/BrowserSessionRow.disconnect-button.spec.tsx @@ -0,0 +1,42 @@ +import React from "react" +import { render, screen } from "@testing-library/react" +import BrowserSessionRow from "../BrowserSessionRow" +import { ExtensionStateContext } from "@src/context/ExtensionStateContext" +import { TooltipProvider } from "@radix-ui/react-tooltip" + +describe("BrowserSessionRow - Disconnect session button", () => { + const renderRow = (isActive: boolean) => { + const mockExtState: any = { + browserViewportSize: "900x600", + isBrowserSessionActive: isActive, + } + + return render( + + + false} + onToggleExpand={() => {}} + lastModifiedMessage={undefined as any} + isLast={true} + onHeightChange={() => {}} + isStreaming={false} + /> + + , + ) + } + + it("shows the Disconnect session button when a session is active", () => { + renderRow(true) + const btn = screen.getByLabelText("Disconnect session") + expect(btn).toBeInTheDocument() + }) + + it("does not render the button when no session is active", () => { + renderRow(false) + const btn = screen.queryByLabelText("Disconnect session") + expect(btn).toBeNull() + }) +}) diff --git a/webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx b/webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx new file mode 100644 index 00000000000..e870e8df3c2 --- /dev/null +++ b/webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx @@ -0,0 +1,119 @@ +// npx vitest run src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx + +import { render, waitFor } from "@/utils/test-utils" +import { QueryClient, QueryClientProvider } from "@tanstack/react-query" +import { ExtensionStateContextProvider } from "@src/context/ExtensionStateContext" +import ChatView, { ChatViewProps } from "../ChatView" + +vi.mock("@src/utils/vscode", () => ({ + vscode: { postMessage: vi.fn() }, +})) + +vi.mock("rehype-highlight", () => ({ default: () => () => {} })) +vi.mock("hast-util-to-text", () => ({ default: () => "" })) + +vi.mock("../BrowserSessionRow", () => ({ + default: function MockBrowserSessionRow({ messages }: { messages: any[] }) { + return
{JSON.stringify(messages)}
+ }, +})) + +vi.mock("../ChatRow", () => ({ + default: function MockChatRow({ message }: { message: any }) { + return
{JSON.stringify(message)}
+ }, +})) + +vi.mock("../TaskHeader", () => ({ + default: function MockTaskHeader() { + return
+ }, +})) + +vi.mock("@src/components/common/CodeBlock", () => ({ + default: () => null, + CODE_BLOCK_BG_COLOR: "rgb(30, 30, 30)", +})) + +const queryClient = new QueryClient() + +const defaultProps: ChatViewProps = { + isHidden: false, + showAnnouncement: false, + hideAnnouncement: () => {}, +} + +const renderChatView = (props: Partial = {}) => { + return render( + + + + + , + ) +} + +const mockPostMessage = (state: any) => { + window.postMessage( + { + type: "state", + state: { + version: "1.0.0", + clineMessages: [], + taskHistory: [], + shouldShowAnnouncement: false, + allowedCommands: [], + autoApprovalEnabled: true, + ...state, + }, + }, + "*", + ) +} + +describe("ChatView followup inside browser session", () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + it.skip("renders followup ask as a regular ChatRow while session banner is visible", async () => { + renderChatView() + + const ts = Date.now() + + // Send initial message with browser session and followup + mockPostMessage({ + alwaysAllowBrowser: true, + clineMessages: [ + { type: "say", say: "task", ts: ts - 4000, text: "Initial task" }, + { + type: "ask", + ask: "browser_action_launch", + ts: ts - 3000, + text: "http://example.com", + partial: false, + }, + { type: "say", say: "browser_action_result", ts: ts - 2000, text: "" }, + { + type: "ask", + ask: "followup", + ts: ts, + text: JSON.stringify({ question: "Continue?", suggest: [{ answer: "Yes" }, { answer: "No" }] }), + partial: false, + }, + ], + }) + + // Banner should be present (only contains browser_action_launch and browser_action_result) + await waitFor(() => { + const banner = document.querySelector('[data-testid="browser-session"]') + expect(banner).not.toBeNull() + }) + + // At least one ChatRow should render (the followup question) + await waitFor(() => { + const chatRows = document.querySelectorAll('[data-testid="chat-row"]') + expect(chatRows.length).toBeGreaterThan(0) + }) + }) +}) diff --git a/webview-ui/src/context/ExtensionStateContext.tsx b/webview-ui/src/context/ExtensionStateContext.tsx index 6443ccad93d..4bc03e259c7 100644 --- a/webview-ui/src/context/ExtensionStateContext.tsx +++ b/webview-ui/src/context/ExtensionStateContext.tsx @@ -200,6 +200,7 @@ export const ExtensionStateContextProvider: React.FC<{ children: React.ReactNode deniedCommands: [], soundEnabled: false, soundVolume: 0.5, + isBrowserSessionActive: false, ttsEnabled: false, ttsSpeed: 1.0, diffEnabled: false, diff --git a/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx b/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx index a0bdc51e75e..28899e342a0 100644 --- a/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx +++ b/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx @@ -214,6 +214,7 @@ describe("mergeExtensionState", () => { remoteControlEnabled: false, taskSyncEnabled: false, featureRoomoteControlEnabled: false, + isBrowserSessionActive: false, checkpointTimeout: DEFAULT_CHECKPOINT_TIMEOUT_SECONDS, // Add the checkpoint timeout property } diff --git a/webview-ui/src/i18n/locales/ca/chat.json b/webview-ui/src/i18n/locales/ca/chat.json index 9a7ea4d5404..25886e9649d 100644 --- a/webview-ui/src/i18n/locales/ca/chat.json +++ b/webview-ui/src/i18n/locales/ca/chat.json @@ -306,6 +306,7 @@ "socialLinks": "Uneix-te a nosaltres a X, Discord, o r/RooCode 🚀" }, "browser": { + "session": "Sessió del navegador", "rooWantsToUse": "Roo vol utilitzar el navegador", "consoleLogs": "Registres de consola", "noNewLogs": "(Cap registre nou)", @@ -322,8 +323,10 @@ "launch": "Iniciar navegador a {{url}}", "click": "Clic ({{coordinate}})", "type": "Escriure \"{{text}}\"", + "press": "Prem {{key}}", "scrollDown": "Desplaçar avall", "scrollUp": "Desplaçar amunt", + "hover": "Plana sobre ({{coordinate}})", "close": "Tancar navegador" } }, diff --git a/webview-ui/src/i18n/locales/de/chat.json b/webview-ui/src/i18n/locales/de/chat.json index 95412e0fdb5..6d87fa600be 100644 --- a/webview-ui/src/i18n/locales/de/chat.json +++ b/webview-ui/src/i18n/locales/de/chat.json @@ -306,6 +306,7 @@ "socialLinks": "Folge uns auf X, Discord oder r/RooCode 🚀" }, "browser": { + "session": "Browser-Sitzung", "rooWantsToUse": "Roo möchte den Browser verwenden", "consoleLogs": "Konsolenprotokolle", "noNewLogs": "(Keine neuen Protokolle)", @@ -322,8 +323,10 @@ "launch": "Browser starten auf {{url}}", "click": "Klicken ({{coordinate}})", "type": "Eingeben \"{{text}}\"", + "press": "{{key}} drücken", "scrollDown": "Nach unten scrollen", "scrollUp": "Nach oben scrollen", + "hover": "Hover ({{coordinate}})", "close": "Browser schließen" } }, diff --git a/webview-ui/src/i18n/locales/en/chat.json b/webview-ui/src/i18n/locales/en/chat.json index 68ec8ebb2fe..f4a8b2b153c 100644 --- a/webview-ui/src/i18n/locales/en/chat.json +++ b/webview-ui/src/i18n/locales/en/chat.json @@ -321,6 +321,7 @@ "countdownDisplay": "{{count}}s" }, "browser": { + "session": "Browser Session", "rooWantsToUse": "Roo wants to use the browser", "consoleLogs": "Console Logs", "noNewLogs": "(No new logs)", @@ -333,12 +334,13 @@ }, "sessionStarted": "Browser Session Started", "actions": { - "title": "Browse Action: ", "launch": "Launch browser at {{url}}", "click": "Click ({{coordinate}})", "type": "Type \"{{text}}\"", + "press": "Press {{key}}", "scrollDown": "Scroll down", "scrollUp": "Scroll up", + "hover": "Hover ({{coordinate}})", "close": "Close browser" } }, diff --git a/webview-ui/src/i18n/locales/es/chat.json b/webview-ui/src/i18n/locales/es/chat.json index 7accb141f6e..8e68db10e2c 100644 --- a/webview-ui/src/i18n/locales/es/chat.json +++ b/webview-ui/src/i18n/locales/es/chat.json @@ -306,6 +306,7 @@ "socialLinks": "Únete a nosotros en X, Discord, o r/RooCode 🚀" }, "browser": { + "session": "Sesión del navegador", "rooWantsToUse": "Roo quiere usar el navegador", "consoleLogs": "Registros de la consola", "noNewLogs": "(No hay nuevos registros)", @@ -322,8 +323,10 @@ "launch": "Iniciar navegador en {{url}}", "click": "Clic ({{coordinate}})", "type": "Escribir \"{{text}}\"", + "press": "Pulsar {{key}}", "scrollDown": "Desplazar hacia abajo", "scrollUp": "Desplazar hacia arriba", + "hover": "Flotar ({{coordinate}})", "close": "Cerrar navegador" } }, diff --git a/webview-ui/src/i18n/locales/fr/chat.json b/webview-ui/src/i18n/locales/fr/chat.json index aab98f2ce69..46387f18523 100644 --- a/webview-ui/src/i18n/locales/fr/chat.json +++ b/webview-ui/src/i18n/locales/fr/chat.json @@ -306,6 +306,7 @@ "socialLinks": "Rejoins-nous sur X, Discord, ou r/RooCode 🚀" }, "browser": { + "session": "Session du navigateur", "rooWantsToUse": "Roo veut utiliser le navigateur", "consoleLogs": "Journaux de console", "noNewLogs": "(Pas de nouveaux journaux)", @@ -322,8 +323,10 @@ "launch": "Lancer le navigateur sur {{url}}", "click": "Cliquer ({{coordinate}})", "type": "Saisir \"{{text}}\"", + "press": "Appuyer sur {{key}}", "scrollDown": "Défiler vers le bas", "scrollUp": "Défiler vers le haut", + "hover": "Survoler ({{coordinate}})", "close": "Fermer le navigateur" } }, diff --git a/webview-ui/src/i18n/locales/hi/chat.json b/webview-ui/src/i18n/locales/hi/chat.json index 3ca7516d725..09cc68cf975 100644 --- a/webview-ui/src/i18n/locales/hi/chat.json +++ b/webview-ui/src/i18n/locales/hi/chat.json @@ -306,6 +306,7 @@ "socialLinks": "X, Discord, या r/RooCode पर हमसे जुड़ें 🚀" }, "browser": { + "session": "ब्राउज़र सत्र", "rooWantsToUse": "Roo ब्राउज़र का उपयोग करना चाहता है", "consoleLogs": "कंसोल लॉग", "noNewLogs": "(कोई नया लॉग नहीं)", @@ -322,8 +323,10 @@ "launch": "{{url}} पर ब्राउज़र लॉन्च करें", "click": "क्लिक करें ({{coordinate}})", "type": "टाइप करें \"{{text}}\"", + "press": "{{key}} दबाएँ", "scrollDown": "नीचे स्क्रॉल करें", "scrollUp": "ऊपर स्क्रॉल करें", + "hover": "होवर करें ({{coordinate}})", "close": "ब्राउज़र बंद करें" } }, diff --git a/webview-ui/src/i18n/locales/id/chat.json b/webview-ui/src/i18n/locales/id/chat.json index e1836e61be1..789929d2a5a 100644 --- a/webview-ui/src/i18n/locales/id/chat.json +++ b/webview-ui/src/i18n/locales/id/chat.json @@ -327,6 +327,7 @@ "countdownDisplay": "{{count}}dtk" }, "browser": { + "session": "Sesi Browser", "rooWantsToUse": "Roo ingin menggunakan browser", "consoleLogs": "Log Konsol", "noNewLogs": "(Tidak ada log baru)", @@ -343,8 +344,10 @@ "launch": "Luncurkan browser di {{url}}", "click": "Klik ({{coordinate}})", "type": "Ketik \"{{text}}\"", + "press": "Tekan {{key}}", "scrollDown": "Gulir ke bawah", "scrollUp": "Gulir ke atas", + "hover": "Arahkan ({{coordinate}})", "close": "Tutup browser" } }, diff --git a/webview-ui/src/i18n/locales/it/chat.json b/webview-ui/src/i18n/locales/it/chat.json index 8c5544c9717..b45fa19d756 100644 --- a/webview-ui/src/i18n/locales/it/chat.json +++ b/webview-ui/src/i18n/locales/it/chat.json @@ -306,6 +306,7 @@ "socialLinks": "Unisciti a noi su X, Discord, o r/RooCode 🚀" }, "browser": { + "session": "Sessione del browser", "rooWantsToUse": "Roo vuole utilizzare il browser", "consoleLogs": "Log della console", "noNewLogs": "(Nessun nuovo log)", @@ -322,8 +323,10 @@ "launch": "Avvia browser su {{url}}", "click": "Clic ({{coordinate}})", "type": "Digita \"{{text}}\"", + "press": "Premi {{key}}", "scrollDown": "Scorri verso il basso", "scrollUp": "Scorri verso l'alto", + "hover": "Passa il mouse ({{coordinate}})", "close": "Chiudi browser" } }, diff --git a/webview-ui/src/i18n/locales/ja/chat.json b/webview-ui/src/i18n/locales/ja/chat.json index 4f6e40cadf7..4e686e74d43 100644 --- a/webview-ui/src/i18n/locales/ja/chat.json +++ b/webview-ui/src/i18n/locales/ja/chat.json @@ -306,6 +306,7 @@ "socialLinks": "XDiscord、またはr/RooCodeでフォローしてください 🚀" }, "browser": { + "session": "ブラウザセッション", "rooWantsToUse": "Rooはブラウザを使用したい", "consoleLogs": "コンソールログ", "noNewLogs": "(新しいログはありません)", @@ -322,8 +323,10 @@ "launch": "{{url}} でブラウザを起動", "click": "クリック ({{coordinate}})", "type": "入力 \"{{text}}\"", + "press": "{{key}}を押す", "scrollDown": "下にスクロール", "scrollUp": "上にスクロール", + "hover": "ホバー ({{coordinate}})", "close": "ブラウザを閉じる" } }, diff --git a/webview-ui/src/i18n/locales/ko/chat.json b/webview-ui/src/i18n/locales/ko/chat.json index 7d71191b017..430b9ccf743 100644 --- a/webview-ui/src/i18n/locales/ko/chat.json +++ b/webview-ui/src/i18n/locales/ko/chat.json @@ -306,6 +306,7 @@ "socialLinks": "X, Discord, 또는 r/RooCode에서 만나요 🚀" }, "browser": { + "session": "브라우저 세션", "rooWantsToUse": "Roo가 브라우저를 사용하고 싶어합니다", "consoleLogs": "콘솔 로그", "noNewLogs": "(새 로그 없음)", @@ -322,8 +323,10 @@ "launch": "{{url}}에서 브라우저 실행", "click": "클릭 ({{coordinate}})", "type": "입력 \"{{text}}\"", + "press": "{{key}} 누르기", "scrollDown": "아래로 스크롤", "scrollUp": "위로 스크롤", + "hover": "가리키기 ({{coordinate}})", "close": "브라우저 닫기" } }, diff --git a/webview-ui/src/i18n/locales/nl/chat.json b/webview-ui/src/i18n/locales/nl/chat.json index 82cdd0c46f9..55c9ed1dffe 100644 --- a/webview-ui/src/i18n/locales/nl/chat.json +++ b/webview-ui/src/i18n/locales/nl/chat.json @@ -306,6 +306,7 @@ "countdownDisplay": "{{count}}s" }, "browser": { + "session": "Browsersessie", "rooWantsToUse": "Roo wil de browser gebruiken", "consoleLogs": "Console-logboeken", "noNewLogs": "(Geen nieuwe logboeken)", @@ -322,8 +323,10 @@ "launch": "Browser starten op {{url}}", "click": "Klik ({{coordinate}})", "type": "Typ \"{{text}}\"", + "press": "Druk op {{key}}", "scrollDown": "Scroll naar beneden", "scrollUp": "Scroll naar boven", + "hover": "Zweven ({{coordinate}})", "close": "Browser sluiten" } }, diff --git a/webview-ui/src/i18n/locales/pl/chat.json b/webview-ui/src/i18n/locales/pl/chat.json index 7fe61db2a01..f5820122210 100644 --- a/webview-ui/src/i18n/locales/pl/chat.json +++ b/webview-ui/src/i18n/locales/pl/chat.json @@ -306,6 +306,7 @@ "socialLinks": "Dołącz do nas na X, Discord, lub r/RooCode 🚀" }, "browser": { + "session": "Sesja przeglądarki", "rooWantsToUse": "Roo chce użyć przeglądarki", "consoleLogs": "Logi konsoli", "noNewLogs": "(Brak nowych logów)", @@ -322,8 +323,10 @@ "launch": "Uruchom przeglądarkę na {{url}}", "click": "Kliknij ({{coordinate}})", "type": "Wpisz \"{{text}}\"", + "press": "Naciśnij {{key}}", "scrollDown": "Przewiń w dół", "scrollUp": "Przewiń w górę", + "hover": "Najedź ({{coordinate}})", "close": "Zamknij przeglądarkę" } }, diff --git a/webview-ui/src/i18n/locales/pt-BR/chat.json b/webview-ui/src/i18n/locales/pt-BR/chat.json index 4f287e6b538..661c7507c7f 100644 --- a/webview-ui/src/i18n/locales/pt-BR/chat.json +++ b/webview-ui/src/i18n/locales/pt-BR/chat.json @@ -306,6 +306,7 @@ "socialLinks": "Junte-se a nós no X, Discord, ou r/RooCode 🚀" }, "browser": { + "session": "Sessão do Navegador", "rooWantsToUse": "Roo quer usar o navegador", "consoleLogs": "Logs do console", "noNewLogs": "(Sem novos logs)", @@ -322,8 +323,10 @@ "launch": "Iniciar navegador em {{url}}", "click": "Clique ({{coordinate}})", "type": "Digitar \"{{text}}\"", + "press": "Pressione {{key}}", "scrollDown": "Rolar para baixo", "scrollUp": "Rolar para cima", + "hover": "Pairar ({{coordinate}})", "close": "Fechar navegador" } }, diff --git a/webview-ui/src/i18n/locales/ru/chat.json b/webview-ui/src/i18n/locales/ru/chat.json index 6cd11c1223b..f748820753e 100644 --- a/webview-ui/src/i18n/locales/ru/chat.json +++ b/webview-ui/src/i18n/locales/ru/chat.json @@ -307,6 +307,7 @@ "countdownDisplay": "{{count}}с" }, "browser": { + "session": "Сеанс браузера", "rooWantsToUse": "Roo хочет использовать браузер", "consoleLogs": "Логи консоли", "noNewLogs": "(Новых логов нет)", @@ -323,8 +324,10 @@ "launch": "Открыть браузер по адресу {{url}}", "click": "Клик ({{coordinate}})", "type": "Ввести \"{{text}}\"", + "press": "Нажать {{key}}", "scrollDown": "Прокрутить вниз", "scrollUp": "Прокрутить вверх", + "hover": "Навести ({{coordinate}})", "close": "Закрыть браузер" } }, diff --git a/webview-ui/src/i18n/locales/tr/chat.json b/webview-ui/src/i18n/locales/tr/chat.json index 44b9c5ae35b..7ed0a4b1bea 100644 --- a/webview-ui/src/i18n/locales/tr/chat.json +++ b/webview-ui/src/i18n/locales/tr/chat.json @@ -307,6 +307,7 @@ "socialLinks": "Bize X, Discord, veya r/RooCode'da katılın 🚀" }, "browser": { + "session": "Tarayıcı Oturumu", "rooWantsToUse": "Roo tarayıcıyı kullanmak istiyor", "consoleLogs": "Konsol Kayıtları", "noNewLogs": "(Yeni kayıt yok)", @@ -323,8 +324,10 @@ "launch": "{{url}} adresinde tarayıcı başlat", "click": "Tıkla ({{coordinate}})", "type": "Yaz \"{{text}}\"", + "press": "{{key}} tuşuna bas", "scrollDown": "Aşağı kaydır", "scrollUp": "Yukarı kaydır", + "hover": "Üzerine gel ({{coordinate}})", "close": "Tarayıcıyı kapat" } }, diff --git a/webview-ui/src/i18n/locales/vi/chat.json b/webview-ui/src/i18n/locales/vi/chat.json index 1359d48b593..2488ba9cb39 100644 --- a/webview-ui/src/i18n/locales/vi/chat.json +++ b/webview-ui/src/i18n/locales/vi/chat.json @@ -307,6 +307,7 @@ "socialLinks": "Tham gia với chúng tôi trên X, Discord, hoặc r/RooCode 🚀" }, "browser": { + "session": "Phiên trình duyệt", "rooWantsToUse": "Roo muốn sử dụng trình duyệt", "consoleLogs": "Nhật ký bảng điều khiển", "noNewLogs": "(Không có nhật ký mới)", @@ -323,8 +324,10 @@ "launch": "Khởi chạy trình duyệt tại {{url}}", "click": "Nhấp ({{coordinate}})", "type": "Gõ \"{{text}}\"", + "press": "Nhấn {{key}}", "scrollDown": "Cuộn xuống", "scrollUp": "Cuộn lên", + "hover": "Di chuột ({{coordinate}})", "close": "Đóng trình duyệt" } }, diff --git a/webview-ui/src/i18n/locales/zh-CN/chat.json b/webview-ui/src/i18n/locales/zh-CN/chat.json index fa35b68590f..a03048e5833 100644 --- a/webview-ui/src/i18n/locales/zh-CN/chat.json +++ b/webview-ui/src/i18n/locales/zh-CN/chat.json @@ -307,6 +307,7 @@ "socialLinks": "在 XDiscordr/RooCode 上关注我们 🚀" }, "browser": { + "session": "浏览器会话", "rooWantsToUse": "Roo想使用浏览器", "consoleLogs": "控制台日志", "noNewLogs": "(没有新日志)", @@ -323,8 +324,10 @@ "launch": "访问 {{url}}", "click": "点击 ({{coordinate}})", "type": "输入 \"{{text}}\"", + "press": "按 {{key}}", "scrollDown": "向下滚动", "scrollUp": "向上滚动", + "hover": "悬停 ({{coordinate}})", "close": "关闭浏览器" } }, diff --git a/webview-ui/src/i18n/locales/zh-TW/chat.json b/webview-ui/src/i18n/locales/zh-TW/chat.json index a0dee75d306..a4d7b71b08c 100644 --- a/webview-ui/src/i18n/locales/zh-TW/chat.json +++ b/webview-ui/src/i18n/locales/zh-TW/chat.json @@ -325,6 +325,7 @@ "countdownDisplay": "{{count}} 秒" }, "browser": { + "session": "瀏覽器會話", "rooWantsToUse": "Roo 想要使用瀏覽器", "consoleLogs": "主控台記錄", "noNewLogs": "(沒有新記錄)", @@ -341,8 +342,10 @@ "launch": "在 {{url}} 啟動瀏覽器", "click": "點選 ({{coordinate}})", "type": "輸入「{{text}}」", + "press": "按下 {{key}}", "scrollDown": "向下捲動", "scrollUp": "向上捲動", + "hover": "懸停 ({{coordinate}})", "close": "關閉瀏覽器" } }, diff --git a/webview-ui/vite.config.ts b/webview-ui/vite.config.ts index b38452a9902..6bf6412bfb0 100644 --- a/webview-ui/vite.config.ts +++ b/webview-ui/vite.config.ts @@ -101,6 +101,10 @@ export default defineConfig(({ mode }) => { // Ensure source maps are properly included in the build minify: mode === "production" ? "esbuild" : false, rollupOptions: { + input: { + index: resolve(__dirname, "index.html"), + "browser-panel": resolve(__dirname, "browser-panel.html"), + }, output: { entryFileNames: `assets/[name].js`, chunkFileNames: (chunkInfo) => { From c1cda4615856e0daca92315da0baa941650c224c Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 5 Nov 2025 12:57:36 -0700 Subject: [PATCH 02/16] ui(chat): move browser globe to task header; show grey when inactive and green Active when session active --- .../src/components/chat/ChatTextArea.tsx | 34 ++------- webview-ui/src/components/chat/TaskHeader.tsx | 74 ++++++++++++++++--- 2 files changed, 71 insertions(+), 37 deletions(-) diff --git a/webview-ui/src/components/chat/ChatTextArea.tsx b/webview-ui/src/components/chat/ChatTextArea.tsx index c9e017a3169..58f42a367bc 100644 --- a/webview-ui/src/components/chat/ChatTextArea.tsx +++ b/webview-ui/src/components/chat/ChatTextArea.tsx @@ -1,7 +1,7 @@ import React, { forwardRef, useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from "react" import { useEvent } from "react-use" import DynamicTextArea from "react-textarea-autosize" -import { VolumeX, Image, WandSparkles, SendHorizontal, MessageSquareX, Globe } from "lucide-react" +import { VolumeX, Image, WandSparkles, SendHorizontal, MessageSquareX } from "lucide-react" import { mentionRegex, mentionRegexGlobal, commandRegexGlobal, unescapeSpaces } from "@roo/context-mentions" import { WebviewMessage } from "@roo/WebviewMessage" @@ -21,7 +21,7 @@ import { } from "@src/utils/context-mentions" import { cn } from "@src/lib/utils" import { convertToMentionPath } from "@src/utils/path-mentions" -import { StandardTooltip, Button } from "@src/components/ui" +import { StandardTooltip } from "@src/components/ui" import Thumbnails from "../common/Thumbnails" import { ModeSelector } from "./ModeSelector" @@ -1264,32 +1264,14 @@ export const ChatTextArea = forwardRef( )} - {!isEditMode && showBrowserDockToggle && ( - - - - )} {!isEditMode ? : null} {!isEditMode && cloudUserInfo && } + {/* keep props referenced after moving browser button */} +
diff --git a/webview-ui/src/components/chat/TaskHeader.tsx b/webview-ui/src/components/chat/TaskHeader.tsx index b06d6e64f5f..7010feac93e 100644 --- a/webview-ui/src/components/chat/TaskHeader.tsx +++ b/webview-ui/src/components/chat/TaskHeader.tsx @@ -1,4 +1,4 @@ -import { memo, useEffect, useRef, useState } from "react" +import { memo, useEffect, useRef, useState, useMemo } from "react" import { useTranslation } from "react-i18next" import { useCloudUpsell } from "@src/hooks/useCloudUpsell" import { CloudUpsellDialog } from "@src/components/cloud/CloudUpsellDialog" @@ -10,7 +10,8 @@ import { Coins, HardDriveDownload, HardDriveUpload, - FoldVerticalIcon, + FoldVertical, + Globe, } from "lucide-react" import prettyBytes from "pretty-bytes" @@ -21,9 +22,10 @@ import { findLastIndex } from "@roo/array" import { formatLargeNumber } from "@src/utils/format" import { cn } from "@src/lib/utils" -import { StandardTooltip } from "@src/components/ui" +import { StandardTooltip, Button } from "@src/components/ui" import { useExtensionState } from "@src/context/ExtensionStateContext" import { useSelectedModel } from "@/components/ui/hooks/useSelectedModel" +import { vscode } from "@src/utils/vscode" import Thumbnails from "../common/Thumbnails" @@ -59,7 +61,7 @@ const TaskHeader = ({ todos, }: TaskHeaderProps) => { const { t } = useTranslation() - const { apiConfiguration, currentTaskItem, clineMessages } = useExtensionState() + const { apiConfiguration, currentTaskItem, clineMessages, isBrowserSessionActive } = useExtensionState() const { id: modelId, info: model } = useSelectedModel(apiConfiguration) const [isTaskExpanded, setIsTaskExpanded] = useState(false) const [showLongRunningTaskMessage, setShowLongRunningTaskMessage] = useState(false) @@ -95,14 +97,29 @@ const TaskHeader = ({ const textRef = useRef(null) const contextWindow = model?.contextWindow || 1 + // Detect if this task had any browser session activity so we can show a grey globe when inactive + const browserSessionStartIndex = useMemo(() => { + const msgs = clineMessages || [] + for (let i = 0; i < msgs.length; i++) { + const m = msgs[i] as any + if (m?.ask === "browser_action_launch") return i + if (m?.say === "browser_session_status" && typeof m.text === "string" && m.text.includes("opened")) { + return i + } + } + return -1 + }, [clineMessages]) + + const showBrowserGlobe = browserSessionStartIndex !== -1 || !!isBrowserSessionActive + const condenseButton = ( - currentTaskItem && handleCondenseContext(currentTaskItem.id)} - /> - ) + currentTaskItem && handleCondenseContext(currentTaskItem.id)} + /> + ) const hasTodos = todos && Array.isArray(todos) && todos.length > 0 @@ -355,6 +372,41 @@ const TaskHeader = ({ )} {/* Todo list - always shown at bottom when todos exist */} {hasTodos && } + + {/* Browser session status moved from bottom bar to header (bottom-right) */} + {showBrowserGlobe && ( +
e.stopPropagation()}> + + + + {isBrowserSessionActive && ( + + Active + + )} +
+ )}
From 683e1d169708a925466a0a58ac3f498898104fba Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 12 Nov 2025 17:18:33 -0700 Subject: [PATCH 03/16] ci(knip): include webview browser panel in knip.json to avoid false positives; chore(webview): remove unused imports in ChatView --- knip.json | 2 +- webview-ui/src/components/chat/ChatView.tsx | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/knip.json b/knip.json index a847111981b..e15c62bda1b 100644 --- a/knip.json +++ b/knip.json @@ -16,7 +16,7 @@ "project": ["**/*.ts"] }, "webview-ui": { - "entry": ["src/index.tsx"], + "entry": ["src/index.tsx", "src/browser-panel.tsx"], "project": ["src/**/*.{ts,tsx}", "../src/shared/*.ts"] }, "packages/{build,cloud,evals,ipc,telemetry,types}": { diff --git a/webview-ui/src/components/chat/ChatView.tsx b/webview-ui/src/components/chat/ChatView.tsx index dbec6979f07..a7676c01392 100644 --- a/webview-ui/src/components/chat/ChatView.tsx +++ b/webview-ui/src/components/chat/ChatView.tsx @@ -14,7 +14,6 @@ import { appendImages } from "@src/utils/imageUtils" import type { ClineAsk, ClineMessage } from "@roo-code/types" import { ClineSayTool, ExtensionMessage } from "@roo/ExtensionMessage" -import { McpServer, McpTool } from "@roo/mcp" import { findLast } from "@roo/array" import { SuggestionItem } from "@roo-code/types" import { combineApiRequests } from "@roo/combineApiRequests" From 67c654031f99bca7d635a8e0296b486461a18b59 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 12 Nov 2025 18:27:23 -0700 Subject: [PATCH 04/16] Browser Use 2.0: backend emits executedCoordinate for click/hover; FE consumes it; remove fragile session-status parsing; ensure ResizeObserver cleanup; all tests passing (#8941) --- src/core/tools/browserActionTool.ts | 290 ++++++++++++++++++ src/shared/ExtensionMessage.ts | 1 + .../src/components/chat/BrowserActionRow.tsx | 4 +- .../src/components/chat/BrowserSessionRow.tsx | 6 +- webview-ui/src/components/chat/ChatView.tsx | 4 - webview-ui/src/components/chat/TaskHeader.tsx | 17 +- 6 files changed, 304 insertions(+), 18 deletions(-) create mode 100644 src/core/tools/browserActionTool.ts diff --git a/src/core/tools/browserActionTool.ts b/src/core/tools/browserActionTool.ts new file mode 100644 index 00000000000..0ff6fd04a48 --- /dev/null +++ b/src/core/tools/browserActionTool.ts @@ -0,0 +1,290 @@ +import { Task } from "../task/Task" +import { ToolUse, AskApproval, HandleError, PushToolResult, RemoveClosingTag } from "../../shared/tools" +import { + BrowserAction, + BrowserActionResult, + browserActions, + ClineSayBrowserAction, +} from "../../shared/ExtensionMessage" +import { formatResponse } from "../prompts/responses" +import { Anthropic } from "@anthropic-ai/sdk" + +/** + * Parses coordinate string and scales from image dimensions to viewport dimensions + * The LLM examines the screenshot it receives (which may be downscaled by the API) + * and reports coordinates in format: "x,y@widthxheight" where widthxheight is what the LLM observed + * + * Format: "x,y@widthxheight" (required) + * Returns: scaled coordinate string "x,y" in viewport coordinates + * Throws: Error if format is invalid or missing image dimensions + */ +function scaleCoordinate(coordinate: string, viewportWidth: number, viewportHeight: number): string { + // Parse coordinate with required image dimensions (accepts both 'x' and ',' as dimension separators) + const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) + + if (!match) { + throw new Error( + `Invalid coordinate format: "${coordinate}". ` + + `Expected format: "x,y@widthxheight" (e.g., "450,300@1024x768")`, + ) + } + + const [, xStr, yStr, imgWidthStr, imgHeightStr] = match + const x = parseInt(xStr, 10) + const y = parseInt(yStr, 10) + const imgWidth = parseInt(imgWidthStr, 10) + const imgHeight = parseInt(imgHeightStr, 10) + + // Scale coordinates from image dimensions to viewport dimensions + const scaledX = Math.round((x / imgWidth) * viewportWidth) + const scaledY = Math.round((y / imgHeight) * viewportHeight) + + return `${scaledX},${scaledY}` +} + +export async function browserActionTool( + cline: Task, + block: ToolUse, + askApproval: AskApproval, + handleError: HandleError, + pushToolResult: PushToolResult, + removeClosingTag: RemoveClosingTag, +) { + const action: BrowserAction | undefined = block.params.action as BrowserAction + const url: string | undefined = block.params.url + const coordinate: string | undefined = block.params.coordinate + const text: string | undefined = block.params.text + const size: string | undefined = block.params.size + + if (!action || !browserActions.includes(action)) { + // checking for action to ensure it is complete and valid + if (!block.partial) { + // if the block is complete and we don't have a valid action cline is a mistake + cline.consecutiveMistakeCount++ + cline.recordToolError("browser_action") + pushToolResult(await cline.sayAndCreateMissingParamError("browser_action", "action")) + // Do not close the browser on parameter validation errors + } + + return + } + + try { + if (block.partial) { + if (action === "launch") { + await cline.ask("browser_action_launch", removeClosingTag("url", url), block.partial).catch(() => {}) + } else { + await cline.say( + "browser_action", + JSON.stringify({ + action: action as BrowserAction, + coordinate: removeClosingTag("coordinate", coordinate), + text: removeClosingTag("text", text), + size: removeClosingTag("size", size), + } satisfies ClineSayBrowserAction), + undefined, + block.partial, + ) + } + return + } else { + // Initialize with empty object to avoid "used before assigned" errors + let browserActionResult: BrowserActionResult = {} + + if (action === "launch") { + if (!url) { + cline.consecutiveMistakeCount++ + cline.recordToolError("browser_action") + pushToolResult(await cline.sayAndCreateMissingParamError("browser_action", "url")) + // Do not close the browser on parameter validation errors + return + } + + cline.consecutiveMistakeCount = 0 + const didApprove = await askApproval("browser_action_launch", url) + + if (!didApprove) { + return + } + + // NOTE: It's okay that we call cline message since the partial inspect_site is finished streaming. + // The only scenario we have to avoid is sending messages WHILE a partial message exists at the end of the messages array. + // For example the api_req_finished message would interfere with the partial message, so we needed to remove that. + + // Launch browser first (this triggers "Browser session opened" status message) + await cline.browserSession.launchBrowser() + + // Create browser_action say message AFTER launching so status appears first + await cline.say( + "browser_action", + JSON.stringify({ + action: "launch" as BrowserAction, + text: url, + } satisfies ClineSayBrowserAction), + undefined, + false, + ) + + browserActionResult = await cline.browserSession.navigateToUrl(url) + } else { + // Variables to hold validated and processed parameters + let processedCoordinate = coordinate + + if (action === "click" || action === "hover") { + if (!coordinate) { + cline.consecutiveMistakeCount++ + cline.recordToolError("browser_action") + pushToolResult(await cline.sayAndCreateMissingParamError("browser_action", "coordinate")) + // Do not close the browser on parameter validation errors + return // can't be within an inner switch + } + + // Get viewport dimensions from the browser session + const viewportSize = cline.browserSession.getViewportSize() + const viewportWidth = viewportSize.width || 900 // default to 900 if not available + const viewportHeight = viewportSize.height || 600 // default to 600 if not available + + // Scale coordinate from image dimensions to viewport dimensions + try { + processedCoordinate = scaleCoordinate(coordinate, viewportWidth, viewportHeight) + } catch (error) { + cline.consecutiveMistakeCount++ + cline.recordToolError("browser_action") + pushToolResult( + await cline.sayAndCreateMissingParamError( + "browser_action", + "coordinate", + error instanceof Error ? error.message : String(error), + ), + ) + return + } + } + + if (action === "type" || action === "press") { + if (!text) { + cline.consecutiveMistakeCount++ + cline.recordToolError("browser_action") + pushToolResult(await cline.sayAndCreateMissingParamError("browser_action", "text")) + // Do not close the browser on parameter validation errors + return + } + } + + if (action === "resize") { + if (!size) { + cline.consecutiveMistakeCount++ + cline.recordToolError("browser_action") + pushToolResult(await cline.sayAndCreateMissingParamError("browser_action", "size")) + // Do not close the browser on parameter validation errors + return + } + } + + cline.consecutiveMistakeCount = 0 + + // Prepare say payload; include executedCoordinate for pointer actions + const sayPayload: ClineSayBrowserAction & { executedCoordinate?: string } = { + action: action as BrowserAction, + coordinate, + text, + size, + } + if ((action === "click" || action === "hover") && processedCoordinate) { + sayPayload.executedCoordinate = processedCoordinate + } + await cline.say("browser_action", JSON.stringify(sayPayload), undefined, false) + + switch (action) { + case "click": + browserActionResult = await cline.browserSession.click(processedCoordinate!) + break + case "hover": + browserActionResult = await cline.browserSession.hover(processedCoordinate!) + break + case "type": + browserActionResult = await cline.browserSession.type(text!) + break + case "press": + browserActionResult = await cline.browserSession.press(text!) + break + case "scroll_down": + browserActionResult = await cline.browserSession.scrollDown() + break + case "scroll_up": + browserActionResult = await cline.browserSession.scrollUp() + break + case "resize": + browserActionResult = await cline.browserSession.resize(size!) + break + case "close": + browserActionResult = await cline.browserSession.closeBrowser() + break + } + } + + switch (action) { + case "launch": + case "click": + case "hover": + case "type": + case "press": + case "scroll_down": + case "scroll_up": + case "resize": { + await cline.say("browser_action_result", JSON.stringify(browserActionResult)) + + const images = browserActionResult?.screenshot ? [browserActionResult.screenshot] : [] + + let messageText = `The browser action has been executed.` + + messageText += `\n\n**CRITICAL**: When providing click/hover coordinates:` + messageText += `\n1. Screenshot dimensions != Browser viewport dimensions` + messageText += `\n2. Measure x,y on the screenshot image you see below` + messageText += `\n3. Use format: x,y@WIDTHxHEIGHT where WIDTHxHEIGHT is the EXACT pixel size of the screenshot image` + messageText += `\n4. Never use the browser viewport size for WIDTHxHEIGHT - it is only for reference and is often larger than the screenshot` + messageText += `\n5. Screenshots are often downscaled - always use the dimensions you see in the image` + messageText += `\nExample: Viewport 1280x800, screenshot 1000x625, click (500,300) -> 500,300@1000x625` + + // Include browser viewport dimensions (for reference only) + if (browserActionResult?.viewportWidth && browserActionResult?.viewportHeight) { + messageText += `\n\nBrowser viewport: ${browserActionResult.viewportWidth}x${browserActionResult.viewportHeight}` + } + + // Include cursor position if available + if (browserActionResult?.currentMousePosition) { + messageText += `\nCursor position: ${browserActionResult.currentMousePosition}` + } + + messageText += `\n\nConsole logs:\n${browserActionResult?.logs || "(No new logs)"}\n` + + if (images.length > 0) { + const blocks = [ + ...formatResponse.imageBlocks(images), + { type: "text", text: messageText } as Anthropic.TextBlockParam, + ] + pushToolResult(blocks) + } else { + pushToolResult(messageText) + } + + break + } + case "close": + pushToolResult( + formatResponse.toolResult( + `The browser has been closed. You may now proceed to using other tools.`, + ), + ) + + break + } + + return + } + } catch (error) { + // Keep the browser session alive on errors; report the error without terminating the session + await handleError("executing browser action", error) + return + } +} diff --git a/src/shared/ExtensionMessage.ts b/src/shared/ExtensionMessage.ts index e456452fe2c..5b6214b1219 100644 --- a/src/shared/ExtensionMessage.ts +++ b/src/shared/ExtensionMessage.ts @@ -441,6 +441,7 @@ export interface ClineSayBrowserAction { coordinate?: string size?: string text?: string + executedCoordinate?: string } export type BrowserActionResult = { diff --git a/webview-ui/src/components/chat/BrowserActionRow.tsx b/webview-ui/src/components/chat/BrowserActionRow.tsx index 9b13b9426c0..0727e0c765f 100644 --- a/webview-ui/src/components/chat/BrowserActionRow.tsx +++ b/webview-ui/src/components/chat/BrowserActionRow.tsx @@ -151,13 +151,13 @@ const BrowserActionRow = memo(({ message, nextMessage, actionIndex, totalActions case "launch": return `Launched browser` case "click": - return `Clicked at: ${getViewportCoordinate(browserAction.coordinate)}` + return `Clicked at: ${browserAction.executedCoordinate || getViewportCoordinate(browserAction.coordinate)}` case "type": return `Typed: ${browserAction.text}` case "press": return `Pressed key: ${prettyKey(browserAction.text)}` case "hover": - return `Hovered at: ${getViewportCoordinate(browserAction.coordinate)}` + return `Hovered at: ${browserAction.executedCoordinate || getViewportCoordinate(browserAction.coordinate)}` case "scroll_down": return "Scrolled down" case "scroll_up": diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index 695d40bc848..79322dadb53 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -73,6 +73,7 @@ const prettyKey = (k?: string): string => { const getBrowserActionText = ( action: BrowserAction, + executedCoordinate?: string, coordinate?: string, text?: string, size?: string, @@ -113,7 +114,7 @@ const getBrowserActionText = ( case "launch": return `Launched browser` case "click": - return `Clicked at: ${getViewportCoordinate(coordinate)}` + return `Clicked at: ${executedCoordinate || getViewportCoordinate(coordinate)}` case "type": return `Typed: ${text}` case "press": @@ -123,7 +124,7 @@ const getBrowserActionText = ( case "scroll_up": return "Scrolled up" case "hover": - return `Hovered at: ${getViewportCoordinate(coordinate)}` + return `Hovered at: ${executedCoordinate || getViewportCoordinate(coordinate)}` case "resize": return `Resized to: ${size?.split(/[x,]/).join(" x ")}` case "close": @@ -623,6 +624,7 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { {getBrowserActionText( action.action, + pages[currentPageIndex]?.mousePosition, action.coordinate, action.text, pageSize, diff --git a/webview-ui/src/components/chat/ChatView.tsx b/webview-ui/src/components/chat/ChatView.tsx index a7676c01392..3b53e016841 100644 --- a/webview-ui/src/components/chat/ChatView.tsx +++ b/webview-ui/src/components/chat/ChatView.tsx @@ -972,10 +972,6 @@ const ChatViewComponent: React.ForwardRefRenderFunction currentTaskItem && handleCondenseContext(currentTaskItem.id)} - /> - ) + currentTaskItem && handleCondenseContext(currentTaskItem.id)} + /> + ) const hasTodos = todos && Array.isArray(todos) && todos.length > 0 From afdafb33698edbec9bd15524a2730af9041380e4 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 12 Nov 2025 18:59:06 -0700 Subject: [PATCH 05/16] Update webview-ui/src/components/chat/BrowserSessionRow.tsx Co-authored-by: roomote[bot] <219738659+roomote[bot]@users.noreply.github.com> --- .../src/components/chat/BrowserSessionRow.tsx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index 79322dadb53..5f4665126d3 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -622,15 +622,15 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { <> {getActionIcon(action.action)} - {getBrowserActionText( - action.action, - pages[currentPageIndex]?.mousePosition, - action.coordinate, - action.text, - pageSize, - pageViewportWidth, - pageViewportHeight, - )} + {getBrowserActionText( + action.action, + action.executedCoordinate, + action.coordinate, + action.text, + pageSize, + pageViewportWidth, + pageViewportHeight, + )} ) From 31eab6212e8a2736831ca9aaeca13c990473753a Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Mon, 17 Nov 2025 22:34:54 -0700 Subject: [PATCH 06/16] feat: show browser session spinner while browser action is running --- .../src/components/chat/BrowserSessionRow.tsx | 51 ++++--- .../chat/__tests__/BrowserSessionRow.spec.tsx | 126 ++++++++++++++++++ 2 files changed, 155 insertions(+), 22 deletions(-) create mode 100644 webview-ui/src/components/chat/__tests__/BrowserSessionRow.spec.tsx diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index 5f4665126d3..191febc7a15 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -9,6 +9,7 @@ import { vscode } from "@src/utils/vscode" import { useExtensionState } from "@src/context/ExtensionStateContext" import CodeBlock from "../common/CodeBlock" +import { ProgressIndicator } from "./ProgressIndicator" import { Button, StandardTooltip } from "@src/components/ui" import { Globe, @@ -459,7 +460,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { // Latest non-close browser_action for header summary (fallback) - // Determine if the overall browser session is still active (spins until 'close') const lastBrowserActionOverall = useMemo(() => { const all = messages.filter((m) => m.say === "browser_action") return all.at(-1) @@ -468,20 +468,22 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { // Use actual Playwright session state from extension (not message parsing) const isBrowserSessionOpen = isBrowserSessionActive - // Check if currently performing a browser action (for spinner) - const _isSessionActive = useMemo(() => { - // Only show active spinner if a session has started - const started = messages.some((m) => m.say === "browser_action_result") - if (!started) return false - // If the last API request got interrupted/cancelled, treat session as inactive - if (isLastApiReqInterrupted) return false - if (!lastBrowserActionOverall) return true - try { - const act = JSON.parse(lastBrowserActionOverall.text || "{}") as ClineSayBrowserAction - return act.action !== "close" - } catch { + // Check if a browser action is currently in flight (for spinner) + const isActionRunning = useMemo(() => { + if (!lastBrowserActionOverall || isLastApiReqInterrupted) { + return false + } + + // Find the last browser_action_result (including empty text) to detect completion + const lastBrowserActionResult = [...messages].reverse().find((m) => m.say === "browser_action_result") + + if (!lastBrowserActionResult) { + // We have at least one action, but haven't seen any result yet return true } + + // If the last action happened after the last result, it's still running + return lastBrowserActionOverall.ts > lastBrowserActionResult.ts }, [messages, lastBrowserActionOverall, isLastApiReqInterrupted]) // Browser session drawer never auto-expands - user must manually toggle it @@ -592,6 +594,11 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { gap: 8, }}> {t("chat:browser.session")} + {isActionRunning && ( + + )} {pages.length > 0 && ( { <> {getActionIcon(action.action)} - {getBrowserActionText( - action.action, - action.executedCoordinate, - action.coordinate, - action.text, - pageSize, - pageViewportWidth, - pageViewportHeight, - )} + {getBrowserActionText( + action.action, + action.executedCoordinate, + action.coordinate, + action.text, + pageSize, + pageViewportWidth, + pageViewportHeight, + )} ) diff --git a/webview-ui/src/components/chat/__tests__/BrowserSessionRow.spec.tsx b/webview-ui/src/components/chat/__tests__/BrowserSessionRow.spec.tsx new file mode 100644 index 00000000000..684145f2556 --- /dev/null +++ b/webview-ui/src/components/chat/__tests__/BrowserSessionRow.spec.tsx @@ -0,0 +1,126 @@ +import React from "react" +import { describe, it, expect, vi } from "vitest" +import { render, screen } from "@testing-library/react" + +import BrowserSessionRow from "../BrowserSessionRow" + +// Mock ExtensionStateContext so BrowserSessionRow falls back to props +vi.mock("@src/context/ExtensionStateContext", () => ({ + useExtensionState: () => { + throw new Error("No ExtensionStateContext in test environment") + }, +})) + +// Simplify i18n usage and provide initReactI18next for i18n setup +vi.mock("react-i18next", () => ({ + useTranslation: () => ({ + t: (key: string) => key, + }), + initReactI18next: { + type: "3rdParty", + init: () => {}, + }, +})) + +// Replace ProgressIndicator with a simple test marker +vi.mock("../ProgressIndicator", () => ({ + ProgressIndicator: () =>
, +})) + +const baseProps = { + isExpanded: () => false, + onToggleExpand: () => {}, + lastModifiedMessage: undefined, + isLast: true, + onHeightChange: () => {}, + isStreaming: false, +} + +describe("BrowserSessionRow - action spinner", () => { + it("does not show spinner when there are no browser actions", () => { + const messages = [ + { + type: "say", + say: "task", + ts: 1, + text: "Task started", + } as any, + ] + + render() + + expect(screen.queryByTestId("browser-session-spinner")).toBeNull() + }) + + it("shows spinner while the latest browser action is still running", () => { + const messages = [ + { + type: "say", + say: "task", + ts: 1, + text: "Task started", + } as any, + { + type: "say", + say: "browser_action", + ts: 2, + text: JSON.stringify({ action: "click" }), + } as any, + { + type: "say", + say: "browser_action_result", + ts: 3, + text: JSON.stringify({ currentUrl: "https://example.com" }), + } as any, + { + type: "say", + say: "browser_action", + ts: 4, + text: JSON.stringify({ action: "scroll_down" }), + } as any, + ] + + render() + + expect(screen.getByTestId("browser-session-spinner")).toBeInTheDocument() + }) + + it("hides spinner once the latest browser action has a result", () => { + const messages = [ + { + type: "say", + say: "task", + ts: 1, + text: "Task started", + } as any, + { + type: "say", + say: "browser_action", + ts: 2, + text: JSON.stringify({ action: "click" }), + } as any, + { + type: "say", + say: "browser_action_result", + ts: 3, + text: JSON.stringify({ currentUrl: "https://example.com" }), + } as any, + { + type: "say", + say: "browser_action", + ts: 4, + text: JSON.stringify({ action: "scroll_down" }), + } as any, + { + type: "say", + say: "browser_action_result", + ts: 5, + text: JSON.stringify({ currentUrl: "https://example.com/page2" }), + } as any, + ] + + render() + + expect(screen.queryByTestId("browser-session-spinner")).toBeNull() + }) +}) From 2b9ea040f060fbdd15f3eee6bb0963996f28927a Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Mon, 17 Nov 2025 22:43:39 -0700 Subject: [PATCH 07/16] chore: address remaining browser review feedback --- src/services/browser/BrowserSession.ts | 6 +- .../ChatView.followup-in-session.spec.tsx | 119 ------------------ 2 files changed, 2 insertions(+), 123 deletions(-) delete mode 100644 webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx diff --git a/src/services/browser/BrowserSession.ts b/src/services/browser/BrowserSession.ts index fdd897c5ac6..d76f6768c4e 100644 --- a/src/services/browser/BrowserSession.ts +++ b/src/services/browser/BrowserSession.ts @@ -209,8 +209,6 @@ export class BrowserSession { const wasActive = !!(this.browser || this.page) if (wasActive) { - console.log("closing browser...") - if (this.isUsingRemoteBrowser && this.browser) { await this.browser.disconnect().catch(() => {}) } else { @@ -801,7 +799,7 @@ export class BrowserSession { y, ) } catch (error) { - console.log("Failed to draw cursor indicator:", error) + console.error("Failed to draw cursor indicator:", error) } } @@ -817,7 +815,7 @@ export class BrowserSession { } }) } catch (error) { - console.log("Failed to remove cursor indicator:", error) + console.error("Failed to remove cursor indicator:", error) } } diff --git a/webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx b/webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx deleted file mode 100644 index e870e8df3c2..00000000000 --- a/webview-ui/src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx +++ /dev/null @@ -1,119 +0,0 @@ -// npx vitest run src/components/chat/__tests__/ChatView.followup-in-session.spec.tsx - -import { render, waitFor } from "@/utils/test-utils" -import { QueryClient, QueryClientProvider } from "@tanstack/react-query" -import { ExtensionStateContextProvider } from "@src/context/ExtensionStateContext" -import ChatView, { ChatViewProps } from "../ChatView" - -vi.mock("@src/utils/vscode", () => ({ - vscode: { postMessage: vi.fn() }, -})) - -vi.mock("rehype-highlight", () => ({ default: () => () => {} })) -vi.mock("hast-util-to-text", () => ({ default: () => "" })) - -vi.mock("../BrowserSessionRow", () => ({ - default: function MockBrowserSessionRow({ messages }: { messages: any[] }) { - return
{JSON.stringify(messages)}
- }, -})) - -vi.mock("../ChatRow", () => ({ - default: function MockChatRow({ message }: { message: any }) { - return
{JSON.stringify(message)}
- }, -})) - -vi.mock("../TaskHeader", () => ({ - default: function MockTaskHeader() { - return
- }, -})) - -vi.mock("@src/components/common/CodeBlock", () => ({ - default: () => null, - CODE_BLOCK_BG_COLOR: "rgb(30, 30, 30)", -})) - -const queryClient = new QueryClient() - -const defaultProps: ChatViewProps = { - isHidden: false, - showAnnouncement: false, - hideAnnouncement: () => {}, -} - -const renderChatView = (props: Partial = {}) => { - return render( - - - - - , - ) -} - -const mockPostMessage = (state: any) => { - window.postMessage( - { - type: "state", - state: { - version: "1.0.0", - clineMessages: [], - taskHistory: [], - shouldShowAnnouncement: false, - allowedCommands: [], - autoApprovalEnabled: true, - ...state, - }, - }, - "*", - ) -} - -describe("ChatView followup inside browser session", () => { - beforeEach(() => { - vi.clearAllMocks() - }) - - it.skip("renders followup ask as a regular ChatRow while session banner is visible", async () => { - renderChatView() - - const ts = Date.now() - - // Send initial message with browser session and followup - mockPostMessage({ - alwaysAllowBrowser: true, - clineMessages: [ - { type: "say", say: "task", ts: ts - 4000, text: "Initial task" }, - { - type: "ask", - ask: "browser_action_launch", - ts: ts - 3000, - text: "http://example.com", - partial: false, - }, - { type: "say", say: "browser_action_result", ts: ts - 2000, text: "" }, - { - type: "ask", - ask: "followup", - ts: ts, - text: JSON.stringify({ question: "Continue?", suggest: [{ answer: "Yes" }, { answer: "No" }] }), - partial: false, - }, - ], - }) - - // Banner should be present (only contains browser_action_launch and browser_action_result) - await waitFor(() => { - const banner = document.querySelector('[data-testid="browser-session"]') - expect(banner).not.toBeNull() - }) - - // At least one ChatRow should render (the followup question) - await waitFor(() => { - const chatRows = document.querySelectorAll('[data-testid="chat-row"]') - expect(chatRows.length).toBeGreaterThan(0) - }) - }) -}) From 90de34b1567b6b6c568a2f6c2368a7895299cced Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Mon, 17 Nov 2025 23:15:53 -0700 Subject: [PATCH 08/16] refactor: split BrowserSessionRow header and drawer --- .../src/components/chat/BrowserSessionRow.tsx | 1107 +++++++++-------- 1 file changed, 557 insertions(+), 550 deletions(-) diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index 191febc7a15..37e323242a3 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -532,618 +532,625 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { } }, []) - const browserSessionRow = ( + const BrowserSessionHeader: React.FC = () => (
- {/* Main header - clickable to expand/collapse, mimics TodoList style */} -
+ setNextActionsExpanded((v) => { + const nv = !v + onExpandChange?.(nv) + return nv + }), + })} + /> + + {/* Simple text: "Browser Session" with step counter */} + + setNextActionsExpanded((v) => { + const nv = !v + onExpandChange?.(nv) + return nv + }), + })} + style={{ + flex: 1, + fontSize: 13, + fontWeight: 500, + lineHeight: "22px", + color: "var(--vscode-editor-foreground)", + cursor: fullScreen ? "default" : "pointer", display: "flex", alignItems: "center", gap: 8, - marginBottom: 0, - userSelect: "none", }}> - {/* Globe icon - green when browser session is open */} - + )} + {pages.length > 0 && ( + + {currentPageIndex + 1}/{pages.length} + + )} + {/* Inline action summary to the right, similar to ChatView */} + - setNextActionsExpanded((v) => { - const nv = !v - onExpandChange?.(nv) - return nv - }), - })} - /> + display: "inline-flex", + alignItems: "center", + gap: 6, + fontSize: 12, + color: "var(--vscode-descriptionForeground)", + fontWeight: 400, + }}> + {(() => { + const action = currentPageAction + const pageSize = pages[currentPageIndex]?.size + const pageViewportWidth = pages[currentPageIndex]?.viewportWidth + const pageViewportHeight = pages[currentPageIndex]?.viewportHeight + if (action) { + return ( + <> + {getActionIcon(action.action)} + + {getBrowserActionText( + action.action, + action.executedCoordinate, + action.coordinate, + action.text, + pageSize, + pageViewportWidth, + pageViewportHeight, + )} + + + ) + } else if (initialUrl) { + return ( + <> + {getActionIcon("launch" as any)} + {getBrowserActionText("launch", undefined, initialUrl, undefined)} + + ) + } + return null + })()} + + - {/* Simple text: "Browser Session" with step counter */} + {/* Right side: cost badge and chevron */} + {totalApiCost > 0 && ( +
+ ${totalApiCost.toFixed(4)} +
+ )} + + {/* Chevron toggle hidden in fullScreen */} + {!fullScreen && ( - setNextActionsExpanded((v) => { - const nv = !v - onExpandChange?.(nv) - return nv - }), - })} + onClick={() => + setNextActionsExpanded((v) => { + const nv = !v + onExpandChange?.(nv) + return nv + }) + } + className={`codicon ${nextActionsExpanded ? "codicon-chevron-up" : "codicon-chevron-down"}`} style={{ - flex: 1, fontSize: 13, fontWeight: 500, lineHeight: "22px", color: "var(--vscode-editor-foreground)", - cursor: fullScreen ? "default" : "pointer", + cursor: "pointer", + display: "inline-block", + transition: "transform 150ms ease", + }} + /> + )} + + {/* Kill browser button hidden from header in fullScreen; kept in toolbar */} + {isBrowserSessionOpen && !fullScreen && ( + + + + )} +
+ ) + + const BrowserSessionDrawer: React.FC = () => { + if (!nextActionsExpanded) return null + + return ( +
+ {/* Browser-like Toolbar */} +
- {t("chat:browser.session")} - {isActionRunning && ( - - )} - {pages.length > 0 && ( - + + - {/* Right side: cost badge and chevron */} - {totalApiCost > 0 && ( + {/* Back */} + + + + + {/* Forward */} + + + + + {/* Go to end */} + + + + + {/* Address Bar */}
- ${totalApiCost.toFixed(4)} + + + {displayState.url || "about:blank"} + + {/* Step counter removed */}
- )} - {/* Chevron toggle hidden in fullScreen */} - {!fullScreen && ( - - setNextActionsExpanded((v) => { - const nv = !v - onExpandChange?.(nv) - return nv - }) - } - className={`codicon ${nextActionsExpanded ? "codicon-chevron-up" : "codicon-chevron-down"}`} - style={{ - fontSize: 13, - fontWeight: 500, - lineHeight: "22px", - color: "var(--vscode-editor-foreground)", - cursor: "pointer", - display: "inline-block", - transition: "transform 150ms ease", - }} - /> - )} - - {/* Kill browser button hidden from header in fullScreen; kept in toolbar */} - {isBrowserSessionOpen && !fullScreen && ( + {/* Kill (Disconnect) replaces Reload */} - + + - )} -
- {/* Expanded drawer content - inline/fullscreen */} - {nextActionsExpanded && ( + {/* Open External */} + + + + + {/* Copy URL */} + + + +
+ {/* Screenshot Area */}
- {/* Browser-like Toolbar */} -
- {/* Go to beginning */} - - - - - {/* Back */} - - - - - {/* Forward */} - - - - - {/* Go to end */} - - - - - {/* Address Bar */} + {displayState.screenshot ? ( + {t("chat:browser.screenshot")} + vscode.postMessage({ + type: "openImage", + text: displayState.screenshot, + }) + } + /> + ) : (
- - {displayState.url || "about:blank"} - - {/* Step counter removed */} -
- - {/* Kill (Disconnect) replaces Reload */} - - - - - {/* Open External */} - - - - - {/* Copy URL */} - - - -
- {/* Screenshot Area */} -
- {displayState.screenshot ? ( - {t("chat:browser.screenshot")} - vscode.postMessage({ - type: "openImage", - text: displayState.screenshot, - }) - } + className="codicon codicon-globe" + style={{ fontSize: "80px", color: "var(--vscode-descriptionForeground)" }} /> - ) : ( -
- -
- )} - {displayState.mousePosition && - (() => { - // Use measured size if available; otherwise fall back to current client size so cursor remains visible - const containerW = sW || (screenshotRef.current?.clientWidth ?? 0) - const containerH = sH || (screenshotRef.current?.clientHeight ?? 0) - if (containerW <= 0 || containerH <= 0) { - // Minimal fallback to keep cursor visible before first measurement - return ( - - ) - } - - // Compute displayed image box within the container for object-fit: contain; objectPosition: top center - const imgAspect = cursorViewportWidth / cursorViewportHeight - const containerAspect = containerW / containerH - let displayW = containerW - let displayH = containerH - let offsetX = 0 - let offsetY = 0 - if (containerAspect > imgAspect) { - // Full height, letterboxed left/right; top aligned - displayH = containerH - displayW = containerH * imgAspect - offsetX = (containerW - displayW) / 2 - offsetY = 0 - } else { - // Full width, potential space below; top aligned - displayW = containerW - displayH = containerW / imgAspect - offsetX = 0 - offsetY = 0 - } - - // Parse "x,y" or "x,y@widthxheight" for original basis - const m = /^\s*(\d+)\s*,\s*(\d+)(?:\s*@\s*(\d+)\s*[x,]\s*(\d+))?\s*$/.exec( - displayState.mousePosition || "", - ) - const mx = parseInt(m?.[1] || "0", 10) - const my = parseInt(m?.[2] || "0", 10) - const baseW = m?.[3] ? parseInt(m[3], 10) : cursorViewportWidth - const baseH = m?.[4] ? parseInt(m[4], 10) : cursorViewportHeight - - const leftPx = offsetX + (baseW > 0 ? (mx / baseW) * displayW : 0) - const topPx = offsetY + (baseH > 0 ? (my / baseH) * displayH : 0) - +
+ )} + {displayState.mousePosition && + (() => { + // Use measured size if available; otherwise fall back to current client size so cursor remains visible + const containerW = sW || (screenshotRef.current?.clientWidth ?? 0) + const containerH = sH || (screenshotRef.current?.clientHeight ?? 0) + if (containerW <= 0 || containerH <= 0) { + // Minimal fallback to keep cursor visible before first measurement return ( ) - })()} -
+ } + + // Compute displayed image box within the container for object-fit: contain; objectPosition: top center + const imgAspect = cursorViewportWidth / cursorViewportHeight + const containerAspect = containerW / containerH + let displayW = containerW + let displayH = containerH + let offsetX = 0 + let offsetY = 0 + if (containerAspect > imgAspect) { + // Full height, letterboxed left/right; top aligned + displayH = containerH + displayW = containerH * imgAspect + offsetX = (containerW - displayW) / 2 + offsetY = 0 + } else { + // Full width, potential space below; top aligned + displayW = containerW + displayH = containerW / imgAspect + offsetX = 0 + offsetY = 0 + } - {/* Browser Action summary moved inline to header; row removed */} + // Parse "x,y" or "x,y@widthxheight" for original basis + const m = /^\s*(\d+)\s*,\s*(\d+)(?:\s*@\s*(\d+)\s*[x,]\s*(\d+))?\s*$/.exec( + displayState.mousePosition || "", + ) + const mx = parseInt(m?.[1] || "0", 10) + const my = parseInt(m?.[2] || "0", 10) + const baseW = m?.[3] ? parseInt(m[3], 10) : cursorViewportWidth + const baseH = m?.[4] ? parseInt(m[4], 10) : cursorViewportHeight + + const leftPx = offsetX + (baseW > 0 ? (mx / baseW) * displayW : 0) + const topPx = offsetY + (baseH > 0 ? (my / baseH) * displayH : 0) + + return ( + + ) + })()} +
- {/* Console Logs Section (collapsible, default collapsed) */} + {/* Browser Action summary moved inline to header; row removed */} + + {/* Console Logs Section (collapsible, default collapsed) */} +
{ + e.stopPropagation() + setConsoleLogsExpanded((v) => !v) + }} + className="text-vscode-editor-foreground/70 hover:text-vscode-editor-foreground transition-colors" style={{ - padding: "8px 10px", - // Pin logs to bottom of the fullscreen drawer - marginTop: fullScreen ? "auto" : undefined, + display: "flex", + alignItems: "center", + gap: "8px", + marginBottom: consoleLogsExpanded ? "6px" : 0, + cursor: "pointer", }}> + + + {t("chat:browser.consoleLogs")} + + + {/* Log type indicators */}
{ - e.stopPropagation() - setConsoleLogsExpanded((v) => !v) - }} - className="text-vscode-editor-foreground/70 hover:text-vscode-editor-foreground transition-colors" - style={{ - display: "flex", - alignItems: "center", - gap: "8px", - marginBottom: consoleLogsExpanded ? "6px" : 0, - cursor: "pointer", - }}> - - - {t("chat:browser.consoleLogs")} - - - {/* Log type indicators */} -
e.stopPropagation()} - style={{ display: "flex", alignItems: "center", gap: 6, marginLeft: "auto" }}> - {logTypeMeta.map(({ key, label }) => { - const isAll = key === "all" - const count = isAll - ? (Object.values(parsedLogs.counts) as number[]).reduce((a, b) => a + b, 0) - : parsedLogs.counts[key as "debug" | "info" | "warn" | "error" | "log"] - const isActive = logFilter === (key as any) - const disabled = count === 0 - return ( - - ) - })} - setConsoleLogsExpanded((v) => !v)} - className={`codicon codicon-chevron-${consoleLogsExpanded ? "down" : "right"}`} - style={{ marginLeft: 6 }} - /> -
+ onClick={(e) => e.stopPropagation()} + style={{ display: "flex", alignItems: "center", gap: 6, marginLeft: "auto" }}> + {logTypeMeta.map(({ key, label }) => { + const isAll = key === "all" + const count = isAll + ? (Object.values(parsedLogs.counts) as number[]).reduce((a, b) => a + b, 0) + : parsedLogs.counts[key as "debug" | "info" | "warn" | "error" | "log"] + const isActive = logFilter === (key as any) + const disabled = count === 0 + return ( + + ) + })} + setConsoleLogsExpanded((v) => !v)} + className={`codicon codicon-chevron-${consoleLogsExpanded ? "down" : "right"}`} + style={{ marginLeft: 6 }} + />
- {consoleLogsExpanded && ( -
- -
- )}
+ {consoleLogsExpanded && ( +
+ +
+ )}
- )} +
+ ) + } + + const browserSessionRow = ( +
+ + + {/* Expanded drawer content - inline/fullscreen */} +
) From 8de74a041c54d8be16e66a8ce33cc6de1598d044 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 19 Nov 2025 16:26:50 -0700 Subject: [PATCH 09/16] fixed file names --- src/core/tools/{browserActionTool.ts => BrowserActionTool.ts} | 0 ...caling.spec.ts => BrowserActionTool.coordinateScaling.spec.ts} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/core/tools/{browserActionTool.ts => BrowserActionTool.ts} (100%) rename src/core/tools/__tests__/{browserActionTool.coordinateScaling.spec.ts => BrowserActionTool.coordinateScaling.spec.ts} (100%) diff --git a/src/core/tools/browserActionTool.ts b/src/core/tools/BrowserActionTool.ts similarity index 100% rename from src/core/tools/browserActionTool.ts rename to src/core/tools/BrowserActionTool.ts diff --git a/src/core/tools/__tests__/browserActionTool.coordinateScaling.spec.ts b/src/core/tools/__tests__/BrowserActionTool.coordinateScaling.spec.ts similarity index 100% rename from src/core/tools/__tests__/browserActionTool.coordinateScaling.spec.ts rename to src/core/tools/__tests__/BrowserActionTool.coordinateScaling.spec.ts From c9ce87479cec40f5295a91de55ea608b072539ba Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 19 Nov 2025 16:48:37 -0700 Subject: [PATCH 10/16] Fix browserActionTool call site --- src/core/assistant-message/presentAssistantMessage.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/core/assistant-message/presentAssistantMessage.ts b/src/core/assistant-message/presentAssistantMessage.ts index df0371cb7dd..171209f6fba 100644 --- a/src/core/assistant-message/presentAssistantMessage.ts +++ b/src/core/assistant-message/presentAssistantMessage.ts @@ -669,13 +669,14 @@ export async function presentAssistantMessage(cline: Task) { }) break case "browser_action": - await browserActionTool.handle(cline, block as ToolUse<"browser_action">, { + await browserActionTool( + cline, + block as ToolUse<"browser_action">, askApproval, handleError, pushToolResult, removeClosingTag, - toolProtocol, - }) + ) break case "execute_command": await executeCommandTool.handle(cline, block as ToolUse<"execute_command">, { From 791544f2164239b058fd59756f1ca6c51c3c73a6 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 19 Nov 2025 17:09:38 -0700 Subject: [PATCH 11/16] Update system prompt snapshot --- .../__snapshots__/system-prompt/with-computer-use-support.snap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap b/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap index cea59da7f57..323aa0bdbe4 100644 --- a/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap +++ b/src/core/prompts/__tests__/__snapshots__/system-prompt/with-computer-use-support.snap @@ -520,7 +520,7 @@ RULES - At the end of each user message, you will automatically receive environment_details. This information is not written by the user themselves, but is auto-generated to provide potentially relevant context about the project structure and environment. While this information can be valuable for understanding the project context, do not treat it as a direct part of the user's request or response. Use it to inform your actions and decisions, but don't assume the user is explicitly asking about or referring to this information unless they clearly do so in their message. When using environment_details, explain your actions clearly to ensure the user understands, as they may not be aware of these details. - Before executing commands, check the "Actively Running Terminals" section in environment_details. If present, consider how these active processes might impact your task. For example, if a local development server is already running, you wouldn't need to start it again. If no active terminals are listed, proceed with command execution as normal. - MCP operations should be used one at a time, similar to other tool usage. Wait for confirmation of success before proceeding with additional operations. -- It is critical you wait for the user's response after each tool use, in order to confirm the success of the tool use. For example, if asked to make a todo app, you would create a file, wait for the user's response it was created successfully, then create another file if needed, wait for the user's response it was created successfully, etc. +- It is critical you wait for the user's response after each tool use, in order to confirm the success of the tool use. For example, if asked to make a todo app, you would create a file, wait for the user's response it was created successfully, then create another file if needed, wait for the user's response it was created successfully, etc. Then if you want to test your work, you might use browser_action to launch the site, wait for the user's response confirming the site was launched along with a screenshot, then perhaps e.g., click a button to test functionality if needed, wait for the user's response confirming the button was clicked along with a screenshot of the new state, before finally closing the browser. ==== From ff44f0b45198e26df19def0dcace3f7f120bd14e Mon Sep 17 00:00:00 2001 From: daniel-lxs Date: Thu, 20 Nov 2025 13:51:09 -0500 Subject: [PATCH 12/16] fix(browser_action): align tool definition with implementation - Change coordinate from object to string format 'x,y@WIDTHxHEIGHT' - Change size from object to string format 'WIDTHxHEIGHT' - Update required params to only include 'action' (others are conditionally required) - Add missing 'press' action to enum - Update text parameter description to clarify usage for both type and press actions Fixes issue where assistant was correctly following schema but receiving 'Missing value for required parameter' errors. --- .../tools/native-tools/browser_action.ts | 42 +++++-------------- 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/src/core/prompts/tools/native-tools/browser_action.ts b/src/core/prompts/tools/native-tools/browser_action.ts index 6f5df50a0c3..64977780b7a 100644 --- a/src/core/prompts/tools/native-tools/browser_action.ts +++ b/src/core/prompts/tools/native-tools/browser_action.ts @@ -5,7 +5,7 @@ export default { function: { name: "browser_action", description: - "Interact with a Puppeteer-controlled browser session. Always start by launching at a URL and always finish by closing the browser. While the browser is active, do not call any other tools. Use coordinates within the viewport to hover or click, provide text for typing, and ensure actions are grounded in the latest screenshot and console logs.", + "Interact with a browser session. Always start by launching at a URL and always finish by closing the browser. While the browser is active, do not call any other tools. Use coordinates within the viewport to hover or click, provide text for typing, and ensure actions are grounded in the latest screenshot and console logs.", strict: true, parameters: { type: "object", @@ -13,51 +13,29 @@ export default { action: { type: "string", description: "Browser action to perform", - enum: ["launch", "hover", "click", "type", "resize", "scroll_down", "scroll_up", "close"], + enum: ["launch", "click", "hover", "type", "press", "scroll_down", "scroll_up", "resize", "close"], }, url: { type: ["string", "null"], description: "URL to open when performing the launch action; must include protocol", }, coordinate: { - type: ["object", "null"], + type: ["string", "null"], description: - "Screen coordinate for hover or click actions; target the center of the desired element", - properties: { - x: { - type: "number", - description: "Horizontal pixel position within the current viewport", - }, - y: { - type: "number", - description: "Vertical pixel position within the current viewport", - }, - }, - required: ["x", "y"], - additionalProperties: false, + "Screen coordinate for hover or click actions in format 'x,y@WIDTHxHEIGHT' where x,y is the target position on the screenshot image and WIDTHxHEIGHT is the exact pixel dimensions of the screenshot image (not the browser viewport). Example: '450,203@900x600' means click at (450,203) on a 900x600 screenshot. The coordinates will be automatically scaled to match the actual viewport dimensions.", }, size: { - type: ["object", "null"], - description: "Viewport dimensions to apply when performing the resize action", - properties: { - width: { - type: "number", - description: "Viewport width in pixels", - }, - height: { - type: "number", - description: "Viewport height in pixels", - }, - }, - required: ["width", "height"], - additionalProperties: false, + type: ["string", "null"], + description: + "Viewport dimensions for the resize action in format 'WIDTHxHEIGHT' or 'WIDTH,HEIGHT'. Example: '1280x800' or '1280,800'", }, text: { type: ["string", "null"], - description: "Text to type when performing the type action", + description: + "Text to type when performing the type action, or key name to press when performing the press action (e.g., 'Enter', 'Tab', 'Escape')", }, }, - required: ["action", "url", "coordinate", "size", "text"], + required: ["action"], additionalProperties: false, }, }, From 8a289d9f39b6a2b8d9f1f2d8c9be5961c0331893 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Thu, 20 Nov 2025 14:14:27 -0700 Subject: [PATCH 13/16] Corrected location of the browser session indicator in the task header. --- webview-ui/src/components/chat/TaskHeader.tsx | 165 +++++++++--------- 1 file changed, 85 insertions(+), 80 deletions(-) diff --git a/webview-ui/src/components/chat/TaskHeader.tsx b/webview-ui/src/components/chat/TaskHeader.tsx index 6c9ac4fcc64..de499b9aade 100644 --- a/webview-ui/src/components/chat/TaskHeader.tsx +++ b/webview-ui/src/components/chat/TaskHeader.tsx @@ -196,53 +196,93 @@ const TaskHeader = ({
{!isTaskExpanded && contextWindow > 0 && (
e.stopPropagation()}> - - -
- {t("chat:tokenProgress.tokensUsed", { - used: formatLargeNumber(contextTokens || 0), - total: formatLargeNumber(contextWindow), - })} -
- {(() => { - const maxTokens = model - ? getModelMaxOutputTokens({ modelId, model, settings: apiConfiguration }) - : 0 - const reservedForOutput = maxTokens || 0 - const availableSpace = contextWindow - (contextTokens || 0) - reservedForOutput +
+ + +
+ {t("chat:tokenProgress.tokensUsed", { + used: formatLargeNumber(contextTokens || 0), + total: formatLargeNumber(contextWindow), + })} +
+ {(() => { + const maxTokens = model + ? getModelMaxOutputTokens({ + modelId, + model, + settings: apiConfiguration, + }) + : 0 + const reservedForOutput = maxTokens || 0 + const availableSpace = + contextWindow - (contextTokens || 0) - reservedForOutput - return ( - <> - {reservedForOutput > 0 && ( -
- {t("chat:tokenProgress.reservedForResponse", { - amount: formatLargeNumber(reservedForOutput), - })} -
- )} - {availableSpace > 0 && ( -
- {t("chat:tokenProgress.availableSpace", { - amount: formatLargeNumber(availableSpace), - })} -
- )} - - ) - })()} -
- } - side="top" - sideOffset={8}> - - {formatLargeNumber(contextTokens || 0)} / {formatLargeNumber(contextWindow)} - -
- {!!totalCost && ${totalCost.toFixed(2)}} + return ( + <> + {reservedForOutput > 0 && ( +
+ {t("chat:tokenProgress.reservedForResponse", { + amount: formatLargeNumber(reservedForOutput), + })} +
+ )} + {availableSpace > 0 && ( +
+ {t("chat:tokenProgress.availableSpace", { + amount: formatLargeNumber(availableSpace), + })} +
+ )} + + ) + })()} +
+ } + side="top" + sideOffset={8}> + + {formatLargeNumber(contextTokens || 0)} / {formatLargeNumber(contextWindow)} + + + {!!totalCost && ${totalCost.toFixed(2)}} + + {showBrowserGlobe && ( +
e.stopPropagation()}> + + + + {isBrowserSessionActive && ( + + Active + + )} +
+ )} )} {/* Expanded state: Show task text and images */} @@ -369,41 +409,6 @@ const TaskHeader = ({ )} {/* Todo list - always shown at bottom when todos exist */} {hasTodos && } - - {/* Browser session status moved from bottom bar to header (bottom-right) */} - {showBrowserGlobe && ( -
e.stopPropagation()}> - - - - {isBrowserSessionActive && ( - - Active - - )} -
- )} From 20bdf5379cb67ab66febb5d256912c57db3ad7b0 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Fri, 21 Nov 2025 11:43:23 -0700 Subject: [PATCH 14/16] refactor: Restore browser action title in all locales and update component to use it --- .../__tests__/getEnvironmentDetails.spec.ts | 6 +- src/core/environment/getEnvironmentDetails.ts | 49 +++++----- src/core/tools/BrowserActionTool.ts | 34 +------ ...rowserActionTool.coordinateScaling.spec.ts | 74 +-------------- src/core/webview/ClineProvider.ts | 28 ++++-- src/core/webview/webviewMessageHandler.ts | 3 +- src/shared/browserUtils.ts | 95 +++++++++++++++++++ .../browser-session/BrowserSessionPanel.tsx | 43 ++++----- .../src/components/chat/BrowserActionRow.tsx | 75 ++------------- .../src/components/chat/BrowserSessionRow.tsx | 75 +-------------- webview-ui/src/components/chat/ChatView.tsx | 31 +----- webview-ui/src/i18n/locales/ca/chat.json | 2 +- webview-ui/src/i18n/locales/en/chat.json | 1 + webview-ui/src/i18n/locales/es/chat.json | 2 +- webview-ui/src/i18n/locales/fr/chat.json | 2 +- webview-ui/src/i18n/locales/id/chat.json | 2 +- webview-ui/src/i18n/locales/ja/chat.json | 2 +- webview-ui/src/i18n/locales/nl/chat.json | 2 +- webview-ui/src/i18n/locales/ru/chat.json | 2 +- webview-ui/src/i18n/locales/tr/chat.json | 2 +- webview-ui/src/i18n/locales/zh-TW/chat.json | 2 +- 21 files changed, 188 insertions(+), 344 deletions(-) create mode 100644 src/shared/browserUtils.ts diff --git a/src/core/environment/__tests__/getEnvironmentDetails.spec.ts b/src/core/environment/__tests__/getEnvironmentDetails.spec.ts index 0d6a4da591f..ef6d0513d47 100644 --- a/src/core/environment/__tests__/getEnvironmentDetails.spec.ts +++ b/src/core/environment/__tests__/getEnvironmentDetails.spec.ts @@ -120,6 +120,7 @@ describe("getEnvironmentDetails", () => { } as unknown as WeakRef, browserSession: { isSessionActive: vi.fn().mockReturnValue(false), + getViewportSize: vi.fn().mockReturnValue({ width: 900, height: 600 }), } as any, } @@ -459,10 +460,9 @@ describe("getEnvironmentDetails", () => { expect(getGitStatus).toHaveBeenCalledWith(mockCwd, 5) }) - it("should include Browser Session Status when inactive", async () => { + it("should NOT include Browser Session Status when inactive", async () => { const result = await getEnvironmentDetails(mockCline as Task) - expect(result).toContain("# Browser Session Status") - expect(result).toContain("Inactive - Browser is not launched") + expect(result).not.toContain("# Browser Session Status") }) it("should include Browser Session Status with current viewport when active", async () => { diff --git a/src/core/environment/getEnvironmentDetails.ts b/src/core/environment/getEnvironmentDetails.ts index 4c529e65e10..e42db79d402 100644 --- a/src/core/environment/getEnvironmentDetails.ts +++ b/src/core/environment/getEnvironmentDetails.ts @@ -248,37 +248,34 @@ export async function getEnvironmentDetails(cline: Task, includeFileDetails: boo } } - // Add browser session status - Always show to prevent LLM from trying browser actions when no session is active + // Add browser session status - Only show when active to prevent cluttering context const isBrowserActive = cline.browserSession.isSessionActive() - // Build viewport info for status (prefer actual viewport if available, else fallback to configured setting) - const configuredViewport = (state?.browserViewportSize as string | undefined) ?? "900x600" - let configuredWidth: number | undefined - let configuredHeight: number | undefined - if (configuredViewport.includes("x")) { - const parts = configuredViewport.split("x").map((v) => Number(v)) - configuredWidth = parts[0] - configuredHeight = parts[1] - } + if (isBrowserActive) { + // Build viewport info for status (prefer actual viewport if available, else fallback to configured setting) + const configuredViewport = (state?.browserViewportSize as string | undefined) ?? "900x600" + let configuredWidth: number | undefined + let configuredHeight: number | undefined + if (configuredViewport.includes("x")) { + const parts = configuredViewport.split("x").map((v) => Number(v)) + configuredWidth = parts[0] + configuredHeight = parts[1] + } - let actualWidth: number | undefined - let actualHeight: number | undefined - // Use optional chaining to avoid issues with tests that stub browserSession - const vp = isBrowserActive ? (cline.browserSession as any).getViewportSize?.() : undefined - if (vp) { - actualWidth = vp.width - actualHeight = vp.height - } + let actualWidth: number | undefined + let actualHeight: number | undefined + const vp = cline.browserSession.getViewportSize?.() + if (vp) { + actualWidth = vp.width + actualHeight = vp.height + } - const width = actualWidth ?? configuredWidth - const height = actualHeight ?? configuredHeight - const viewportInfo = isBrowserActive && width && height ? `\nCurrent viewport size: ${width}x${height} pixels.` : "" + const width = actualWidth ?? configuredWidth + const height = actualHeight ?? configuredHeight + const viewportInfo = width && height ? `\nCurrent viewport size: ${width}x${height} pixels.` : "" - details += `\n# Browser Session Status\n${ - isBrowserActive - ? "Active - A browser session is currently open and ready for browser_action commands" - : "Inactive - Browser is not launched. Using any browser action except the browser_action with action='launch' to start a new session will result in an error." - }${viewportInfo}\n` + details += `\n# Browser Session Status\nActive - A browser session is currently open and ready for browser_action commands${viewportInfo}\n` + } if (includeFileDetails) { details += `\n\n# Current Workspace Directory (${cline.cwd.toPosix()}) Files\n` diff --git a/src/core/tools/BrowserActionTool.ts b/src/core/tools/BrowserActionTool.ts index 0ff6fd04a48..b9afae2fdb1 100644 --- a/src/core/tools/BrowserActionTool.ts +++ b/src/core/tools/BrowserActionTool.ts @@ -8,39 +8,7 @@ import { } from "../../shared/ExtensionMessage" import { formatResponse } from "../prompts/responses" import { Anthropic } from "@anthropic-ai/sdk" - -/** - * Parses coordinate string and scales from image dimensions to viewport dimensions - * The LLM examines the screenshot it receives (which may be downscaled by the API) - * and reports coordinates in format: "x,y@widthxheight" where widthxheight is what the LLM observed - * - * Format: "x,y@widthxheight" (required) - * Returns: scaled coordinate string "x,y" in viewport coordinates - * Throws: Error if format is invalid or missing image dimensions - */ -function scaleCoordinate(coordinate: string, viewportWidth: number, viewportHeight: number): string { - // Parse coordinate with required image dimensions (accepts both 'x' and ',' as dimension separators) - const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) - - if (!match) { - throw new Error( - `Invalid coordinate format: "${coordinate}". ` + - `Expected format: "x,y@widthxheight" (e.g., "450,300@1024x768")`, - ) - } - - const [, xStr, yStr, imgWidthStr, imgHeightStr] = match - const x = parseInt(xStr, 10) - const y = parseInt(yStr, 10) - const imgWidth = parseInt(imgWidthStr, 10) - const imgHeight = parseInt(imgHeightStr, 10) - - // Scale coordinates from image dimensions to viewport dimensions - const scaledX = Math.round((x / imgWidth) * viewportWidth) - const scaledY = Math.round((y / imgHeight) * viewportHeight) - - return `${scaledX},${scaledY}` -} +import { scaleCoordinate } from "../../shared/browserUtils" export async function browserActionTool( cline: Task, diff --git a/src/core/tools/__tests__/BrowserActionTool.coordinateScaling.spec.ts b/src/core/tools/__tests__/BrowserActionTool.coordinateScaling.spec.ts index 08604026745..4294fff4d3a 100644 --- a/src/core/tools/__tests__/BrowserActionTool.coordinateScaling.spec.ts +++ b/src/core/tools/__tests__/BrowserActionTool.coordinateScaling.spec.ts @@ -1,9 +1,6 @@ // Test coordinate scaling functionality in browser actions -import { describe, it, expect, vi, beforeEach } from "vitest" - -// Mock the scaleCoordinate function by extracting it -// In a real scenario, we'd export it or test through the main function -// For now, we'll test the regex pattern and logic +import { describe, it, expect } from "vitest" +import { scaleCoordinate } from "../../../shared/browserUtils" describe("Browser Action Coordinate Scaling", () => { describe("Coordinate format validation", () => { @@ -18,10 +15,9 @@ describe("Browser Action Coordinate Scaling", () => { "450,300@1024,768", // comma separator for dimensions ] - const regex = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/ - validFormats.forEach((coord) => { - expect(coord).toMatch(regex) + // Should not throw + expect(() => scaleCoordinate(coord, 900, 600)).not.toThrow() }) }) @@ -39,35 +35,14 @@ describe("Browser Action Coordinate Scaling", () => { "450,300@axb", // non-numeric dimensions ] - const regex = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/ - invalidFormats.forEach((coord) => { - expect(coord).not.toMatch(regex) + expect(() => scaleCoordinate(coord, 900, 600)).toThrow() }) }) }) describe("Coordinate scaling logic", () => { it("should correctly scale coordinates from image to viewport", () => { - // Simulate the scaling logic - const scaleCoordinate = (coordinate: string, viewportWidth: number, viewportHeight: number): string => { - const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) - if (!match) { - throw new Error(`Invalid coordinate format: "${coordinate}"`) - } - - const [, xStr, yStr, imgWidthStr, imgHeightStr] = match - const x = parseInt(xStr, 10) - const y = parseInt(yStr, 10) - const imgWidth = parseInt(imgWidthStr, 10) - const imgHeight = parseInt(imgHeightStr, 10) - - const scaledX = Math.round((x / imgWidth) * viewportWidth) - const scaledY = Math.round((y / imgHeight) * viewportHeight) - - return `${scaledX},${scaledY}` - } - // Test case 1: Same dimensions (no scaling) expect(scaleCoordinate("450,300@900x600", 900, 600)).toBe("450,300") @@ -88,27 +63,6 @@ describe("Browser Action Coordinate Scaling", () => { }) it("should throw error for invalid coordinate format", () => { - const scaleCoordinate = (coordinate: string, viewportWidth: number, viewportHeight: number): string => { - const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) - if (!match) { - throw new Error( - `Invalid coordinate format: "${coordinate}". ` + - `Expected format: "x,y@widthxheight" (e.g., "450,300@1024x768")`, - ) - } - - const [, xStr, yStr, imgWidthStr, imgHeightStr] = match - const x = parseInt(xStr, 10) - const y = parseInt(yStr, 10) - const imgWidth = parseInt(imgWidthStr, 10) - const imgHeight = parseInt(imgHeightStr, 10) - - const scaledX = Math.round((x / imgWidth) * viewportWidth) - const scaledY = Math.round((y / imgHeight) * viewportHeight) - - return `${scaledX},${scaledY}` - } - // Test invalid formats expect(() => scaleCoordinate("450,300", 900, 600)).toThrow("Invalid coordinate format") expect(() => scaleCoordinate("450,300@1024", 900, 600)).toThrow("Invalid coordinate format") @@ -116,24 +70,6 @@ describe("Browser Action Coordinate Scaling", () => { }) it("should handle rounding correctly", () => { - const scaleCoordinate = (coordinate: string, viewportWidth: number, viewportHeight: number): string => { - const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) - if (!match) { - throw new Error(`Invalid coordinate format: "${coordinate}"`) - } - - const [, xStr, yStr, imgWidthStr, imgHeightStr] = match - const x = parseInt(xStr, 10) - const y = parseInt(yStr, 10) - const imgWidth = parseInt(imgWidthStr, 10) - const imgHeight = parseInt(imgHeightStr, 10) - - const scaledX = Math.round((x / imgWidth) * viewportWidth) - const scaledY = Math.round((y / imgHeight) * viewportHeight) - - return `${scaledX},${scaledY}` - } - // Test rounding behavior // 333 / 1000 * 900 = 299.7 -> rounds to 300 expect(scaleCoordinate("333,333@1000x1000", 900, 900)).toBe("300,300") diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts index 9a10387dc6b..8f6c8c7631d 100644 --- a/src/core/webview/ClineProvider.ts +++ b/src/core/webview/ClineProvider.ts @@ -1925,6 +1925,7 @@ export class ClineProvider openRouterImageGenerationSelectedModel, openRouterUseMiddleOutTransform, featureRoomoteControlEnabled, + isBrowserSessionActive, } = await this.getState() let cloudOrganizations: CloudOrganizationMembership[] = [] @@ -1974,7 +1975,7 @@ export class ClineProvider alwaysAllowModeSwitch: alwaysAllowModeSwitch ?? false, alwaysAllowSubtasks: alwaysAllowSubtasks ?? false, alwaysAllowUpdateTodoList: alwaysAllowUpdateTodoList ?? false, - isBrowserSessionActive: this.getCurrentTask()?.browserSession?.isSessionActive() ?? false, + isBrowserSessionActive, allowedMaxRequests, allowedMaxCost, autoCondenseContext: autoCondenseContext ?? true, @@ -2125,10 +2126,13 @@ export class ClineProvider providerSettings.apiProvider = apiProvider } + const cloudService = CloudService.hasInstance() ? CloudService.instance : undefined let organizationAllowList = ORGANIZATION_ALLOW_ALL try { - organizationAllowList = await CloudService.instance.getAllowList() + if (cloudService) { + organizationAllowList = await cloudService.getAllowList() + } } catch (error) { console.error( `[getState] failed to get organization allow list: ${error instanceof Error ? error.message : String(error)}`, @@ -2138,7 +2142,9 @@ export class ClineProvider let cloudUserInfo: CloudUserInfo | null = null try { - cloudUserInfo = CloudService.instance.getUserInfo() + if (cloudService) { + cloudUserInfo = cloudService.getUserInfo() + } } catch (error) { console.error( `[getState] failed to get cloud user info: ${error instanceof Error ? error.message : String(error)}`, @@ -2148,7 +2154,9 @@ export class ClineProvider let cloudIsAuthenticated: boolean = false try { - cloudIsAuthenticated = CloudService.instance.isAuthenticated() + if (cloudService) { + cloudIsAuthenticated = cloudService.isAuthenticated() + } } catch (error) { console.error( `[getState] failed to get cloud authentication state: ${error instanceof Error ? error.message : String(error)}`, @@ -2158,7 +2166,9 @@ export class ClineProvider let sharingEnabled: boolean = false try { - sharingEnabled = await CloudService.instance.canShareTask() + if (cloudService) { + sharingEnabled = await cloudService.canShareTask() + } } catch (error) { console.error( `[getState] failed to get sharing enabled state: ${error instanceof Error ? error.message : String(error)}`, @@ -2168,8 +2178,8 @@ export class ClineProvider let organizationSettingsVersion: number = -1 try { - if (CloudService.hasInstance()) { - const settings = CloudService.instance.getOrganizationSettings() + if (cloudService) { + const settings = cloudService.getOrganizationSettings() organizationSettingsVersion = settings?.version ?? -1 } } catch (error) { @@ -2181,7 +2191,9 @@ export class ClineProvider let taskSyncEnabled: boolean = false try { - taskSyncEnabled = CloudService.instance.isTaskSyncEnabled() + if (cloudService) { + taskSyncEnabled = cloudService.isTaskSyncEnabled() + } } catch (error) { console.error( `[getState] failed to get task sync enabled state: ${error instanceof Error ? error.message : String(error)}`, diff --git a/src/core/webview/webviewMessageHandler.ts b/src/core/webview/webviewMessageHandler.ts index 5f7c4fcc3f1..2a8d225f0e6 100644 --- a/src/core/webview/webviewMessageHandler.ts +++ b/src/core/webview/webviewMessageHandler.ts @@ -2139,8 +2139,7 @@ export const webviewMessageHandler = async ( provider.postMessageToWebview({ type: "importModeResult", success: true, - // Cast to any to support older ImportResult types that may not declare slug - slug: (result as any)?.slug, + slug: result.slug, }) // Show success message diff --git a/src/shared/browserUtils.ts b/src/shared/browserUtils.ts new file mode 100644 index 00000000000..4e071121c1b --- /dev/null +++ b/src/shared/browserUtils.ts @@ -0,0 +1,95 @@ +/** + * Parses coordinate string and scales from image dimensions to viewport dimensions + * The LLM examines the screenshot it receives (which may be downscaled by the API) + * and reports coordinates in format: "x,y@widthxheight" where widthxheight is what the LLM observed + * + * Format: "x,y@widthxheight" (required) + * Returns: scaled coordinate string "x,y" in viewport coordinates + * Throws: Error if format is invalid or missing image dimensions + */ +export function scaleCoordinate(coordinate: string, viewportWidth: number, viewportHeight: number): string { + // Parse coordinate with required image dimensions (accepts both 'x' and ',' as dimension separators) + const match = coordinate.match(/^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/) + + if (!match) { + throw new Error( + `Invalid coordinate format: "${coordinate}". ` + + `Expected format: "x,y@widthxheight" (e.g., "450,300@1024x768")`, + ) + } + + const [, xStr, yStr, imgWidthStr, imgHeightStr] = match + const x = parseInt(xStr, 10) + const y = parseInt(yStr, 10) + const imgWidth = parseInt(imgWidthStr, 10) + const imgHeight = parseInt(imgHeightStr, 10) + + // Scale coordinates from image dimensions to viewport dimensions + const scaledX = Math.round((x / imgWidth) * viewportWidth) + const scaledY = Math.round((y / imgHeight) * viewportHeight) + + return `${scaledX},${scaledY}` +} + +/** + * Formats a key string into a more readable format (e.g., "Control+c" -> "Ctrl + C") + */ +export function prettyKey(k?: string): string { + if (!k) return "" + return k + .split("+") + .map((part) => { + const p = part.trim() + const lower = p.toLowerCase() + const map: Record = { + enter: "Enter", + tab: "Tab", + escape: "Esc", + esc: "Esc", + backspace: "Backspace", + space: "Space", + shift: "Shift", + control: "Ctrl", + ctrl: "Ctrl", + alt: "Alt", + meta: "Meta", + command: "Cmd", + cmd: "Cmd", + arrowup: "Arrow Up", + arrowdown: "Arrow Down", + arrowleft: "Arrow Left", + arrowright: "Arrow Right", + pageup: "Page Up", + pagedown: "Page Down", + home: "Home", + end: "End", + } + if (map[lower]) return map[lower] + const keyMatch = /^Key([A-Z])$/.exec(p) + if (keyMatch) return keyMatch[1].toUpperCase() + const digitMatch = /^Digit([0-9])$/.exec(p) + if (digitMatch) return digitMatch[1] + const spaced = p.replace(/([a-z])([A-Z])/g, "$1 $2") + return spaced.charAt(0).toUpperCase() + spaced.slice(1) + }) + .join(" + ") +} + +/** + * Wrapper around scaleCoordinate that handles failures gracefully by checking for simple coordinates + */ +export function getViewportCoordinate( + coord: string | undefined, + viewportWidth: number, + viewportHeight: number, +): string { + if (!coord) return "" + + try { + return scaleCoordinate(coord, viewportWidth, viewportHeight) + } catch (e) { + // Fallback to simple x,y parsing or return as is + const simpleMatch = /^\s*(\d+)\s*,\s*(\d+)/.exec(coord) + return simpleMatch ? `${simpleMatch[1]},${simpleMatch[2]}` : coord + } +} diff --git a/webview-ui/src/components/browser-session/BrowserSessionPanel.tsx b/webview-ui/src/components/browser-session/BrowserSessionPanel.tsx index 00f3e176b6c..fe88106ad27 100644 --- a/webview-ui/src/components/browser-session/BrowserSessionPanel.tsx +++ b/webview-ui/src/components/browser-session/BrowserSessionPanel.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useState, useCallback } from "react" +import React, { useEffect, useState } from "react" import { type ClineMessage } from "@roo-code/types" import BrowserSessionRow from "../chat/BrowserSessionRow" import { TooltipProvider } from "@src/components/ui/tooltip" @@ -51,32 +51,25 @@ const BrowserSessionPanelContent: React.FC = () => { } }, []) - const handleHeightChange = useCallback(() => { - // No-op for panel - no scrolling needed - }, []) - return (
-
- expandedRows[messageTs] ?? false} - onToggleExpand={(messageTs: number) => { - setExpandedRows((prev: Record) => ({ - ...prev, - [messageTs]: !prev[messageTs], - })) - }} - fullScreen={true} - browserViewportSizeProp={browserViewportSize} - isBrowserSessionActiveProp={isBrowserSessionActive} - navigateToPageIndex={navigateToStepIndex} - /> -
+ expandedRows[messageTs] ?? false} + onToggleExpand={(messageTs: number) => { + setExpandedRows((prev: Record) => ({ + ...prev, + [messageTs]: !prev[messageTs], + })) + }} + fullScreen={true} + browserViewportSizeProp={browserViewportSize} + isBrowserSessionActiveProp={isBrowserSessionActive} + navigateToPageIndex={navigateToStepIndex} + />
) } diff --git a/webview-ui/src/components/chat/BrowserActionRow.tsx b/webview-ui/src/components/chat/BrowserActionRow.tsx index 0727e0c765f..4eecc284ae0 100644 --- a/webview-ui/src/components/chat/BrowserActionRow.tsx +++ b/webview-ui/src/components/chat/BrowserActionRow.tsx @@ -2,6 +2,7 @@ import { memo, useMemo, useEffect, useRef } from "react" import { ClineMessage } from "@roo-code/types" import { ClineSayBrowserAction } from "@roo/ExtensionMessage" import { vscode } from "@src/utils/vscode" +import { getViewportCoordinate as getViewportCoordinateShared, prettyKey } from "@roo/browserUtils" import { MousePointer as MousePointerIcon, Keyboard, @@ -13,47 +14,7 @@ import { Maximize2, } from "lucide-react" import { useExtensionState } from "@src/context/ExtensionStateContext" - -const prettyKey = (k?: string): string => { - if (!k) return "" - return k - .split("+") - .map((part) => { - const p = part.trim() - const lower = p.toLowerCase() - const map: Record = { - enter: "Enter", - tab: "Tab", - escape: "Esc", - esc: "Esc", - backspace: "Backspace", - space: "Space", - shift: "Shift", - control: "Ctrl", - ctrl: "Ctrl", - alt: "Alt", - meta: "Meta", - command: "Cmd", - cmd: "Cmd", - arrowup: "Arrow Up", - arrowdown: "Arrow Down", - arrowleft: "Arrow Left", - arrowright: "Arrow Right", - pageup: "Page Up", - pagedown: "Page Down", - home: "Home", - end: "End", - } - if (map[lower]) return map[lower] - const keyMatch = /^Key([A-Z])$/.exec(p) - if (keyMatch) return keyMatch[1].toUpperCase() - const digitMatch = /^Digit([0-9])$/.exec(p) - if (digitMatch) return digitMatch[1] - const spaced = p.replace(/([a-z])([A-Z])/g, "$1 $2") - return spaced.charAt(0).toUpperCase() + spaced.slice(1) - }) - .join(" + ") -} +import { useTranslation } from "react-i18next" interface BrowserActionRowProps { message: ClineMessage @@ -87,6 +48,7 @@ const getActionIcon = (action: string) => { } const BrowserActionRow = memo(({ message, nextMessage, actionIndex, totalActions }: BrowserActionRowProps) => { + const { t } = useTranslation() const { isBrowserSessionActive } = useExtensionState() const hasHandledAutoOpenRef = useRef(false) @@ -119,33 +81,8 @@ const BrowserActionRow = memo(({ message, nextMessage, actionIndex, totalActions // Helper to scale coordinates from screenshot dimensions to viewport dimensions // Matches the backend's scaleCoordinate function logic - const getViewportCoordinate = (coord?: string): string => { - if (!coord) return "" - - // Parse "x,y@widthxheight" format - const match = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/.exec(coord) - if (!match) { - // If no @dimensions, return as-is (might be plain x,y format) - const simpleMatch = /^\s*(\d+)\s*,\s*(\d+)/.exec(coord) - return simpleMatch ? `${simpleMatch[1]},${simpleMatch[2]}` : coord - } - - const x = parseInt(match[1], 10) - const y = parseInt(match[2], 10) - const imgWidth = parseInt(match[3], 10) - const imgHeight = parseInt(match[4], 10) - - // If we don't have viewport dimensions, just return the screenshot coordinates - if (!viewportDimensions?.width || !viewportDimensions?.height) { - return `${x},${y}` - } - - // Scale coordinates from image dimensions to viewport dimensions (same as backend) - const scaledX = Math.round((x / imgWidth) * viewportDimensions.width) - const scaledY = Math.round((y / imgHeight) * viewportDimensions.height) - - return `${scaledX},${scaledY}` - } + const getViewportCoordinate = (coord?: string): string => + getViewportCoordinateShared(coord, viewportDimensions?.width ?? 0, viewportDimensions?.height ?? 0) switch (browserAction.action) { case "launch": @@ -224,7 +161,7 @@ const BrowserActionRow = memo(({ message, nextMessage, actionIndex, totalActions className="codicon codicon-globe text-vscode-testing-iconPassed shrink-0" style={{ marginBottom: "-1.5px" }} /> - Browser Action + {t("chat:browser.actions.title")} {actionIndex !== undefined && totalActions !== undefined && ( {" "} diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index 37e323242a3..8fc23c6d0b2 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -11,6 +11,7 @@ import { useExtensionState } from "@src/context/ExtensionStateContext" import CodeBlock from "../common/CodeBlock" import { ProgressIndicator } from "./ProgressIndicator" import { Button, StandardTooltip } from "@src/components/ui" +import { getViewportCoordinate as getViewportCoordinateShared, prettyKey } from "@roo/browserUtils" import { Globe, Pointer, @@ -31,47 +32,6 @@ import { Copy, } from "lucide-react" -const prettyKey = (k?: string): string => { - if (!k) return "" - return k - .split("+") - .map((part) => { - const p = part.trim() - const lower = p.toLowerCase() - const map: Record = { - enter: "Enter", - tab: "Tab", - escape: "Esc", - esc: "Esc", - backspace: "Backspace", - space: "Space", - shift: "Shift", - control: "Ctrl", - ctrl: "Ctrl", - alt: "Alt", - meta: "Meta", - command: "Cmd", - cmd: "Cmd", - arrowup: "Arrow Up", - arrowdown: "Arrow Down", - arrowleft: "Arrow Left", - arrowright: "Arrow Right", - pageup: "Page Up", - pagedown: "Page Down", - home: "Home", - end: "End", - } - if (map[lower]) return map[lower] - const keyMatch = /^Key([A-Z])$/.exec(p) - if (keyMatch) return keyMatch[1].toUpperCase() - const digitMatch = /^Digit([0-9])$/.exec(p) - if (digitMatch) return digitMatch[1] - const spaced = p.replace(/([a-z])([A-Z])/g, "$1 $2") - return spaced.charAt(0).toUpperCase() + spaced.slice(1) - }) - .join(" + ") -} - const getBrowserActionText = ( action: BrowserAction, executedCoordinate?: string, @@ -83,33 +43,8 @@ const getBrowserActionText = ( ) => { // Helper to scale coordinates from screenshot dimensions to viewport dimensions // Matches the backend's scaleCoordinate function logic - const getViewportCoordinate = (coord?: string): string => { - if (!coord) return "" - - // Parse "x,y@widthxheight" format - const match = /^\s*(\d+)\s*,\s*(\d+)\s*@\s*(\d+)\s*[x,]\s*(\d+)\s*$/.exec(coord) - if (!match) { - // If no @dimensions, return as-is (might be plain x,y format) - const simpleMatch = /^\s*(\d+)\s*,\s*(\d+)/.exec(coord) - return simpleMatch ? `${simpleMatch[1]},${simpleMatch[2]}` : coord - } - - const x = parseInt(match[1], 10) - const y = parseInt(match[2], 10) - const imgWidth = parseInt(match[3], 10) - const imgHeight = parseInt(match[4], 10) - - // If we don't have viewport dimensions, just return the screenshot coordinates - if (!viewportWidth || !viewportHeight) { - return `${x},${y}` - } - - // Scale coordinates from image dimensions to viewport dimensions (same as backend) - const scaledX = Math.round((x / imgWidth) * viewportWidth) - const scaledY = Math.round((y / imgHeight) * viewportHeight) - - return `${scaledX},${scaledY}` - } + const getViewportCoordinate = (coord?: string): string => + getViewportCoordinateShared(coord, viewportWidth ?? 0, viewportHeight ?? 0) switch (action) { case "launch": @@ -164,7 +99,7 @@ interface BrowserSessionRowProps { onToggleExpand: (messageTs: number) => void lastModifiedMessage?: ClineMessage isLast: boolean - onHeightChange: (isTaller: boolean) => void + onHeightChange?: (isTaller: boolean) => void isStreaming: boolean onExpandChange?: (expanded: boolean) => void fullScreen?: boolean @@ -1159,7 +1094,7 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { const isInitialRender = prevHeightRef.current === 0 if (isLast && rowHeight !== 0 && rowHeight !== Infinity && rowHeight !== prevHeightRef.current) { if (!isInitialRender) { - onHeightChange(rowHeight > prevHeightRef.current) + onHeightChange?.(rowHeight > prevHeightRef.current) } prevHeightRef.current = rowHeight } diff --git a/webview-ui/src/components/chat/ChatView.tsx b/webview-ui/src/components/chat/ChatView.tsx index 3b53e016841..e09cdc557a3 100644 --- a/webview-ui/src/components/chat/ChatView.tsx +++ b/webview-ui/src/components/chat/ChatView.tsx @@ -37,7 +37,6 @@ import TelemetryBanner from "../common/TelemetryBanner" import VersionIndicator from "../common/VersionIndicator" import HistoryPreview from "../history/HistoryPreview" import Announcement from "./Announcement" -import BrowserSessionRow from "./BrowserSessionRow" import BrowserActionRow from "./BrowserActionRow" import BrowserSessionStatusRow from "./BrowserSessionStatusRow" import ChatRow from "./ChatRow" @@ -1161,34 +1160,8 @@ const ChatViewComponent: React.ForwardRefRenderFunction { - // Mark that user has responded - userRespondedRef.current = true - }, []) - const itemContent = useCallback( - (index: number, messageOrGroup: ClineMessage | ClineMessage[]) => { - // browser session group - this should never be called now since we don't group messages - if (Array.isArray(messageOrGroup)) { - return ( - expandedRows[messageTs] ?? false} - onToggleExpand={(messageTs: number) => { - setExpandedRows((prev: Record) => ({ - ...prev, - [messageTs]: !prev[messageTs], - })) - }} - /> - ) - } - + (index: number, messageOrGroup: ClineMessage) => { const hasCheckpoint = modifiedMessages.some((message) => message.say === "checkpoint_saved") // Check if this is a browser action message @@ -1232,7 +1205,6 @@ const ChatViewComponent: React.ForwardRefRenderFunction Date: Fri, 21 Nov 2025 13:46:16 -0700 Subject: [PATCH 15/16] Fixed issue where coordiantes were incorrect after resize event. --- src/services/browser/BrowserSession.ts | 29 +++++++++- .../browser/__tests__/BrowserSession.spec.ts | 57 +++++++++++++++++++ 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/services/browser/BrowserSession.ts b/src/services/browser/BrowserSession.ts index d76f6768c4e..98f6f85e037 100644 --- a/src/services/browser/BrowserSession.ts +++ b/src/services/browser/BrowserSession.ts @@ -828,11 +828,34 @@ export class BrowserSession { /** * Returns the last known viewport size (if any) + * + * Prefer the live page viewport when available so we stay accurate after: + * - browser_action resize + * - manual window resizes (especially with remote browsers) + * + * Falls back to the configured default viewport when no prior information exists. */ getViewportSize(): { width?: number; height?: number } { - return { - width: this.lastViewportWidth, - height: this.lastViewportHeight, + // If we have an active page, ask Puppeteer for the current viewport. + // This keeps us in sync with any resizes that happen outside of our own + // browser_action lifecycle (e.g. user dragging the window). + if (this.page) { + const vp = this.page.viewport() + if (vp?.width) this.lastViewportWidth = vp.width + if (vp?.height) this.lastViewportHeight = vp.height + } + + // If we've ever observed a viewport, use that. + if (this.lastViewportWidth && this.lastViewportHeight) { + return { + width: this.lastViewportWidth, + height: this.lastViewportHeight, + } } + + // Otherwise fall back to the configured default so the tool can still + // operate before the first screenshot-based action has run. + const { width, height } = this.getViewport() + return { width, height } } } diff --git a/src/services/browser/__tests__/BrowserSession.spec.ts b/src/services/browser/__tests__/BrowserSession.spec.ts index d3784c3afff..a7d9707ab39 100644 --- a/src/services/browser/__tests__/BrowserSession.spec.ts +++ b/src/services/browser/__tests__/BrowserSession.spec.ts @@ -394,4 +394,61 @@ describe("cursor visualization", () => { // Verify no cursor position in result expect(result.currentMousePosition).toBeUndefined() }) + + describe("getViewportSize", () => { + it("falls back to configured viewport when no page or last viewport is available", () => { + const localCtx: any = { + globalState: { + get: vi.fn((key: string) => { + if (key === "browserViewportSize") return "1024x768" + return undefined + }), + update: vi.fn(), + }, + globalStorageUri: { fsPath: "/mock/global/storage/path" }, + extensionUri: { fsPath: "/mock/extension/path" }, + } + + const session = new BrowserSession(localCtx) + const vp = (session as any).getViewportSize() + expect(vp).toEqual({ width: 1024, height: 768 }) + }) + + it("returns live page viewport when available and updates lastViewport cache", () => { + const localCtx: any = { + globalState: { + get: vi.fn(), + update: vi.fn(), + }, + globalStorageUri: { fsPath: "/mock/global/storage/path" }, + extensionUri: { fsPath: "/mock/extension/path" }, + } + const session = new BrowserSession(localCtx) + ;(session as any).page = { + viewport: vi.fn().mockReturnValue({ width: 1111, height: 555 }), + } + + const vp = (session as any).getViewportSize() + expect(vp).toEqual({ width: 1111, height: 555 }) + expect((session as any).lastViewportWidth).toBe(1111) + expect((session as any).lastViewportHeight).toBe(555) + }) + + it("returns cached last viewport when page no longer exists", () => { + const localCtx: any = { + globalState: { + get: vi.fn(), + update: vi.fn(), + }, + globalStorageUri: { fsPath: "/mock/global/storage/path" }, + extensionUri: { fsPath: "/mock/extension/path" }, + } + const session = new BrowserSession(localCtx) + ;(session as any).lastViewportWidth = 800 + ;(session as any).lastViewportHeight = 600 + + const vp = (session as any).getViewportSize() + expect(vp).toEqual({ width: 800, height: 600 }) + }) + }) }) From d3a5304c919e8d58454ca7a38f91b0a6d2e92ea2 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Fri, 21 Nov 2025 14:02:38 -0700 Subject: [PATCH 16/16] revert out of scope changes. --- src/core/webview/ClineProvider.ts | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts index 8f6c8c7631d..deecc6c1f87 100644 --- a/src/core/webview/ClineProvider.ts +++ b/src/core/webview/ClineProvider.ts @@ -2126,13 +2126,10 @@ export class ClineProvider providerSettings.apiProvider = apiProvider } - const cloudService = CloudService.hasInstance() ? CloudService.instance : undefined let organizationAllowList = ORGANIZATION_ALLOW_ALL try { - if (cloudService) { - organizationAllowList = await cloudService.getAllowList() - } + organizationAllowList = await CloudService.instance.getAllowList() } catch (error) { console.error( `[getState] failed to get organization allow list: ${error instanceof Error ? error.message : String(error)}`, @@ -2142,9 +2139,7 @@ export class ClineProvider let cloudUserInfo: CloudUserInfo | null = null try { - if (cloudService) { - cloudUserInfo = cloudService.getUserInfo() - } + cloudUserInfo = CloudService.instance.getUserInfo() } catch (error) { console.error( `[getState] failed to get cloud user info: ${error instanceof Error ? error.message : String(error)}`, @@ -2154,9 +2149,7 @@ export class ClineProvider let cloudIsAuthenticated: boolean = false try { - if (cloudService) { - cloudIsAuthenticated = cloudService.isAuthenticated() - } + cloudIsAuthenticated = CloudService.instance.isAuthenticated() } catch (error) { console.error( `[getState] failed to get cloud authentication state: ${error instanceof Error ? error.message : String(error)}`, @@ -2166,9 +2159,7 @@ export class ClineProvider let sharingEnabled: boolean = false try { - if (cloudService) { - sharingEnabled = await cloudService.canShareTask() - } + sharingEnabled = await CloudService.instance.canShareTask() } catch (error) { console.error( `[getState] failed to get sharing enabled state: ${error instanceof Error ? error.message : String(error)}`, @@ -2178,8 +2169,8 @@ export class ClineProvider let organizationSettingsVersion: number = -1 try { - if (cloudService) { - const settings = cloudService.getOrganizationSettings() + if (CloudService.hasInstance()) { + const settings = CloudService.instance.getOrganizationSettings() organizationSettingsVersion = settings?.version ?? -1 } } catch (error) { @@ -2191,9 +2182,7 @@ export class ClineProvider let taskSyncEnabled: boolean = false try { - if (cloudService) { - taskSyncEnabled = cloudService.isTaskSyncEnabled() - } + taskSyncEnabled = CloudService.instance.isTaskSyncEnabled() } catch (error) { console.error( `[getState] failed to get task sync enabled state: ${error instanceof Error ? error.message : String(error)}`,